seqno_to_time_mapping.h 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353
  1. // Copyright (c) Meta Platforms, Inc. and affiliates.
  2. //
  3. // This source code is licensed under both the GPLv2 (found in the
  4. // COPYING file in the root directory) and Apache 2.0 License
  5. // (found in the LICENSE.Apache file in the root directory).
  6. #pragma once
  7. #include <algorithm>
  8. #include <cinttypes>
  9. #include <cstdint>
  10. #include <deque>
  11. #include <functional>
  12. #include <iterator>
  13. #include <string>
  14. #include "db/dbformat.h"
  15. #include "rocksdb/status.h"
  16. #include "rocksdb/types.h"
  17. namespace ROCKSDB_NAMESPACE {
  18. constexpr uint64_t kUnknownTimeBeforeAll = 0;
  19. constexpr SequenceNumber kUnknownSeqnoBeforeAll = 0;
  20. // Maximum number of entries can be encoded into SST. The data is delta encode
  21. // so the maximum data usage for each SST is < 0.3K
  22. constexpr uint64_t kMaxSeqnoTimePairsPerSST = 100;
  23. // Maximum number of entries per CF. If there's only CF with this feature on,
  24. // the max span divided by this number, so for example, if
  25. // preclude_last_level_data_seconds = 100000 (~1day), then it will sample the
  26. // seqno -> time every 1000 seconds (~17minutes). Then the maximum entry it
  27. // needs is 100.
  28. // When there are multiple CFs having this feature on, the sampling cadence is
  29. // determined by the smallest setting, the capacity is determined the largest
  30. // setting, also it's caped by kMaxSeqnoTimePairsPerCF * 10.
  31. constexpr uint64_t kMaxSeqnoTimePairsPerCF = 100;
  32. constexpr uint64_t kMaxSeqnoToTimeEntries = kMaxSeqnoTimePairsPerCF * 10;
  33. // SeqnoToTimeMapping stores a sampled mapping from sequence numbers to
  34. // unix times (seconds since epoch). This information provides rough bounds
  35. // between sequence numbers and their write times, but is primarily designed
  36. // for getting a best lower bound on the sequence number of data written no
  37. // later than a specified time.
  38. //
  39. // For ease of sampling, it is assumed that the recorded time in each pair
  40. // comes at or after the sequence number and before the next sequence number,
  41. // so this example:
  42. //
  43. // Seqno: 10, 11, ... 20, 21, ... 30, 31, ...
  44. // Time: ... 500 ... 600 ... 700 ...
  45. //
  46. // would be represented as
  47. // 10 -> 500
  48. // 20 -> 600
  49. // 30 -> 700
  50. //
  51. // In typical operation, the list is in "enforced" operation to maintain
  52. // invariants on sortedness, capacity, and time span of entries. However, some
  53. // operations will put the object into "unenforced" mode where those invariants
  54. // are relaxed until explicitly or implicitly re-enforced (which will sort and
  55. // filter the data).
  56. //
  57. // NOT thread safe - requires external synchronization, except a const
  58. // object allows concurrent reads.
  59. class SeqnoToTimeMapping {
  60. public:
  61. // A simple struct for sequence number to time pair
  62. struct SeqnoTimePair {
  63. SequenceNumber seqno = 0;
  64. uint64_t time = 0;
  65. SeqnoTimePair() = default;
  66. SeqnoTimePair(SequenceNumber _seqno, uint64_t _time)
  67. : seqno(_seqno), time(_time) {}
  68. // Encode to dest string
  69. void Encode(std::string& dest) const;
  70. // Decode the value from input Slice and remove it from the input
  71. Status Decode(Slice& input);
  72. // For delta encoding
  73. SeqnoTimePair ComputeDelta(const SeqnoTimePair& base) const {
  74. return {seqno - base.seqno, time - base.time};
  75. }
  76. // For delta decoding
  77. void ApplyDelta(const SeqnoTimePair& delta_or_base) {
  78. seqno += delta_or_base.seqno;
  79. time += delta_or_base.time;
  80. }
  81. // If another pair can be combined into this one (for optimizing
  82. // normal SeqnoToTimeMapping behavior), then this mapping is modified
  83. // and true is returned, indicating the other mapping can be discarded.
  84. // Otherwise false is returned and nothing is changed.
  85. bool Merge(const SeqnoTimePair& other);
  86. // Ordering used for Sort()
  87. bool operator<(const SeqnoTimePair& other) const {
  88. return std::tie(seqno, time) < std::tie(other.seqno, other.time);
  89. }
  90. bool operator==(const SeqnoTimePair& other) const {
  91. return std::tie(seqno, time) == std::tie(other.seqno, other.time);
  92. }
  93. static bool SeqnoLess(const SeqnoTimePair& a, const SeqnoTimePair& b) {
  94. return a.seqno < b.seqno;
  95. }
  96. static bool TimeLess(const SeqnoTimePair& a, const SeqnoTimePair& b) {
  97. return a.time < b.time;
  98. }
  99. };
  100. // Construct an empty SeqnoToTimeMapping with no limits.
  101. SeqnoToTimeMapping() {}
  102. // ==== Configuration for enforced state ==== //
  103. // Set a time span beyond which old entries can be deleted. Specifically,
  104. // under enforcement mode, the structure will maintian only one entry older
  105. // than the newest entry time minus max_time_span, so that
  106. // GetProximalSeqnoBeforeTime queries back to that time return a good result.
  107. // UINT64_MAX == unlimited. 0 == retain just one latest entry. Returns *this.
  108. SeqnoToTimeMapping& SetMaxTimeSpan(uint64_t max_time_span);
  109. // Set the nominal capacity under enforcement mode. The structure is allowed
  110. // to grow some reasonable fraction larger but will automatically compact
  111. // down to this size. UINT64_MAX == unlimited. 0 == retain nothing.
  112. // Returns *this.
  113. SeqnoToTimeMapping& SetCapacity(uint64_t capacity);
  114. // ==== Modifiers, enforced ==== //
  115. // Adds a series of mappings interpolating from from_seqno->from_time to
  116. // to_seqno->to_time. This can only be called on an empty object and both
  117. // seqno range and time range are inclusive.
  118. void PrePopulate(SequenceNumber from_seqno, SequenceNumber to_seqno,
  119. uint64_t from_time, uint64_t to_time);
  120. // Append a new entry to the list. The `seqno` should be >= all previous
  121. // entries. This operation maintains enforced mode invariants, and will
  122. // automatically (re-)enter enforced mode if not already in that state.
  123. // Returns false if the entry was merged into the most recent entry
  124. // rather than creating a new entry.
  125. bool Append(SequenceNumber seqno, uint64_t time);
  126. bool Append(std::pair<SequenceNumber, uint64_t> seqno_time_pair) {
  127. return Append(seqno_time_pair.first, seqno_time_pair.second);
  128. }
  129. // Clear all entries and (re-)enter enforced mode if not already in that
  130. // state. Enforced limits are unchanged.
  131. void Clear() {
  132. pairs_.clear();
  133. enforced_ = true;
  134. }
  135. // Enters the "enforced" state if not already in that state, which is
  136. // useful before copying or querying. This will
  137. // * Sort the entries
  138. // * Discard any obsolete entries, which is aided if the caller specifies
  139. // the `now` time so that entries older than now minus the max time span can
  140. // be discarded.
  141. // * Compact the entries to the configured capacity.
  142. // Returns *this.
  143. SeqnoToTimeMapping& Enforce(uint64_t now = 0);
  144. // ==== Modifiers, unenforced ==== //
  145. // Add a new random entry and enter "unenforced" state. Unlike Append(), it
  146. // can be any historical data.
  147. void AddUnenforced(SequenceNumber seqno, uint64_t time);
  148. // Decode and add the entries to this mapping object. Unless starting from
  149. // an empty mapping with no configured enforcement limits, this operation
  150. // enters the unenforced state.
  151. Status DecodeFrom(const std::string& pairs_str);
  152. // Copies entries from the src mapping object to this one, limited to entries
  153. // needed to answer GetProximalTimeBeforeSeqno() queries for the given
  154. // *inclusive* seqno range. The source structure must be in enforced
  155. // state as a precondition. Unless starting with this object as empty mapping
  156. // with no configured enforcement limits, this object enters the unenforced
  157. // state.
  158. void CopyFromSeqnoRange(const SeqnoToTimeMapping& src,
  159. SequenceNumber from_seqno,
  160. SequenceNumber to_seqno = kMaxSequenceNumber);
  161. void CopyFrom(const SeqnoToTimeMapping& src) {
  162. CopyFromSeqnoRange(src, kUnknownSeqnoBeforeAll, kMaxSequenceNumber);
  163. }
  164. // ==== Accessors ==== //
  165. // Given a sequence number, return the best (largest / newest) known time
  166. // that is no later than the write time of that given sequence number.
  167. // If no such specific time is known, returns kUnknownTimeBeforeAll.
  168. // Using the example in the class comment above,
  169. // GetProximalTimeBeforeSeqno(10) -> kUnknownTimeBeforeAll
  170. // GetProximalTimeBeforeSeqno(11) -> 500
  171. // GetProximalTimeBeforeSeqno(20) -> 500
  172. // GetProximalTimeBeforeSeqno(21) -> 600
  173. // Because this is a const operation depending on sortedness, the structure
  174. // must be in enforced state as a precondition.
  175. uint64_t GetProximalTimeBeforeSeqno(SequenceNumber seqno) const;
  176. // Given a time, return the best (largest) sequence number whose write time
  177. // is no later than that given time. If no such specific sequence number is
  178. // known, returns kUnknownSeqnoBeforeAll. Using the example in the class
  179. // comment above,
  180. // GetProximalSeqnoBeforeTime(499) -> kUnknownSeqnoBeforeAll
  181. // GetProximalSeqnoBeforeTime(500) -> 10
  182. // GetProximalSeqnoBeforeTime(599) -> 10
  183. // GetProximalSeqnoBeforeTime(600) -> 20
  184. // Because this is a const operation depending on sortedness, the structure
  185. // must be in enforced state as a precondition.
  186. SequenceNumber GetProximalSeqnoBeforeTime(uint64_t time) const;
  187. // Given current time, the configured `preserve_internal_time_seconds`, and
  188. // `preclude_last_level_data_seconds`, find the relevant cutoff sequence
  189. // numbers for tiering.
  190. void GetCurrentTieringCutoffSeqnos(
  191. uint64_t current_time, uint64_t preserve_internal_time_seconds,
  192. uint64_t preclude_last_level_data_seconds,
  193. SequenceNumber* preserve_time_min_seqno,
  194. SequenceNumber* preclude_last_level_min_seqno) const;
  195. // Encode to a binary string by appending to `dest`.
  196. // Because this is a const operation depending on sortedness, the structure
  197. // must be in enforced state as a precondition.
  198. void EncodeTo(std::string& dest) const;
  199. // Return the number of entries
  200. size_t Size() const { return pairs_.size(); }
  201. uint64_t GetCapacity() const { return capacity_; }
  202. // If the internal list is empty
  203. bool Empty() const { return pairs_.empty(); }
  204. // return the string for user message
  205. // Note: Not efficient, okay for print
  206. std::string ToHumanString() const;
  207. #ifndef NDEBUG
  208. const SeqnoTimePair& TEST_GetLastEntry() const { return pairs_.back(); }
  209. const std::deque<SeqnoTimePair>& TEST_GetInternalMapping() const {
  210. return pairs_;
  211. }
  212. bool TEST_IsEnforced() const { return enforced_; }
  213. #endif
  214. private:
  215. uint64_t max_time_span_ = UINT64_MAX;
  216. uint64_t capacity_ = UINT64_MAX;
  217. std::deque<SeqnoTimePair> pairs_;
  218. // Whether this object is in the "enforced" state. Between calls to public
  219. // functions, enforced_==true means that
  220. // * `pairs_` is sorted
  221. // * The capacity limit (non-strict) is met
  222. // * The time span limit is met
  223. // However, some places within the implementation (Append()) will temporarily
  224. // violate those last two conditions while enforced_==true. See also the
  225. // Enforce*() and Sort*() private functions below.
  226. bool enforced_ = true;
  227. void EnforceMaxTimeSpan(uint64_t now = 0);
  228. void EnforceCapacity(bool strict);
  229. void SortAndMerge();
  230. using pair_const_iterator =
  231. std::deque<SeqnoToTimeMapping::SeqnoTimePair>::const_iterator;
  232. pair_const_iterator FindGreaterTime(uint64_t time) const;
  233. pair_const_iterator FindGreaterSeqno(SequenceNumber seqno) const;
  234. pair_const_iterator FindGreaterEqSeqno(SequenceNumber seqno) const;
  235. };
  236. // A struct to help combining settings across column families
  237. struct MinAndMaxPreserveSeconds {
  238. uint64_t min_preserve_seconds = std::numeric_limits<uint64_t>::max();
  239. uint64_t max_preserve_seconds = std::numeric_limits<uint64_t>::min();
  240. MinAndMaxPreserveSeconds() = default;
  241. template <class CFOpts>
  242. explicit MinAndMaxPreserveSeconds(const CFOpts& opts) {
  243. Combine(opts);
  244. }
  245. bool IsEnabled() const {
  246. return min_preserve_seconds != std::numeric_limits<uint64_t>::max();
  247. }
  248. // Incorporate another CF's settings into the result. If preserve/preclude are
  249. // disabled for this CF, they are excluded from the result.
  250. template <class CFOpts>
  251. void Combine(const CFOpts& opts) {
  252. uint64_t preserve_seconds = std::max(opts.preserve_internal_time_seconds,
  253. opts.preclude_last_level_data_seconds);
  254. if (preserve_seconds > 0) {
  255. min_preserve_seconds = std::min(preserve_seconds, min_preserve_seconds);
  256. max_preserve_seconds = std::max(preserve_seconds, max_preserve_seconds);
  257. }
  258. }
  259. // Choose how many seconds between mapping samples
  260. uint64_t GetRecodingCadence() const {
  261. if (IsEnabled()) {
  262. // round up to 1 when the time_duration is smaller than
  263. // kMaxSeqnoTimePairsPerCF
  264. return (min_preserve_seconds + kMaxSeqnoTimePairsPerCF - 1) /
  265. kMaxSeqnoTimePairsPerCF;
  266. } else {
  267. // disabled
  268. return 0;
  269. }
  270. }
  271. };
  272. // === Utility methods used for TimedPut === //
  273. // Pack a value Slice and a unix write time into buffer `buf` and return a Slice
  274. // for the packed value backed by `buf`.
  275. Slice PackValueAndWriteTime(const Slice& value, uint64_t unix_write_time,
  276. std::string* buf);
  277. // Pack a value Slice and a sequence number into buffer `buf` and return a Slice
  278. // for the packed value backed by `buf`.
  279. Slice PackValueAndSeqno(const Slice& value, SequenceNumber seqno,
  280. std::string* buf);
  281. // Parse a packed value to get the write time.
  282. uint64_t ParsePackedValueForWriteTime(const Slice& value);
  283. // Parse a packed value to get the value and the write time. The unpacked value
  284. // Slice is backed up by the same memory backing up `value`.
  285. std::tuple<Slice, uint64_t> ParsePackedValueWithWriteTime(const Slice& value);
  286. // Parse a packed value to get the sequence number.
  287. SequenceNumber ParsePackedValueForSeqno(const Slice& value);
  288. // Parse a packed value to get the value and the sequence number. The unpacked
  289. // value Slice is backed up by the same memory backing up `value`.
  290. std::tuple<Slice, SequenceNumber> ParsePackedValueWithSeqno(const Slice& value);
  291. // Parse a packed value to get the value. The unpacked value Slice is backed up
  292. // by the same memory backing up `value`.
  293. Slice ParsePackedValueForValue(const Slice& value);
  294. } // namespace ROCKSDB_NAMESPACE