seqno_to_time_mapping.cc 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571
  1. // Copyright (c) Meta Platforms, Inc. and affiliates.
  2. //
  3. // This source code is licensed under both the GPLv2 (found in the
  4. // COPYING file in the root directory) and Apache 2.0 License
  5. // (found in the LICENSE.Apache file in the root directory).
  6. #include "db/seqno_to_time_mapping.h"
  7. #include <algorithm>
  8. #include <cassert>
  9. #include <cstdint>
  10. #include <deque>
  11. #include <functional>
  12. #include <queue>
  13. #include <vector>
  14. #include "db/version_edit.h"
  15. #include "util/string_util.h"
  16. namespace ROCKSDB_NAMESPACE {
  17. SeqnoToTimeMapping::pair_const_iterator SeqnoToTimeMapping::FindGreaterTime(
  18. uint64_t time) const {
  19. assert(enforced_);
  20. return std::upper_bound(pairs_.cbegin(), pairs_.cend(),
  21. SeqnoTimePair{0, time}, SeqnoTimePair::TimeLess);
  22. }
  23. SeqnoToTimeMapping::pair_const_iterator SeqnoToTimeMapping::FindGreaterEqSeqno(
  24. SequenceNumber seqno) const {
  25. assert(enforced_);
  26. return std::lower_bound(pairs_.cbegin(), pairs_.cend(),
  27. SeqnoTimePair{seqno, 0}, SeqnoTimePair::SeqnoLess);
  28. }
  29. SeqnoToTimeMapping::pair_const_iterator SeqnoToTimeMapping::FindGreaterSeqno(
  30. SequenceNumber seqno) const {
  31. assert(enforced_);
  32. return std::upper_bound(pairs_.cbegin(), pairs_.cend(),
  33. SeqnoTimePair{seqno, 0}, SeqnoTimePair::SeqnoLess);
  34. }
  35. uint64_t SeqnoToTimeMapping::GetProximalTimeBeforeSeqno(
  36. SequenceNumber seqno) const {
  37. assert(enforced_);
  38. // Find the last entry with a seqno strictly less than the given seqno.
  39. // First, find the first entry >= the given seqno (or end)
  40. auto it = FindGreaterEqSeqno(seqno);
  41. if (it == pairs_.cbegin()) {
  42. return kUnknownTimeBeforeAll;
  43. }
  44. // Then return data from previous.
  45. it--;
  46. return it->time;
  47. }
  48. SequenceNumber SeqnoToTimeMapping::GetProximalSeqnoBeforeTime(
  49. uint64_t time) const {
  50. assert(enforced_);
  51. // Find the last entry with a time <= the given time.
  52. // First, find the first entry > the given time (or end).
  53. auto it = FindGreaterTime(time);
  54. if (it == pairs_.cbegin()) {
  55. return kUnknownSeqnoBeforeAll;
  56. }
  57. // Then return data from previous.
  58. --it;
  59. return it->seqno;
  60. }
  61. void SeqnoToTimeMapping::GetCurrentTieringCutoffSeqnos(
  62. uint64_t current_time, uint64_t preserve_internal_time_seconds,
  63. uint64_t preclude_last_level_data_seconds,
  64. SequenceNumber* preserve_time_min_seqno,
  65. SequenceNumber* preclude_last_level_min_seqno) const {
  66. uint64_t preserve_time_duration = std::max(preserve_internal_time_seconds,
  67. preclude_last_level_data_seconds);
  68. if (preserve_time_duration <= 0) {
  69. return;
  70. }
  71. uint64_t preserve_time = current_time > preserve_time_duration
  72. ? current_time - preserve_time_duration
  73. : 0;
  74. // GetProximalSeqnoBeforeTime tells us the last seqno known to have been
  75. // written at or before the given time. + 1 to get the minimum we should
  76. // preserve without excluding anything that might have been written on or
  77. // after the given time.
  78. if (preserve_time_min_seqno) {
  79. *preserve_time_min_seqno = GetProximalSeqnoBeforeTime(preserve_time) + 1;
  80. }
  81. if (preclude_last_level_data_seconds > 0 && preclude_last_level_min_seqno) {
  82. uint64_t preclude_last_level_time =
  83. current_time > preclude_last_level_data_seconds
  84. ? current_time - preclude_last_level_data_seconds
  85. : 0;
  86. *preclude_last_level_min_seqno =
  87. GetProximalSeqnoBeforeTime(preclude_last_level_time) + 1;
  88. }
  89. }
  90. void SeqnoToTimeMapping::EnforceMaxTimeSpan(uint64_t now) {
  91. assert(enforced_); // at least sorted
  92. uint64_t cutoff_time;
  93. if (pairs_.size() <= 1) {
  94. return;
  95. }
  96. if (now > 0) {
  97. if (now < max_time_span_) {
  98. // Nothing eligible to prune / avoid underflow
  99. return;
  100. }
  101. cutoff_time = now - max_time_span_;
  102. } else {
  103. const auto& last = pairs_.back();
  104. if (last.time < max_time_span_) {
  105. // Nothing eligible to prune / avoid underflow
  106. return;
  107. }
  108. cutoff_time = last.time - max_time_span_;
  109. }
  110. // Keep one entry <= cutoff_time
  111. while (pairs_.size() >= 2 && pairs_[0].time <= cutoff_time &&
  112. pairs_[1].time <= cutoff_time) {
  113. pairs_.pop_front();
  114. }
  115. }
  116. void SeqnoToTimeMapping::EnforceCapacity(bool strict) {
  117. assert(enforced_); // at least sorted
  118. uint64_t strict_cap = capacity_;
  119. if (strict_cap == 0) {
  120. pairs_.clear();
  121. return;
  122. }
  123. // Treat cap of 1 as 2 to work with the below algorithm (etc.)
  124. if (strict_cap == 1) {
  125. strict_cap = 2;
  126. }
  127. // When !strict, allow being over nominal capacity by a modest fraction.
  128. uint64_t effective_cap = strict_cap + (strict ? 0 : strict_cap / 8);
  129. if (effective_cap < strict_cap) {
  130. // Correct overflow
  131. effective_cap = UINT64_MAX;
  132. }
  133. if (pairs_.size() <= effective_cap) {
  134. return;
  135. }
  136. // The below algorithm expects at least one removal candidate between first
  137. // and last.
  138. assert(pairs_.size() >= 3);
  139. size_t to_remove_count = pairs_.size() - strict_cap;
  140. struct RemovalCandidate {
  141. uint64_t new_time_gap;
  142. std::deque<SeqnoTimePair>::iterator it;
  143. RemovalCandidate(uint64_t _new_time_gap,
  144. std::deque<SeqnoTimePair>::iterator _it)
  145. : new_time_gap(_new_time_gap), it(_it) {}
  146. bool operator>(const RemovalCandidate& other) const {
  147. if (new_time_gap == other.new_time_gap) {
  148. // If same gap, treat the newer entry as less attractive
  149. // for removal (like larger gap)
  150. return it->seqno > other.it->seqno;
  151. }
  152. return new_time_gap > other.new_time_gap;
  153. }
  154. };
  155. // A priority queue of best removal candidates (smallest time gap remaining
  156. // after removal)
  157. using RC = RemovalCandidate;
  158. using PQ = std::priority_queue<RC, std::vector<RC>, std::greater<RC>>;
  159. PQ pq;
  160. // Add all the candidates (not including first and last)
  161. {
  162. auto it = pairs_.begin();
  163. assert(it->time != kUnknownTimeBeforeAll);
  164. uint64_t prev_prev_time = it->time;
  165. ++it;
  166. assert(it->time != kUnknownTimeBeforeAll);
  167. auto prev_it = it;
  168. ++it;
  169. while (it != pairs_.end()) {
  170. assert(it->time != kUnknownTimeBeforeAll);
  171. uint64_t gap = it->time - prev_prev_time;
  172. pq.emplace(gap, prev_it);
  173. prev_prev_time = prev_it->time;
  174. prev_it = it;
  175. ++it;
  176. }
  177. }
  178. // Greedily remove the best candidate, iteratively
  179. while (to_remove_count > 0) {
  180. assert(!pq.empty());
  181. // Remove the candidate with smallest gap
  182. auto rc = pq.top();
  183. pq.pop();
  184. // NOTE: priority_queue does not support updating an existing element,
  185. // but we can work around that because the gap tracked in pq is only
  186. // going to be better than actuality, and we can detect and adjust
  187. // when a better-than-actual gap is found.
  188. // Determine actual time gap if this entry is removed (zero entries are
  189. // marked for deletion)
  190. auto it = rc.it + 1;
  191. uint64_t after_time = it->time;
  192. while (after_time == kUnknownTimeBeforeAll) {
  193. assert(it != pairs_.end());
  194. ++it;
  195. after_time = it->time;
  196. }
  197. it = rc.it - 1;
  198. uint64_t before_time = it->time;
  199. while (before_time == kUnknownTimeBeforeAll) {
  200. assert(it != pairs_.begin());
  201. --it;
  202. before_time = it->time;
  203. }
  204. // Check whether the gap is still valid (or needs to be recomputed)
  205. if (rc.new_time_gap == after_time - before_time) {
  206. // Mark the entry as removed
  207. rc.it->time = kUnknownTimeBeforeAll;
  208. --to_remove_count;
  209. } else {
  210. // Insert a replacement up-to-date removal candidate
  211. pq.emplace(after_time - before_time, rc.it);
  212. }
  213. }
  214. // Collapse away entries marked for deletion
  215. auto from_it = pairs_.begin();
  216. auto to_it = from_it;
  217. for (; from_it != pairs_.end(); ++from_it) {
  218. if (from_it->time != kUnknownTimeBeforeAll) {
  219. if (from_it != to_it) {
  220. *to_it = *from_it;
  221. }
  222. ++to_it;
  223. }
  224. }
  225. // Erase slots freed up
  226. pairs_.erase(to_it, pairs_.end());
  227. assert(pairs_.size() == strict_cap);
  228. }
  229. bool SeqnoToTimeMapping::SeqnoTimePair::Merge(const SeqnoTimePair& other) {
  230. assert(seqno <= other.seqno);
  231. if (seqno == other.seqno) {
  232. // Favoring GetProximalSeqnoBeforeTime over GetProximalTimeBeforeSeqno
  233. // by keeping the older time. For example, consider nothing has been
  234. // written to the DB in some time.
  235. time = std::min(time, other.time);
  236. return true;
  237. } else if (time == other.time) {
  238. // Favoring GetProximalSeqnoBeforeTime over GetProximalTimeBeforeSeqno
  239. // by keeping the newer seqno. For example, when a burst of writes ages
  240. // out, we want the cutoff to be the newest seqno from that burst.
  241. seqno = std::max(seqno, other.seqno);
  242. return true;
  243. } else if (time > other.time) {
  244. assert(seqno < other.seqno);
  245. // Need to resolve an inconsistency (clock drift? very rough time?).
  246. // Given the direction that entries are supposed to err, trust the earlier
  247. // time entry as more reliable, and this choice ensures we don't
  248. // accidentally throw out an entry within our time span.
  249. *this = other;
  250. return true;
  251. } else {
  252. // Not merged
  253. return false;
  254. }
  255. }
  256. void SeqnoToTimeMapping::SortAndMerge() {
  257. assert(!enforced_);
  258. if (!pairs_.empty()) {
  259. std::sort(pairs_.begin(), pairs_.end());
  260. auto from_it = pairs_.begin();
  261. auto to_it = from_it;
  262. for (++from_it; from_it != pairs_.end(); ++from_it) {
  263. if (to_it->Merge(*from_it)) {
  264. // Merged with last entry
  265. } else {
  266. // Copy into next entry
  267. *++to_it = *from_it;
  268. }
  269. }
  270. // Erase slots freed up from merging
  271. pairs_.erase(to_it + 1, pairs_.end());
  272. }
  273. // Mark as "at least sorted"
  274. enforced_ = true;
  275. }
  276. SeqnoToTimeMapping& SeqnoToTimeMapping::SetMaxTimeSpan(uint64_t max_time_span) {
  277. max_time_span_ = max_time_span;
  278. if (enforced_) {
  279. EnforceMaxTimeSpan();
  280. }
  281. return *this;
  282. }
  283. SeqnoToTimeMapping& SeqnoToTimeMapping::SetCapacity(uint64_t capacity) {
  284. capacity_ = capacity;
  285. if (enforced_) {
  286. EnforceCapacity(/*strict=*/true);
  287. }
  288. return *this;
  289. }
  290. SeqnoToTimeMapping& SeqnoToTimeMapping::Enforce(uint64_t now) {
  291. if (!enforced_) {
  292. SortAndMerge();
  293. assert(enforced_);
  294. EnforceMaxTimeSpan(now);
  295. } else if (now > 0) {
  296. EnforceMaxTimeSpan(now);
  297. }
  298. EnforceCapacity(/*strict=*/true);
  299. return *this;
  300. }
  301. void SeqnoToTimeMapping::AddUnenforced(SequenceNumber seqno, uint64_t time) {
  302. if (seqno == 0) {
  303. return;
  304. }
  305. enforced_ = false;
  306. pairs_.emplace_back(seqno, time);
  307. }
  308. // The encoded format is:
  309. // [num_of_entries][[seqno][time],[seqno][time],...]
  310. // ^ ^
  311. // var_int delta_encoded (var_int)
  312. // Except empty string is used for empty mapping. This means the encoding
  313. // doesn't fully form a prefix code, but that is OK for applications like
  314. // TableProperties.
  315. void SeqnoToTimeMapping::EncodeTo(std::string& dest) const {
  316. assert(enforced_);
  317. // Can use empty string for empty mapping
  318. if (pairs_.empty()) {
  319. return;
  320. }
  321. // Encode number of entries
  322. PutVarint64(&dest, pairs_.size());
  323. SeqnoTimePair base;
  324. for (auto& cur : pairs_) {
  325. assert(base < cur);
  326. // Delta encode each entry
  327. SeqnoTimePair val = cur.ComputeDelta(base);
  328. base = cur;
  329. val.Encode(dest);
  330. }
  331. }
  332. namespace {
  333. Status DecodeImpl(Slice& input,
  334. std::deque<SeqnoToTimeMapping::SeqnoTimePair>& pairs) {
  335. if (input.empty()) {
  336. return Status::OK();
  337. }
  338. uint64_t count;
  339. if (!GetVarint64(&input, &count)) {
  340. return Status::Corruption("Invalid sequence number time size");
  341. }
  342. SeqnoToTimeMapping::SeqnoTimePair base;
  343. for (uint64_t i = 0; i < count; i++) {
  344. SeqnoToTimeMapping::SeqnoTimePair val;
  345. Status s = val.Decode(input);
  346. if (!s.ok()) {
  347. return s;
  348. }
  349. val.ApplyDelta(base);
  350. pairs.emplace_back(val);
  351. base = val;
  352. }
  353. if (!input.empty()) {
  354. return Status::Corruption(
  355. "Extra bytes at end of sequence number time mapping");
  356. }
  357. return Status::OK();
  358. }
  359. } // namespace
  360. Status SeqnoToTimeMapping::DecodeFrom(const std::string& pairs_str) {
  361. size_t orig_size = pairs_.size();
  362. Slice input(pairs_str);
  363. Status s = DecodeImpl(input, pairs_);
  364. if (!s.ok()) {
  365. // Roll back in case of corrupted data
  366. pairs_.resize(orig_size);
  367. } else if (orig_size > 0 || max_time_span_ < UINT64_MAX ||
  368. capacity_ < UINT64_MAX) {
  369. enforced_ = false;
  370. }
  371. return s;
  372. }
  373. void SeqnoToTimeMapping::SeqnoTimePair::Encode(std::string& dest) const {
  374. PutVarint64Varint64(&dest, seqno, time);
  375. }
  376. Status SeqnoToTimeMapping::SeqnoTimePair::Decode(Slice& input) {
  377. if (!GetVarint64(&input, &seqno)) {
  378. return Status::Corruption("Invalid sequence number");
  379. }
  380. if (!GetVarint64(&input, &time)) {
  381. return Status::Corruption("Invalid time");
  382. }
  383. return Status::OK();
  384. }
  385. void SeqnoToTimeMapping::CopyFromSeqnoRange(const SeqnoToTimeMapping& src,
  386. SequenceNumber from_seqno,
  387. SequenceNumber to_seqno) {
  388. bool orig_empty = Empty();
  389. auto src_it = src.FindGreaterEqSeqno(from_seqno);
  390. // Allow nonsensical ranges like [1000, 0] which might show up e.g. for
  391. // an SST file with no entries.
  392. auto src_it_end =
  393. to_seqno < from_seqno ? src_it : src.FindGreaterSeqno(to_seqno);
  394. // To best answer GetProximalTimeBeforeSeqno(from_seqno) we need an entry
  395. // with a seqno before that (if available)
  396. if (src_it != src.pairs_.begin()) {
  397. --src_it;
  398. }
  399. assert(src_it <= src_it_end);
  400. std::copy(src_it, src_it_end, std::back_inserter(pairs_));
  401. if (!orig_empty || max_time_span_ < UINT64_MAX || capacity_ < UINT64_MAX) {
  402. enforced_ = false;
  403. }
  404. }
  405. bool SeqnoToTimeMapping::Append(SequenceNumber seqno, uint64_t time) {
  406. if (capacity_ == 0) {
  407. return false;
  408. }
  409. bool added = false;
  410. if (seqno == 0) {
  411. // skip seq number 0, which may have special meaning, like zeroed out data
  412. // TODO: consider changing?
  413. } else if (pairs_.empty()) {
  414. enforced_ = true;
  415. pairs_.emplace_back(seqno, time);
  416. // skip normal enforced check below
  417. return true;
  418. } else {
  419. auto& last = pairs_.back();
  420. // We can attempt to merge with the last entry if the new entry sorts with
  421. // it.
  422. if (last.seqno <= seqno) {
  423. bool merged = last.Merge({seqno, time});
  424. if (!merged) {
  425. if (enforced_ && (seqno <= last.seqno || time <= last.time)) {
  426. // Out of order append should not happen, except in case of clock
  427. // reset
  428. assert(false);
  429. } else {
  430. pairs_.emplace_back(seqno, time);
  431. added = true;
  432. }
  433. }
  434. } else if (!enforced_) {
  435. // Treat like AddUnenforced and fix up below
  436. pairs_.emplace_back(seqno, time);
  437. added = true;
  438. } else {
  439. // Out of order append attempted
  440. assert(false);
  441. }
  442. }
  443. // Similar to Enforce() but not quite
  444. if (!enforced_) {
  445. SortAndMerge();
  446. assert(enforced_);
  447. }
  448. EnforceMaxTimeSpan();
  449. EnforceCapacity(/*strict=*/false);
  450. return added;
  451. }
  452. void SeqnoToTimeMapping::PrePopulate(SequenceNumber from_seqno,
  453. SequenceNumber to_seqno,
  454. uint64_t from_time, uint64_t to_time) {
  455. assert(Empty());
  456. assert(from_seqno > 0);
  457. assert(to_seqno > from_seqno);
  458. assert(from_time > kUnknownTimeBeforeAll);
  459. assert(to_time >= from_time);
  460. // TODO: smartly limit this to max_capacity_ representative samples
  461. for (auto i = from_seqno; i <= to_seqno; i++) {
  462. uint64_t t = from_time + (to_time - from_time) * (i - from_seqno) /
  463. (to_seqno - from_seqno);
  464. pairs_.emplace_back(i, t);
  465. }
  466. }
  467. std::string SeqnoToTimeMapping::ToHumanString() const {
  468. std::string ret;
  469. for (const auto& seq_time : pairs_) {
  470. AppendNumberTo(&ret, seq_time.seqno);
  471. ret.append("->");
  472. AppendNumberTo(&ret, seq_time.time);
  473. ret.append(",");
  474. }
  475. return ret;
  476. }
  477. Slice PackValueAndWriteTime(const Slice& value, uint64_t unix_write_time,
  478. std::string* buf) {
  479. buf->assign(value.data(), value.size());
  480. PutFixed64(buf, unix_write_time);
  481. return Slice(*buf);
  482. }
  483. Slice PackValueAndSeqno(const Slice& value, SequenceNumber seqno,
  484. std::string* buf) {
  485. buf->assign(value.data(), value.size());
  486. PutFixed64(buf, seqno);
  487. return Slice(*buf);
  488. }
  489. uint64_t ParsePackedValueForWriteTime(const Slice& value) {
  490. assert(value.size() >= sizeof(uint64_t));
  491. Slice write_time_slice(value.data() + value.size() - sizeof(uint64_t),
  492. sizeof(uint64_t));
  493. uint64_t write_time;
  494. [[maybe_unused]] auto res = GetFixed64(&write_time_slice, &write_time);
  495. assert(res);
  496. return write_time;
  497. }
  498. std::tuple<Slice, uint64_t> ParsePackedValueWithWriteTime(const Slice& value) {
  499. return std::make_tuple(Slice(value.data(), value.size() - sizeof(uint64_t)),
  500. ParsePackedValueForWriteTime(value));
  501. }
  502. SequenceNumber ParsePackedValueForSeqno(const Slice& value) {
  503. assert(value.size() >= sizeof(SequenceNumber));
  504. Slice seqno_slice(value.data() + value.size() - sizeof(uint64_t),
  505. sizeof(uint64_t));
  506. SequenceNumber seqno;
  507. [[maybe_unused]] auto res = GetFixed64(&seqno_slice, &seqno);
  508. assert(res);
  509. return seqno;
  510. }
  511. std::tuple<Slice, SequenceNumber> ParsePackedValueWithSeqno(
  512. const Slice& value) {
  513. return std::make_tuple(
  514. Slice(value.data(), value.size() - sizeof(SequenceNumber)),
  515. ParsePackedValueForSeqno(value));
  516. }
  517. Slice ParsePackedValueForValue(const Slice& value) {
  518. assert(value.size() >= sizeof(uint64_t));
  519. return Slice(value.data(), value.size() - sizeof(uint64_t));
  520. }
  521. } // namespace ROCKSDB_NAMESPACE