range_tombstone_fragmenter.cc 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439
  1. // Copyright (c) 2018-present, Facebook, Inc. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. #include "db/range_tombstone_fragmenter.h"
  6. #include <algorithm>
  7. #include <functional>
  8. #include <set>
  9. #include <stdio.h>
  10. #include <cinttypes>
  11. #include "util/autovector.h"
  12. #include "util/kv_map.h"
  13. #include "util/vector_iterator.h"
  14. namespace ROCKSDB_NAMESPACE {
  15. FragmentedRangeTombstoneList::FragmentedRangeTombstoneList(
  16. std::unique_ptr<InternalIterator> unfragmented_tombstones,
  17. const InternalKeyComparator& icmp, bool for_compaction,
  18. const std::vector<SequenceNumber>& snapshots) {
  19. if (unfragmented_tombstones == nullptr) {
  20. return;
  21. }
  22. bool is_sorted = true;
  23. int num_tombstones = 0;
  24. InternalKey pinned_last_start_key;
  25. Slice last_start_key;
  26. for (unfragmented_tombstones->SeekToFirst(); unfragmented_tombstones->Valid();
  27. unfragmented_tombstones->Next(), num_tombstones++) {
  28. if (num_tombstones > 0 &&
  29. icmp.Compare(last_start_key, unfragmented_tombstones->key()) > 0) {
  30. is_sorted = false;
  31. break;
  32. }
  33. if (unfragmented_tombstones->IsKeyPinned()) {
  34. last_start_key = unfragmented_tombstones->key();
  35. } else {
  36. pinned_last_start_key.DecodeFrom(unfragmented_tombstones->key());
  37. last_start_key = pinned_last_start_key.Encode();
  38. }
  39. }
  40. if (is_sorted) {
  41. FragmentTombstones(std::move(unfragmented_tombstones), icmp, for_compaction,
  42. snapshots);
  43. return;
  44. }
  45. // Sort the tombstones before fragmenting them.
  46. std::vector<std::string> keys, values;
  47. keys.reserve(num_tombstones);
  48. values.reserve(num_tombstones);
  49. for (unfragmented_tombstones->SeekToFirst(); unfragmented_tombstones->Valid();
  50. unfragmented_tombstones->Next()) {
  51. keys.emplace_back(unfragmented_tombstones->key().data(),
  52. unfragmented_tombstones->key().size());
  53. values.emplace_back(unfragmented_tombstones->value().data(),
  54. unfragmented_tombstones->value().size());
  55. }
  56. // VectorIterator implicitly sorts by key during construction.
  57. auto iter = std::unique_ptr<VectorIterator>(
  58. new VectorIterator(std::move(keys), std::move(values), &icmp));
  59. FragmentTombstones(std::move(iter), icmp, for_compaction, snapshots);
  60. }
  61. void FragmentedRangeTombstoneList::FragmentTombstones(
  62. std::unique_ptr<InternalIterator> unfragmented_tombstones,
  63. const InternalKeyComparator& icmp, bool for_compaction,
  64. const std::vector<SequenceNumber>& snapshots) {
  65. Slice cur_start_key(nullptr, 0);
  66. auto cmp = ParsedInternalKeyComparator(&icmp);
  67. // Stores the end keys and sequence numbers of range tombstones with a start
  68. // key less than or equal to cur_start_key. Provides an ordering by end key
  69. // for use in flush_current_tombstones.
  70. std::set<ParsedInternalKey, ParsedInternalKeyComparator> cur_end_keys(cmp);
  71. // Given the next start key in unfragmented_tombstones,
  72. // flush_current_tombstones writes every tombstone fragment that starts
  73. // and ends with a key before next_start_key, and starts with a key greater
  74. // than or equal to cur_start_key.
  75. auto flush_current_tombstones = [&](const Slice& next_start_key) {
  76. auto it = cur_end_keys.begin();
  77. bool reached_next_start_key = false;
  78. for (; it != cur_end_keys.end() && !reached_next_start_key; ++it) {
  79. Slice cur_end_key = it->user_key;
  80. if (icmp.user_comparator()->Compare(cur_start_key, cur_end_key) == 0) {
  81. // Empty tombstone.
  82. continue;
  83. }
  84. if (icmp.user_comparator()->Compare(next_start_key, cur_end_key) <= 0) {
  85. // All of the end keys in [it, cur_end_keys.end()) are after
  86. // next_start_key, so the tombstones they represent can be used in
  87. // fragments that start with keys greater than or equal to
  88. // next_start_key. However, the end keys we already passed will not be
  89. // used in any more tombstone fragments.
  90. //
  91. // Remove the fully fragmented tombstones and stop iteration after a
  92. // final round of flushing to preserve the tombstones we can create more
  93. // fragments from.
  94. reached_next_start_key = true;
  95. cur_end_keys.erase(cur_end_keys.begin(), it);
  96. cur_end_key = next_start_key;
  97. }
  98. // Flush a range tombstone fragment [cur_start_key, cur_end_key), which
  99. // should not overlap with the last-flushed tombstone fragment.
  100. assert(tombstones_.empty() ||
  101. icmp.user_comparator()->Compare(tombstones_.back().end_key,
  102. cur_start_key) <= 0);
  103. // Sort the sequence numbers of the tombstones being fragmented in
  104. // descending order, and then flush them in that order.
  105. autovector<SequenceNumber> seqnums_to_flush;
  106. for (auto flush_it = it; flush_it != cur_end_keys.end(); ++flush_it) {
  107. seqnums_to_flush.push_back(flush_it->sequence);
  108. }
  109. std::sort(seqnums_to_flush.begin(), seqnums_to_flush.end(),
  110. std::greater<SequenceNumber>());
  111. size_t start_idx = tombstone_seqs_.size();
  112. size_t end_idx = start_idx + seqnums_to_flush.size();
  113. if (for_compaction) {
  114. // Drop all tombstone seqnums that are not preserved by a snapshot.
  115. SequenceNumber next_snapshot = kMaxSequenceNumber;
  116. for (auto seq : seqnums_to_flush) {
  117. if (seq <= next_snapshot) {
  118. // This seqnum is visible by a lower snapshot.
  119. tombstone_seqs_.push_back(seq);
  120. seq_set_.insert(seq);
  121. auto upper_bound_it =
  122. std::lower_bound(snapshots.begin(), snapshots.end(), seq);
  123. if (upper_bound_it == snapshots.begin()) {
  124. // This seqnum is the topmost one visible by the earliest
  125. // snapshot. None of the seqnums below it will be visible, so we
  126. // can skip them.
  127. break;
  128. }
  129. next_snapshot = *std::prev(upper_bound_it);
  130. }
  131. }
  132. end_idx = tombstone_seqs_.size();
  133. } else {
  134. // The fragmentation is being done for reads, so preserve all seqnums.
  135. tombstone_seqs_.insert(tombstone_seqs_.end(), seqnums_to_flush.begin(),
  136. seqnums_to_flush.end());
  137. seq_set_.insert(seqnums_to_flush.begin(), seqnums_to_flush.end());
  138. }
  139. assert(start_idx < end_idx);
  140. tombstones_.emplace_back(cur_start_key, cur_end_key, start_idx, end_idx);
  141. cur_start_key = cur_end_key;
  142. }
  143. if (!reached_next_start_key) {
  144. // There is a gap between the last flushed tombstone fragment and
  145. // the next tombstone's start key. Remove all the end keys in
  146. // the working set, since we have fully fragmented their corresponding
  147. // tombstones.
  148. cur_end_keys.clear();
  149. }
  150. cur_start_key = next_start_key;
  151. };
  152. pinned_iters_mgr_.StartPinning();
  153. bool no_tombstones = true;
  154. for (unfragmented_tombstones->SeekToFirst(); unfragmented_tombstones->Valid();
  155. unfragmented_tombstones->Next()) {
  156. const Slice& ikey = unfragmented_tombstones->key();
  157. Slice tombstone_start_key = ExtractUserKey(ikey);
  158. SequenceNumber tombstone_seq = GetInternalKeySeqno(ikey);
  159. if (!unfragmented_tombstones->IsKeyPinned()) {
  160. pinned_slices_.emplace_back(tombstone_start_key.data(),
  161. tombstone_start_key.size());
  162. tombstone_start_key = pinned_slices_.back();
  163. }
  164. no_tombstones = false;
  165. Slice tombstone_end_key = unfragmented_tombstones->value();
  166. if (!unfragmented_tombstones->IsValuePinned()) {
  167. pinned_slices_.emplace_back(tombstone_end_key.data(),
  168. tombstone_end_key.size());
  169. tombstone_end_key = pinned_slices_.back();
  170. }
  171. if (!cur_end_keys.empty() && icmp.user_comparator()->Compare(
  172. cur_start_key, tombstone_start_key) != 0) {
  173. // The start key has changed. Flush all tombstones that start before
  174. // this new start key.
  175. flush_current_tombstones(tombstone_start_key);
  176. }
  177. cur_start_key = tombstone_start_key;
  178. cur_end_keys.emplace(tombstone_end_key, tombstone_seq, kTypeRangeDeletion);
  179. }
  180. if (!cur_end_keys.empty()) {
  181. ParsedInternalKey last_end_key = *std::prev(cur_end_keys.end());
  182. flush_current_tombstones(last_end_key.user_key);
  183. }
  184. if (!no_tombstones) {
  185. pinned_iters_mgr_.PinIterator(unfragmented_tombstones.release(),
  186. false /* arena */);
  187. }
  188. }
  189. bool FragmentedRangeTombstoneList::ContainsRange(SequenceNumber lower,
  190. SequenceNumber upper) const {
  191. auto seq_it = seq_set_.lower_bound(lower);
  192. return seq_it != seq_set_.end() && *seq_it <= upper;
  193. }
  194. FragmentedRangeTombstoneIterator::FragmentedRangeTombstoneIterator(
  195. const FragmentedRangeTombstoneList* tombstones,
  196. const InternalKeyComparator& icmp, SequenceNumber _upper_bound,
  197. SequenceNumber _lower_bound)
  198. : tombstone_start_cmp_(icmp.user_comparator()),
  199. tombstone_end_cmp_(icmp.user_comparator()),
  200. icmp_(&icmp),
  201. ucmp_(icmp.user_comparator()),
  202. tombstones_(tombstones),
  203. upper_bound_(_upper_bound),
  204. lower_bound_(_lower_bound) {
  205. assert(tombstones_ != nullptr);
  206. Invalidate();
  207. }
  208. FragmentedRangeTombstoneIterator::FragmentedRangeTombstoneIterator(
  209. const std::shared_ptr<const FragmentedRangeTombstoneList>& tombstones,
  210. const InternalKeyComparator& icmp, SequenceNumber _upper_bound,
  211. SequenceNumber _lower_bound)
  212. : tombstone_start_cmp_(icmp.user_comparator()),
  213. tombstone_end_cmp_(icmp.user_comparator()),
  214. icmp_(&icmp),
  215. ucmp_(icmp.user_comparator()),
  216. tombstones_ref_(tombstones),
  217. tombstones_(tombstones_ref_.get()),
  218. upper_bound_(_upper_bound),
  219. lower_bound_(_lower_bound) {
  220. assert(tombstones_ != nullptr);
  221. Invalidate();
  222. }
  223. void FragmentedRangeTombstoneIterator::SeekToFirst() {
  224. pos_ = tombstones_->begin();
  225. seq_pos_ = tombstones_->seq_begin();
  226. }
  227. void FragmentedRangeTombstoneIterator::SeekToTopFirst() {
  228. if (tombstones_->empty()) {
  229. Invalidate();
  230. return;
  231. }
  232. pos_ = tombstones_->begin();
  233. seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx),
  234. tombstones_->seq_iter(pos_->seq_end_idx),
  235. upper_bound_, std::greater<SequenceNumber>());
  236. ScanForwardToVisibleTombstone();
  237. }
  238. void FragmentedRangeTombstoneIterator::SeekToLast() {
  239. pos_ = std::prev(tombstones_->end());
  240. seq_pos_ = std::prev(tombstones_->seq_end());
  241. }
  242. void FragmentedRangeTombstoneIterator::SeekToTopLast() {
  243. if (tombstones_->empty()) {
  244. Invalidate();
  245. return;
  246. }
  247. pos_ = std::prev(tombstones_->end());
  248. seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx),
  249. tombstones_->seq_iter(pos_->seq_end_idx),
  250. upper_bound_, std::greater<SequenceNumber>());
  251. ScanBackwardToVisibleTombstone();
  252. }
  253. void FragmentedRangeTombstoneIterator::Seek(const Slice& target) {
  254. if (tombstones_->empty()) {
  255. Invalidate();
  256. return;
  257. }
  258. SeekToCoveringTombstone(target);
  259. ScanForwardToVisibleTombstone();
  260. }
  261. void FragmentedRangeTombstoneIterator::SeekForPrev(const Slice& target) {
  262. if (tombstones_->empty()) {
  263. Invalidate();
  264. return;
  265. }
  266. SeekForPrevToCoveringTombstone(target);
  267. ScanBackwardToVisibleTombstone();
  268. }
  269. void FragmentedRangeTombstoneIterator::SeekToCoveringTombstone(
  270. const Slice& target) {
  271. pos_ = std::upper_bound(tombstones_->begin(), tombstones_->end(), target,
  272. tombstone_end_cmp_);
  273. if (pos_ == tombstones_->end()) {
  274. // All tombstones end before target.
  275. seq_pos_ = tombstones_->seq_end();
  276. return;
  277. }
  278. seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx),
  279. tombstones_->seq_iter(pos_->seq_end_idx),
  280. upper_bound_, std::greater<SequenceNumber>());
  281. }
  282. void FragmentedRangeTombstoneIterator::SeekForPrevToCoveringTombstone(
  283. const Slice& target) {
  284. if (tombstones_->empty()) {
  285. Invalidate();
  286. return;
  287. }
  288. pos_ = std::upper_bound(tombstones_->begin(), tombstones_->end(), target,
  289. tombstone_start_cmp_);
  290. if (pos_ == tombstones_->begin()) {
  291. // All tombstones start after target.
  292. Invalidate();
  293. return;
  294. }
  295. --pos_;
  296. seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx),
  297. tombstones_->seq_iter(pos_->seq_end_idx),
  298. upper_bound_, std::greater<SequenceNumber>());
  299. }
  300. void FragmentedRangeTombstoneIterator::ScanForwardToVisibleTombstone() {
  301. while (pos_ != tombstones_->end() &&
  302. (seq_pos_ == tombstones_->seq_iter(pos_->seq_end_idx) ||
  303. *seq_pos_ < lower_bound_)) {
  304. ++pos_;
  305. if (pos_ == tombstones_->end()) {
  306. Invalidate();
  307. return;
  308. }
  309. seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx),
  310. tombstones_->seq_iter(pos_->seq_end_idx),
  311. upper_bound_, std::greater<SequenceNumber>());
  312. }
  313. }
  314. void FragmentedRangeTombstoneIterator::ScanBackwardToVisibleTombstone() {
  315. while (pos_ != tombstones_->end() &&
  316. (seq_pos_ == tombstones_->seq_iter(pos_->seq_end_idx) ||
  317. *seq_pos_ < lower_bound_)) {
  318. if (pos_ == tombstones_->begin()) {
  319. Invalidate();
  320. return;
  321. }
  322. --pos_;
  323. seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx),
  324. tombstones_->seq_iter(pos_->seq_end_idx),
  325. upper_bound_, std::greater<SequenceNumber>());
  326. }
  327. }
  328. void FragmentedRangeTombstoneIterator::Next() {
  329. ++seq_pos_;
  330. if (seq_pos_ == tombstones_->seq_iter(pos_->seq_end_idx)) {
  331. ++pos_;
  332. }
  333. }
  334. void FragmentedRangeTombstoneIterator::TopNext() {
  335. ++pos_;
  336. if (pos_ == tombstones_->end()) {
  337. return;
  338. }
  339. seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx),
  340. tombstones_->seq_iter(pos_->seq_end_idx),
  341. upper_bound_, std::greater<SequenceNumber>());
  342. ScanForwardToVisibleTombstone();
  343. }
  344. void FragmentedRangeTombstoneIterator::Prev() {
  345. if (seq_pos_ == tombstones_->seq_begin()) {
  346. Invalidate();
  347. return;
  348. }
  349. --seq_pos_;
  350. if (pos_ == tombstones_->end() ||
  351. seq_pos_ == tombstones_->seq_iter(pos_->seq_start_idx - 1)) {
  352. --pos_;
  353. }
  354. }
  355. void FragmentedRangeTombstoneIterator::TopPrev() {
  356. if (pos_ == tombstones_->begin()) {
  357. Invalidate();
  358. return;
  359. }
  360. --pos_;
  361. seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx),
  362. tombstones_->seq_iter(pos_->seq_end_idx),
  363. upper_bound_, std::greater<SequenceNumber>());
  364. ScanBackwardToVisibleTombstone();
  365. }
  366. bool FragmentedRangeTombstoneIterator::Valid() const {
  367. return tombstones_ != nullptr && pos_ != tombstones_->end();
  368. }
  369. SequenceNumber FragmentedRangeTombstoneIterator::MaxCoveringTombstoneSeqnum(
  370. const Slice& target_user_key) {
  371. SeekToCoveringTombstone(target_user_key);
  372. return ValidPos() && ucmp_->Compare(start_key(), target_user_key) <= 0 ? seq()
  373. : 0;
  374. }
  375. std::map<SequenceNumber, std::unique_ptr<FragmentedRangeTombstoneIterator>>
  376. FragmentedRangeTombstoneIterator::SplitBySnapshot(
  377. const std::vector<SequenceNumber>& snapshots) {
  378. std::map<SequenceNumber, std::unique_ptr<FragmentedRangeTombstoneIterator>>
  379. splits;
  380. SequenceNumber lower = 0;
  381. SequenceNumber upper;
  382. for (size_t i = 0; i <= snapshots.size(); i++) {
  383. if (i >= snapshots.size()) {
  384. upper = kMaxSequenceNumber;
  385. } else {
  386. upper = snapshots[i];
  387. }
  388. if (tombstones_->ContainsRange(lower, upper)) {
  389. splits.emplace(upper, std::unique_ptr<FragmentedRangeTombstoneIterator>(
  390. new FragmentedRangeTombstoneIterator(
  391. tombstones_, *icmp_, upper, lower)));
  392. }
  393. lower = upper + 1;
  394. }
  395. return splits;
  396. }
  397. } // namespace ROCKSDB_NAMESPACE