db_iter.h 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344
  1. // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. //
  6. // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
  7. // Use of this source code is governed by a BSD-style license that can be
  8. // found in the LICENSE file. See the AUTHORS file for names of contributors.
  9. #pragma once
  10. #include <stdint.h>
  11. #include <string>
  12. #include "db/db_impl/db_impl.h"
  13. #include "db/dbformat.h"
  14. #include "db/range_del_aggregator.h"
  15. #include "memory/arena.h"
  16. #include "options/cf_options.h"
  17. #include "rocksdb/db.h"
  18. #include "rocksdb/iterator.h"
  19. #include "table/iterator_wrapper.h"
  20. #include "util/autovector.h"
  21. namespace ROCKSDB_NAMESPACE {
  22. // This file declares the factory functions of DBIter, in its original form
  23. // or a wrapped form with class ArenaWrappedDBIter, which is defined here.
  24. // Class DBIter, which is declared and implemented inside db_iter.cc, is
  25. // an iterator that converts internal keys (yielded by an InternalIterator)
  26. // that were live at the specified sequence number into appropriate user
  27. // keys.
  28. // Each internal key consists of a user key, a sequence number, and a value
  29. // type. DBIter deals with multiple key versions, tombstones, merge operands,
  30. // etc, and exposes an Iterator.
  31. // For example, DBIter may wrap following InternalIterator:
  32. // user key: AAA value: v3 seqno: 100 type: Put
  33. // user key: AAA value: v2 seqno: 97 type: Put
  34. // user key: AAA value: v1 seqno: 95 type: Put
  35. // user key: BBB value: v1 seqno: 90 type: Put
  36. // user key: BBC value: N/A seqno: 98 type: Delete
  37. // user key: BBC value: v1 seqno: 95 type: Put
  38. // If the snapshot passed in is 102, then the DBIter is expected to
  39. // expose the following iterator:
  40. // key: AAA value: v3
  41. // key: BBB value: v1
  42. // If the snapshot passed in is 96, then it should expose:
  43. // key: AAA value: v1
  44. // key: BBB value: v1
  45. // key: BBC value: v1
  46. //
  47. // Memtables and sstables that make the DB representation contain
  48. // (userkey,seq,type) => uservalue entries. DBIter
  49. // combines multiple entries for the same userkey found in the DB
  50. // representation into a single entry while accounting for sequence
  51. // numbers, deletion markers, overwrites, etc.
  52. class DBIter final : public Iterator {
  53. public:
  54. // The following is grossly complicated. TODO: clean it up
  55. // Which direction is the iterator currently moving?
  56. // (1) When moving forward:
  57. // (1a) if current_entry_is_merged_ = false, the internal iterator is
  58. // positioned at the exact entry that yields this->key(), this->value()
  59. // (1b) if current_entry_is_merged_ = true, the internal iterator is
  60. // positioned immediately after the last entry that contributed to the
  61. // current this->value(). That entry may or may not have key equal to
  62. // this->key().
  63. // (2) When moving backwards, the internal iterator is positioned
  64. // just before all entries whose user key == this->key().
  65. enum Direction { kForward, kReverse };
  66. // LocalStatistics contain Statistics counters that will be aggregated per
  67. // each iterator instance and then will be sent to the global statistics when
  68. // the iterator is destroyed.
  69. //
  70. // The purpose of this approach is to avoid perf regression happening
  71. // when multiple threads bump the atomic counters from a DBIter::Next().
  72. struct LocalStatistics {
  73. explicit LocalStatistics() { ResetCounters(); }
  74. void ResetCounters() {
  75. next_count_ = 0;
  76. next_found_count_ = 0;
  77. prev_count_ = 0;
  78. prev_found_count_ = 0;
  79. bytes_read_ = 0;
  80. skip_count_ = 0;
  81. }
  82. void BumpGlobalStatistics(Statistics* global_statistics) {
  83. RecordTick(global_statistics, NUMBER_DB_NEXT, next_count_);
  84. RecordTick(global_statistics, NUMBER_DB_NEXT_FOUND, next_found_count_);
  85. RecordTick(global_statistics, NUMBER_DB_PREV, prev_count_);
  86. RecordTick(global_statistics, NUMBER_DB_PREV_FOUND, prev_found_count_);
  87. RecordTick(global_statistics, ITER_BYTES_READ, bytes_read_);
  88. RecordTick(global_statistics, NUMBER_ITER_SKIP, skip_count_);
  89. PERF_COUNTER_ADD(iter_read_bytes, bytes_read_);
  90. ResetCounters();
  91. }
  92. // Map to Tickers::NUMBER_DB_NEXT
  93. uint64_t next_count_;
  94. // Map to Tickers::NUMBER_DB_NEXT_FOUND
  95. uint64_t next_found_count_;
  96. // Map to Tickers::NUMBER_DB_PREV
  97. uint64_t prev_count_;
  98. // Map to Tickers::NUMBER_DB_PREV_FOUND
  99. uint64_t prev_found_count_;
  100. // Map to Tickers::ITER_BYTES_READ
  101. uint64_t bytes_read_;
  102. // Map to Tickers::NUMBER_ITER_SKIP
  103. uint64_t skip_count_;
  104. };
  105. DBIter(Env* _env, const ReadOptions& read_options,
  106. const ImmutableCFOptions& cf_options,
  107. const MutableCFOptions& mutable_cf_options, const Comparator* cmp,
  108. InternalIterator* iter, SequenceNumber s, bool arena_mode,
  109. uint64_t max_sequential_skip_in_iterations,
  110. ReadCallback* read_callback, DBImpl* db_impl, ColumnFamilyData* cfd,
  111. bool allow_blob);
  112. // No copying allowed
  113. DBIter(const DBIter&) = delete;
  114. void operator=(const DBIter&) = delete;
  115. ~DBIter() override {
  116. // Release pinned data if any
  117. if (pinned_iters_mgr_.PinningEnabled()) {
  118. pinned_iters_mgr_.ReleasePinnedData();
  119. }
  120. RecordTick(statistics_, NO_ITERATOR_DELETED);
  121. ResetInternalKeysSkippedCounter();
  122. local_stats_.BumpGlobalStatistics(statistics_);
  123. iter_.DeleteIter(arena_mode_);
  124. }
  125. void SetIter(InternalIterator* iter) {
  126. assert(iter_.iter() == nullptr);
  127. iter_.Set(iter);
  128. iter_.iter()->SetPinnedItersMgr(&pinned_iters_mgr_);
  129. }
  130. ReadRangeDelAggregator* GetRangeDelAggregator() { return &range_del_agg_; }
  131. bool Valid() const override { return valid_; }
  132. Slice key() const override {
  133. assert(valid_);
  134. if (start_seqnum_ > 0) {
  135. return saved_key_.GetInternalKey();
  136. } else {
  137. return saved_key_.GetUserKey();
  138. }
  139. }
  140. Slice value() const override {
  141. assert(valid_);
  142. if (current_entry_is_merged_) {
  143. // If pinned_value_ is set then the result of merge operator is one of
  144. // the merge operands and we should return it.
  145. return pinned_value_.data() ? pinned_value_ : saved_value_;
  146. } else if (direction_ == kReverse) {
  147. return pinned_value_;
  148. } else {
  149. return iter_.value();
  150. }
  151. }
  152. Status status() const override {
  153. if (status_.ok()) {
  154. return iter_.status();
  155. } else {
  156. assert(!valid_);
  157. return status_;
  158. }
  159. }
  160. bool IsBlob() const {
  161. assert(valid_ && (allow_blob_ || !is_blob_));
  162. return is_blob_;
  163. }
  164. Status GetProperty(std::string prop_name, std::string* prop) override;
  165. void Next() final override;
  166. void Prev() final override;
  167. void Seek(const Slice& target) final override;
  168. void SeekForPrev(const Slice& target) final override;
  169. void SeekToFirst() final override;
  170. void SeekToLast() final override;
  171. Env* env() const { return env_; }
  172. void set_sequence(uint64_t s) {
  173. sequence_ = s;
  174. if (read_callback_) {
  175. read_callback_->Refresh(s);
  176. }
  177. }
  178. void set_valid(bool v) { valid_ = v; }
  179. private:
  180. // For all methods in this block:
  181. // PRE: iter_->Valid() && status_.ok()
  182. // Return false if there was an error, and status() is non-ok, valid_ = false;
  183. // in this case callers would usually stop what they were doing and return.
  184. bool ReverseToForward();
  185. bool ReverseToBackward();
  186. // Set saved_key_ to the seek key to target, with proper sequence number set.
  187. // It might get adjusted if the seek key is smaller than iterator lower bound.
  188. void SetSavedKeyToSeekTarget(const Slice& target);
  189. // Set saved_key_ to the seek key to target, with proper sequence number set.
  190. // It might get adjusted if the seek key is larger than iterator upper bound.
  191. void SetSavedKeyToSeekForPrevTarget(const Slice& target);
  192. bool FindValueForCurrentKey();
  193. bool FindValueForCurrentKeyUsingSeek();
  194. bool FindUserKeyBeforeSavedKey();
  195. // If `skipping_saved_key` is true, the function will keep iterating until it
  196. // finds a user key that is larger than `saved_key_`.
  197. // If `prefix` is not null, the iterator needs to stop when all keys for the
  198. // prefix are exhausted and the interator is set to invalid.
  199. bool FindNextUserEntry(bool skipping_saved_key, const Slice* prefix);
  200. // Internal implementation of FindNextUserEntry().
  201. bool FindNextUserEntryInternal(bool skipping_saved_key, const Slice* prefix);
  202. bool ParseKey(ParsedInternalKey* key);
  203. bool MergeValuesNewToOld();
  204. // If prefix is not null, we need to set the iterator to invalid if no more
  205. // entry can be found within the prefix.
  206. void PrevInternal(const Slice* prefix);
  207. bool TooManyInternalKeysSkipped(bool increment = true);
  208. bool IsVisible(SequenceNumber sequence);
  209. // Temporarily pin the blocks that we encounter until ReleaseTempPinnedData()
  210. // is called
  211. void TempPinData() {
  212. if (!pin_thru_lifetime_) {
  213. pinned_iters_mgr_.StartPinning();
  214. }
  215. }
  216. // Release blocks pinned by TempPinData()
  217. void ReleaseTempPinnedData() {
  218. if (!pin_thru_lifetime_ && pinned_iters_mgr_.PinningEnabled()) {
  219. pinned_iters_mgr_.ReleasePinnedData();
  220. }
  221. }
  222. inline void ClearSavedValue() {
  223. if (saved_value_.capacity() > 1048576) {
  224. std::string empty;
  225. swap(empty, saved_value_);
  226. } else {
  227. saved_value_.clear();
  228. }
  229. }
  230. inline void ResetInternalKeysSkippedCounter() {
  231. local_stats_.skip_count_ += num_internal_keys_skipped_;
  232. if (valid_) {
  233. local_stats_.skip_count_--;
  234. }
  235. num_internal_keys_skipped_ = 0;
  236. }
  237. bool expect_total_order_inner_iter() {
  238. assert(expect_total_order_inner_iter_ || prefix_extractor_ != nullptr);
  239. return expect_total_order_inner_iter_;
  240. }
  241. const SliceTransform* prefix_extractor_;
  242. Env* const env_;
  243. Logger* logger_;
  244. UserComparatorWrapper user_comparator_;
  245. const MergeOperator* const merge_operator_;
  246. IteratorWrapper iter_;
  247. ReadCallback* read_callback_;
  248. // Max visible sequence number. It is normally the snapshot seq unless we have
  249. // uncommitted data in db as in WriteUnCommitted.
  250. SequenceNumber sequence_;
  251. IterKey saved_key_;
  252. // Reusable internal key data structure. This is only used inside one function
  253. // and should not be used across functions. Reusing this object can reduce
  254. // overhead of calling construction of the function if creating it each time.
  255. ParsedInternalKey ikey_;
  256. std::string saved_value_;
  257. Slice pinned_value_;
  258. // for prefix seek mode to support prev()
  259. Statistics* statistics_;
  260. uint64_t max_skip_;
  261. uint64_t max_skippable_internal_keys_;
  262. uint64_t num_internal_keys_skipped_;
  263. const Slice* iterate_lower_bound_;
  264. const Slice* iterate_upper_bound_;
  265. // The prefix of the seek key. It is only used when prefix_same_as_start_
  266. // is true and prefix extractor is not null. In Next() or Prev(), current keys
  267. // will be checked against this prefix, so that the iterator can be
  268. // invalidated if the keys in this prefix has been exhausted. Set it using
  269. // SetUserKey() and use it using GetUserKey().
  270. IterKey prefix_;
  271. Status status_;
  272. Direction direction_;
  273. bool valid_;
  274. bool current_entry_is_merged_;
  275. // True if we know that the current entry's seqnum is 0.
  276. // This information is used as that the next entry will be for another
  277. // user key.
  278. bool is_key_seqnum_zero_;
  279. const bool prefix_same_as_start_;
  280. // Means that we will pin all data blocks we read as long the Iterator
  281. // is not deleted, will be true if ReadOptions::pin_data is true
  282. const bool pin_thru_lifetime_;
  283. // Expect the inner iterator to maintain a total order.
  284. // prefix_extractor_ must be non-NULL if the value is false.
  285. const bool expect_total_order_inner_iter_;
  286. bool allow_blob_;
  287. bool is_blob_;
  288. bool arena_mode_;
  289. // List of operands for merge operator.
  290. MergeContext merge_context_;
  291. ReadRangeDelAggregator range_del_agg_;
  292. LocalStatistics local_stats_;
  293. PinnedIteratorsManager pinned_iters_mgr_;
  294. #ifdef ROCKSDB_LITE
  295. ROCKSDB_FIELD_UNUSED
  296. #endif
  297. DBImpl* db_impl_;
  298. #ifdef ROCKSDB_LITE
  299. ROCKSDB_FIELD_UNUSED
  300. #endif
  301. ColumnFamilyData* cfd_;
  302. // for diff snapshots we want the lower bound on the seqnum;
  303. // if this value > 0 iterator will return internal keys
  304. SequenceNumber start_seqnum_;
  305. };
  306. // Return a new iterator that converts internal keys (yielded by
  307. // "*internal_iter") that were live at the specified `sequence` number
  308. // into appropriate user keys.
  309. extern Iterator* NewDBIterator(
  310. Env* env, const ReadOptions& read_options,
  311. const ImmutableCFOptions& cf_options,
  312. const MutableCFOptions& mutable_cf_options,
  313. const Comparator* user_key_comparator, InternalIterator* internal_iter,
  314. const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations,
  315. ReadCallback* read_callback, DBImpl* db_impl = nullptr,
  316. ColumnFamilyData* cfd = nullptr, bool allow_blob = false);
  317. } // namespace ROCKSDB_NAMESPACE