multiget_context.h 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408
  1. // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. #pragma once
  6. #include <algorithm>
  7. #include <array>
  8. #include <string>
  9. #include "db/dbformat.h"
  10. #include "db/lookup_key.h"
  11. #include "db/merge_context.h"
  12. #include "rocksdb/env.h"
  13. #include "rocksdb/options.h"
  14. #include "rocksdb/statistics.h"
  15. #include "rocksdb/types.h"
  16. #include "util/async_file_reader.h"
  17. #include "util/autovector.h"
  18. #include "util/math.h"
  19. #include "util/single_thread_executor.h"
  20. namespace ROCKSDB_NAMESPACE {
  21. class GetContext;
  22. class PinnableWideColumns;
  23. struct KeyContext {
  24. const Slice* key;
  25. LookupKey* lkey;
  26. Slice ukey_with_ts;
  27. Slice ukey_without_ts;
  28. Slice ikey;
  29. ColumnFamilyHandle* column_family;
  30. Status* s;
  31. MergeContext merge_context;
  32. SequenceNumber max_covering_tombstone_seq;
  33. bool key_exists;
  34. bool is_blob_index;
  35. void* cb_arg;
  36. PinnableSlice* value;
  37. PinnableWideColumns* columns;
  38. std::string* timestamp;
  39. GetContext* get_context;
  40. KeyContext(ColumnFamilyHandle* col_family, const Slice& user_key,
  41. PinnableSlice* val, PinnableWideColumns* cols, std::string* ts,
  42. Status* stat)
  43. : key(&user_key),
  44. lkey(nullptr),
  45. column_family(col_family),
  46. s(stat),
  47. max_covering_tombstone_seq(0),
  48. key_exists(false),
  49. is_blob_index(false),
  50. cb_arg(nullptr),
  51. value(val),
  52. columns(cols),
  53. timestamp(ts),
  54. get_context(nullptr) {}
  55. };
  56. // The MultiGetContext class is a container for the sorted list of keys that
  57. // we need to lookup in a batch. Its main purpose is to make batch execution
  58. // easier by allowing various stages of the MultiGet lookups to operate on
  59. // subsets of keys, potentially non-contiguous. In order to accomplish this,
  60. // it defines the following classes -
  61. //
  62. // MultiGetContext::Range
  63. // MultiGetContext::Range::Iterator
  64. // MultiGetContext::Range::IteratorWrapper
  65. //
  66. // Here is an example of how this can be used -
  67. //
  68. // {
  69. // MultiGetContext ctx(...);
  70. // MultiGetContext::Range range = ctx.GetMultiGetRange();
  71. //
  72. // // Iterate to determine some subset of the keys
  73. // MultiGetContext::Range::Iterator start = range.begin();
  74. // MultiGetContext::Range::Iterator end = ...;
  75. //
  76. // // Make a new range with a subset of keys
  77. // MultiGetContext::Range subrange(range, start, end);
  78. //
  79. // // Define an auxillary vector, if needed, to hold additional data for
  80. // // each key
  81. // std::array<Foo, MultiGetContext::MAX_BATCH_SIZE> aux;
  82. //
  83. // // Iterate over the subrange and the auxillary vector simultaneously
  84. // MultiGetContext::Range::Iterator iter = subrange.begin();
  85. // for (; iter != subrange.end(); ++iter) {
  86. // KeyContext& key = *iter;
  87. // Foo& aux_key = aux_iter[iter.index()];
  88. // ...
  89. // }
  90. // }
  91. class MultiGetContext {
  92. public:
  93. // Limit the number of keys in a batch to this number. Benchmarks show that
  94. // there is negligible benefit for batches exceeding this. Keeping this < 32
  95. // simplifies iteration, as well as reduces the amount of stack allocations
  96. // that need to be performed
  97. static const int MAX_BATCH_SIZE = 32;
  98. // A bitmask of at least MAX_BATCH_SIZE - 1 bits, so that
  99. // Mask{1} << MAX_BATCH_SIZE is well defined
  100. using Mask = uint64_t;
  101. static_assert(MAX_BATCH_SIZE < sizeof(Mask) * 8);
  102. MultiGetContext(autovector<KeyContext*, MAX_BATCH_SIZE>* sorted_keys,
  103. size_t begin, size_t num_keys, SequenceNumber snapshot,
  104. const ReadOptions& read_opts, FileSystem* fs,
  105. Statistics* stats)
  106. : num_keys_(num_keys),
  107. value_mask_(0),
  108. value_size_(0),
  109. lookup_key_ptr_(reinterpret_cast<LookupKey*>(lookup_key_stack_buf))
  110. #if USE_COROUTINES
  111. ,
  112. reader_(fs, stats),
  113. executor_(reader_)
  114. #endif // USE_COROUTINES
  115. {
  116. (void)fs;
  117. (void)stats;
  118. assert(num_keys <= MAX_BATCH_SIZE);
  119. if (num_keys > MAX_LOOKUP_KEYS_ON_STACK) {
  120. lookup_key_heap_buf.reset(new char[sizeof(LookupKey) * num_keys]);
  121. lookup_key_ptr_ = reinterpret_cast<LookupKey*>(lookup_key_heap_buf.get());
  122. }
  123. for (size_t iter = 0;
  124. iter < num_keys_ && /* suppress a warning */ iter < MAX_BATCH_SIZE;
  125. ++iter) {
  126. // autovector may not be contiguous storage, so make a copy
  127. sorted_keys_[iter] = (*sorted_keys)[begin + iter];
  128. sorted_keys_[iter]->lkey = new (&lookup_key_ptr_[iter])
  129. LookupKey(*sorted_keys_[iter]->key, snapshot, read_opts.timestamp);
  130. sorted_keys_[iter]->ukey_with_ts = sorted_keys_[iter]->lkey->user_key();
  131. sorted_keys_[iter]->ukey_without_ts = StripTimestampFromUserKey(
  132. sorted_keys_[iter]->lkey->user_key(),
  133. read_opts.timestamp == nullptr ? 0 : read_opts.timestamp->size());
  134. sorted_keys_[iter]->ikey = sorted_keys_[iter]->lkey->internal_key();
  135. sorted_keys_[iter]->timestamp = (*sorted_keys)[begin + iter]->timestamp;
  136. sorted_keys_[iter]->get_context =
  137. (*sorted_keys)[begin + iter]->get_context;
  138. }
  139. }
  140. ~MultiGetContext() {
  141. for (size_t i = 0; i < num_keys_; ++i) {
  142. lookup_key_ptr_[i].~LookupKey();
  143. }
  144. }
  145. #if USE_COROUTINES
  146. SingleThreadExecutor& executor() { return executor_; }
  147. AsyncFileReader& reader() { return reader_; }
  148. #endif // USE_COROUTINES
  149. private:
  150. static const int MAX_LOOKUP_KEYS_ON_STACK = 16;
  151. alignas(
  152. alignof(LookupKey)) char lookup_key_stack_buf[sizeof(LookupKey) *
  153. MAX_LOOKUP_KEYS_ON_STACK];
  154. std::array<KeyContext*, MAX_BATCH_SIZE> sorted_keys_;
  155. size_t num_keys_;
  156. Mask value_mask_;
  157. uint64_t value_size_;
  158. std::unique_ptr<char[]> lookup_key_heap_buf;
  159. LookupKey* lookup_key_ptr_;
  160. #if USE_COROUTINES
  161. AsyncFileReader reader_;
  162. SingleThreadExecutor executor_;
  163. #endif // USE_COROUTINES
  164. public:
  165. // MultiGetContext::Range - Specifies a range of keys, by start and end index,
  166. // from the parent MultiGetContext. Each range contains a bit vector that
  167. // indicates whether the corresponding keys need to be processed or skipped.
  168. // A Range object can be copy constructed, and the new object inherits the
  169. // original Range's bit vector. This is useful for progressively skipping
  170. // keys as the lookup goes through various stages. For example, when looking
  171. // up keys in the same SST file, a Range is created excluding keys not
  172. // belonging to that file. A new Range is then copy constructed and individual
  173. // keys are skipped based on bloom filter lookup.
  174. class Range {
  175. public:
  176. // MultiGetContext::Range::Iterator - A forward iterator that iterates over
  177. // non-skippable keys in a Range, as well as keys whose final value has been
  178. // found. The latter is tracked by MultiGetContext::value_mask_
  179. class Iterator {
  180. public:
  181. // -- iterator traits
  182. using self_type = Iterator;
  183. using value_type = KeyContext;
  184. using reference = KeyContext&;
  185. using pointer = KeyContext*;
  186. using difference_type = int;
  187. using iterator_category = std::forward_iterator_tag;
  188. Iterator(const Range* range, size_t idx)
  189. : range_(range), ctx_(range->ctx_), index_(idx) {
  190. while (index_ < range_->end_ &&
  191. (Mask{1} << index_) &
  192. (range_->ctx_->value_mask_ | range_->skip_mask_ |
  193. range_->invalid_mask_))
  194. index_++;
  195. }
  196. Iterator(const Iterator&) = default;
  197. Iterator(const Iterator& other, const Range* range)
  198. : range_(range), ctx_(other.ctx_), index_(other.index_) {
  199. assert(range->ctx_ == other.ctx_);
  200. }
  201. Iterator& operator=(const Iterator&) = default;
  202. Iterator& operator++() {
  203. while (++index_ < range_->end_ &&
  204. (Mask{1} << index_) &
  205. (range_->ctx_->value_mask_ | range_->skip_mask_ |
  206. range_->invalid_mask_)) {
  207. // empty loop body
  208. }
  209. return *this;
  210. }
  211. bool operator==(Iterator other) const {
  212. assert(range_->ctx_ == other.range_->ctx_);
  213. return index_ == other.index_;
  214. }
  215. bool operator!=(Iterator other) const {
  216. assert(range_->ctx_ == other.range_->ctx_);
  217. return index_ != other.index_;
  218. }
  219. KeyContext& operator*() {
  220. assert(index_ < range_->end_ && index_ >= range_->start_);
  221. return *(ctx_->sorted_keys_[index_]);
  222. }
  223. KeyContext* operator->() {
  224. assert(index_ < range_->end_ && index_ >= range_->start_);
  225. return ctx_->sorted_keys_[index_];
  226. }
  227. size_t index() { return index_; }
  228. private:
  229. friend Range;
  230. const Range* range_;
  231. const MultiGetContext* ctx_;
  232. size_t index_;
  233. };
  234. Range(const Range& mget_range, const Iterator& first,
  235. const Iterator& last) {
  236. ctx_ = mget_range.ctx_;
  237. if (first == last) {
  238. // This means create an empty range based on mget_range. So just
  239. // set start_ and and_ to the same value
  240. start_ = mget_range.start_;
  241. end_ = start_;
  242. } else {
  243. start_ = first.index_;
  244. end_ = last.index_;
  245. }
  246. skip_mask_ = mget_range.skip_mask_;
  247. invalid_mask_ = mget_range.invalid_mask_;
  248. assert(start_ < 64);
  249. assert(end_ < 64);
  250. }
  251. Range() = default;
  252. Iterator begin() const { return Iterator(this, start_); }
  253. Iterator end() const { return Iterator(this, end_); }
  254. bool empty() const { return RemainingMask() == 0; }
  255. void SkipIndex(size_t index) { skip_mask_ |= Mask{1} << index; }
  256. void SkipKey(const Iterator& iter) { SkipIndex(iter.index_); }
  257. bool IsKeySkipped(const Iterator& iter) const {
  258. return skip_mask_ & (Mask{1} << iter.index_);
  259. }
  260. // Update the value_mask_ in MultiGetContext so its
  261. // immediately reflected in all the Range Iterators
  262. void MarkKeyDone(Iterator& iter) {
  263. ctx_->value_mask_ |= (Mask{1} << iter.index_);
  264. }
  265. bool CheckKeyDone(Iterator& iter) const {
  266. return ctx_->value_mask_ & (Mask{1} << iter.index_);
  267. }
  268. uint64_t KeysLeft() const { return BitsSetToOne(RemainingMask()); }
  269. void AddSkipsFrom(const Range& other) {
  270. assert(ctx_ == other.ctx_);
  271. skip_mask_ |= other.skip_mask_;
  272. }
  273. uint64_t GetValueSize() { return ctx_->value_size_; }
  274. void AddValueSize(uint64_t value_size) { ctx_->value_size_ += value_size; }
  275. MultiGetContext* context() const { return ctx_; }
  276. Range Suffix(const Range& other) const {
  277. size_t other_last = other.FindLastRemaining();
  278. size_t my_last = FindLastRemaining();
  279. if (my_last > other_last) {
  280. return Range(*this, Iterator(this, other_last),
  281. Iterator(this, my_last));
  282. } else {
  283. return Range(*this, begin(), begin());
  284. }
  285. }
  286. // The += operator expands the number of keys in this range. The expansion
  287. // is always to the right, i.e start of the additional range >= end of
  288. // current range. There should be no overlap. Any skipped keys in rhs are
  289. // marked as invalid in the invalid_mask_.
  290. Range& operator+=(const Range& rhs) {
  291. assert(rhs.start_ >= end_);
  292. // Check for non-overlapping ranges and adjust invalid_mask_ accordingly
  293. if (end_ < rhs.start_) {
  294. invalid_mask_ |= RangeMask(end_, rhs.start_);
  295. skip_mask_ |= RangeMask(end_, rhs.start_);
  296. }
  297. start_ = std::min<size_t>(start_, rhs.start_);
  298. end_ = std::max<size_t>(end_, rhs.end_);
  299. skip_mask_ |= rhs.skip_mask_ & RangeMask(rhs.start_, rhs.end_);
  300. invalid_mask_ |= (rhs.invalid_mask_ | rhs.skip_mask_) &
  301. RangeMask(rhs.start_, rhs.end_);
  302. assert(start_ < 64);
  303. assert(end_ < 64);
  304. return *this;
  305. }
  306. // The -= operator removes keys from this range. The removed keys should
  307. // come from a range completely overlapping the current range. The removed
  308. // keys are marked invalid in the invalid_mask_.
  309. Range& operator-=(const Range& rhs) {
  310. assert(start_ <= rhs.start_ && end_ >= rhs.end_);
  311. skip_mask_ |= (~rhs.skip_mask_ | rhs.invalid_mask_) &
  312. RangeMask(rhs.start_, rhs.end_);
  313. invalid_mask_ |= (~rhs.skip_mask_ | rhs.invalid_mask_) &
  314. RangeMask(rhs.start_, rhs.end_);
  315. return *this;
  316. }
  317. // Return a complement of the current range
  318. Range operator~() {
  319. Range res = *this;
  320. res.skip_mask_ = ~skip_mask_ & RangeMask(start_, end_);
  321. return res;
  322. }
  323. private:
  324. friend MultiGetContext;
  325. MultiGetContext* ctx_;
  326. size_t start_;
  327. size_t end_;
  328. Mask skip_mask_;
  329. Mask invalid_mask_;
  330. Range(MultiGetContext* ctx, size_t num_keys)
  331. : ctx_(ctx),
  332. start_(0),
  333. end_(num_keys),
  334. skip_mask_(0),
  335. invalid_mask_(0) {
  336. assert(num_keys < 64);
  337. }
  338. static Mask RangeMask(size_t start, size_t end) {
  339. return (((Mask{1} << (end - start)) - 1) << start);
  340. }
  341. Mask RemainingMask() const {
  342. return (((Mask{1} << end_) - 1) & ~((Mask{1} << start_) - 1) &
  343. ~(ctx_->value_mask_ | skip_mask_));
  344. }
  345. size_t FindLastRemaining() const {
  346. Mask mask = RemainingMask();
  347. size_t index = (mask >>= start_) ? start_ : 0;
  348. while (mask >>= 1) {
  349. index++;
  350. }
  351. return index;
  352. }
  353. };
  354. // Return the initial range that encompasses all the keys in the batch
  355. Range GetMultiGetRange() { return Range(this, num_keys_); }
  356. };
  357. } // namespace ROCKSDB_NAMESPACE