index_builder.h 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443
  1. // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. //
  6. // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
  7. // Use of this source code is governed by a BSD-style license that can be
  8. // found in the LICENSE file. See the AUTHORS file for names of contributors.
  9. #pragma once
  10. #include <assert.h>
  11. #include <cinttypes>
  12. #include <list>
  13. #include <string>
  14. #include <unordered_map>
  15. #include "rocksdb/comparator.h"
  16. #include "table/block_based/block_based_table_factory.h"
  17. #include "table/block_based/block_builder.h"
  18. #include "table/format.h"
  19. namespace ROCKSDB_NAMESPACE {
  20. // The interface for building index.
  21. // Instruction for adding a new concrete IndexBuilder:
  22. // 1. Create a subclass instantiated from IndexBuilder.
  23. // 2. Add a new entry associated with that subclass in TableOptions::IndexType.
  24. // 3. Add a create function for the new subclass in CreateIndexBuilder.
  25. // Note: we can devise more advanced design to simplify the process for adding
  26. // new subclass, which will, on the other hand, increase the code complexity and
  27. // catch unwanted attention from readers. Given that we won't add/change
  28. // indexes frequently, it makes sense to just embrace a more straightforward
  29. // design that just works.
  30. class IndexBuilder {
  31. public:
  32. static IndexBuilder* CreateIndexBuilder(
  33. BlockBasedTableOptions::IndexType index_type,
  34. const ROCKSDB_NAMESPACE::InternalKeyComparator* comparator,
  35. const InternalKeySliceTransform* int_key_slice_transform,
  36. const bool use_value_delta_encoding,
  37. const BlockBasedTableOptions& table_opt);
  38. // Index builder will construct a set of blocks which contain:
  39. // 1. One primary index block.
  40. // 2. (Optional) a set of metablocks that contains the metadata of the
  41. // primary index.
  42. struct IndexBlocks {
  43. Slice index_block_contents;
  44. std::unordered_map<std::string, Slice> meta_blocks;
  45. };
  46. explicit IndexBuilder(const InternalKeyComparator* comparator)
  47. : comparator_(comparator) {}
  48. virtual ~IndexBuilder() {}
  49. // Add a new index entry to index block.
  50. // To allow further optimization, we provide `last_key_in_current_block` and
  51. // `first_key_in_next_block`, based on which the specific implementation can
  52. // determine the best index key to be used for the index block.
  53. // Called before the OnKeyAdded() call for first_key_in_next_block.
  54. // @last_key_in_current_block: this parameter maybe overridden with the value
  55. // "substitute key".
  56. // @first_key_in_next_block: it will be nullptr if the entry being added is
  57. // the last one in the table
  58. //
  59. // REQUIRES: Finish() has not yet been called.
  60. virtual void AddIndexEntry(std::string* last_key_in_current_block,
  61. const Slice* first_key_in_next_block,
  62. const BlockHandle& block_handle) = 0;
  63. // This method will be called whenever a key is added. The subclasses may
  64. // override OnKeyAdded() if they need to collect additional information.
  65. virtual void OnKeyAdded(const Slice& /*key*/) {}
  66. // Inform the index builder that all entries has been written. Block builder
  67. // may therefore perform any operation required for block finalization.
  68. //
  69. // REQUIRES: Finish() has not yet been called.
  70. inline Status Finish(IndexBlocks* index_blocks) {
  71. // Throw away the changes to last_partition_block_handle. It has no effect
  72. // on the first call to Finish anyway.
  73. BlockHandle last_partition_block_handle;
  74. return Finish(index_blocks, last_partition_block_handle);
  75. }
  76. // This override of Finish can be utilized to build the 2nd level index in
  77. // PartitionIndexBuilder.
  78. //
  79. // index_blocks will be filled with the resulting index data. If the return
  80. // value is Status::InComplete() then it means that the index is partitioned
  81. // and the callee should keep calling Finish until Status::OK() is returned.
  82. // In that case, last_partition_block_handle is pointer to the block written
  83. // with the result of the last call to Finish. This can be utilized to build
  84. // the second level index pointing to each block of partitioned indexes. The
  85. // last call to Finish() that returns Status::OK() populates index_blocks with
  86. // the 2nd level index content.
  87. virtual Status Finish(IndexBlocks* index_blocks,
  88. const BlockHandle& last_partition_block_handle) = 0;
  89. // Get the size for index block. Must be called after ::Finish.
  90. virtual size_t IndexSize() const = 0;
  91. virtual bool seperator_is_key_plus_seq() { return true; }
  92. protected:
  93. const InternalKeyComparator* comparator_;
  94. // Set after ::Finish is called
  95. size_t index_size_ = 0;
  96. };
  97. // This index builder builds space-efficient index block.
  98. //
  99. // Optimizations:
  100. // 1. Made block's `block_restart_interval` to be 1, which will avoid linear
  101. // search when doing index lookup (can be disabled by setting
  102. // index_block_restart_interval).
  103. // 2. Shorten the key length for index block. Other than honestly using the
  104. // last key in the data block as the index key, we instead find a shortest
  105. // substitute key that serves the same function.
  106. class ShortenedIndexBuilder : public IndexBuilder {
  107. public:
  108. explicit ShortenedIndexBuilder(
  109. const InternalKeyComparator* comparator,
  110. const int index_block_restart_interval, const uint32_t format_version,
  111. const bool use_value_delta_encoding,
  112. BlockBasedTableOptions::IndexShorteningMode shortening_mode,
  113. bool include_first_key)
  114. : IndexBuilder(comparator),
  115. index_block_builder_(index_block_restart_interval,
  116. true /*use_delta_encoding*/,
  117. use_value_delta_encoding),
  118. index_block_builder_without_seq_(index_block_restart_interval,
  119. true /*use_delta_encoding*/,
  120. use_value_delta_encoding),
  121. use_value_delta_encoding_(use_value_delta_encoding),
  122. include_first_key_(include_first_key),
  123. shortening_mode_(shortening_mode) {
  124. // Making the default true will disable the feature for old versions
  125. seperator_is_key_plus_seq_ = (format_version <= 2);
  126. }
  127. virtual void OnKeyAdded(const Slice& key) override {
  128. if (include_first_key_ && current_block_first_internal_key_.empty()) {
  129. current_block_first_internal_key_.assign(key.data(), key.size());
  130. }
  131. }
  132. virtual void AddIndexEntry(std::string* last_key_in_current_block,
  133. const Slice* first_key_in_next_block,
  134. const BlockHandle& block_handle) override {
  135. if (first_key_in_next_block != nullptr) {
  136. if (shortening_mode_ !=
  137. BlockBasedTableOptions::IndexShorteningMode::kNoShortening) {
  138. comparator_->FindShortestSeparator(last_key_in_current_block,
  139. *first_key_in_next_block);
  140. }
  141. if (!seperator_is_key_plus_seq_ &&
  142. comparator_->user_comparator()->Compare(
  143. ExtractUserKey(*last_key_in_current_block),
  144. ExtractUserKey(*first_key_in_next_block)) == 0) {
  145. seperator_is_key_plus_seq_ = true;
  146. }
  147. } else {
  148. if (shortening_mode_ == BlockBasedTableOptions::IndexShorteningMode::
  149. kShortenSeparatorsAndSuccessor) {
  150. comparator_->FindShortSuccessor(last_key_in_current_block);
  151. }
  152. }
  153. auto sep = Slice(*last_key_in_current_block);
  154. assert(!include_first_key_ || !current_block_first_internal_key_.empty());
  155. IndexValue entry(block_handle, current_block_first_internal_key_);
  156. std::string encoded_entry;
  157. std::string delta_encoded_entry;
  158. entry.EncodeTo(&encoded_entry, include_first_key_, nullptr);
  159. if (use_value_delta_encoding_ && !last_encoded_handle_.IsNull()) {
  160. entry.EncodeTo(&delta_encoded_entry, include_first_key_,
  161. &last_encoded_handle_);
  162. } else {
  163. // If it's the first block, or delta encoding is disabled,
  164. // BlockBuilder::Add() below won't use delta-encoded slice.
  165. }
  166. last_encoded_handle_ = block_handle;
  167. const Slice delta_encoded_entry_slice(delta_encoded_entry);
  168. index_block_builder_.Add(sep, encoded_entry, &delta_encoded_entry_slice);
  169. if (!seperator_is_key_plus_seq_) {
  170. index_block_builder_without_seq_.Add(ExtractUserKey(sep), encoded_entry,
  171. &delta_encoded_entry_slice);
  172. }
  173. current_block_first_internal_key_.clear();
  174. }
  175. using IndexBuilder::Finish;
  176. virtual Status Finish(
  177. IndexBlocks* index_blocks,
  178. const BlockHandle& /*last_partition_block_handle*/) override {
  179. if (seperator_is_key_plus_seq_) {
  180. index_blocks->index_block_contents = index_block_builder_.Finish();
  181. } else {
  182. index_blocks->index_block_contents =
  183. index_block_builder_without_seq_.Finish();
  184. }
  185. index_size_ = index_blocks->index_block_contents.size();
  186. return Status::OK();
  187. }
  188. virtual size_t IndexSize() const override { return index_size_; }
  189. virtual bool seperator_is_key_plus_seq() override {
  190. return seperator_is_key_plus_seq_;
  191. }
  192. friend class PartitionedIndexBuilder;
  193. private:
  194. BlockBuilder index_block_builder_;
  195. BlockBuilder index_block_builder_without_seq_;
  196. const bool use_value_delta_encoding_;
  197. bool seperator_is_key_plus_seq_;
  198. const bool include_first_key_;
  199. BlockBasedTableOptions::IndexShorteningMode shortening_mode_;
  200. BlockHandle last_encoded_handle_ = BlockHandle::NullBlockHandle();
  201. std::string current_block_first_internal_key_;
  202. };
  203. // HashIndexBuilder contains a binary-searchable primary index and the
  204. // metadata for secondary hash index construction.
  205. // The metadata for hash index consists two parts:
  206. // - a metablock that compactly contains a sequence of prefixes. All prefixes
  207. // are stored consectively without any metadata (like, prefix sizes) being
  208. // stored, which is kept in the other metablock.
  209. // - a metablock contains the metadata of the prefixes, including prefix size,
  210. // restart index and number of block it spans. The format looks like:
  211. //
  212. // +-----------------+---------------------------+---------------------+
  213. // <=prefix 1
  214. // | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
  215. // +-----------------+---------------------------+---------------------+
  216. // <=prefix 2
  217. // | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
  218. // +-----------------+---------------------------+---------------------+
  219. // | |
  220. // | .... |
  221. // | |
  222. // +-----------------+---------------------------+---------------------+
  223. // <=prefix n
  224. // | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
  225. // +-----------------+---------------------------+---------------------+
  226. //
  227. // The reason of separating these two metablocks is to enable the efficiently
  228. // reuse the first metablock during hash index construction without unnecessary
  229. // data copy or small heap allocations for prefixes.
  230. class HashIndexBuilder : public IndexBuilder {
  231. public:
  232. explicit HashIndexBuilder(
  233. const InternalKeyComparator* comparator,
  234. const SliceTransform* hash_key_extractor,
  235. int index_block_restart_interval, int format_version,
  236. bool use_value_delta_encoding,
  237. BlockBasedTableOptions::IndexShorteningMode shortening_mode)
  238. : IndexBuilder(comparator),
  239. primary_index_builder_(comparator, index_block_restart_interval,
  240. format_version, use_value_delta_encoding,
  241. shortening_mode, /* include_first_key */ false),
  242. hash_key_extractor_(hash_key_extractor) {}
  243. virtual void AddIndexEntry(std::string* last_key_in_current_block,
  244. const Slice* first_key_in_next_block,
  245. const BlockHandle& block_handle) override {
  246. ++current_restart_index_;
  247. primary_index_builder_.AddIndexEntry(last_key_in_current_block,
  248. first_key_in_next_block, block_handle);
  249. }
  250. virtual void OnKeyAdded(const Slice& key) override {
  251. auto key_prefix = hash_key_extractor_->Transform(key);
  252. bool is_first_entry = pending_block_num_ == 0;
  253. // Keys may share the prefix
  254. if (is_first_entry || pending_entry_prefix_ != key_prefix) {
  255. if (!is_first_entry) {
  256. FlushPendingPrefix();
  257. }
  258. // need a hard copy otherwise the underlying data changes all the time.
  259. // TODO(kailiu) ToString() is expensive. We may speed up can avoid data
  260. // copy.
  261. pending_entry_prefix_ = key_prefix.ToString();
  262. pending_block_num_ = 1;
  263. pending_entry_index_ = static_cast<uint32_t>(current_restart_index_);
  264. } else {
  265. // entry number increments when keys share the prefix reside in
  266. // different data blocks.
  267. auto last_restart_index = pending_entry_index_ + pending_block_num_ - 1;
  268. assert(last_restart_index <= current_restart_index_);
  269. if (last_restart_index != current_restart_index_) {
  270. ++pending_block_num_;
  271. }
  272. }
  273. }
  274. virtual Status Finish(
  275. IndexBlocks* index_blocks,
  276. const BlockHandle& last_partition_block_handle) override {
  277. if (pending_block_num_ != 0) {
  278. FlushPendingPrefix();
  279. }
  280. primary_index_builder_.Finish(index_blocks, last_partition_block_handle);
  281. index_blocks->meta_blocks.insert(
  282. {kHashIndexPrefixesBlock.c_str(), prefix_block_});
  283. index_blocks->meta_blocks.insert(
  284. {kHashIndexPrefixesMetadataBlock.c_str(), prefix_meta_block_});
  285. return Status::OK();
  286. }
  287. virtual size_t IndexSize() const override {
  288. return primary_index_builder_.IndexSize() + prefix_block_.size() +
  289. prefix_meta_block_.size();
  290. }
  291. virtual bool seperator_is_key_plus_seq() override {
  292. return primary_index_builder_.seperator_is_key_plus_seq();
  293. }
  294. private:
  295. void FlushPendingPrefix() {
  296. prefix_block_.append(pending_entry_prefix_.data(),
  297. pending_entry_prefix_.size());
  298. PutVarint32Varint32Varint32(
  299. &prefix_meta_block_,
  300. static_cast<uint32_t>(pending_entry_prefix_.size()),
  301. pending_entry_index_, pending_block_num_);
  302. }
  303. ShortenedIndexBuilder primary_index_builder_;
  304. const SliceTransform* hash_key_extractor_;
  305. // stores a sequence of prefixes
  306. std::string prefix_block_;
  307. // stores the metadata of prefixes
  308. std::string prefix_meta_block_;
  309. // The following 3 variables keeps unflushed prefix and its metadata.
  310. // The details of block_num and entry_index can be found in
  311. // "block_hash_index.{h,cc}"
  312. uint32_t pending_block_num_ = 0;
  313. uint32_t pending_entry_index_ = 0;
  314. std::string pending_entry_prefix_;
  315. uint64_t current_restart_index_ = 0;
  316. };
  317. /**
  318. * IndexBuilder for two-level indexing. Internally it creates a new index for
  319. * each partition and Finish then in order when Finish is called on it
  320. * continiously until Status::OK() is returned.
  321. *
  322. * The format on the disk would be I I I I I I IP where I is block containing a
  323. * partition of indexes built using ShortenedIndexBuilder and IP is a block
  324. * containing a secondary index on the partitions, built using
  325. * ShortenedIndexBuilder.
  326. */
  327. class PartitionedIndexBuilder : public IndexBuilder {
  328. public:
  329. static PartitionedIndexBuilder* CreateIndexBuilder(
  330. const ROCKSDB_NAMESPACE::InternalKeyComparator* comparator,
  331. const bool use_value_delta_encoding,
  332. const BlockBasedTableOptions& table_opt);
  333. explicit PartitionedIndexBuilder(const InternalKeyComparator* comparator,
  334. const BlockBasedTableOptions& table_opt,
  335. const bool use_value_delta_encoding);
  336. virtual ~PartitionedIndexBuilder();
  337. virtual void AddIndexEntry(std::string* last_key_in_current_block,
  338. const Slice* first_key_in_next_block,
  339. const BlockHandle& block_handle) override;
  340. virtual Status Finish(
  341. IndexBlocks* index_blocks,
  342. const BlockHandle& last_partition_block_handle) override;
  343. virtual size_t IndexSize() const override { return index_size_; }
  344. size_t TopLevelIndexSize(uint64_t) const { return top_level_index_size_; }
  345. size_t NumPartitions() const;
  346. inline bool ShouldCutFilterBlock() {
  347. // Current policy is to align the partitions of index and filters
  348. if (cut_filter_block) {
  349. cut_filter_block = false;
  350. return true;
  351. }
  352. return false;
  353. }
  354. std::string& GetPartitionKey() { return sub_index_last_key_; }
  355. // Called when an external entity (such as filter partition builder) request
  356. // cutting the next partition
  357. void RequestPartitionCut();
  358. virtual bool seperator_is_key_plus_seq() override {
  359. return seperator_is_key_plus_seq_;
  360. }
  361. bool get_use_value_delta_encoding() { return use_value_delta_encoding_; }
  362. private:
  363. // Set after ::Finish is called
  364. size_t top_level_index_size_ = 0;
  365. // Set after ::Finish is called
  366. size_t partition_cnt_ = 0;
  367. void MakeNewSubIndexBuilder();
  368. struct Entry {
  369. std::string key;
  370. std::unique_ptr<ShortenedIndexBuilder> value;
  371. };
  372. std::list<Entry> entries_; // list of partitioned indexes and their keys
  373. BlockBuilder index_block_builder_; // top-level index builder
  374. BlockBuilder index_block_builder_without_seq_; // same for user keys
  375. // the active partition index builder
  376. ShortenedIndexBuilder* sub_index_builder_;
  377. // the last key in the active partition index builder
  378. std::string sub_index_last_key_;
  379. std::unique_ptr<FlushBlockPolicy> flush_policy_;
  380. // true if Finish is called once but not complete yet.
  381. bool finishing_indexes = false;
  382. const BlockBasedTableOptions& table_opt_;
  383. bool seperator_is_key_plus_seq_;
  384. bool use_value_delta_encoding_;
  385. // true if an external entity (such as filter partition builder) request
  386. // cutting the next partition
  387. bool partition_cut_requested_ = true;
  388. // true if it should cut the next filter partition block
  389. bool cut_filter_block = false;
  390. BlockHandle last_encoded_handle_;
  391. };
  392. } // namespace ROCKSDB_NAMESPACE