plain_table_index.h 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249
  1. // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. #pragma once
  6. #ifndef ROCKSDB_LITE
  7. #include <string>
  8. #include <vector>
  9. #include "db/dbformat.h"
  10. #include "memory/arena.h"
  11. #include "monitoring/histogram.h"
  12. #include "options/cf_options.h"
  13. #include "rocksdb/options.h"
  14. namespace ROCKSDB_NAMESPACE {
  15. // The file contains two classes PlainTableIndex and PlainTableIndexBuilder
  16. // The two classes implement the index format of PlainTable.
  17. // For descripton of PlainTable format, see comments of class
  18. // PlainTableFactory
  19. //
  20. //
  21. // PlainTableIndex contains buckets size of index_size_, each is a
  22. // 32-bit integer. The lower 31 bits contain an offset value (explained below)
  23. // and the first bit of the integer indicates type of the offset.
  24. //
  25. // +--------------+------------------------------------------------------+
  26. // | Flag (1 bit) | Offset to binary search buffer or file (31 bits) +
  27. // +--------------+------------------------------------------------------+
  28. //
  29. // Explanation for the "flag bit":
  30. //
  31. // 0 indicates that the bucket contains only one prefix (no conflict when
  32. // hashing this prefix), whose first row starts from this offset of the
  33. // file.
  34. // 1 indicates that the bucket contains more than one prefixes, or there
  35. // are too many rows for one prefix so we need a binary search for it. In
  36. // this case, the offset indicates the offset of sub_index_ holding the
  37. // binary search indexes of keys for those rows. Those binary search indexes
  38. // are organized in this way:
  39. //
  40. // The first 4 bytes, indicate how many indexes (N) are stored after it. After
  41. // it, there are N 32-bit integers, each points of an offset of the file,
  42. // which
  43. // points to starting of a row. Those offsets need to be guaranteed to be in
  44. // ascending order so the keys they are pointing to are also in ascending
  45. // order
  46. // to make sure we can use them to do binary searches. Below is visual
  47. // presentation of a bucket.
  48. //
  49. // <begin>
  50. // number_of_records: varint32
  51. // record 1 file offset: fixedint32
  52. // record 2 file offset: fixedint32
  53. // ....
  54. // record N file offset: fixedint32
  55. // <end>
  56. // The class loads the index block from a PlainTable SST file, and executes
  57. // the index lookup.
  58. // The class is used by PlainTableReader class.
  59. class PlainTableIndex {
  60. public:
  61. enum IndexSearchResult {
  62. kNoPrefixForBucket = 0,
  63. kDirectToFile = 1,
  64. kSubindex = 2
  65. };
  66. explicit PlainTableIndex(Slice data) { InitFromRawData(data); }
  67. PlainTableIndex()
  68. : index_size_(0),
  69. sub_index_size_(0),
  70. num_prefixes_(0),
  71. index_(nullptr),
  72. sub_index_(nullptr) {}
  73. // The function that executes the lookup the hash table.
  74. // The hash key is `prefix_hash`. The function fills the hash bucket
  75. // content in `bucket_value`, which is up to the caller to interpret.
  76. IndexSearchResult GetOffset(uint32_t prefix_hash,
  77. uint32_t* bucket_value) const;
  78. // Initialize data from `index_data`, which points to raw data for
  79. // index stored in the SST file.
  80. Status InitFromRawData(Slice index_data);
  81. // Decode the sub index for specific hash bucket.
  82. // The `offset` is the value returned as `bucket_value` by GetOffset()
  83. // and is only valid when the return value is `kSubindex`.
  84. // The return value is the pointer to the starting address of the
  85. // sub-index. `upper_bound` is filled with the value indicating how many
  86. // entries the sub-index has.
  87. const char* GetSubIndexBasePtrAndUpperBound(uint32_t offset,
  88. uint32_t* upper_bound) const {
  89. const char* index_ptr = &sub_index_[offset];
  90. return GetVarint32Ptr(index_ptr, index_ptr + 4, upper_bound);
  91. }
  92. uint32_t GetIndexSize() const { return index_size_; }
  93. uint32_t GetSubIndexSize() const { return sub_index_size_; }
  94. uint32_t GetNumPrefixes() const { return num_prefixes_; }
  95. static const uint64_t kMaxFileSize = (1u << 31) - 1;
  96. static const uint32_t kSubIndexMask = 0x80000000;
  97. static const size_t kOffsetLen = sizeof(uint32_t);
  98. private:
  99. uint32_t index_size_;
  100. uint32_t sub_index_size_;
  101. uint32_t num_prefixes_;
  102. uint32_t* index_;
  103. char* sub_index_;
  104. };
  105. // PlainTableIndexBuilder is used to create plain table index.
  106. // After calling Finish(), it returns Slice, which is usually
  107. // used either to initialize PlainTableIndex or
  108. // to save index to sst file.
  109. // For more details about the index, please refer to:
  110. // https://github.com/facebook/rocksdb/wiki/PlainTable-Format
  111. // #wiki-in-memory-index-format
  112. // The class is used by PlainTableBuilder class.
  113. class PlainTableIndexBuilder {
  114. public:
  115. PlainTableIndexBuilder(Arena* arena, const ImmutableCFOptions& ioptions,
  116. const SliceTransform* prefix_extractor,
  117. size_t index_sparseness, double hash_table_ratio,
  118. size_t huge_page_tlb_size)
  119. : arena_(arena),
  120. ioptions_(ioptions),
  121. record_list_(kRecordsPerGroup),
  122. is_first_record_(true),
  123. due_index_(false),
  124. num_prefixes_(0),
  125. num_keys_per_prefix_(0),
  126. prev_key_prefix_hash_(0),
  127. index_sparseness_(index_sparseness),
  128. index_size_(0),
  129. sub_index_size_(0),
  130. prefix_extractor_(prefix_extractor),
  131. hash_table_ratio_(hash_table_ratio),
  132. huge_page_tlb_size_(huge_page_tlb_size) {}
  133. void AddKeyPrefix(Slice key_prefix_slice, uint32_t key_offset);
  134. Slice Finish();
  135. uint32_t GetTotalSize() const {
  136. return VarintLength(index_size_) + VarintLength(num_prefixes_) +
  137. PlainTableIndex::kOffsetLen * index_size_ + sub_index_size_;
  138. }
  139. static const std::string kPlainTableIndexBlock;
  140. private:
  141. struct IndexRecord {
  142. uint32_t hash; // hash of the prefix
  143. uint32_t offset; // offset of a row
  144. IndexRecord* next;
  145. };
  146. // Helper class to track all the index records
  147. class IndexRecordList {
  148. public:
  149. explicit IndexRecordList(size_t num_records_per_group)
  150. : kNumRecordsPerGroup(num_records_per_group),
  151. current_group_(nullptr),
  152. num_records_in_current_group_(num_records_per_group) {}
  153. ~IndexRecordList() {
  154. for (size_t i = 0; i < groups_.size(); i++) {
  155. delete[] groups_[i];
  156. }
  157. }
  158. void AddRecord(uint32_t hash, uint32_t offset);
  159. size_t GetNumRecords() const {
  160. return (groups_.size() - 1) * kNumRecordsPerGroup +
  161. num_records_in_current_group_;
  162. }
  163. IndexRecord* At(size_t index) {
  164. return &(groups_[index / kNumRecordsPerGroup]
  165. [index % kNumRecordsPerGroup]);
  166. }
  167. private:
  168. IndexRecord* AllocateNewGroup() {
  169. IndexRecord* result = new IndexRecord[kNumRecordsPerGroup];
  170. groups_.push_back(result);
  171. return result;
  172. }
  173. // Each group in `groups_` contains fix-sized records (determined by
  174. // kNumRecordsPerGroup). Which can help us minimize the cost if resizing
  175. // occurs.
  176. const size_t kNumRecordsPerGroup;
  177. IndexRecord* current_group_;
  178. // List of arrays allocated
  179. std::vector<IndexRecord*> groups_;
  180. size_t num_records_in_current_group_;
  181. };
  182. void AllocateIndex();
  183. // Internal helper function to bucket index record list to hash buckets.
  184. void BucketizeIndexes(std::vector<IndexRecord*>* hash_to_offsets,
  185. std::vector<uint32_t>* entries_per_bucket);
  186. // Internal helper class to fill the indexes and bloom filters to internal
  187. // data structures.
  188. Slice FillIndexes(const std::vector<IndexRecord*>& hash_to_offsets,
  189. const std::vector<uint32_t>& entries_per_bucket);
  190. Arena* arena_;
  191. const ImmutableCFOptions ioptions_;
  192. HistogramImpl keys_per_prefix_hist_;
  193. IndexRecordList record_list_;
  194. bool is_first_record_;
  195. bool due_index_;
  196. uint32_t num_prefixes_;
  197. uint32_t num_keys_per_prefix_;
  198. uint32_t prev_key_prefix_hash_;
  199. size_t index_sparseness_;
  200. uint32_t index_size_;
  201. uint32_t sub_index_size_;
  202. const SliceTransform* prefix_extractor_;
  203. double hash_table_ratio_;
  204. size_t huge_page_tlb_size_;
  205. std::string prev_key_prefix_;
  206. static const size_t kRecordsPerGroup = 256;
  207. };
  208. }; // namespace ROCKSDB_NAMESPACE
  209. #endif // ROCKSDB_LITE