block_prefix_index.cc 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232
  1. // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. #include "table/block_based/block_prefix_index.h"
  6. #include <vector>
  7. #include "memory/arena.h"
  8. #include "rocksdb/comparator.h"
  9. #include "rocksdb/slice.h"
  10. #include "rocksdb/slice_transform.h"
  11. #include "util/coding.h"
  12. #include "util/hash.h"
  13. namespace ROCKSDB_NAMESPACE {
  14. inline uint32_t Hash(const Slice& s) {
  15. return ROCKSDB_NAMESPACE::Hash(s.data(), s.size(), 0);
  16. }
  17. inline uint32_t PrefixToBucket(const Slice& prefix, uint32_t num_buckets) {
  18. return Hash(prefix) % num_buckets;
  19. }
  20. // The prefix block index is simply a bucket array, with each entry pointing to
  21. // the blocks that span the prefixes hashed to this bucket.
  22. //
  23. // To reduce memory footprint, if there is only one block per bucket, the entry
  24. // stores the block id directly. If there are more than one blocks per bucket,
  25. // because of hash collision or a single prefix spanning multiple blocks,
  26. // the entry points to an array of block ids. The block array is an array of
  27. // uint32_t's. The first uint32_t indicates the total number of blocks, followed
  28. // by the block ids.
  29. //
  30. // To differentiate the two cases, the high order bit of the entry indicates
  31. // whether it is a 'pointer' into a separate block array.
  32. // 0x7FFFFFFF is reserved for empty bucket.
  33. const uint32_t kNoneBlock = 0x7FFFFFFF;
  34. const uint32_t kBlockArrayMask = 0x80000000;
  35. inline bool IsNone(uint32_t block_id) { return block_id == kNoneBlock; }
  36. inline bool IsBlockId(uint32_t block_id) {
  37. return (block_id & kBlockArrayMask) == 0;
  38. }
  39. inline uint32_t DecodeIndex(uint32_t block_id) {
  40. uint32_t index = block_id ^ kBlockArrayMask;
  41. assert(index < kBlockArrayMask);
  42. return index;
  43. }
  44. inline uint32_t EncodeIndex(uint32_t index) {
  45. assert(index < kBlockArrayMask);
  46. return index | kBlockArrayMask;
  47. }
  48. // temporary storage for prefix information during index building
  49. struct PrefixRecord {
  50. Slice prefix;
  51. uint32_t start_block;
  52. uint32_t end_block;
  53. uint32_t num_blocks;
  54. PrefixRecord* next;
  55. };
  56. class BlockPrefixIndex::Builder {
  57. public:
  58. explicit Builder(const SliceTransform* internal_prefix_extractor)
  59. : internal_prefix_extractor_(internal_prefix_extractor) {}
  60. void Add(const Slice& key_prefix, uint32_t start_block, uint32_t num_blocks) {
  61. PrefixRecord* record = reinterpret_cast<PrefixRecord*>(
  62. arena_.AllocateAligned(sizeof(PrefixRecord)));
  63. record->prefix = key_prefix;
  64. record->start_block = start_block;
  65. record->end_block = start_block + num_blocks - 1;
  66. record->num_blocks = num_blocks;
  67. prefixes_.push_back(record);
  68. }
  69. BlockPrefixIndex* Finish() {
  70. // For now, use roughly 1:1 prefix to bucket ratio.
  71. uint32_t num_buckets = static_cast<uint32_t>(prefixes_.size()) + 1;
  72. // Collect prefix records that hash to the same bucket, into a single
  73. // linklist.
  74. std::vector<PrefixRecord*> prefixes_per_bucket(num_buckets, nullptr);
  75. std::vector<uint32_t> num_blocks_per_bucket(num_buckets, 0);
  76. for (PrefixRecord* current : prefixes_) {
  77. uint32_t bucket = PrefixToBucket(current->prefix, num_buckets);
  78. // merge the prefix block span if the first block of this prefix is
  79. // connected to the last block of the previous prefix.
  80. PrefixRecord* prev = prefixes_per_bucket[bucket];
  81. if (prev) {
  82. assert(current->start_block >= prev->end_block);
  83. auto distance = current->start_block - prev->end_block;
  84. if (distance <= 1) {
  85. prev->end_block = current->end_block;
  86. prev->num_blocks = prev->end_block - prev->start_block + 1;
  87. num_blocks_per_bucket[bucket] += (current->num_blocks + distance - 1);
  88. continue;
  89. }
  90. }
  91. current->next = prev;
  92. prefixes_per_bucket[bucket] = current;
  93. num_blocks_per_bucket[bucket] += current->num_blocks;
  94. }
  95. // Calculate the block array buffer size
  96. uint32_t total_block_array_entries = 0;
  97. for (uint32_t i = 0; i < num_buckets; i++) {
  98. uint32_t num_blocks = num_blocks_per_bucket[i];
  99. if (num_blocks > 1) {
  100. total_block_array_entries += (num_blocks + 1);
  101. }
  102. }
  103. // Populate the final prefix block index
  104. uint32_t* block_array_buffer = new uint32_t[total_block_array_entries];
  105. uint32_t* buckets = new uint32_t[num_buckets];
  106. uint32_t offset = 0;
  107. for (uint32_t i = 0; i < num_buckets; i++) {
  108. uint32_t num_blocks = num_blocks_per_bucket[i];
  109. if (num_blocks == 0) {
  110. assert(prefixes_per_bucket[i] == nullptr);
  111. buckets[i] = kNoneBlock;
  112. } else if (num_blocks == 1) {
  113. assert(prefixes_per_bucket[i] != nullptr);
  114. assert(prefixes_per_bucket[i]->next == nullptr);
  115. buckets[i] = prefixes_per_bucket[i]->start_block;
  116. } else {
  117. assert(total_block_array_entries > 0);
  118. assert(prefixes_per_bucket[i] != nullptr);
  119. buckets[i] = EncodeIndex(offset);
  120. block_array_buffer[offset] = num_blocks;
  121. uint32_t* last_block = &block_array_buffer[offset + num_blocks];
  122. auto current = prefixes_per_bucket[i];
  123. // populate block ids from largest to smallest
  124. while (current != nullptr) {
  125. for (uint32_t iter = 0; iter < current->num_blocks; iter++) {
  126. *last_block = current->end_block - iter;
  127. last_block--;
  128. }
  129. current = current->next;
  130. }
  131. assert(last_block == &block_array_buffer[offset]);
  132. offset += (num_blocks + 1);
  133. }
  134. }
  135. assert(offset == total_block_array_entries);
  136. return new BlockPrefixIndex(internal_prefix_extractor_, num_buckets,
  137. buckets, total_block_array_entries,
  138. block_array_buffer);
  139. }
  140. private:
  141. const SliceTransform* internal_prefix_extractor_;
  142. std::vector<PrefixRecord*> prefixes_;
  143. Arena arena_;
  144. };
  145. Status BlockPrefixIndex::Create(const SliceTransform* internal_prefix_extractor,
  146. const Slice& prefixes, const Slice& prefix_meta,
  147. BlockPrefixIndex** prefix_index) {
  148. uint64_t pos = 0;
  149. auto meta_pos = prefix_meta;
  150. Status s;
  151. Builder builder(internal_prefix_extractor);
  152. while (!meta_pos.empty()) {
  153. uint32_t prefix_size = 0;
  154. uint32_t entry_index = 0;
  155. uint32_t num_blocks = 0;
  156. if (!GetVarint32(&meta_pos, &prefix_size) ||
  157. !GetVarint32(&meta_pos, &entry_index) ||
  158. !GetVarint32(&meta_pos, &num_blocks)) {
  159. s = Status::Corruption(
  160. "Corrupted prefix meta block: unable to read from it.");
  161. break;
  162. }
  163. if (pos + prefix_size > prefixes.size()) {
  164. s = Status::Corruption(
  165. "Corrupted prefix meta block: size inconsistency.");
  166. break;
  167. }
  168. Slice prefix(prefixes.data() + pos, prefix_size);
  169. builder.Add(prefix, entry_index, num_blocks);
  170. pos += prefix_size;
  171. }
  172. if (s.ok() && pos != prefixes.size()) {
  173. s = Status::Corruption("Corrupted prefix meta block");
  174. }
  175. if (s.ok()) {
  176. *prefix_index = builder.Finish();
  177. }
  178. return s;
  179. }
  180. uint32_t BlockPrefixIndex::GetBlocks(const Slice& key, uint32_t** blocks) {
  181. Slice prefix = internal_prefix_extractor_->Transform(key);
  182. uint32_t bucket = PrefixToBucket(prefix, num_buckets_);
  183. uint32_t block_id = buckets_[bucket];
  184. if (IsNone(block_id)) {
  185. return 0;
  186. } else if (IsBlockId(block_id)) {
  187. *blocks = &buckets_[bucket];
  188. return 1;
  189. } else {
  190. uint32_t index = DecodeIndex(block_id);
  191. assert(index < num_block_array_buffer_entries_);
  192. *blocks = &block_array_buffer_[index + 1];
  193. uint32_t num_blocks = block_array_buffer_[index];
  194. assert(num_blocks > 1);
  195. assert(index + num_blocks < num_block_array_buffer_entries_);
  196. return num_blocks;
  197. }
  198. }
  199. } // namespace ROCKSDB_NAMESPACE