| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232 |
- // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
- // This source code is licensed under both the GPLv2 (found in the
- // COPYING file in the root directory) and Apache 2.0 License
- // (found in the LICENSE.Apache file in the root directory).
- #include "table/block_based/block_prefix_index.h"
- #include <vector>
- #include "memory/arena.h"
- #include "rocksdb/comparator.h"
- #include "rocksdb/slice.h"
- #include "rocksdb/slice_transform.h"
- #include "util/coding.h"
- #include "util/hash.h"
- namespace ROCKSDB_NAMESPACE {
- inline uint32_t Hash(const Slice& s) {
- return ROCKSDB_NAMESPACE::Hash(s.data(), s.size(), 0);
- }
- inline uint32_t PrefixToBucket(const Slice& prefix, uint32_t num_buckets) {
- return Hash(prefix) % num_buckets;
- }
- // The prefix block index is simply a bucket array, with each entry pointing to
- // the blocks that span the prefixes hashed to this bucket.
- //
- // To reduce memory footprint, if there is only one block per bucket, the entry
- // stores the block id directly. If there are more than one blocks per bucket,
- // because of hash collision or a single prefix spanning multiple blocks,
- // the entry points to an array of block ids. The block array is an array of
- // uint32_t's. The first uint32_t indicates the total number of blocks, followed
- // by the block ids.
- //
- // To differentiate the two cases, the high order bit of the entry indicates
- // whether it is a 'pointer' into a separate block array.
- // 0x7FFFFFFF is reserved for empty bucket.
- const uint32_t kNoneBlock = 0x7FFFFFFF;
- const uint32_t kBlockArrayMask = 0x80000000;
- inline bool IsNone(uint32_t block_id) { return block_id == kNoneBlock; }
- inline bool IsBlockId(uint32_t block_id) {
- return (block_id & kBlockArrayMask) == 0;
- }
- inline uint32_t DecodeIndex(uint32_t block_id) {
- uint32_t index = block_id ^ kBlockArrayMask;
- assert(index < kBlockArrayMask);
- return index;
- }
- inline uint32_t EncodeIndex(uint32_t index) {
- assert(index < kBlockArrayMask);
- return index | kBlockArrayMask;
- }
- // temporary storage for prefix information during index building
- struct PrefixRecord {
- Slice prefix;
- uint32_t start_block;
- uint32_t end_block;
- uint32_t num_blocks;
- PrefixRecord* next;
- };
- class BlockPrefixIndex::Builder {
- public:
- explicit Builder(const SliceTransform* internal_prefix_extractor)
- : internal_prefix_extractor_(internal_prefix_extractor) {}
- void Add(const Slice& key_prefix, uint32_t start_block, uint32_t num_blocks) {
- PrefixRecord* record = reinterpret_cast<PrefixRecord*>(
- arena_.AllocateAligned(sizeof(PrefixRecord)));
- record->prefix = key_prefix;
- record->start_block = start_block;
- record->end_block = start_block + num_blocks - 1;
- record->num_blocks = num_blocks;
- prefixes_.push_back(record);
- }
- BlockPrefixIndex* Finish() {
- // For now, use roughly 1:1 prefix to bucket ratio.
- uint32_t num_buckets = static_cast<uint32_t>(prefixes_.size()) + 1;
- // Collect prefix records that hash to the same bucket, into a single
- // linklist.
- std::vector<PrefixRecord*> prefixes_per_bucket(num_buckets, nullptr);
- std::vector<uint32_t> num_blocks_per_bucket(num_buckets, 0);
- for (PrefixRecord* current : prefixes_) {
- uint32_t bucket = PrefixToBucket(current->prefix, num_buckets);
- // merge the prefix block span if the first block of this prefix is
- // connected to the last block of the previous prefix.
- PrefixRecord* prev = prefixes_per_bucket[bucket];
- if (prev) {
- assert(current->start_block >= prev->end_block);
- auto distance = current->start_block - prev->end_block;
- if (distance <= 1) {
- prev->end_block = current->end_block;
- prev->num_blocks = prev->end_block - prev->start_block + 1;
- num_blocks_per_bucket[bucket] += (current->num_blocks + distance - 1);
- continue;
- }
- }
- current->next = prev;
- prefixes_per_bucket[bucket] = current;
- num_blocks_per_bucket[bucket] += current->num_blocks;
- }
- // Calculate the block array buffer size
- uint32_t total_block_array_entries = 0;
- for (uint32_t i = 0; i < num_buckets; i++) {
- uint32_t num_blocks = num_blocks_per_bucket[i];
- if (num_blocks > 1) {
- total_block_array_entries += (num_blocks + 1);
- }
- }
- // Populate the final prefix block index
- uint32_t* block_array_buffer = new uint32_t[total_block_array_entries];
- uint32_t* buckets = new uint32_t[num_buckets];
- uint32_t offset = 0;
- for (uint32_t i = 0; i < num_buckets; i++) {
- uint32_t num_blocks = num_blocks_per_bucket[i];
- if (num_blocks == 0) {
- assert(prefixes_per_bucket[i] == nullptr);
- buckets[i] = kNoneBlock;
- } else if (num_blocks == 1) {
- assert(prefixes_per_bucket[i] != nullptr);
- assert(prefixes_per_bucket[i]->next == nullptr);
- buckets[i] = prefixes_per_bucket[i]->start_block;
- } else {
- assert(total_block_array_entries > 0);
- assert(prefixes_per_bucket[i] != nullptr);
- buckets[i] = EncodeIndex(offset);
- block_array_buffer[offset] = num_blocks;
- uint32_t* last_block = &block_array_buffer[offset + num_blocks];
- auto current = prefixes_per_bucket[i];
- // populate block ids from largest to smallest
- while (current != nullptr) {
- for (uint32_t iter = 0; iter < current->num_blocks; iter++) {
- *last_block = current->end_block - iter;
- last_block--;
- }
- current = current->next;
- }
- assert(last_block == &block_array_buffer[offset]);
- offset += (num_blocks + 1);
- }
- }
- assert(offset == total_block_array_entries);
- return new BlockPrefixIndex(internal_prefix_extractor_, num_buckets,
- buckets, total_block_array_entries,
- block_array_buffer);
- }
- private:
- const SliceTransform* internal_prefix_extractor_;
- std::vector<PrefixRecord*> prefixes_;
- Arena arena_;
- };
- Status BlockPrefixIndex::Create(const SliceTransform* internal_prefix_extractor,
- const Slice& prefixes, const Slice& prefix_meta,
- BlockPrefixIndex** prefix_index) {
- uint64_t pos = 0;
- auto meta_pos = prefix_meta;
- Status s;
- Builder builder(internal_prefix_extractor);
- while (!meta_pos.empty()) {
- uint32_t prefix_size = 0;
- uint32_t entry_index = 0;
- uint32_t num_blocks = 0;
- if (!GetVarint32(&meta_pos, &prefix_size) ||
- !GetVarint32(&meta_pos, &entry_index) ||
- !GetVarint32(&meta_pos, &num_blocks)) {
- s = Status::Corruption(
- "Corrupted prefix meta block: unable to read from it.");
- break;
- }
- if (pos + prefix_size > prefixes.size()) {
- s = Status::Corruption(
- "Corrupted prefix meta block: size inconsistency.");
- break;
- }
- Slice prefix(prefixes.data() + pos, prefix_size);
- builder.Add(prefix, entry_index, num_blocks);
- pos += prefix_size;
- }
- if (s.ok() && pos != prefixes.size()) {
- s = Status::Corruption("Corrupted prefix meta block");
- }
- if (s.ok()) {
- *prefix_index = builder.Finish();
- }
- return s;
- }
- uint32_t BlockPrefixIndex::GetBlocks(const Slice& key, uint32_t** blocks) {
- Slice prefix = internal_prefix_extractor_->Transform(key);
- uint32_t bucket = PrefixToBucket(prefix, num_buckets_);
- uint32_t block_id = buckets_[bucket];
- if (IsNone(block_id)) {
- return 0;
- } else if (IsBlockId(block_id)) {
- *blocks = &buckets_[bucket];
- return 1;
- } else {
- uint32_t index = DecodeIndex(block_id);
- assert(index < num_block_array_buffer_entries_);
- *blocks = &block_array_buffer_[index + 1];
- uint32_t num_blocks = block_array_buffer_[index];
- assert(num_blocks > 1);
- assert(index + num_blocks < num_block_array_buffer_entries_);
- return num_blocks;
- }
- }
- } // namespace ROCKSDB_NAMESPACE
|