data_block_hash_index.h 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137
  1. // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. #pragma once
  6. #include <cstdint>
  7. #include <string>
  8. #include <vector>
  9. #include "rocksdb/slice.h"
  10. namespace ROCKSDB_NAMESPACE {
  11. // This is an experimental feature aiming to reduce the CPU utilization of
  12. // point-lookup within a data-block. It is only used in data blocks, and not
  13. // in meta-data blocks or per-table index blocks.
  14. //
  15. // It only used to support BlockBasedTable::Get().
  16. //
  17. // A serialized hash index is appended to the data-block. The new block data
  18. // format is as follows:
  19. //
  20. // DATA_BLOCK: [RI RI RI ... RI RI_IDX HASH_IDX FOOTER]
  21. //
  22. // RI: Restart Interval (the same as the default data-block format)
  23. // RI_IDX: Restart Interval index (the same as the default data-block format)
  24. // HASH_IDX: The new data-block hash index feature.
  25. // FOOTER: A 32bit block footer, which is the NUM_RESTARTS with the MSB as
  26. // the flag indicating if this hash index is in use. Note that
  27. // given a data block < 32KB, the MSB is never used. So we can
  28. // borrow the MSB as the hash index flag. Therefore, this format is
  29. // compatible with the legacy data-blocks with num_restarts < 32768,
  30. // as the MSB is 0.
  31. //
  32. // The format of the data-block hash index is as follows:
  33. //
  34. // HASH_IDX: [B B B ... B NUM_BUCK]
  35. //
  36. // B: bucket, an array of restart index. Each buckets is uint8_t.
  37. // NUM_BUCK: Number of buckets, which is the length of the bucket array.
  38. //
  39. // We reserve two special flag:
  40. // kNoEntry=255,
  41. // kCollision=254.
  42. //
  43. // Therefore, the max number of restarts this hash index can supoport is 253.
  44. //
  45. // Buckets are initialized to be kNoEntry.
  46. //
  47. // When storing a key in the hash index, the key is first hashed to a bucket.
  48. // If there the bucket is empty (kNoEntry), the restart index is stored in
  49. // the bucket. If there is already a restart index there, we will update the
  50. // existing restart index to a collision marker (kCollision). If the
  51. // the bucket is already marked as collision, we do not store the restart
  52. // index either.
  53. //
  54. // During query process, a key is first hashed to a bucket. Then we examine if
  55. // the buckets store nothing (kNoEntry) or the bucket had a collision
  56. // (kCollision). If either of those happens, we get the restart index of
  57. // the key and will directly go to the restart interval to search the key.
  58. //
  59. // Note that we only support blocks with #restart_interval < 254. If a block
  60. // has more restart interval than that, hash index will not be create for it.
  61. const uint8_t kNoEntry = 255;
  62. const uint8_t kCollision = 254;
  63. const uint8_t kMaxRestartSupportedByHashIndex = 253;
  64. // Because we use uint16_t address, we only support block no more than 64KB
  65. const size_t kMaxBlockSizeSupportedByHashIndex = 1u << 16;
  66. const double kDefaultUtilRatio = 0.75;
  67. class DataBlockHashIndexBuilder {
  68. public:
  69. DataBlockHashIndexBuilder()
  70. : bucket_per_key_(-1 /*uninitialized marker*/),
  71. estimated_num_buckets_(0),
  72. valid_(false) {}
  73. void Initialize(double util_ratio) {
  74. if (util_ratio <= 0) {
  75. util_ratio = kDefaultUtilRatio; // sanity check
  76. }
  77. bucket_per_key_ = 1 / util_ratio;
  78. valid_ = true;
  79. }
  80. inline bool Valid() const { return valid_ && bucket_per_key_ > 0; }
  81. void Add(const Slice& key, const size_t restart_index);
  82. void Finish(std::string& buffer);
  83. void Reset();
  84. inline size_t EstimateSize() const {
  85. uint16_t estimated_num_buckets =
  86. static_cast<uint16_t>(estimated_num_buckets_);
  87. // Maching the num_buckets number in DataBlockHashIndexBuilder::Finish.
  88. estimated_num_buckets |= 1;
  89. return sizeof(uint16_t) +
  90. static_cast<size_t>(estimated_num_buckets * sizeof(uint8_t));
  91. }
  92. private:
  93. double bucket_per_key_; // is the multiplicative inverse of util_ratio_
  94. double estimated_num_buckets_;
  95. // Now the only usage for `valid_` is to mark false when the inserted
  96. // restart_index is larger than supported. In this case HashIndex is not
  97. // appended to the block content.
  98. bool valid_;
  99. std::vector<std::pair<uint32_t, uint8_t>> hash_and_restart_pairs_;
  100. friend class DataBlockHashIndex_DataBlockHashTestSmall_Test;
  101. };
  102. class DataBlockHashIndex {
  103. public:
  104. DataBlockHashIndex() : num_buckets_(0) {}
  105. void Initialize(const char* data, uint16_t size, uint16_t* map_offset);
  106. uint8_t Lookup(const char* data, uint32_t map_offset, const Slice& key) const;
  107. inline bool Valid() { return num_buckets_ != 0; }
  108. private:
  109. // To make the serialized hash index compact and to save the space overhead,
  110. // here all the data fields persisted in the block are in uint16 format.
  111. // We find that a uint16 is large enough to index every offset of a 64KiB
  112. // block.
  113. // So in other words, DataBlockHashIndex does not support block size equal
  114. // or greater then 64KiB.
  115. uint16_t num_buckets_;
  116. };
  117. } // namespace ROCKSDB_NAMESPACE