data_block_hash_index.h 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136
  1. // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. #pragma once
  6. #include <string>
  7. #include <vector>
  8. #include "rocksdb/slice.h"
  9. namespace ROCKSDB_NAMESPACE {
  10. // This is an experimental feature aiming to reduce the CPU utilization of
  11. // point-lookup within a data-block. It is only used in data blocks, and not
  12. // in meta-data blocks or per-table index blocks.
  13. //
  14. // It only used to support BlockBasedTable::Get().
  15. //
  16. // A serialized hash index is appended to the data-block. The new block data
  17. // format is as follows:
  18. //
  19. // DATA_BLOCK: [RI RI RI ... RI RI_IDX HASH_IDX FOOTER]
  20. //
  21. // RI: Restart Interval (the same as the default data-block format)
  22. // RI_IDX: Restart Interval index (the same as the default data-block format)
  23. // HASH_IDX: The new data-block hash index feature.
  24. // FOOTER: A 32bit block footer, which is the NUM_RESTARTS with the MSB as
  25. // the flag indicating if this hash index is in use. Note that
  26. // given a data block < 32KB, the MSB is never used. So we can
  27. // borrow the MSB as the hash index flag. Therefore, this format is
  28. // compatible with the legacy data-blocks with num_restarts < 32768,
  29. // as the MSB is 0.
  30. //
  31. // The format of the data-block hash index is as follows:
  32. //
  33. // HASH_IDX: [B B B ... B NUM_BUCK]
  34. //
  35. // B: bucket, an array of restart index. Each buckets is uint8_t.
  36. // NUM_BUCK: Number of buckets, which is the length of the bucket array.
  37. //
  38. // We reserve two special flag:
  39. // kNoEntry=255,
  40. // kCollision=254.
  41. //
  42. // Therefore, the max number of restarts this hash index can supoport is 253.
  43. //
  44. // Buckets are initialized to be kNoEntry.
  45. //
  46. // When storing a key in the hash index, the key is first hashed to a bucket.
  47. // If there the bucket is empty (kNoEntry), the restart index is stored in
  48. // the bucket. If there is already a restart index there, we will update the
  49. // existing restart index to a collision marker (kCollision). If the
  50. // the bucket is already marked as collision, we do not store the restart
  51. // index either.
  52. //
  53. // During query process, a key is first hashed to a bucket. Then we examine if
  54. // the buckets store nothing (kNoEntry) or the bucket had a collision
  55. // (kCollision). If either of those happens, we get the restart index of
  56. // the key and will directly go to the restart interval to search the key.
  57. //
  58. // Note that we only support blocks with #restart_interval < 254. If a block
  59. // has more restart interval than that, hash index will not be create for it.
  60. const uint8_t kNoEntry = 255;
  61. const uint8_t kCollision = 254;
  62. const uint8_t kMaxRestartSupportedByHashIndex = 253;
  63. // Because we use uint16_t address, we only support block no more than 64KB
  64. const size_t kMaxBlockSizeSupportedByHashIndex = 1u << 16;
  65. const double kDefaultUtilRatio = 0.75;
  66. class DataBlockHashIndexBuilder {
  67. public:
  68. DataBlockHashIndexBuilder()
  69. : bucket_per_key_(-1 /*uninitialized marker*/),
  70. estimated_num_buckets_(0),
  71. valid_(false) {}
  72. void Initialize(double util_ratio) {
  73. if (util_ratio <= 0) {
  74. util_ratio = kDefaultUtilRatio; // sanity check
  75. }
  76. bucket_per_key_ = 1 / util_ratio;
  77. valid_ = true;
  78. }
  79. inline bool Valid() const { return valid_ && bucket_per_key_ > 0; }
  80. void Add(const Slice& key, const size_t restart_index);
  81. void Finish(std::string& buffer);
  82. void Reset();
  83. inline size_t EstimateSize() const {
  84. uint16_t estimated_num_buckets =
  85. static_cast<uint16_t>(estimated_num_buckets_);
  86. // Maching the num_buckets number in DataBlockHashIndexBuilder::Finish.
  87. estimated_num_buckets |= 1;
  88. return sizeof(uint16_t) +
  89. static_cast<size_t>(estimated_num_buckets * sizeof(uint8_t));
  90. }
  91. private:
  92. double bucket_per_key_; // is the multiplicative inverse of util_ratio_
  93. double estimated_num_buckets_;
  94. // Now the only usage for `valid_` is to mark false when the inserted
  95. // restart_index is larger than supported. In this case HashIndex is not
  96. // appended to the block content.
  97. bool valid_;
  98. std::vector<std::pair<uint32_t, uint8_t>> hash_and_restart_pairs_;
  99. friend class DataBlockHashIndex_DataBlockHashTestSmall_Test;
  100. };
  101. class DataBlockHashIndex {
  102. public:
  103. DataBlockHashIndex() : num_buckets_(0) {}
  104. void Initialize(const char* data, uint16_t size, uint16_t* map_offset);
  105. uint8_t Lookup(const char* data, uint32_t map_offset, const Slice& key) const;
  106. inline bool Valid() { return num_buckets_ != 0; }
  107. private:
  108. // To make the serialized hash index compact and to save the space overhead,
  109. // here all the data fields persisted in the block are in uint16 format.
  110. // We find that a uint16 is large enough to index every offset of a 64KiB
  111. // block.
  112. // So in other words, DataBlockHashIndex does not support block size equal
  113. // or greater then 64KiB.
  114. uint16_t num_buckets_;
  115. };
  116. } // namespace ROCKSDB_NAMESPACE