data_block_hash_index.cc 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293
  1. // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. #include <string>
  6. #include <vector>
  7. #include "rocksdb/slice.h"
  8. #include "table/block_based/data_block_hash_index.h"
  9. #include "util/coding.h"
  10. #include "util/hash.h"
  11. namespace ROCKSDB_NAMESPACE {
  12. void DataBlockHashIndexBuilder::Add(const Slice& key,
  13. const size_t restart_index) {
  14. assert(Valid());
  15. if (restart_index > kMaxRestartSupportedByHashIndex) {
  16. valid_ = false;
  17. return;
  18. }
  19. uint32_t hash_value = GetSliceHash(key);
  20. hash_and_restart_pairs_.emplace_back(hash_value,
  21. static_cast<uint8_t>(restart_index));
  22. estimated_num_buckets_ += bucket_per_key_;
  23. }
  24. void DataBlockHashIndexBuilder::Finish(std::string& buffer) {
  25. assert(Valid());
  26. uint16_t num_buckets = static_cast<uint16_t>(estimated_num_buckets_);
  27. if (num_buckets == 0) {
  28. num_buckets = 1; // sanity check
  29. }
  30. // The build-in hash cannot well distribute strings when into different
  31. // buckets when num_buckets is power of two, resulting in high hash
  32. // collision.
  33. // We made the num_buckets to be odd to avoid this issue.
  34. num_buckets |= 1;
  35. std::vector<uint8_t> buckets(num_buckets, kNoEntry);
  36. // write the restart_index array
  37. for (auto& entry : hash_and_restart_pairs_) {
  38. uint32_t hash_value = entry.first;
  39. uint8_t restart_index = entry.second;
  40. uint16_t buck_idx = static_cast<uint16_t>(hash_value % num_buckets);
  41. if (buckets[buck_idx] == kNoEntry) {
  42. buckets[buck_idx] = restart_index;
  43. } else if (buckets[buck_idx] != restart_index) {
  44. // same bucket cannot store two different restart_index, mark collision
  45. buckets[buck_idx] = kCollision;
  46. }
  47. }
  48. for (uint8_t restart_index : buckets) {
  49. buffer.append(
  50. const_cast<const char*>(reinterpret_cast<char*>(&restart_index)),
  51. sizeof(restart_index));
  52. }
  53. // write NUM_BUCK
  54. PutFixed16(&buffer, num_buckets);
  55. assert(buffer.size() <= kMaxBlockSizeSupportedByHashIndex);
  56. }
  57. void DataBlockHashIndexBuilder::Reset() {
  58. estimated_num_buckets_ = 0;
  59. valid_ = true;
  60. hash_and_restart_pairs_.clear();
  61. }
  62. void DataBlockHashIndex::Initialize(const char* data, uint16_t size,
  63. uint16_t* map_offset) {
  64. assert(size >= sizeof(uint16_t)); // NUM_BUCKETS
  65. num_buckets_ = DecodeFixed16(data + size - sizeof(uint16_t));
  66. assert(num_buckets_ > 0);
  67. assert(size > num_buckets_ * sizeof(uint8_t));
  68. *map_offset = static_cast<uint16_t>(size - sizeof(uint16_t) -
  69. num_buckets_ * sizeof(uint8_t));
  70. }
  71. uint8_t DataBlockHashIndex::Lookup(const char* data, uint32_t map_offset,
  72. const Slice& key) const {
  73. uint32_t hash_value = GetSliceHash(key);
  74. uint16_t idx = static_cast<uint16_t>(hash_value % num_buckets_);
  75. const char* bucket_table = data + map_offset;
  76. return static_cast<uint8_t>(*(bucket_table + idx * sizeof(uint8_t)));
  77. }
  78. } // namespace ROCKSDB_NAMESPACE