plain_table_index.cc 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211
  1. // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. #ifndef ROCKSDB_LITE
  6. #include <cinttypes>
  7. #include "table/plain/plain_table_index.h"
  8. #include "util/coding.h"
  9. #include "util/hash.h"
  10. namespace ROCKSDB_NAMESPACE {
  11. namespace {
  12. inline uint32_t GetBucketIdFromHash(uint32_t hash, uint32_t num_buckets) {
  13. assert(num_buckets > 0);
  14. return hash % num_buckets;
  15. }
  16. }
  17. Status PlainTableIndex::InitFromRawData(Slice data) {
  18. if (!GetVarint32(&data, &index_size_)) {
  19. return Status::Corruption("Couldn't read the index size!");
  20. }
  21. assert(index_size_ > 0);
  22. if (!GetVarint32(&data, &num_prefixes_)) {
  23. return Status::Corruption("Couldn't read the index size!");
  24. }
  25. sub_index_size_ =
  26. static_cast<uint32_t>(data.size()) - index_size_ * kOffsetLen;
  27. char* index_data_begin = const_cast<char*>(data.data());
  28. index_ = reinterpret_cast<uint32_t*>(index_data_begin);
  29. sub_index_ = reinterpret_cast<char*>(index_ + index_size_);
  30. return Status::OK();
  31. }
  32. PlainTableIndex::IndexSearchResult PlainTableIndex::GetOffset(
  33. uint32_t prefix_hash, uint32_t* bucket_value) const {
  34. int bucket = GetBucketIdFromHash(prefix_hash, index_size_);
  35. GetUnaligned(index_ + bucket, bucket_value);
  36. if ((*bucket_value & kSubIndexMask) == kSubIndexMask) {
  37. *bucket_value ^= kSubIndexMask;
  38. return kSubindex;
  39. }
  40. if (*bucket_value >= kMaxFileSize) {
  41. return kNoPrefixForBucket;
  42. } else {
  43. // point directly to the file
  44. return kDirectToFile;
  45. }
  46. }
  47. void PlainTableIndexBuilder::IndexRecordList::AddRecord(uint32_t hash,
  48. uint32_t offset) {
  49. if (num_records_in_current_group_ == kNumRecordsPerGroup) {
  50. current_group_ = AllocateNewGroup();
  51. num_records_in_current_group_ = 0;
  52. }
  53. auto& new_record = current_group_[num_records_in_current_group_++];
  54. new_record.hash = hash;
  55. new_record.offset = offset;
  56. new_record.next = nullptr;
  57. }
  58. void PlainTableIndexBuilder::AddKeyPrefix(Slice key_prefix_slice,
  59. uint32_t key_offset) {
  60. if (is_first_record_ || prev_key_prefix_ != key_prefix_slice.ToString()) {
  61. ++num_prefixes_;
  62. if (!is_first_record_) {
  63. keys_per_prefix_hist_.Add(num_keys_per_prefix_);
  64. }
  65. num_keys_per_prefix_ = 0;
  66. prev_key_prefix_ = key_prefix_slice.ToString();
  67. prev_key_prefix_hash_ = GetSliceHash(key_prefix_slice);
  68. due_index_ = true;
  69. }
  70. if (due_index_) {
  71. // Add an index key for every kIndexIntervalForSamePrefixKeys keys
  72. record_list_.AddRecord(prev_key_prefix_hash_, key_offset);
  73. due_index_ = false;
  74. }
  75. num_keys_per_prefix_++;
  76. if (index_sparseness_ == 0 || num_keys_per_prefix_ % index_sparseness_ == 0) {
  77. due_index_ = true;
  78. }
  79. is_first_record_ = false;
  80. }
  81. Slice PlainTableIndexBuilder::Finish() {
  82. AllocateIndex();
  83. std::vector<IndexRecord*> hash_to_offsets(index_size_, nullptr);
  84. std::vector<uint32_t> entries_per_bucket(index_size_, 0);
  85. BucketizeIndexes(&hash_to_offsets, &entries_per_bucket);
  86. keys_per_prefix_hist_.Add(num_keys_per_prefix_);
  87. ROCKS_LOG_INFO(ioptions_.info_log, "Number of Keys per prefix Histogram: %s",
  88. keys_per_prefix_hist_.ToString().c_str());
  89. // From the temp data structure, populate indexes.
  90. return FillIndexes(hash_to_offsets, entries_per_bucket);
  91. }
  92. void PlainTableIndexBuilder::AllocateIndex() {
  93. if (prefix_extractor_ == nullptr || hash_table_ratio_ <= 0) {
  94. // Fall back to pure binary search if the user fails to specify a prefix
  95. // extractor.
  96. index_size_ = 1;
  97. } else {
  98. double hash_table_size_multipier = 1.0 / hash_table_ratio_;
  99. index_size_ =
  100. static_cast<uint32_t>(num_prefixes_ * hash_table_size_multipier) + 1;
  101. assert(index_size_ > 0);
  102. }
  103. }
  104. void PlainTableIndexBuilder::BucketizeIndexes(
  105. std::vector<IndexRecord*>* hash_to_offsets,
  106. std::vector<uint32_t>* entries_per_bucket) {
  107. bool first = true;
  108. uint32_t prev_hash = 0;
  109. size_t num_records = record_list_.GetNumRecords();
  110. for (size_t i = 0; i < num_records; i++) {
  111. IndexRecord* index_record = record_list_.At(i);
  112. uint32_t cur_hash = index_record->hash;
  113. if (first || prev_hash != cur_hash) {
  114. prev_hash = cur_hash;
  115. first = false;
  116. }
  117. uint32_t bucket = GetBucketIdFromHash(cur_hash, index_size_);
  118. IndexRecord* prev_bucket_head = (*hash_to_offsets)[bucket];
  119. index_record->next = prev_bucket_head;
  120. (*hash_to_offsets)[bucket] = index_record;
  121. (*entries_per_bucket)[bucket]++;
  122. }
  123. sub_index_size_ = 0;
  124. for (auto entry_count : *entries_per_bucket) {
  125. if (entry_count <= 1) {
  126. continue;
  127. }
  128. // Only buckets with more than 1 entry will have subindex.
  129. sub_index_size_ += VarintLength(entry_count);
  130. // total bytes needed to store these entries' in-file offsets.
  131. sub_index_size_ += entry_count * PlainTableIndex::kOffsetLen;
  132. }
  133. }
  134. Slice PlainTableIndexBuilder::FillIndexes(
  135. const std::vector<IndexRecord*>& hash_to_offsets,
  136. const std::vector<uint32_t>& entries_per_bucket) {
  137. ROCKS_LOG_DEBUG(ioptions_.info_log,
  138. "Reserving %" PRIu32 " bytes for plain table's sub_index",
  139. sub_index_size_);
  140. auto total_allocate_size = GetTotalSize();
  141. char* allocated = arena_->AllocateAligned(
  142. total_allocate_size, huge_page_tlb_size_, ioptions_.info_log);
  143. auto temp_ptr = EncodeVarint32(allocated, index_size_);
  144. uint32_t* index =
  145. reinterpret_cast<uint32_t*>(EncodeVarint32(temp_ptr, num_prefixes_));
  146. char* sub_index = reinterpret_cast<char*>(index + index_size_);
  147. uint32_t sub_index_offset = 0;
  148. for (uint32_t i = 0; i < index_size_; i++) {
  149. uint32_t num_keys_for_bucket = entries_per_bucket[i];
  150. switch (num_keys_for_bucket) {
  151. case 0:
  152. // No key for bucket
  153. PutUnaligned(index + i, (uint32_t)PlainTableIndex::kMaxFileSize);
  154. break;
  155. case 1:
  156. // point directly to the file offset
  157. PutUnaligned(index + i, hash_to_offsets[i]->offset);
  158. break;
  159. default:
  160. // point to second level indexes.
  161. PutUnaligned(index + i, sub_index_offset | PlainTableIndex::kSubIndexMask);
  162. char* prev_ptr = &sub_index[sub_index_offset];
  163. char* cur_ptr = EncodeVarint32(prev_ptr, num_keys_for_bucket);
  164. sub_index_offset += static_cast<uint32_t>(cur_ptr - prev_ptr);
  165. char* sub_index_pos = &sub_index[sub_index_offset];
  166. IndexRecord* record = hash_to_offsets[i];
  167. int j;
  168. for (j = num_keys_for_bucket - 1; j >= 0 && record;
  169. j--, record = record->next) {
  170. EncodeFixed32(sub_index_pos + j * sizeof(uint32_t), record->offset);
  171. }
  172. assert(j == -1 && record == nullptr);
  173. sub_index_offset += PlainTableIndex::kOffsetLen * num_keys_for_bucket;
  174. assert(sub_index_offset <= sub_index_size_);
  175. break;
  176. }
  177. }
  178. assert(sub_index_offset == sub_index_size_);
  179. ROCKS_LOG_DEBUG(ioptions_.info_log,
  180. "hash table size: %" PRIu32 ", suffix_map length %" PRIu32,
  181. index_size_, sub_index_size_);
  182. return Slice(allocated, GetTotalSize());
  183. }
  184. const std::string PlainTableIndexBuilder::kPlainTableIndexBlock =
  185. "PlainTableIndexBlock";
  186. }; // namespace ROCKSDB_NAMESPACE
  187. #endif // ROCKSDB_LITE