block_builder.cc 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196
  1. // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. //
  6. // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
  7. // Use of this source code is governed by a BSD-style license that can be
  8. // found in the LICENSE file. See the AUTHORS file for names of contributors.
  9. //
  10. // BlockBuilder generates blocks where keys are prefix-compressed:
  11. //
  12. // When we store a key, we drop the prefix shared with the previous
  13. // string. This helps reduce the space requirement significantly.
  14. // Furthermore, once every K keys, we do not apply the prefix
  15. // compression and store the entire key. We call this a "restart
  16. // point". The tail end of the block stores the offsets of all of the
  17. // restart points, and can be used to do a binary search when looking
  18. // for a particular key. Values are stored as-is (without compression)
  19. // immediately following the corresponding key.
  20. //
  21. // An entry for a particular key-value pair has the form:
  22. // shared_bytes: varint32
  23. // unshared_bytes: varint32
  24. // value_length: varint32
  25. // key_delta: char[unshared_bytes]
  26. // value: char[value_length]
  27. // shared_bytes == 0 for restart points.
  28. //
  29. // The trailer of the block has the form:
  30. // restarts: uint32[num_restarts]
  31. // num_restarts: uint32
  32. // restarts[i] contains the offset within the block of the ith restart point.
  33. #include "table/block_based/block_builder.h"
  34. #include <assert.h>
  35. #include <algorithm>
  36. #include "db/dbformat.h"
  37. #include "rocksdb/comparator.h"
  38. #include "table/block_based/data_block_footer.h"
  39. #include "util/coding.h"
  40. namespace ROCKSDB_NAMESPACE {
  41. BlockBuilder::BlockBuilder(
  42. int block_restart_interval, bool use_delta_encoding,
  43. bool use_value_delta_encoding,
  44. BlockBasedTableOptions::DataBlockIndexType index_type,
  45. double data_block_hash_table_util_ratio)
  46. : block_restart_interval_(block_restart_interval),
  47. use_delta_encoding_(use_delta_encoding),
  48. use_value_delta_encoding_(use_value_delta_encoding),
  49. restarts_(),
  50. counter_(0),
  51. finished_(false) {
  52. switch (index_type) {
  53. case BlockBasedTableOptions::kDataBlockBinarySearch:
  54. break;
  55. case BlockBasedTableOptions::kDataBlockBinaryAndHash:
  56. data_block_hash_index_builder_.Initialize(
  57. data_block_hash_table_util_ratio);
  58. break;
  59. default:
  60. assert(0);
  61. }
  62. assert(block_restart_interval_ >= 1);
  63. restarts_.push_back(0); // First restart point is at offset 0
  64. estimate_ = sizeof(uint32_t) + sizeof(uint32_t);
  65. }
  66. void BlockBuilder::Reset() {
  67. buffer_.clear();
  68. restarts_.clear();
  69. restarts_.push_back(0); // First restart point is at offset 0
  70. estimate_ = sizeof(uint32_t) + sizeof(uint32_t);
  71. counter_ = 0;
  72. finished_ = false;
  73. last_key_.clear();
  74. if (data_block_hash_index_builder_.Valid()) {
  75. data_block_hash_index_builder_.Reset();
  76. }
  77. }
  78. size_t BlockBuilder::EstimateSizeAfterKV(const Slice& key,
  79. const Slice& value) const {
  80. size_t estimate = CurrentSizeEstimate();
  81. // Note: this is an imprecise estimate as it accounts for the whole key size
  82. // instead of non-shared key size.
  83. estimate += key.size();
  84. // In value delta encoding we estimate the value delta size as half the full
  85. // value size since only the size field of block handle is encoded.
  86. estimate +=
  87. !use_value_delta_encoding_ || (counter_ >= block_restart_interval_)
  88. ? value.size()
  89. : value.size() / 2;
  90. if (counter_ >= block_restart_interval_) {
  91. estimate += sizeof(uint32_t); // a new restart entry.
  92. }
  93. estimate += sizeof(int32_t); // varint for shared prefix length.
  94. // Note: this is an imprecise estimate as we will have to encoded size, one
  95. // for shared key and one for non-shared key.
  96. estimate += VarintLength(key.size()); // varint for key length.
  97. if (!use_value_delta_encoding_ || (counter_ >= block_restart_interval_)) {
  98. estimate += VarintLength(value.size()); // varint for value length.
  99. }
  100. return estimate;
  101. }
  102. Slice BlockBuilder::Finish() {
  103. // Append restart array
  104. for (size_t i = 0; i < restarts_.size(); i++) {
  105. PutFixed32(&buffer_, restarts_[i]);
  106. }
  107. uint32_t num_restarts = static_cast<uint32_t>(restarts_.size());
  108. BlockBasedTableOptions::DataBlockIndexType index_type =
  109. BlockBasedTableOptions::kDataBlockBinarySearch;
  110. if (data_block_hash_index_builder_.Valid() &&
  111. CurrentSizeEstimate() <= kMaxBlockSizeSupportedByHashIndex) {
  112. data_block_hash_index_builder_.Finish(buffer_);
  113. index_type = BlockBasedTableOptions::kDataBlockBinaryAndHash;
  114. }
  115. // footer is a packed format of data_block_index_type and num_restarts
  116. uint32_t block_footer = PackIndexTypeAndNumRestarts(index_type, num_restarts);
  117. PutFixed32(&buffer_, block_footer);
  118. finished_ = true;
  119. return Slice(buffer_);
  120. }
  121. void BlockBuilder::Add(const Slice& key, const Slice& value,
  122. const Slice* const delta_value) {
  123. assert(!finished_);
  124. assert(counter_ <= block_restart_interval_);
  125. assert(!use_value_delta_encoding_ || delta_value);
  126. size_t shared = 0; // number of bytes shared with prev key
  127. if (counter_ >= block_restart_interval_) {
  128. // Restart compression
  129. restarts_.push_back(static_cast<uint32_t>(buffer_.size()));
  130. estimate_ += sizeof(uint32_t);
  131. counter_ = 0;
  132. if (use_delta_encoding_) {
  133. // Update state
  134. last_key_.assign(key.data(), key.size());
  135. }
  136. } else if (use_delta_encoding_) {
  137. Slice last_key_piece(last_key_);
  138. // See how much sharing to do with previous string
  139. shared = key.difference_offset(last_key_piece);
  140. // Update state
  141. // We used to just copy the changed data here, but it appears to be
  142. // faster to just copy the whole thing.
  143. last_key_.assign(key.data(), key.size());
  144. }
  145. const size_t non_shared = key.size() - shared;
  146. const size_t curr_size = buffer_.size();
  147. if (use_value_delta_encoding_) {
  148. // Add "<shared><non_shared>" to buffer_
  149. PutVarint32Varint32(&buffer_, static_cast<uint32_t>(shared),
  150. static_cast<uint32_t>(non_shared));
  151. } else {
  152. // Add "<shared><non_shared><value_size>" to buffer_
  153. PutVarint32Varint32Varint32(&buffer_, static_cast<uint32_t>(shared),
  154. static_cast<uint32_t>(non_shared),
  155. static_cast<uint32_t>(value.size()));
  156. }
  157. // Add string delta to buffer_ followed by value
  158. buffer_.append(key.data() + shared, non_shared);
  159. // Use value delta encoding only when the key has shared bytes. This would
  160. // simplify the decoding, where it can figure which decoding to use simply by
  161. // looking at the shared bytes size.
  162. if (shared != 0 && use_value_delta_encoding_) {
  163. buffer_.append(delta_value->data(), delta_value->size());
  164. } else {
  165. buffer_.append(value.data(), value.size());
  166. }
  167. if (data_block_hash_index_builder_.Valid()) {
  168. data_block_hash_index_builder_.Add(ExtractUserKey(key),
  169. restarts_.size() - 1);
  170. }
  171. counter_++;
  172. estimate_ += buffer_.size() - curr_size;
  173. }
  174. } // namespace ROCKSDB_NAMESPACE