block_builder.cc 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275
  1. // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. //
  6. // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
  7. // Use of this source code is governed by a BSD-style license that can be
  8. // found in the LICENSE file. See the AUTHORS file for names of contributors.
  9. //
  10. // BlockBuilder generates blocks where keys are prefix-compressed:
  11. //
  12. // When we store a key, we drop the prefix shared with the previous
  13. // string. This helps reduce the space requirement significantly.
  14. // Furthermore, once every K keys, we do not apply the prefix
  15. // compression and store the entire key. We call this a "restart
  16. // point". The tail end of the block stores the offsets of all of the
  17. // restart points, and can be used to do a binary search when looking
  18. // for a particular key. Values are stored as-is (without compression)
  19. // immediately following the corresponding key.
  20. //
  21. // An entry for a particular key-value pair has the form:
  22. // shared_bytes: varint32
  23. // unshared_bytes: varint32
  24. // value_length: varint32 (NOTE1)
  25. // key_delta: char[unshared_bytes]
  26. // value: char[value_length]
  27. // shared_bytes == 0 (explicitly stored) for restart points.
  28. //
  29. // The trailer of the block has the form:
  30. // restarts: uint32[num_restarts]
  31. // num_restarts: uint32
  32. // restarts[i] contains the offset within the block of the ith restart point.
  33. //
  34. // NOTE1: omitted for format_version >= 4 index blocks, because the value is
  35. // composed of one (shared_bytes > 0) or two (shared_bytes == 0) varints, whose
  36. // length is self-describing.
  37. #include "table/block_based/block_builder.h"
  38. #include <algorithm>
  39. #include <cassert>
  40. #include "db/dbformat.h"
  41. #include "rocksdb/comparator.h"
  42. #include "table/block_based/data_block_footer.h"
  43. #include "util/coding.h"
  44. namespace ROCKSDB_NAMESPACE {
  45. BlockBuilder::BlockBuilder(
  46. int block_restart_interval, bool use_delta_encoding,
  47. bool use_value_delta_encoding,
  48. BlockBasedTableOptions::DataBlockIndexType index_type,
  49. double data_block_hash_table_util_ratio, size_t ts_sz,
  50. bool persist_user_defined_timestamps, bool is_user_key)
  51. : block_restart_interval_(block_restart_interval),
  52. use_delta_encoding_(use_delta_encoding),
  53. use_value_delta_encoding_(use_value_delta_encoding),
  54. strip_ts_sz_(persist_user_defined_timestamps ? 0 : ts_sz),
  55. is_user_key_(is_user_key),
  56. restarts_(1, 0), // First restart point is at offset 0
  57. counter_(0),
  58. finished_(false) {
  59. switch (index_type) {
  60. case BlockBasedTableOptions::kDataBlockBinarySearch:
  61. break;
  62. case BlockBasedTableOptions::kDataBlockBinaryAndHash:
  63. data_block_hash_index_builder_.Initialize(
  64. data_block_hash_table_util_ratio);
  65. break;
  66. default:
  67. assert(0);
  68. }
  69. assert(block_restart_interval_ >= 1);
  70. estimate_ = sizeof(uint32_t) + sizeof(uint32_t);
  71. }
  72. void BlockBuilder::Reset() {
  73. buffer_.clear();
  74. restarts_.resize(1); // First restart point is at offset 0
  75. assert(restarts_[0] == 0);
  76. estimate_ = sizeof(uint32_t) + sizeof(uint32_t);
  77. counter_ = 0;
  78. finished_ = false;
  79. last_key_.clear();
  80. if (data_block_hash_index_builder_.Valid()) {
  81. data_block_hash_index_builder_.Reset();
  82. }
  83. #ifndef NDEBUG
  84. add_with_last_key_called_ = false;
  85. #endif
  86. }
  87. void BlockBuilder::SwapAndReset(std::string& buffer) {
  88. std::swap(buffer_, buffer);
  89. Reset();
  90. }
  91. size_t BlockBuilder::EstimateSizeAfterKV(const Slice& key,
  92. const Slice& value) const {
  93. size_t estimate = CurrentSizeEstimate();
  94. // Note: this is an imprecise estimate as it accounts for the whole key size
  95. // instead of non-shared key size.
  96. estimate += key.size();
  97. if (strip_ts_sz_ > 0) {
  98. estimate -= strip_ts_sz_;
  99. }
  100. // In value delta encoding we estimate the value delta size as half the full
  101. // value size since only the size field of block handle is encoded.
  102. estimate +=
  103. !use_value_delta_encoding_ || (counter_ >= block_restart_interval_)
  104. ? value.size()
  105. : value.size() / 2;
  106. if (counter_ >= block_restart_interval_) {
  107. estimate += sizeof(uint32_t); // a new restart entry.
  108. }
  109. estimate += sizeof(int32_t); // varint for shared prefix length.
  110. // Note: this is an imprecise estimate as we will have to encoded size, one
  111. // for shared key and one for non-shared key.
  112. estimate += VarintLength(key.size()); // varint for key length.
  113. if (!use_value_delta_encoding_ || (counter_ >= block_restart_interval_)) {
  114. estimate += VarintLength(value.size()); // varint for value length.
  115. }
  116. return estimate;
  117. }
  118. Slice BlockBuilder::Finish() {
  119. // Append restart array
  120. for (size_t i = 0; i < restarts_.size(); i++) {
  121. PutFixed32(&buffer_, restarts_[i]);
  122. }
  123. uint32_t num_restarts = static_cast<uint32_t>(restarts_.size());
  124. BlockBasedTableOptions::DataBlockIndexType index_type =
  125. BlockBasedTableOptions::kDataBlockBinarySearch;
  126. if (data_block_hash_index_builder_.Valid() &&
  127. CurrentSizeEstimate() <= kMaxBlockSizeSupportedByHashIndex) {
  128. data_block_hash_index_builder_.Finish(buffer_);
  129. index_type = BlockBasedTableOptions::kDataBlockBinaryAndHash;
  130. }
  131. // footer is a packed format of data_block_index_type and num_restarts
  132. uint32_t block_footer = PackIndexTypeAndNumRestarts(index_type, num_restarts);
  133. PutFixed32(&buffer_, block_footer);
  134. finished_ = true;
  135. return Slice(buffer_);
  136. }
  137. void BlockBuilder::Add(const Slice& key, const Slice& value,
  138. const Slice* const delta_value,
  139. bool skip_delta_encoding) {
  140. // Ensure no unsafe mixing of Add and AddWithLastKey
  141. assert(!add_with_last_key_called_);
  142. AddWithLastKeyImpl(key, value, last_key_, delta_value, skip_delta_encoding,
  143. buffer_.size());
  144. if (use_delta_encoding_) {
  145. // Update state
  146. // We used to just copy the changed data, but it appears to be
  147. // faster to just copy the whole thing.
  148. last_key_.assign(key.data(), key.size());
  149. }
  150. }
  151. void BlockBuilder::AddWithLastKey(const Slice& key, const Slice& value,
  152. const Slice& last_key_param,
  153. const Slice* const delta_value,
  154. bool skip_delta_encoding) {
  155. // Ensure no unsafe mixing of Add and AddWithLastKey
  156. assert(last_key_.empty());
  157. #ifndef NDEBUG
  158. add_with_last_key_called_ = false;
  159. #endif
  160. // Here we make sure to use an empty `last_key` on first call after creation
  161. // or Reset. This is more convenient for the caller and we can be more
  162. // clever inside BlockBuilder. On this hot code path, we want to avoid
  163. // conditional jumps like `buffer_.empty() ? ... : ...` so we can use a
  164. // fast arithmetic operation instead, with an assertion to be sure our logic
  165. // is sound.
  166. size_t buffer_size = buffer_.size();
  167. size_t last_key_size = last_key_param.size();
  168. assert(buffer_size == 0 || buffer_size >= last_key_size - strip_ts_sz_);
  169. Slice last_key(last_key_param.data(), last_key_size * (buffer_size > 0));
  170. AddWithLastKeyImpl(key, value, last_key, delta_value, skip_delta_encoding,
  171. buffer_size);
  172. }
  173. inline void BlockBuilder::AddWithLastKeyImpl(const Slice& key,
  174. const Slice& value,
  175. const Slice& last_key,
  176. const Slice* const delta_value,
  177. bool skip_delta_encoding,
  178. size_t buffer_size) {
  179. assert(!finished_);
  180. assert(counter_ <= block_restart_interval_);
  181. std::string key_buf;
  182. std::string last_key_buf;
  183. const Slice key_to_persist = MaybeStripTimestampFromKey(&key_buf, key);
  184. // For delta key encoding, the first key in each restart interval doesn't have
  185. // a last key to share bytes with.
  186. const Slice last_key_persisted =
  187. last_key.size() == 0
  188. ? last_key
  189. : MaybeStripTimestampFromKey(&last_key_buf, last_key);
  190. size_t shared = 0; // number of bytes shared with prev key
  191. if (counter_ >= block_restart_interval_) {
  192. // Restart compression
  193. restarts_.push_back(static_cast<uint32_t>(buffer_size));
  194. estimate_ += sizeof(uint32_t);
  195. counter_ = 0;
  196. } else if (use_delta_encoding_ && !skip_delta_encoding) {
  197. // See how much sharing to do with previous string
  198. shared = key_to_persist.difference_offset(last_key_persisted);
  199. }
  200. const size_t non_shared = key_to_persist.size() - shared;
  201. if (use_value_delta_encoding_) {
  202. // Add "<shared><non_shared>" to buffer_
  203. PutVarint32Varint32(&buffer_, static_cast<uint32_t>(shared),
  204. static_cast<uint32_t>(non_shared));
  205. } else {
  206. // Add "<shared><non_shared><value_size>" to buffer_
  207. PutVarint32Varint32Varint32(&buffer_, static_cast<uint32_t>(shared),
  208. static_cast<uint32_t>(non_shared),
  209. static_cast<uint32_t>(value.size()));
  210. }
  211. // Add string delta to buffer_ followed by value
  212. buffer_.append(key_to_persist.data() + shared, non_shared);
  213. // Use value delta encoding only when the key has shared bytes. This would
  214. // simplify the decoding, where it can figure which decoding to use simply by
  215. // looking at the shared bytes size.
  216. if (shared != 0 && use_value_delta_encoding_) {
  217. assert(delta_value != nullptr);
  218. buffer_.append(delta_value->data(), delta_value->size());
  219. } else {
  220. buffer_.append(value.data(), value.size());
  221. }
  222. // TODO(yuzhangyu): make user defined timestamp work with block hash index.
  223. if (data_block_hash_index_builder_.Valid()) {
  224. // Only data blocks should be using `kDataBlockBinaryAndHash` index type.
  225. // And data blocks should always be built with internal keys instead of
  226. // user keys.
  227. assert(!is_user_key_);
  228. data_block_hash_index_builder_.Add(ExtractUserKey(key),
  229. restarts_.size() - 1);
  230. }
  231. counter_++;
  232. estimate_ += buffer_.size() - buffer_size;
  233. }
  234. const Slice BlockBuilder::MaybeStripTimestampFromKey(std::string* key_buf,
  235. const Slice& key) {
  236. Slice stripped_key = key;
  237. if (strip_ts_sz_ > 0) {
  238. if (is_user_key_) {
  239. stripped_key.remove_suffix(strip_ts_sz_);
  240. } else {
  241. StripTimestampFromInternalKey(key_buf, key, strip_ts_sz_);
  242. stripped_key = *key_buf;
  243. }
  244. }
  245. return stripped_key;
  246. }
  247. } // namespace ROCKSDB_NAMESPACE