| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338 |
- // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
- // This source code is licensed under both the GPLv2 (found in the
- // COPYING file in the root directory) and Apache 2.0 License
- // (found in the LICENSE.Apache file in the root directory).
- //
- // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
- // Use of this source code is governed by a BSD-style license that can be
- // found in the LICENSE file. See the AUTHORS file for names of contributors.
- //
- // Decodes the blocks generated by block_builder.cc.
- #include "table/block_based/block.h"
- #include <algorithm>
- #include <string>
- #include <unordered_map>
- #include <vector>
- #include "monitoring/perf_context_imp.h"
- #include "port/port.h"
- #include "port/stack_trace.h"
- #include "rocksdb/comparator.h"
- #include "table/block_based/block_prefix_index.h"
- #include "table/block_based/data_block_footer.h"
- #include "table/format.h"
- #include "util/coding.h"
- namespace ROCKSDB_NAMESPACE {
- // Helper routine: decode the next block entry starting at "p",
- // storing the number of shared key bytes, non_shared key bytes,
- // and the length of the value in "*shared", "*non_shared", and
- // "*value_length", respectively. Will not dereference past "limit".
- //
- // If any errors are detected, returns nullptr. Otherwise, returns a
- // pointer to the key delta (just past the three decoded values).
- struct DecodeEntry {
- inline const char* operator()(const char* p, const char* limit,
- uint32_t* shared, uint32_t* non_shared,
- uint32_t* value_length) {
- // We need 2 bytes for shared and non_shared size. We also need one more
- // byte either for value size or the actual value in case of value delta
- // encoding.
- assert(limit - p >= 3);
- *shared = reinterpret_cast<const unsigned char*>(p)[0];
- *non_shared = reinterpret_cast<const unsigned char*>(p)[1];
- *value_length = reinterpret_cast<const unsigned char*>(p)[2];
- if ((*shared | *non_shared | *value_length) < 128) {
- // Fast path: all three values are encoded in one byte each
- p += 3;
- } else {
- if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) {
- return nullptr;
- }
- if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) {
- return nullptr;
- }
- if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) {
- return nullptr;
- }
- }
- // Using an assert in place of "return null" since we should not pay the
- // cost of checking for corruption on every single key decoding
- assert(!(static_cast<uint32_t>(limit - p) < (*non_shared + *value_length)));
- return p;
- }
- };
- // Helper routine: similar to DecodeEntry but does not have assertions.
- // Instead, returns nullptr so that caller can detect and report failure.
- struct CheckAndDecodeEntry {
- inline const char* operator()(const char* p, const char* limit,
- uint32_t* shared, uint32_t* non_shared,
- uint32_t* value_length) {
- // We need 2 bytes for shared and non_shared size. We also need one more
- // byte either for value size or the actual value in case of value delta
- // encoding.
- if (limit - p < 3) {
- return nullptr;
- }
- *shared = reinterpret_cast<const unsigned char*>(p)[0];
- *non_shared = reinterpret_cast<const unsigned char*>(p)[1];
- *value_length = reinterpret_cast<const unsigned char*>(p)[2];
- if ((*shared | *non_shared | *value_length) < 128) {
- // Fast path: all three values are encoded in one byte each
- p += 3;
- } else {
- if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) {
- return nullptr;
- }
- if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) {
- return nullptr;
- }
- if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) {
- return nullptr;
- }
- }
- if (static_cast<uint32_t>(limit - p) < (*non_shared + *value_length)) {
- return nullptr;
- }
- return p;
- }
- };
- struct DecodeKey {
- inline const char* operator()(const char* p, const char* limit,
- uint32_t* shared, uint32_t* non_shared) {
- uint32_t value_length;
- return DecodeEntry()(p, limit, shared, non_shared, &value_length);
- }
- };
- // In format_version 4, which is used by index blocks, the value size is not
- // encoded before the entry, as the value is known to be the handle with the
- // known size.
- struct DecodeKeyV4 {
- inline const char* operator()(const char* p, const char* limit,
- uint32_t* shared, uint32_t* non_shared) {
- // We need 2 bytes for shared and non_shared size. We also need one more
- // byte either for value size or the actual value in case of value delta
- // encoding.
- if (limit - p < 3) {
- return nullptr;
- }
- *shared = reinterpret_cast<const unsigned char*>(p)[0];
- *non_shared = reinterpret_cast<const unsigned char*>(p)[1];
- if ((*shared | *non_shared) < 128) {
- // Fast path: all three values are encoded in one byte each
- p += 2;
- } else {
- if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) {
- return nullptr;
- }
- if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) {
- return nullptr;
- }
- }
- return p;
- }
- };
- struct DecodeEntryV4 {
- inline const char* operator()(const char* p, const char* limit,
- uint32_t* shared, uint32_t* non_shared,
- uint32_t* value_length) {
- assert(value_length);
- *value_length = 0;
- return DecodeKeyV4()(p, limit, shared, non_shared);
- }
- };
- void DataBlockIter::NextImpl() {
- #ifndef NDEBUG
- if (TEST_Corrupt_Callback("DataBlockIter::NextImpl")) {
- return;
- }
- #endif
- bool is_shared = false;
- ParseNextDataKey(&is_shared);
- ++cur_entry_idx_;
- }
- void MetaBlockIter::NextImpl() {
- bool is_shared = false;
- ParseNextKey<CheckAndDecodeEntry>(&is_shared);
- ++cur_entry_idx_;
- }
- void IndexBlockIter::NextImpl() {
- ParseNextIndexKey();
- ++cur_entry_idx_;
- }
- void IndexBlockIter::PrevImpl() {
- assert(Valid());
- // Scan backwards to a restart point before current_
- const uint32_t original = current_;
- while (GetRestartPoint(restart_index_) >= original) {
- if (restart_index_ == 0) {
- // No more entries
- current_ = restarts_;
- restart_index_ = num_restarts_;
- return;
- }
- restart_index_--;
- }
- SeekToRestartPoint(restart_index_);
- // Loop until end of current entry hits the start of original entry
- while (ParseNextIndexKey() && NextEntryOffset() < original) {
- }
- --cur_entry_idx_;
- }
- void MetaBlockIter::PrevImpl() {
- assert(Valid());
- // Scan backwards to a restart point before current_
- const uint32_t original = current_;
- while (GetRestartPoint(restart_index_) >= original) {
- if (restart_index_ == 0) {
- // No more entries
- current_ = restarts_;
- restart_index_ = num_restarts_;
- return;
- }
- restart_index_--;
- }
- SeekToRestartPoint(restart_index_);
- bool is_shared = false;
- // Loop until end of current entry hits the start of original entry
- while (ParseNextKey<CheckAndDecodeEntry>(&is_shared) &&
- NextEntryOffset() < original) {
- }
- --cur_entry_idx_;
- }
- // Similar to IndexBlockIter::PrevImpl but also caches the prev entries
- void DataBlockIter::PrevImpl() {
- assert(Valid());
- assert(prev_entries_idx_ == -1 ||
- static_cast<size_t>(prev_entries_idx_) < prev_entries_.size());
- --cur_entry_idx_;
- // Check if we can use cached prev_entries_
- if (prev_entries_idx_ > 0 &&
- prev_entries_[prev_entries_idx_].offset == current_) {
- // Read cached CachedPrevEntry
- prev_entries_idx_--;
- const CachedPrevEntry& current_prev_entry =
- prev_entries_[prev_entries_idx_];
- const char* key_ptr = nullptr;
- bool raw_key_cached;
- if (current_prev_entry.key_ptr != nullptr) {
- // The key is not delta encoded and stored in the data block
- key_ptr = current_prev_entry.key_ptr;
- raw_key_cached = false;
- } else {
- // The key is delta encoded and stored in prev_entries_keys_buff_
- key_ptr = prev_entries_keys_buff_.data() + current_prev_entry.key_offset;
- raw_key_cached = true;
- }
- const Slice current_key(key_ptr, current_prev_entry.key_size);
- current_ = current_prev_entry.offset;
- // TODO(ajkr): the copy when `raw_key_cached` is done here for convenience,
- // not necessity. It is convenient since this class treats keys as pinned
- // when `raw_key_` points to an outside buffer. So we cannot allow
- // `raw_key_` point into Prev cache as it is a transient outside buffer
- // (i.e., keys in it are not actually pinned).
- raw_key_.SetKey(current_key, raw_key_cached /* copy */);
- value_ = current_prev_entry.value;
- return;
- }
- // Clear prev entries cache
- prev_entries_idx_ = -1;
- prev_entries_.clear();
- prev_entries_keys_buff_.clear();
- // Scan backwards to a restart point before current_
- const uint32_t original = current_;
- while (GetRestartPoint(restart_index_) >= original) {
- if (restart_index_ == 0) {
- // No more entries
- current_ = restarts_;
- restart_index_ = num_restarts_;
- return;
- }
- restart_index_--;
- }
- SeekToRestartPoint(restart_index_);
- do {
- bool is_shared = false;
- if (!ParseNextDataKey(&is_shared)) {
- break;
- }
- Slice current_key = raw_key_.GetKey();
- if (raw_key_.IsKeyPinned()) {
- // The key is not delta encoded
- prev_entries_.emplace_back(current_, current_key.data(), 0,
- current_key.size(), value());
- } else {
- // The key is delta encoded, cache decoded key in buffer
- size_t new_key_offset = prev_entries_keys_buff_.size();
- prev_entries_keys_buff_.append(current_key.data(), current_key.size());
- prev_entries_.emplace_back(current_, nullptr, new_key_offset,
- current_key.size(), value());
- }
- // Loop until end of current entry hits the start of original entry
- } while (NextEntryOffset() < original);
- prev_entries_idx_ = static_cast<int32_t>(prev_entries_.size()) - 1;
- }
- void DataBlockIter::SeekImpl(const Slice& target) {
- Slice seek_key = target;
- PERF_TIMER_GUARD(block_seek_nanos);
- if (data_ == nullptr) { // Not init yet
- return;
- }
- uint32_t index = 0;
- bool skip_linear_scan = false;
- bool ok = BinarySeek<DecodeKey>(seek_key, &index, &skip_linear_scan);
- if (!ok) {
- return;
- }
- FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan);
- }
- void MetaBlockIter::SeekImpl(const Slice& target) {
- Slice seek_key = target;
- PERF_TIMER_GUARD(block_seek_nanos);
- if (data_ == nullptr) { // Not init yet
- return;
- }
- uint32_t index = 0;
- bool skip_linear_scan = false;
- bool ok = BinarySeek<DecodeKey>(seek_key, &index, &skip_linear_scan);
- if (!ok) {
- return;
- }
- FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan);
- }
- // Optimized Seek for point lookup for an internal key `target`
- // target = "seek_user_key @ type | seqno".
- //
- // For any type other than kTypeValue, kTypeDeletion, kTypeSingleDeletion,
- // kTypeBlobIndex, kTypeWideColumnEntity, kTypeValuePreferredSeqno or
- // kTypeMerge, this function behaves identically to Seek().
- //
- // For any type in kTypeValue, kTypeDeletion, kTypeSingleDeletion,
- // kTypeBlobIndex, kTypeWideColumnEntity, kTypeValuePreferredSeqno or
- // kTypeMerge:
- //
- // If the return value is FALSE, iter location is undefined, and it means:
- // 1) there is no key in this block falling into the range:
- // ["seek_user_key @ type | seqno", "seek_user_key @ kTypeDeletion | 0"],
- // inclusive; AND
- // 2) the last key of this block has a greater user_key from seek_user_key
- //
- // If the return value is TRUE, iter location has two possibilities:
- // 1) If iter is valid, it is set to a location as if set by SeekImpl(target).
- // In this case, it points to the first key with a larger user_key or a
- // matching user_key with a seqno no greater than the seeking seqno.
- // 2) If the iter is invalid, it means that either all the user_key is less
- // than the seek_user_key, or the block ends with a matching user_key but
- // with a smaller [ type | seqno ] (i.e. a larger seqno, or the same seqno
- // but larger type).
- bool DataBlockIter::SeekForGetImpl(const Slice& target) {
- Slice target_user_key = ExtractUserKey(target);
- uint32_t map_offset = restarts_ + num_restarts_ * sizeof(uint32_t);
- uint8_t entry =
- data_block_hash_index_->Lookup(data_, map_offset, target_user_key);
- if (entry == kCollision) {
- // HashSeek not effective, falling back
- SeekImpl(target);
- return true;
- }
- if (entry == kNoEntry) {
- // Even if we cannot find the user_key in this block, the result may
- // exist in the next block. Consider this example:
- //
- // Block N: [aab@100, ... , app@120]
- // boundary key: axy@50 (we make minimal assumption about a boundary key)
- // Block N+1: [axy@10, ... ]
- //
- // If seek_key = axy@60, the search will start from Block N.
- // Even if the user_key is not found in the hash map, the caller still
- // have to continue searching the next block.
- //
- // In this case, we pretend the key is in the last restart interval.
- // The while-loop below will search the last restart interval for the
- // key. It will stop at the first key that is larger than the seek_key,
- // or to the end of the block if no one is larger.
- entry = static_cast<uint8_t>(num_restarts_ - 1);
- }
- uint32_t restart_index = entry;
- // check if the key is in the restart_interval
- assert(restart_index < num_restarts_);
- SeekToRestartPoint(restart_index);
- current_ = GetRestartPoint(restart_index);
- cur_entry_idx_ =
- static_cast<int32_t>(restart_index * block_restart_interval_) - 1;
- uint32_t limit = restarts_;
- if (restart_index + 1 < num_restarts_) {
- limit = GetRestartPoint(restart_index + 1);
- }
- while (current_ < limit) {
- ++cur_entry_idx_;
- bool shared;
- // Here we only linear seek the target key inside the restart interval.
- // If a key does not exist inside a restart interval, we avoid
- // further searching the block content across restart interval boundary.
- //
- // TODO(fwu): check the left and right boundary of the restart interval
- // to avoid linear seek a target key that is out of range.
- if (!ParseNextDataKey(&shared) || CompareCurrentKey(target) >= 0) {
- // we stop at the first potential matching user key.
- break;
- }
- // If the loop exits due to CompareCurrentKey(target) >= 0, then current key
- // exists, and its checksum verification will be done in UpdateKey() called
- // in SeekForGet().
- // TODO(cbi): If this loop exits with current_ == restart_, per key-value
- // checksum will not be verified in UpdateKey() since Valid()
- // will return false.
- }
- if (current_ == restarts_) {
- // Search reaches to the end of the block. There are three possibilities:
- // 1) there is only one user_key match in the block (otherwise collision).
- // the matching user_key resides in the last restart interval, and it
- // is the last key of the restart interval and of the block as well.
- // ParseNextKey() skipped it as its [ type | seqno ] is smaller.
- //
- // 2) The seek_key is not found in the HashIndex Lookup(), i.e. kNoEntry,
- // AND all existing user_keys in the restart interval are smaller than
- // seek_user_key.
- //
- // 3) The seek_key is a false positive and happens to be hashed to the
- // last restart interval, AND all existing user_keys in the restart
- // interval are smaller than seek_user_key.
- //
- // The result may exist in the next block each case, so we return true.
- return true;
- }
- if (icmp_->user_comparator()->Compare(raw_key_.GetUserKey(),
- target_user_key) != 0) {
- // the key is not in this block and cannot be at the next block either.
- return false;
- }
- // Here we are conservative and only support a limited set of cases
- ValueType value_type = ExtractValueType(raw_key_.GetInternalKey());
- if (value_type != ValueType::kTypeValue &&
- value_type != ValueType::kTypeDeletion &&
- value_type != ValueType::kTypeMerge &&
- value_type != ValueType::kTypeSingleDeletion &&
- value_type != ValueType::kTypeBlobIndex &&
- value_type != ValueType::kTypeWideColumnEntity &&
- value_type != ValueType::kTypeValuePreferredSeqno) {
- SeekImpl(target);
- }
- // Result found, and the iter is correctly set.
- return true;
- }
- void IndexBlockIter::SeekImpl(const Slice& target) {
- #ifndef NDEBUG
- if (TEST_Corrupt_Callback("IndexBlockIter::SeekImpl")) {
- return;
- }
- #endif
- TEST_SYNC_POINT("IndexBlockIter::Seek:0");
- PERF_TIMER_GUARD(block_seek_nanos);
- if (data_ == nullptr) { // Not init yet
- return;
- }
- Slice seek_key = target;
- if (raw_key_.IsUserKey()) {
- seek_key = ExtractUserKey(target);
- }
- status_ = Status::OK();
- uint32_t index = 0;
- bool skip_linear_scan = false;
- bool ok = false;
- if (prefix_index_) {
- bool prefix_may_exist = true;
- ok = PrefixSeek(target, &index, &prefix_may_exist);
- if (!prefix_may_exist) {
- // This is to let the caller to distinguish between non-existing prefix,
- // and when key is larger than the last key, which both set Valid() to
- // false.
- current_ = restarts_;
- status_ = Status::NotFound();
- }
- // restart interval must be one when hash search is enabled so the binary
- // search simply lands at the right place.
- skip_linear_scan = true;
- } else if (value_delta_encoded_) {
- ok = BinarySeek<DecodeKeyV4>(seek_key, &index, &skip_linear_scan);
- } else {
- ok = BinarySeek<DecodeKey>(seek_key, &index, &skip_linear_scan);
- }
- if (!ok) {
- return;
- }
- FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan);
- }
- void DataBlockIter::SeekForPrevImpl(const Slice& target) {
- PERF_TIMER_GUARD(block_seek_nanos);
- Slice seek_key = target;
- if (data_ == nullptr) { // Not init yet
- return;
- }
- uint32_t index = 0;
- bool skip_linear_scan = false;
- bool ok = BinarySeek<DecodeKey>(seek_key, &index, &skip_linear_scan);
- if (!ok) {
- return;
- }
- FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan);
- if (!Valid()) {
- if (status_.ok()) {
- SeekToLastImpl();
- }
- } else {
- while (Valid() && CompareCurrentKey(seek_key) > 0) {
- PrevImpl();
- }
- }
- }
- void MetaBlockIter::SeekForPrevImpl(const Slice& target) {
- PERF_TIMER_GUARD(block_seek_nanos);
- Slice seek_key = target;
- if (data_ == nullptr) { // Not init yet
- return;
- }
- uint32_t index = 0;
- bool skip_linear_scan = false;
- bool ok = BinarySeek<DecodeKey>(seek_key, &index, &skip_linear_scan);
- if (!ok) {
- return;
- }
- FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan);
- if (!Valid()) {
- if (status_.ok()) {
- SeekToLastImpl();
- }
- } else {
- while (Valid() && CompareCurrentKey(seek_key) > 0) {
- PrevImpl();
- }
- }
- }
- void DataBlockIter::SeekToFirstImpl() {
- if (data_ == nullptr) { // Not init yet
- return;
- }
- SeekToRestartPoint(0);
- bool is_shared = false;
- ParseNextDataKey(&is_shared);
- cur_entry_idx_ = 0;
- }
- void MetaBlockIter::SeekToFirstImpl() {
- if (data_ == nullptr) { // Not init yet
- return;
- }
- SeekToRestartPoint(0);
- bool is_shared = false;
- ParseNextKey<CheckAndDecodeEntry>(&is_shared);
- cur_entry_idx_ = 0;
- }
- void IndexBlockIter::SeekToFirstImpl() {
- #ifndef NDEBUG
- if (TEST_Corrupt_Callback("IndexBlockIter::SeekToFirstImpl")) {
- return;
- }
- #endif
- if (data_ == nullptr) { // Not init yet
- return;
- }
- status_ = Status::OK();
- SeekToRestartPoint(0);
- ParseNextIndexKey();
- cur_entry_idx_ = 0;
- }
- void DataBlockIter::SeekToLastImpl() {
- if (data_ == nullptr) { // Not init yet
- return;
- }
- SeekToRestartPoint(num_restarts_ - 1);
- bool is_shared = false;
- cur_entry_idx_ = (num_restarts_ - 1) * block_restart_interval_;
- while (ParseNextDataKey(&is_shared) && NextEntryOffset() < restarts_) {
- // Keep skipping
- ++cur_entry_idx_;
- }
- }
- void MetaBlockIter::SeekToLastImpl() {
- if (data_ == nullptr) { // Not init yet
- return;
- }
- SeekToRestartPoint(num_restarts_ - 1);
- bool is_shared = false;
- assert(num_restarts_ >= 1);
- cur_entry_idx_ =
- static_cast<int32_t>((num_restarts_ - 1) * block_restart_interval_);
- while (ParseNextKey<CheckAndDecodeEntry>(&is_shared) &&
- NextEntryOffset() < restarts_) {
- // Will probably never reach here since restart_interval is always 1
- ++cur_entry_idx_;
- }
- }
- void IndexBlockIter::SeekToLastImpl() {
- if (data_ == nullptr) { // Not init yet
- return;
- }
- status_ = Status::OK();
- SeekToRestartPoint(num_restarts_ - 1);
- cur_entry_idx_ = (num_restarts_ - 1) * block_restart_interval_;
- while (ParseNextIndexKey() && NextEntryOffset() < restarts_) {
- ++cur_entry_idx_;
- }
- }
- template <class TValue>
- template <typename DecodeEntryFunc>
- bool BlockIter<TValue>::ParseNextKey(bool* is_shared) {
- current_ = NextEntryOffset();
- const char* p = data_ + current_;
- const char* limit = data_ + restarts_; // Restarts come right after data
- if (p >= limit) {
- // No more entries to return. Mark as invalid.
- current_ = restarts_;
- restart_index_ = num_restarts_;
- return false;
- }
- // Decode next entry
- uint32_t shared, non_shared, value_length;
- p = DecodeEntryFunc()(p, limit, &shared, &non_shared, &value_length);
- if (p == nullptr || raw_key_.Size() < shared) {
- CorruptionError();
- return false;
- } else {
- if (shared == 0) {
- *is_shared = false;
- // If this key doesn't share any bytes with prev key, and no min timestamp
- // needs to be padded to the key, then we don't need to decode it and
- // can use its address in the block directly (no copy).
- UpdateRawKeyAndMaybePadMinTimestamp(Slice(p, non_shared));
- } else {
- // This key share `shared` bytes with prev key, we need to decode it
- *is_shared = true;
- // If user-defined timestamp is stripped from user key before keys are
- // delta encoded, the decoded key consisting of the shared and non shared
- // bytes do not have user-defined timestamp yet. We need to pad min
- // timestamp to it.
- if (pad_min_timestamp_) {
- raw_key_.TrimAppendWithTimestamp(shared, p, non_shared, ts_sz_);
- } else {
- raw_key_.TrimAppend(shared, p, non_shared);
- }
- }
- value_ = Slice(p + non_shared, value_length);
- if (shared == 0) {
- while (restart_index_ + 1 < num_restarts_ &&
- GetRestartPoint(restart_index_ + 1) < current_) {
- ++restart_index_;
- }
- }
- // else we are in the middle of a restart interval and the restart_index_
- // thus has not changed
- return true;
- }
- }
- bool DataBlockIter::ParseNextDataKey(bool* is_shared) {
- if (ParseNextKey<DecodeEntry>(is_shared)) {
- #ifndef NDEBUG
- if (global_seqno_ != kDisableGlobalSequenceNumber) {
- // If we are reading a file with a global sequence number we should
- // expect that all encoded sequence numbers are zeros and any value
- // type is kTypeValue, kTypeMerge, kTypeDeletion,
- // kTypeDeletionWithTimestamp, kTypeRangeDeletion, or
- // kTypeWideColumnEntity.
- uint64_t packed = ExtractInternalKeyFooter(raw_key_.GetKey());
- SequenceNumber seqno;
- ValueType value_type;
- UnPackSequenceAndType(packed, &seqno, &value_type);
- assert(value_type == ValueType::kTypeValue ||
- value_type == ValueType::kTypeMerge ||
- value_type == ValueType::kTypeDeletion ||
- value_type == ValueType::kTypeDeletionWithTimestamp ||
- value_type == ValueType::kTypeRangeDeletion ||
- value_type == ValueType::kTypeWideColumnEntity);
- assert(seqno == 0);
- }
- #endif // NDEBUG
- return true;
- } else {
- return false;
- }
- }
- bool IndexBlockIter::ParseNextIndexKey() {
- bool is_shared = false;
- bool ok = (value_delta_encoded_) ? ParseNextKey<DecodeEntryV4>(&is_shared)
- : ParseNextKey<DecodeEntry>(&is_shared);
- if (ok) {
- if (value_delta_encoded_ || global_seqno_state_ != nullptr ||
- pad_min_timestamp_) {
- DecodeCurrentValue(is_shared);
- }
- }
- return ok;
- }
- // The format:
- // restart_point 0: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
- // restart_point 1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
- // ...
- // restart_point n-1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
- // where, k is key, v is value, and its encoding is in parentheses.
- // The format of each key is (shared_size, non_shared_size, shared, non_shared)
- // The format of each value, i.e., block handle, is (offset, size) whenever the
- // is_shared is false, which included the first entry in each restart point.
- // Otherwise, the format is delta-size = the size of current block - the size o
- // last block.
- void IndexBlockIter::DecodeCurrentValue(bool is_shared) {
- Slice v(value_.data(), data_ + restarts_ - value_.data());
- // Delta encoding is used if `shared` != 0.
- Status decode_s __attribute__((__unused__)) = decoded_value_.DecodeFrom(
- &v, have_first_key_,
- (value_delta_encoded_ && is_shared) ? &decoded_value_.handle : nullptr);
- assert(decode_s.ok());
- value_ = Slice(value_.data(), v.data() - value_.data());
- if (global_seqno_state_ != nullptr) {
- // Overwrite sequence number the same way as in DataBlockIter.
- IterKey& first_internal_key = global_seqno_state_->first_internal_key;
- first_internal_key.SetInternalKey(decoded_value_.first_internal_key,
- /* copy */ true);
- assert(GetInternalKeySeqno(first_internal_key.GetInternalKey()) == 0);
- ValueType value_type = ExtractValueType(first_internal_key.GetKey());
- assert(value_type == ValueType::kTypeValue ||
- value_type == ValueType::kTypeMerge ||
- value_type == ValueType::kTypeDeletion ||
- value_type == ValueType::kTypeRangeDeletion ||
- value_type == ValueType::kTypeWideColumnEntity);
- first_internal_key.UpdateInternalKey(global_seqno_state_->global_seqno,
- value_type);
- decoded_value_.first_internal_key = first_internal_key.GetKey();
- }
- if (pad_min_timestamp_ && !decoded_value_.first_internal_key.empty()) {
- first_internal_key_with_ts_.clear();
- PadInternalKeyWithMinTimestamp(&first_internal_key_with_ts_,
- decoded_value_.first_internal_key, ts_sz_);
- decoded_value_.first_internal_key = first_internal_key_with_ts_;
- }
- }
- template <class TValue>
- void BlockIter<TValue>::FindKeyAfterBinarySeek(const Slice& target,
- uint32_t index,
- bool skip_linear_scan) {
- // SeekToRestartPoint() only does the lookup in the restart block. We need
- // to follow it up with NextImpl() to position the iterator at the restart
- // key.
- SeekToRestartPoint(index);
- cur_entry_idx_ = static_cast<int32_t>(index * block_restart_interval_) - 1;
- NextImpl();
- if (!skip_linear_scan) {
- // Linear search (within restart block) for first key >= target
- uint32_t max_offset;
- if (index + 1 < num_restarts_) {
- // We are in a non-last restart interval. Since `BinarySeek()` guarantees
- // the next restart key is strictly greater than `target`, we can
- // terminate upon reaching it without any additional key comparison.
- max_offset = GetRestartPoint(index + 1);
- } else {
- // We are in the last restart interval. The while-loop will terminate by
- // `Valid()` returning false upon advancing past the block's last key.
- max_offset = std::numeric_limits<uint32_t>::max();
- }
- while (true) {
- NextImpl();
- if (!Valid()) {
- // TODO(cbi): per key-value checksum will not be verified in UpdateKey()
- // since Valid() will returns false.
- break;
- }
- if (current_ == max_offset) {
- assert(CompareCurrentKey(target) > 0);
- break;
- } else if (CompareCurrentKey(target) >= 0) {
- break;
- }
- }
- }
- }
- // Binary searches in restart array to find the starting restart point for the
- // linear scan, and stores it in `*index`. Assumes restart array does not
- // contain duplicate keys. It is guaranteed that the restart key at `*index + 1`
- // is strictly greater than `target` or does not exist (this can be used to
- // elide a comparison when linear scan reaches all the way to the next restart
- // key). Furthermore, `*skip_linear_scan` is set to indicate whether the
- // `*index`th restart key is the final result so that key does not need to be
- // compared again later.
- template <class TValue>
- template <typename DecodeKeyFunc>
- bool BlockIter<TValue>::BinarySeek(const Slice& target, uint32_t* index,
- bool* skip_linear_scan) {
- if (restarts_ == 0) {
- // SST files dedicated to range tombstones are written with index blocks
- // that have no keys while also having `num_restarts_ == 1`. This would
- // cause a problem for `BinarySeek()` as it'd try to access the first key
- // which does not exist. We identify such blocks by the offset at which
- // their restarts are stored, and return false to prevent any attempted
- // key accesses.
- return false;
- }
- *skip_linear_scan = false;
- // Loop invariants:
- // - Restart key at index `left` is less than or equal to the target key. The
- // sentinel index `-1` is considered to have a key that is less than all
- // keys.
- // - Any restart keys after index `right` are strictly greater than the target
- // key.
- int64_t left = -1, right = num_restarts_ - 1;
- while (left != right) {
- // The `mid` is computed by rounding up so it lands in (`left`, `right`].
- int64_t mid = left + (right - left + 1) / 2;
- uint32_t region_offset = GetRestartPoint(static_cast<uint32_t>(mid));
- uint32_t shared, non_shared;
- const char* key_ptr = DecodeKeyFunc()(
- data_ + region_offset, data_ + restarts_, &shared, &non_shared);
- if (key_ptr == nullptr || (shared != 0)) {
- CorruptionError();
- return false;
- }
- Slice mid_key(key_ptr, non_shared);
- UpdateRawKeyAndMaybePadMinTimestamp(mid_key);
- int cmp = CompareCurrentKey(target);
- if (cmp < 0) {
- // Key at "mid" is smaller than "target". Therefore all
- // blocks before "mid" are uninteresting.
- left = mid;
- } else if (cmp > 0) {
- // Key at "mid" is >= "target". Therefore all blocks at or
- // after "mid" are uninteresting.
- right = mid - 1;
- } else {
- *skip_linear_scan = true;
- left = right = mid;
- }
- }
- if (left == -1) {
- // All keys in the block were strictly greater than `target`. So the very
- // first key in the block is the final seek result.
- *skip_linear_scan = true;
- *index = 0;
- } else {
- *index = static_cast<uint32_t>(left);
- }
- return true;
- }
- // Compare target key and the block key of the block of `block_index`.
- // Return -1 if error.
- int IndexBlockIter::CompareBlockKey(uint32_t block_index, const Slice& target) {
- uint32_t region_offset = GetRestartPoint(block_index);
- uint32_t shared, non_shared;
- const char* key_ptr =
- value_delta_encoded_
- ? DecodeKeyV4()(data_ + region_offset, data_ + restarts_, &shared,
- &non_shared)
- : DecodeKey()(data_ + region_offset, data_ + restarts_, &shared,
- &non_shared);
- if (key_ptr == nullptr || (shared != 0)) {
- CorruptionError();
- return 1; // Return target is smaller
- }
- Slice block_key(key_ptr, non_shared);
- UpdateRawKeyAndMaybePadMinTimestamp(block_key);
- return CompareCurrentKey(target);
- }
- // Binary search in block_ids to find the first block
- // with a key >= target
- bool IndexBlockIter::BinaryBlockIndexSeek(const Slice& target,
- uint32_t* block_ids, uint32_t left,
- uint32_t right, uint32_t* index,
- bool* prefix_may_exist) {
- assert(left <= right);
- assert(index);
- assert(prefix_may_exist);
- *prefix_may_exist = true;
- uint32_t left_bound = left;
- while (left <= right) {
- uint32_t mid = (right + left) / 2;
- int cmp = CompareBlockKey(block_ids[mid], target);
- if (!status_.ok()) {
- return false;
- }
- if (cmp < 0) {
- // Key at "target" is larger than "mid". Therefore all
- // blocks before or at "mid" are uninteresting.
- left = mid + 1;
- } else {
- // Key at "target" is <= "mid". Therefore all blocks
- // after "mid" are uninteresting.
- // If there is only one block left, we found it.
- if (left == right) {
- break;
- }
- right = mid;
- }
- }
- if (left == right) {
- // In one of the two following cases:
- // (1) left is the first one of block_ids
- // (2) there is a gap of blocks between block of `left` and `left-1`.
- // we can further distinguish the case of key in the block or key not
- // existing, by comparing the target key and the key of the previous
- // block to the left of the block found.
- if (block_ids[left] > 0 &&
- (left == left_bound || block_ids[left - 1] != block_ids[left] - 1) &&
- CompareBlockKey(block_ids[left] - 1, target) > 0) {
- current_ = restarts_;
- *prefix_may_exist = false;
- return false;
- }
- *index = block_ids[left];
- return true;
- } else {
- assert(left > right);
- // If the next block key is larger than seek key, it is possible that
- // no key shares the prefix with `target`, or all keys with the same
- // prefix as `target` are smaller than prefix. In the latter case,
- // we are mandated to set the position the same as the total order.
- // In the latter case, either:
- // (1) `target` falls into the range of the next block. In this case,
- // we can place the iterator to the next block, or
- // (2) `target` is larger than all block keys. In this case we can
- // keep the iterator invalidate without setting `prefix_may_exist`
- // to false.
- // We might sometimes end up with setting the total order position
- // while there is no key sharing the prefix as `target`, but it
- // still follows the contract.
- uint32_t right_index = block_ids[right];
- assert(right_index + 1 <= num_restarts_);
- if (right_index + 1 < num_restarts_) {
- if (CompareBlockKey(right_index + 1, target) >= 0) {
- *index = right_index + 1;
- return true;
- } else {
- // We have to set the flag here because we are not positioning
- // the iterator to the total order position.
- *prefix_may_exist = false;
- }
- }
- // Mark iterator invalid
- current_ = restarts_;
- return false;
- }
- }
- bool IndexBlockIter::PrefixSeek(const Slice& target, uint32_t* index,
- bool* prefix_may_exist) {
- assert(index);
- assert(prefix_may_exist);
- assert(prefix_index_);
- *prefix_may_exist = true;
- Slice seek_key = target;
- if (raw_key_.IsUserKey()) {
- seek_key = ExtractUserKey(target);
- }
- uint32_t* block_ids = nullptr;
- uint32_t num_blocks = prefix_index_->GetBlocks(target, &block_ids);
- if (num_blocks == 0) {
- current_ = restarts_;
- *prefix_may_exist = false;
- return false;
- } else {
- assert(block_ids);
- return BinaryBlockIndexSeek(seek_key, block_ids, 0, num_blocks - 1, index,
- prefix_may_exist);
- }
- }
- uint32_t Block::NumRestarts() const {
- assert(size() >= 2 * sizeof(uint32_t));
- uint32_t block_footer = DecodeFixed32(data() + size() - sizeof(uint32_t));
- uint32_t num_restarts = block_footer;
- if (size() > kMaxBlockSizeSupportedByHashIndex) {
- // In BlockBuilder, we have ensured a block with HashIndex is less than
- // kMaxBlockSizeSupportedByHashIndex (64KiB).
- //
- // Therefore, if we encounter a block with a size > 64KiB, the block
- // cannot have HashIndex. So the footer will directly interpreted as
- // num_restarts.
- //
- // Such check is for backward compatibility. We can ensure legacy block
- // with a vary large num_restarts i.e. >= 0x80000000 can be interpreted
- // correctly as no HashIndex even if the MSB of num_restarts is set.
- return num_restarts;
- }
- BlockBasedTableOptions::DataBlockIndexType index_type;
- UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts);
- return num_restarts;
- }
- BlockBasedTableOptions::DataBlockIndexType Block::IndexType() const {
- assert(size() >= 2 * sizeof(uint32_t));
- if (size() > kMaxBlockSizeSupportedByHashIndex) {
- // The check is for the same reason as that in NumRestarts()
- return BlockBasedTableOptions::kDataBlockBinarySearch;
- }
- uint32_t block_footer = DecodeFixed32(data() + size() - sizeof(uint32_t));
- uint32_t num_restarts = block_footer;
- BlockBasedTableOptions::DataBlockIndexType index_type;
- UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts);
- return index_type;
- }
- Block::~Block() {
- // This sync point can be re-enabled if RocksDB can control the
- // initialization order of any/all static options created by the user.
- // TEST_SYNC_POINT("Block::~Block");
- delete[] kv_checksum_;
- }
- Block::Block(BlockContents&& contents, size_t read_amp_bytes_per_bit,
- Statistics* statistics)
- : contents_(std::move(contents)), restart_offset_(0), num_restarts_(0) {
- TEST_SYNC_POINT("Block::Block:0");
- auto& size = contents_.data.size_;
- if (size < sizeof(uint32_t)) {
- size = 0; // Error marker
- } else {
- // Should only decode restart points for uncompressed blocks
- num_restarts_ = NumRestarts();
- switch (IndexType()) {
- case BlockBasedTableOptions::kDataBlockBinarySearch:
- restart_offset_ = static_cast<uint32_t>(size) -
- (1 + num_restarts_) * sizeof(uint32_t);
- if (restart_offset_ > size - sizeof(uint32_t)) {
- // The size is too small for NumRestarts() and therefore
- // restart_offset_ wrapped around.
- size = 0;
- }
- break;
- case BlockBasedTableOptions::kDataBlockBinaryAndHash:
- if (size < sizeof(uint32_t) /* block footer */ +
- sizeof(uint16_t) /* NUM_BUCK */) {
- size = 0;
- break;
- }
- uint16_t map_offset;
- data_block_hash_index_.Initialize(
- contents_.data.data(),
- /* chop off NUM_RESTARTS */
- static_cast<uint16_t>(size - sizeof(uint32_t)), &map_offset);
- restart_offset_ = map_offset - num_restarts_ * sizeof(uint32_t);
- if (restart_offset_ > map_offset) {
- // map_offset is too small for NumRestarts() and
- // therefore restart_offset_ wrapped around.
- size = 0;
- break;
- }
- break;
- default:
- size = 0; // Error marker
- }
- }
- if (read_amp_bytes_per_bit != 0 && statistics && size != 0) {
- read_amp_bitmap_.reset(new BlockReadAmpBitmap(
- restart_offset_, read_amp_bytes_per_bit, statistics));
- }
- }
- void Block::InitializeDataBlockProtectionInfo(uint8_t protection_bytes_per_key,
- const Comparator* raw_ucmp) {
- protection_bytes_per_key_ = 0;
- if (protection_bytes_per_key > 0 && num_restarts_ > 0) {
- // NewDataIterator() is called with protection_bytes_per_key_ = 0.
- // This is intended since checksum is not constructed yet.
- //
- // We do not know global_seqno yet, so checksum computation and
- // verification all assume global_seqno = 0.
- // TODO(yuzhangyu): handle the implication of padding timestamp for kv
- // protection.
- std::unique_ptr<DataBlockIter> iter{NewDataIterator(
- raw_ucmp, kDisableGlobalSequenceNumber, nullptr /* iter */,
- nullptr /* stats */, true /* block_contents_pinned */,
- true /* user_defined_timestamps_persisted */)};
- if (iter->status().ok()) {
- block_restart_interval_ = iter->GetRestartInterval();
- }
- uint32_t num_keys = 0;
- if (iter->status().ok()) {
- num_keys = iter->NumberOfKeys(block_restart_interval_);
- }
- if (iter->status().ok()) {
- checksum_size_ = num_keys * protection_bytes_per_key;
- kv_checksum_ = new char[(size_t)checksum_size_];
- size_t i = 0;
- iter->SeekToFirst();
- while (iter->Valid()) {
- GenerateKVChecksum(kv_checksum_ + i, protection_bytes_per_key,
- iter->key(), iter->value());
- iter->Next();
- i += protection_bytes_per_key;
- }
- assert(!iter->status().ok() || i == num_keys * protection_bytes_per_key);
- }
- if (!iter->status().ok()) {
- contents_.data.size_ = 0; // Error marker
- return;
- }
- protection_bytes_per_key_ = protection_bytes_per_key;
- }
- }
- void Block::InitializeIndexBlockProtectionInfo(uint8_t protection_bytes_per_key,
- const Comparator* raw_ucmp,
- bool value_is_full,
- bool index_has_first_key) {
- protection_bytes_per_key_ = 0;
- if (num_restarts_ > 0 && protection_bytes_per_key > 0) {
- // Note that `global_seqno` and `key_includes_seq` are hardcoded here. They
- // do not impact how the index block is parsed. During checksum
- // construction/verification, we use the entire key buffer from
- // raw_key_.GetKey() returned by iter->key() as the `key` part of key-value
- // checksum, and the content of this buffer do not change for different
- // values of `global_seqno` or `key_includes_seq`.
- // TODO(yuzhangyu): handle the implication of padding timestamp for kv
- // protection.
- std::unique_ptr<IndexBlockIter> iter{NewIndexIterator(
- raw_ucmp, kDisableGlobalSequenceNumber /* global_seqno */, nullptr,
- nullptr /* Statistics */, true /* total_order_seek */,
- index_has_first_key /* have_first_key */, false /* key_includes_seq */,
- value_is_full, true /* block_contents_pinned */,
- true /* user_defined_timestamps_persisted*/,
- nullptr /* prefix_index */)};
- if (iter->status().ok()) {
- block_restart_interval_ = iter->GetRestartInterval();
- }
- uint32_t num_keys = 0;
- if (iter->status().ok()) {
- num_keys = iter->NumberOfKeys(block_restart_interval_);
- }
- if (iter->status().ok()) {
- checksum_size_ = num_keys * protection_bytes_per_key;
- kv_checksum_ = new char[(size_t)checksum_size_];
- iter->SeekToFirst();
- size_t i = 0;
- while (iter->Valid()) {
- GenerateKVChecksum(kv_checksum_ + i, protection_bytes_per_key,
- iter->key(), iter->raw_value());
- iter->Next();
- i += protection_bytes_per_key;
- }
- assert(!iter->status().ok() || i == num_keys * protection_bytes_per_key);
- }
- if (!iter->status().ok()) {
- contents_.data.size_ = 0; // Error marker
- return;
- }
- protection_bytes_per_key_ = protection_bytes_per_key;
- }
- }
- void Block::InitializeMetaIndexBlockProtectionInfo(
- uint8_t protection_bytes_per_key) {
- protection_bytes_per_key_ = 0;
- if (num_restarts_ > 0 && protection_bytes_per_key > 0) {
- std::unique_ptr<MetaBlockIter> iter{
- NewMetaIterator(true /* block_contents_pinned */)};
- if (iter->status().ok()) {
- block_restart_interval_ = iter->GetRestartInterval();
- }
- uint32_t num_keys = 0;
- if (iter->status().ok()) {
- num_keys = iter->NumberOfKeys(block_restart_interval_);
- }
- if (iter->status().ok()) {
- checksum_size_ = num_keys * protection_bytes_per_key;
- kv_checksum_ = new char[(size_t)checksum_size_];
- iter->SeekToFirst();
- size_t i = 0;
- while (iter->Valid()) {
- GenerateKVChecksum(kv_checksum_ + i, protection_bytes_per_key,
- iter->key(), iter->value());
- iter->Next();
- i += protection_bytes_per_key;
- }
- assert(!iter->status().ok() || i == num_keys * protection_bytes_per_key);
- }
- if (!iter->status().ok()) {
- contents_.data.size_ = 0; // Error marker
- return;
- }
- protection_bytes_per_key_ = protection_bytes_per_key;
- }
- }
- MetaBlockIter* Block::NewMetaIterator(bool block_contents_pinned) {
- MetaBlockIter* iter = new MetaBlockIter();
- if (size() < 2 * sizeof(uint32_t)) {
- iter->Invalidate(Status::Corruption("bad block contents"));
- return iter;
- } else if (num_restarts_ == 0) {
- // Empty block.
- iter->Invalidate(Status::OK());
- } else {
- iter->Initialize(data(), restart_offset_, num_restarts_,
- block_contents_pinned, protection_bytes_per_key_,
- kv_checksum_, block_restart_interval_);
- }
- return iter;
- }
- DataBlockIter* Block::NewDataIterator(const Comparator* raw_ucmp,
- SequenceNumber global_seqno,
- DataBlockIter* iter, Statistics* stats,
- bool block_contents_pinned,
- bool user_defined_timestamps_persisted) {
- DataBlockIter* ret_iter;
- if (iter != nullptr) {
- ret_iter = iter;
- } else {
- ret_iter = new DataBlockIter;
- }
- if (size() < 2 * sizeof(uint32_t)) {
- ret_iter->Invalidate(Status::Corruption("bad block contents"));
- return ret_iter;
- }
- if (num_restarts_ == 0) {
- // Empty block.
- ret_iter->Invalidate(Status::OK());
- return ret_iter;
- } else {
- ret_iter->Initialize(
- raw_ucmp, data(), restart_offset_, num_restarts_, global_seqno,
- read_amp_bitmap_.get(), block_contents_pinned,
- user_defined_timestamps_persisted,
- data_block_hash_index_.Valid() ? &data_block_hash_index_ : nullptr,
- protection_bytes_per_key_, kv_checksum_, block_restart_interval_);
- if (read_amp_bitmap_) {
- if (read_amp_bitmap_->GetStatistics() != stats) {
- // DB changed the Statistics pointer, we need to notify read_amp_bitmap_
- read_amp_bitmap_->SetStatistics(stats);
- }
- }
- }
- return ret_iter;
- }
- IndexBlockIter* Block::NewIndexIterator(
- const Comparator* raw_ucmp, SequenceNumber global_seqno,
- IndexBlockIter* iter, Statistics* /*stats*/, bool total_order_seek,
- bool have_first_key, bool key_includes_seq, bool value_is_full,
- bool block_contents_pinned, bool user_defined_timestamps_persisted,
- BlockPrefixIndex* prefix_index) {
- IndexBlockIter* ret_iter;
- if (iter != nullptr) {
- ret_iter = iter;
- } else {
- ret_iter = new IndexBlockIter;
- }
- if (size() < 2 * sizeof(uint32_t)) {
- ret_iter->Invalidate(Status::Corruption("bad block contents"));
- return ret_iter;
- }
- if (num_restarts_ == 0) {
- // Empty block.
- ret_iter->Invalidate(Status::OK());
- return ret_iter;
- } else {
- BlockPrefixIndex* prefix_index_ptr =
- total_order_seek ? nullptr : prefix_index;
- ret_iter->Initialize(
- raw_ucmp, data(), restart_offset_, num_restarts_, global_seqno,
- prefix_index_ptr, have_first_key, key_includes_seq, value_is_full,
- block_contents_pinned, user_defined_timestamps_persisted,
- protection_bytes_per_key_, kv_checksum_, block_restart_interval_);
- }
- return ret_iter;
- }
- size_t Block::ApproximateMemoryUsage() const {
- size_t usage = usable_size();
- #ifdef ROCKSDB_MALLOC_USABLE_SIZE
- usage += malloc_usable_size((void*)this);
- #else
- usage += sizeof(*this);
- #endif // ROCKSDB_MALLOC_USABLE_SIZE
- if (read_amp_bitmap_) {
- usage += read_amp_bitmap_->ApproximateMemoryUsage();
- }
- usage += checksum_size_;
- return usage;
- }
- } // namespace ROCKSDB_NAMESPACE
|