block.cc 47 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338
  1. // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. //
  6. // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
  7. // Use of this source code is governed by a BSD-style license that can be
  8. // found in the LICENSE file. See the AUTHORS file for names of contributors.
  9. //
  10. // Decodes the blocks generated by block_builder.cc.
  11. #include "table/block_based/block.h"
  12. #include <algorithm>
  13. #include <string>
  14. #include <unordered_map>
  15. #include <vector>
  16. #include "monitoring/perf_context_imp.h"
  17. #include "port/port.h"
  18. #include "port/stack_trace.h"
  19. #include "rocksdb/comparator.h"
  20. #include "table/block_based/block_prefix_index.h"
  21. #include "table/block_based/data_block_footer.h"
  22. #include "table/format.h"
  23. #include "util/coding.h"
  24. namespace ROCKSDB_NAMESPACE {
  25. // Helper routine: decode the next block entry starting at "p",
  26. // storing the number of shared key bytes, non_shared key bytes,
  27. // and the length of the value in "*shared", "*non_shared", and
  28. // "*value_length", respectively. Will not dereference past "limit".
  29. //
  30. // If any errors are detected, returns nullptr. Otherwise, returns a
  31. // pointer to the key delta (just past the three decoded values).
  32. struct DecodeEntry {
  33. inline const char* operator()(const char* p, const char* limit,
  34. uint32_t* shared, uint32_t* non_shared,
  35. uint32_t* value_length) {
  36. // We need 2 bytes for shared and non_shared size. We also need one more
  37. // byte either for value size or the actual value in case of value delta
  38. // encoding.
  39. assert(limit - p >= 3);
  40. *shared = reinterpret_cast<const unsigned char*>(p)[0];
  41. *non_shared = reinterpret_cast<const unsigned char*>(p)[1];
  42. *value_length = reinterpret_cast<const unsigned char*>(p)[2];
  43. if ((*shared | *non_shared | *value_length) < 128) {
  44. // Fast path: all three values are encoded in one byte each
  45. p += 3;
  46. } else {
  47. if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) {
  48. return nullptr;
  49. }
  50. if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) {
  51. return nullptr;
  52. }
  53. if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) {
  54. return nullptr;
  55. }
  56. }
  57. // Using an assert in place of "return null" since we should not pay the
  58. // cost of checking for corruption on every single key decoding
  59. assert(!(static_cast<uint32_t>(limit - p) < (*non_shared + *value_length)));
  60. return p;
  61. }
  62. };
  63. // Helper routine: similar to DecodeEntry but does not have assertions.
  64. // Instead, returns nullptr so that caller can detect and report failure.
  65. struct CheckAndDecodeEntry {
  66. inline const char* operator()(const char* p, const char* limit,
  67. uint32_t* shared, uint32_t* non_shared,
  68. uint32_t* value_length) {
  69. // We need 2 bytes for shared and non_shared size. We also need one more
  70. // byte either for value size or the actual value in case of value delta
  71. // encoding.
  72. if (limit - p < 3) {
  73. return nullptr;
  74. }
  75. *shared = reinterpret_cast<const unsigned char*>(p)[0];
  76. *non_shared = reinterpret_cast<const unsigned char*>(p)[1];
  77. *value_length = reinterpret_cast<const unsigned char*>(p)[2];
  78. if ((*shared | *non_shared | *value_length) < 128) {
  79. // Fast path: all three values are encoded in one byte each
  80. p += 3;
  81. } else {
  82. if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) {
  83. return nullptr;
  84. }
  85. if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) {
  86. return nullptr;
  87. }
  88. if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) {
  89. return nullptr;
  90. }
  91. }
  92. if (static_cast<uint32_t>(limit - p) < (*non_shared + *value_length)) {
  93. return nullptr;
  94. }
  95. return p;
  96. }
  97. };
  98. struct DecodeKey {
  99. inline const char* operator()(const char* p, const char* limit,
  100. uint32_t* shared, uint32_t* non_shared) {
  101. uint32_t value_length;
  102. return DecodeEntry()(p, limit, shared, non_shared, &value_length);
  103. }
  104. };
  105. // In format_version 4, which is used by index blocks, the value size is not
  106. // encoded before the entry, as the value is known to be the handle with the
  107. // known size.
  108. struct DecodeKeyV4 {
  109. inline const char* operator()(const char* p, const char* limit,
  110. uint32_t* shared, uint32_t* non_shared) {
  111. // We need 2 bytes for shared and non_shared size. We also need one more
  112. // byte either for value size or the actual value in case of value delta
  113. // encoding.
  114. if (limit - p < 3) {
  115. return nullptr;
  116. }
  117. *shared = reinterpret_cast<const unsigned char*>(p)[0];
  118. *non_shared = reinterpret_cast<const unsigned char*>(p)[1];
  119. if ((*shared | *non_shared) < 128) {
  120. // Fast path: all three values are encoded in one byte each
  121. p += 2;
  122. } else {
  123. if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) {
  124. return nullptr;
  125. }
  126. if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) {
  127. return nullptr;
  128. }
  129. }
  130. return p;
  131. }
  132. };
  133. struct DecodeEntryV4 {
  134. inline const char* operator()(const char* p, const char* limit,
  135. uint32_t* shared, uint32_t* non_shared,
  136. uint32_t* value_length) {
  137. assert(value_length);
  138. *value_length = 0;
  139. return DecodeKeyV4()(p, limit, shared, non_shared);
  140. }
  141. };
  142. void DataBlockIter::NextImpl() {
  143. #ifndef NDEBUG
  144. if (TEST_Corrupt_Callback("DataBlockIter::NextImpl")) {
  145. return;
  146. }
  147. #endif
  148. bool is_shared = false;
  149. ParseNextDataKey(&is_shared);
  150. ++cur_entry_idx_;
  151. }
  152. void MetaBlockIter::NextImpl() {
  153. bool is_shared = false;
  154. ParseNextKey<CheckAndDecodeEntry>(&is_shared);
  155. ++cur_entry_idx_;
  156. }
  157. void IndexBlockIter::NextImpl() {
  158. ParseNextIndexKey();
  159. ++cur_entry_idx_;
  160. }
  161. void IndexBlockIter::PrevImpl() {
  162. assert(Valid());
  163. // Scan backwards to a restart point before current_
  164. const uint32_t original = current_;
  165. while (GetRestartPoint(restart_index_) >= original) {
  166. if (restart_index_ == 0) {
  167. // No more entries
  168. current_ = restarts_;
  169. restart_index_ = num_restarts_;
  170. return;
  171. }
  172. restart_index_--;
  173. }
  174. SeekToRestartPoint(restart_index_);
  175. // Loop until end of current entry hits the start of original entry
  176. while (ParseNextIndexKey() && NextEntryOffset() < original) {
  177. }
  178. --cur_entry_idx_;
  179. }
  180. void MetaBlockIter::PrevImpl() {
  181. assert(Valid());
  182. // Scan backwards to a restart point before current_
  183. const uint32_t original = current_;
  184. while (GetRestartPoint(restart_index_) >= original) {
  185. if (restart_index_ == 0) {
  186. // No more entries
  187. current_ = restarts_;
  188. restart_index_ = num_restarts_;
  189. return;
  190. }
  191. restart_index_--;
  192. }
  193. SeekToRestartPoint(restart_index_);
  194. bool is_shared = false;
  195. // Loop until end of current entry hits the start of original entry
  196. while (ParseNextKey<CheckAndDecodeEntry>(&is_shared) &&
  197. NextEntryOffset() < original) {
  198. }
  199. --cur_entry_idx_;
  200. }
  201. // Similar to IndexBlockIter::PrevImpl but also caches the prev entries
  202. void DataBlockIter::PrevImpl() {
  203. assert(Valid());
  204. assert(prev_entries_idx_ == -1 ||
  205. static_cast<size_t>(prev_entries_idx_) < prev_entries_.size());
  206. --cur_entry_idx_;
  207. // Check if we can use cached prev_entries_
  208. if (prev_entries_idx_ > 0 &&
  209. prev_entries_[prev_entries_idx_].offset == current_) {
  210. // Read cached CachedPrevEntry
  211. prev_entries_idx_--;
  212. const CachedPrevEntry& current_prev_entry =
  213. prev_entries_[prev_entries_idx_];
  214. const char* key_ptr = nullptr;
  215. bool raw_key_cached;
  216. if (current_prev_entry.key_ptr != nullptr) {
  217. // The key is not delta encoded and stored in the data block
  218. key_ptr = current_prev_entry.key_ptr;
  219. raw_key_cached = false;
  220. } else {
  221. // The key is delta encoded and stored in prev_entries_keys_buff_
  222. key_ptr = prev_entries_keys_buff_.data() + current_prev_entry.key_offset;
  223. raw_key_cached = true;
  224. }
  225. const Slice current_key(key_ptr, current_prev_entry.key_size);
  226. current_ = current_prev_entry.offset;
  227. // TODO(ajkr): the copy when `raw_key_cached` is done here for convenience,
  228. // not necessity. It is convenient since this class treats keys as pinned
  229. // when `raw_key_` points to an outside buffer. So we cannot allow
  230. // `raw_key_` point into Prev cache as it is a transient outside buffer
  231. // (i.e., keys in it are not actually pinned).
  232. raw_key_.SetKey(current_key, raw_key_cached /* copy */);
  233. value_ = current_prev_entry.value;
  234. return;
  235. }
  236. // Clear prev entries cache
  237. prev_entries_idx_ = -1;
  238. prev_entries_.clear();
  239. prev_entries_keys_buff_.clear();
  240. // Scan backwards to a restart point before current_
  241. const uint32_t original = current_;
  242. while (GetRestartPoint(restart_index_) >= original) {
  243. if (restart_index_ == 0) {
  244. // No more entries
  245. current_ = restarts_;
  246. restart_index_ = num_restarts_;
  247. return;
  248. }
  249. restart_index_--;
  250. }
  251. SeekToRestartPoint(restart_index_);
  252. do {
  253. bool is_shared = false;
  254. if (!ParseNextDataKey(&is_shared)) {
  255. break;
  256. }
  257. Slice current_key = raw_key_.GetKey();
  258. if (raw_key_.IsKeyPinned()) {
  259. // The key is not delta encoded
  260. prev_entries_.emplace_back(current_, current_key.data(), 0,
  261. current_key.size(), value());
  262. } else {
  263. // The key is delta encoded, cache decoded key in buffer
  264. size_t new_key_offset = prev_entries_keys_buff_.size();
  265. prev_entries_keys_buff_.append(current_key.data(), current_key.size());
  266. prev_entries_.emplace_back(current_, nullptr, new_key_offset,
  267. current_key.size(), value());
  268. }
  269. // Loop until end of current entry hits the start of original entry
  270. } while (NextEntryOffset() < original);
  271. prev_entries_idx_ = static_cast<int32_t>(prev_entries_.size()) - 1;
  272. }
  273. void DataBlockIter::SeekImpl(const Slice& target) {
  274. Slice seek_key = target;
  275. PERF_TIMER_GUARD(block_seek_nanos);
  276. if (data_ == nullptr) { // Not init yet
  277. return;
  278. }
  279. uint32_t index = 0;
  280. bool skip_linear_scan = false;
  281. bool ok = BinarySeek<DecodeKey>(seek_key, &index, &skip_linear_scan);
  282. if (!ok) {
  283. return;
  284. }
  285. FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan);
  286. }
  287. void MetaBlockIter::SeekImpl(const Slice& target) {
  288. Slice seek_key = target;
  289. PERF_TIMER_GUARD(block_seek_nanos);
  290. if (data_ == nullptr) { // Not init yet
  291. return;
  292. }
  293. uint32_t index = 0;
  294. bool skip_linear_scan = false;
  295. bool ok = BinarySeek<DecodeKey>(seek_key, &index, &skip_linear_scan);
  296. if (!ok) {
  297. return;
  298. }
  299. FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan);
  300. }
  301. // Optimized Seek for point lookup for an internal key `target`
  302. // target = "seek_user_key @ type | seqno".
  303. //
  304. // For any type other than kTypeValue, kTypeDeletion, kTypeSingleDeletion,
  305. // kTypeBlobIndex, kTypeWideColumnEntity, kTypeValuePreferredSeqno or
  306. // kTypeMerge, this function behaves identically to Seek().
  307. //
  308. // For any type in kTypeValue, kTypeDeletion, kTypeSingleDeletion,
  309. // kTypeBlobIndex, kTypeWideColumnEntity, kTypeValuePreferredSeqno or
  310. // kTypeMerge:
  311. //
  312. // If the return value is FALSE, iter location is undefined, and it means:
  313. // 1) there is no key in this block falling into the range:
  314. // ["seek_user_key @ type | seqno", "seek_user_key @ kTypeDeletion | 0"],
  315. // inclusive; AND
  316. // 2) the last key of this block has a greater user_key from seek_user_key
  317. //
  318. // If the return value is TRUE, iter location has two possibilities:
  319. // 1) If iter is valid, it is set to a location as if set by SeekImpl(target).
  320. // In this case, it points to the first key with a larger user_key or a
  321. // matching user_key with a seqno no greater than the seeking seqno.
  322. // 2) If the iter is invalid, it means that either all the user_key is less
  323. // than the seek_user_key, or the block ends with a matching user_key but
  324. // with a smaller [ type | seqno ] (i.e. a larger seqno, or the same seqno
  325. // but larger type).
  326. bool DataBlockIter::SeekForGetImpl(const Slice& target) {
  327. Slice target_user_key = ExtractUserKey(target);
  328. uint32_t map_offset = restarts_ + num_restarts_ * sizeof(uint32_t);
  329. uint8_t entry =
  330. data_block_hash_index_->Lookup(data_, map_offset, target_user_key);
  331. if (entry == kCollision) {
  332. // HashSeek not effective, falling back
  333. SeekImpl(target);
  334. return true;
  335. }
  336. if (entry == kNoEntry) {
  337. // Even if we cannot find the user_key in this block, the result may
  338. // exist in the next block. Consider this example:
  339. //
  340. // Block N: [aab@100, ... , app@120]
  341. // boundary key: axy@50 (we make minimal assumption about a boundary key)
  342. // Block N+1: [axy@10, ... ]
  343. //
  344. // If seek_key = axy@60, the search will start from Block N.
  345. // Even if the user_key is not found in the hash map, the caller still
  346. // have to continue searching the next block.
  347. //
  348. // In this case, we pretend the key is in the last restart interval.
  349. // The while-loop below will search the last restart interval for the
  350. // key. It will stop at the first key that is larger than the seek_key,
  351. // or to the end of the block if no one is larger.
  352. entry = static_cast<uint8_t>(num_restarts_ - 1);
  353. }
  354. uint32_t restart_index = entry;
  355. // check if the key is in the restart_interval
  356. assert(restart_index < num_restarts_);
  357. SeekToRestartPoint(restart_index);
  358. current_ = GetRestartPoint(restart_index);
  359. cur_entry_idx_ =
  360. static_cast<int32_t>(restart_index * block_restart_interval_) - 1;
  361. uint32_t limit = restarts_;
  362. if (restart_index + 1 < num_restarts_) {
  363. limit = GetRestartPoint(restart_index + 1);
  364. }
  365. while (current_ < limit) {
  366. ++cur_entry_idx_;
  367. bool shared;
  368. // Here we only linear seek the target key inside the restart interval.
  369. // If a key does not exist inside a restart interval, we avoid
  370. // further searching the block content across restart interval boundary.
  371. //
  372. // TODO(fwu): check the left and right boundary of the restart interval
  373. // to avoid linear seek a target key that is out of range.
  374. if (!ParseNextDataKey(&shared) || CompareCurrentKey(target) >= 0) {
  375. // we stop at the first potential matching user key.
  376. break;
  377. }
  378. // If the loop exits due to CompareCurrentKey(target) >= 0, then current key
  379. // exists, and its checksum verification will be done in UpdateKey() called
  380. // in SeekForGet().
  381. // TODO(cbi): If this loop exits with current_ == restart_, per key-value
  382. // checksum will not be verified in UpdateKey() since Valid()
  383. // will return false.
  384. }
  385. if (current_ == restarts_) {
  386. // Search reaches to the end of the block. There are three possibilities:
  387. // 1) there is only one user_key match in the block (otherwise collision).
  388. // the matching user_key resides in the last restart interval, and it
  389. // is the last key of the restart interval and of the block as well.
  390. // ParseNextKey() skipped it as its [ type | seqno ] is smaller.
  391. //
  392. // 2) The seek_key is not found in the HashIndex Lookup(), i.e. kNoEntry,
  393. // AND all existing user_keys in the restart interval are smaller than
  394. // seek_user_key.
  395. //
  396. // 3) The seek_key is a false positive and happens to be hashed to the
  397. // last restart interval, AND all existing user_keys in the restart
  398. // interval are smaller than seek_user_key.
  399. //
  400. // The result may exist in the next block each case, so we return true.
  401. return true;
  402. }
  403. if (icmp_->user_comparator()->Compare(raw_key_.GetUserKey(),
  404. target_user_key) != 0) {
  405. // the key is not in this block and cannot be at the next block either.
  406. return false;
  407. }
  408. // Here we are conservative and only support a limited set of cases
  409. ValueType value_type = ExtractValueType(raw_key_.GetInternalKey());
  410. if (value_type != ValueType::kTypeValue &&
  411. value_type != ValueType::kTypeDeletion &&
  412. value_type != ValueType::kTypeMerge &&
  413. value_type != ValueType::kTypeSingleDeletion &&
  414. value_type != ValueType::kTypeBlobIndex &&
  415. value_type != ValueType::kTypeWideColumnEntity &&
  416. value_type != ValueType::kTypeValuePreferredSeqno) {
  417. SeekImpl(target);
  418. }
  419. // Result found, and the iter is correctly set.
  420. return true;
  421. }
  422. void IndexBlockIter::SeekImpl(const Slice& target) {
  423. #ifndef NDEBUG
  424. if (TEST_Corrupt_Callback("IndexBlockIter::SeekImpl")) {
  425. return;
  426. }
  427. #endif
  428. TEST_SYNC_POINT("IndexBlockIter::Seek:0");
  429. PERF_TIMER_GUARD(block_seek_nanos);
  430. if (data_ == nullptr) { // Not init yet
  431. return;
  432. }
  433. Slice seek_key = target;
  434. if (raw_key_.IsUserKey()) {
  435. seek_key = ExtractUserKey(target);
  436. }
  437. status_ = Status::OK();
  438. uint32_t index = 0;
  439. bool skip_linear_scan = false;
  440. bool ok = false;
  441. if (prefix_index_) {
  442. bool prefix_may_exist = true;
  443. ok = PrefixSeek(target, &index, &prefix_may_exist);
  444. if (!prefix_may_exist) {
  445. // This is to let the caller to distinguish between non-existing prefix,
  446. // and when key is larger than the last key, which both set Valid() to
  447. // false.
  448. current_ = restarts_;
  449. status_ = Status::NotFound();
  450. }
  451. // restart interval must be one when hash search is enabled so the binary
  452. // search simply lands at the right place.
  453. skip_linear_scan = true;
  454. } else if (value_delta_encoded_) {
  455. ok = BinarySeek<DecodeKeyV4>(seek_key, &index, &skip_linear_scan);
  456. } else {
  457. ok = BinarySeek<DecodeKey>(seek_key, &index, &skip_linear_scan);
  458. }
  459. if (!ok) {
  460. return;
  461. }
  462. FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan);
  463. }
  464. void DataBlockIter::SeekForPrevImpl(const Slice& target) {
  465. PERF_TIMER_GUARD(block_seek_nanos);
  466. Slice seek_key = target;
  467. if (data_ == nullptr) { // Not init yet
  468. return;
  469. }
  470. uint32_t index = 0;
  471. bool skip_linear_scan = false;
  472. bool ok = BinarySeek<DecodeKey>(seek_key, &index, &skip_linear_scan);
  473. if (!ok) {
  474. return;
  475. }
  476. FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan);
  477. if (!Valid()) {
  478. if (status_.ok()) {
  479. SeekToLastImpl();
  480. }
  481. } else {
  482. while (Valid() && CompareCurrentKey(seek_key) > 0) {
  483. PrevImpl();
  484. }
  485. }
  486. }
  487. void MetaBlockIter::SeekForPrevImpl(const Slice& target) {
  488. PERF_TIMER_GUARD(block_seek_nanos);
  489. Slice seek_key = target;
  490. if (data_ == nullptr) { // Not init yet
  491. return;
  492. }
  493. uint32_t index = 0;
  494. bool skip_linear_scan = false;
  495. bool ok = BinarySeek<DecodeKey>(seek_key, &index, &skip_linear_scan);
  496. if (!ok) {
  497. return;
  498. }
  499. FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan);
  500. if (!Valid()) {
  501. if (status_.ok()) {
  502. SeekToLastImpl();
  503. }
  504. } else {
  505. while (Valid() && CompareCurrentKey(seek_key) > 0) {
  506. PrevImpl();
  507. }
  508. }
  509. }
  510. void DataBlockIter::SeekToFirstImpl() {
  511. if (data_ == nullptr) { // Not init yet
  512. return;
  513. }
  514. SeekToRestartPoint(0);
  515. bool is_shared = false;
  516. ParseNextDataKey(&is_shared);
  517. cur_entry_idx_ = 0;
  518. }
  519. void MetaBlockIter::SeekToFirstImpl() {
  520. if (data_ == nullptr) { // Not init yet
  521. return;
  522. }
  523. SeekToRestartPoint(0);
  524. bool is_shared = false;
  525. ParseNextKey<CheckAndDecodeEntry>(&is_shared);
  526. cur_entry_idx_ = 0;
  527. }
  528. void IndexBlockIter::SeekToFirstImpl() {
  529. #ifndef NDEBUG
  530. if (TEST_Corrupt_Callback("IndexBlockIter::SeekToFirstImpl")) {
  531. return;
  532. }
  533. #endif
  534. if (data_ == nullptr) { // Not init yet
  535. return;
  536. }
  537. status_ = Status::OK();
  538. SeekToRestartPoint(0);
  539. ParseNextIndexKey();
  540. cur_entry_idx_ = 0;
  541. }
  542. void DataBlockIter::SeekToLastImpl() {
  543. if (data_ == nullptr) { // Not init yet
  544. return;
  545. }
  546. SeekToRestartPoint(num_restarts_ - 1);
  547. bool is_shared = false;
  548. cur_entry_idx_ = (num_restarts_ - 1) * block_restart_interval_;
  549. while (ParseNextDataKey(&is_shared) && NextEntryOffset() < restarts_) {
  550. // Keep skipping
  551. ++cur_entry_idx_;
  552. }
  553. }
  554. void MetaBlockIter::SeekToLastImpl() {
  555. if (data_ == nullptr) { // Not init yet
  556. return;
  557. }
  558. SeekToRestartPoint(num_restarts_ - 1);
  559. bool is_shared = false;
  560. assert(num_restarts_ >= 1);
  561. cur_entry_idx_ =
  562. static_cast<int32_t>((num_restarts_ - 1) * block_restart_interval_);
  563. while (ParseNextKey<CheckAndDecodeEntry>(&is_shared) &&
  564. NextEntryOffset() < restarts_) {
  565. // Will probably never reach here since restart_interval is always 1
  566. ++cur_entry_idx_;
  567. }
  568. }
  569. void IndexBlockIter::SeekToLastImpl() {
  570. if (data_ == nullptr) { // Not init yet
  571. return;
  572. }
  573. status_ = Status::OK();
  574. SeekToRestartPoint(num_restarts_ - 1);
  575. cur_entry_idx_ = (num_restarts_ - 1) * block_restart_interval_;
  576. while (ParseNextIndexKey() && NextEntryOffset() < restarts_) {
  577. ++cur_entry_idx_;
  578. }
  579. }
  580. template <class TValue>
  581. template <typename DecodeEntryFunc>
  582. bool BlockIter<TValue>::ParseNextKey(bool* is_shared) {
  583. current_ = NextEntryOffset();
  584. const char* p = data_ + current_;
  585. const char* limit = data_ + restarts_; // Restarts come right after data
  586. if (p >= limit) {
  587. // No more entries to return. Mark as invalid.
  588. current_ = restarts_;
  589. restart_index_ = num_restarts_;
  590. return false;
  591. }
  592. // Decode next entry
  593. uint32_t shared, non_shared, value_length;
  594. p = DecodeEntryFunc()(p, limit, &shared, &non_shared, &value_length);
  595. if (p == nullptr || raw_key_.Size() < shared) {
  596. CorruptionError();
  597. return false;
  598. } else {
  599. if (shared == 0) {
  600. *is_shared = false;
  601. // If this key doesn't share any bytes with prev key, and no min timestamp
  602. // needs to be padded to the key, then we don't need to decode it and
  603. // can use its address in the block directly (no copy).
  604. UpdateRawKeyAndMaybePadMinTimestamp(Slice(p, non_shared));
  605. } else {
  606. // This key share `shared` bytes with prev key, we need to decode it
  607. *is_shared = true;
  608. // If user-defined timestamp is stripped from user key before keys are
  609. // delta encoded, the decoded key consisting of the shared and non shared
  610. // bytes do not have user-defined timestamp yet. We need to pad min
  611. // timestamp to it.
  612. if (pad_min_timestamp_) {
  613. raw_key_.TrimAppendWithTimestamp(shared, p, non_shared, ts_sz_);
  614. } else {
  615. raw_key_.TrimAppend(shared, p, non_shared);
  616. }
  617. }
  618. value_ = Slice(p + non_shared, value_length);
  619. if (shared == 0) {
  620. while (restart_index_ + 1 < num_restarts_ &&
  621. GetRestartPoint(restart_index_ + 1) < current_) {
  622. ++restart_index_;
  623. }
  624. }
  625. // else we are in the middle of a restart interval and the restart_index_
  626. // thus has not changed
  627. return true;
  628. }
  629. }
  630. bool DataBlockIter::ParseNextDataKey(bool* is_shared) {
  631. if (ParseNextKey<DecodeEntry>(is_shared)) {
  632. #ifndef NDEBUG
  633. if (global_seqno_ != kDisableGlobalSequenceNumber) {
  634. // If we are reading a file with a global sequence number we should
  635. // expect that all encoded sequence numbers are zeros and any value
  636. // type is kTypeValue, kTypeMerge, kTypeDeletion,
  637. // kTypeDeletionWithTimestamp, kTypeRangeDeletion, or
  638. // kTypeWideColumnEntity.
  639. uint64_t packed = ExtractInternalKeyFooter(raw_key_.GetKey());
  640. SequenceNumber seqno;
  641. ValueType value_type;
  642. UnPackSequenceAndType(packed, &seqno, &value_type);
  643. assert(value_type == ValueType::kTypeValue ||
  644. value_type == ValueType::kTypeMerge ||
  645. value_type == ValueType::kTypeDeletion ||
  646. value_type == ValueType::kTypeDeletionWithTimestamp ||
  647. value_type == ValueType::kTypeRangeDeletion ||
  648. value_type == ValueType::kTypeWideColumnEntity);
  649. assert(seqno == 0);
  650. }
  651. #endif // NDEBUG
  652. return true;
  653. } else {
  654. return false;
  655. }
  656. }
  657. bool IndexBlockIter::ParseNextIndexKey() {
  658. bool is_shared = false;
  659. bool ok = (value_delta_encoded_) ? ParseNextKey<DecodeEntryV4>(&is_shared)
  660. : ParseNextKey<DecodeEntry>(&is_shared);
  661. if (ok) {
  662. if (value_delta_encoded_ || global_seqno_state_ != nullptr ||
  663. pad_min_timestamp_) {
  664. DecodeCurrentValue(is_shared);
  665. }
  666. }
  667. return ok;
  668. }
  669. // The format:
  670. // restart_point 0: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
  671. // restart_point 1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
  672. // ...
  673. // restart_point n-1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
  674. // where, k is key, v is value, and its encoding is in parentheses.
  675. // The format of each key is (shared_size, non_shared_size, shared, non_shared)
  676. // The format of each value, i.e., block handle, is (offset, size) whenever the
  677. // is_shared is false, which included the first entry in each restart point.
  678. // Otherwise, the format is delta-size = the size of current block - the size o
  679. // last block.
  680. void IndexBlockIter::DecodeCurrentValue(bool is_shared) {
  681. Slice v(value_.data(), data_ + restarts_ - value_.data());
  682. // Delta encoding is used if `shared` != 0.
  683. Status decode_s __attribute__((__unused__)) = decoded_value_.DecodeFrom(
  684. &v, have_first_key_,
  685. (value_delta_encoded_ && is_shared) ? &decoded_value_.handle : nullptr);
  686. assert(decode_s.ok());
  687. value_ = Slice(value_.data(), v.data() - value_.data());
  688. if (global_seqno_state_ != nullptr) {
  689. // Overwrite sequence number the same way as in DataBlockIter.
  690. IterKey& first_internal_key = global_seqno_state_->first_internal_key;
  691. first_internal_key.SetInternalKey(decoded_value_.first_internal_key,
  692. /* copy */ true);
  693. assert(GetInternalKeySeqno(first_internal_key.GetInternalKey()) == 0);
  694. ValueType value_type = ExtractValueType(first_internal_key.GetKey());
  695. assert(value_type == ValueType::kTypeValue ||
  696. value_type == ValueType::kTypeMerge ||
  697. value_type == ValueType::kTypeDeletion ||
  698. value_type == ValueType::kTypeRangeDeletion ||
  699. value_type == ValueType::kTypeWideColumnEntity);
  700. first_internal_key.UpdateInternalKey(global_seqno_state_->global_seqno,
  701. value_type);
  702. decoded_value_.first_internal_key = first_internal_key.GetKey();
  703. }
  704. if (pad_min_timestamp_ && !decoded_value_.first_internal_key.empty()) {
  705. first_internal_key_with_ts_.clear();
  706. PadInternalKeyWithMinTimestamp(&first_internal_key_with_ts_,
  707. decoded_value_.first_internal_key, ts_sz_);
  708. decoded_value_.first_internal_key = first_internal_key_with_ts_;
  709. }
  710. }
  711. template <class TValue>
  712. void BlockIter<TValue>::FindKeyAfterBinarySeek(const Slice& target,
  713. uint32_t index,
  714. bool skip_linear_scan) {
  715. // SeekToRestartPoint() only does the lookup in the restart block. We need
  716. // to follow it up with NextImpl() to position the iterator at the restart
  717. // key.
  718. SeekToRestartPoint(index);
  719. cur_entry_idx_ = static_cast<int32_t>(index * block_restart_interval_) - 1;
  720. NextImpl();
  721. if (!skip_linear_scan) {
  722. // Linear search (within restart block) for first key >= target
  723. uint32_t max_offset;
  724. if (index + 1 < num_restarts_) {
  725. // We are in a non-last restart interval. Since `BinarySeek()` guarantees
  726. // the next restart key is strictly greater than `target`, we can
  727. // terminate upon reaching it without any additional key comparison.
  728. max_offset = GetRestartPoint(index + 1);
  729. } else {
  730. // We are in the last restart interval. The while-loop will terminate by
  731. // `Valid()` returning false upon advancing past the block's last key.
  732. max_offset = std::numeric_limits<uint32_t>::max();
  733. }
  734. while (true) {
  735. NextImpl();
  736. if (!Valid()) {
  737. // TODO(cbi): per key-value checksum will not be verified in UpdateKey()
  738. // since Valid() will returns false.
  739. break;
  740. }
  741. if (current_ == max_offset) {
  742. assert(CompareCurrentKey(target) > 0);
  743. break;
  744. } else if (CompareCurrentKey(target) >= 0) {
  745. break;
  746. }
  747. }
  748. }
  749. }
  750. // Binary searches in restart array to find the starting restart point for the
  751. // linear scan, and stores it in `*index`. Assumes restart array does not
  752. // contain duplicate keys. It is guaranteed that the restart key at `*index + 1`
  753. // is strictly greater than `target` or does not exist (this can be used to
  754. // elide a comparison when linear scan reaches all the way to the next restart
  755. // key). Furthermore, `*skip_linear_scan` is set to indicate whether the
  756. // `*index`th restart key is the final result so that key does not need to be
  757. // compared again later.
  758. template <class TValue>
  759. template <typename DecodeKeyFunc>
  760. bool BlockIter<TValue>::BinarySeek(const Slice& target, uint32_t* index,
  761. bool* skip_linear_scan) {
  762. if (restarts_ == 0) {
  763. // SST files dedicated to range tombstones are written with index blocks
  764. // that have no keys while also having `num_restarts_ == 1`. This would
  765. // cause a problem for `BinarySeek()` as it'd try to access the first key
  766. // which does not exist. We identify such blocks by the offset at which
  767. // their restarts are stored, and return false to prevent any attempted
  768. // key accesses.
  769. return false;
  770. }
  771. *skip_linear_scan = false;
  772. // Loop invariants:
  773. // - Restart key at index `left` is less than or equal to the target key. The
  774. // sentinel index `-1` is considered to have a key that is less than all
  775. // keys.
  776. // - Any restart keys after index `right` are strictly greater than the target
  777. // key.
  778. int64_t left = -1, right = num_restarts_ - 1;
  779. while (left != right) {
  780. // The `mid` is computed by rounding up so it lands in (`left`, `right`].
  781. int64_t mid = left + (right - left + 1) / 2;
  782. uint32_t region_offset = GetRestartPoint(static_cast<uint32_t>(mid));
  783. uint32_t shared, non_shared;
  784. const char* key_ptr = DecodeKeyFunc()(
  785. data_ + region_offset, data_ + restarts_, &shared, &non_shared);
  786. if (key_ptr == nullptr || (shared != 0)) {
  787. CorruptionError();
  788. return false;
  789. }
  790. Slice mid_key(key_ptr, non_shared);
  791. UpdateRawKeyAndMaybePadMinTimestamp(mid_key);
  792. int cmp = CompareCurrentKey(target);
  793. if (cmp < 0) {
  794. // Key at "mid" is smaller than "target". Therefore all
  795. // blocks before "mid" are uninteresting.
  796. left = mid;
  797. } else if (cmp > 0) {
  798. // Key at "mid" is >= "target". Therefore all blocks at or
  799. // after "mid" are uninteresting.
  800. right = mid - 1;
  801. } else {
  802. *skip_linear_scan = true;
  803. left = right = mid;
  804. }
  805. }
  806. if (left == -1) {
  807. // All keys in the block were strictly greater than `target`. So the very
  808. // first key in the block is the final seek result.
  809. *skip_linear_scan = true;
  810. *index = 0;
  811. } else {
  812. *index = static_cast<uint32_t>(left);
  813. }
  814. return true;
  815. }
  816. // Compare target key and the block key of the block of `block_index`.
  817. // Return -1 if error.
  818. int IndexBlockIter::CompareBlockKey(uint32_t block_index, const Slice& target) {
  819. uint32_t region_offset = GetRestartPoint(block_index);
  820. uint32_t shared, non_shared;
  821. const char* key_ptr =
  822. value_delta_encoded_
  823. ? DecodeKeyV4()(data_ + region_offset, data_ + restarts_, &shared,
  824. &non_shared)
  825. : DecodeKey()(data_ + region_offset, data_ + restarts_, &shared,
  826. &non_shared);
  827. if (key_ptr == nullptr || (shared != 0)) {
  828. CorruptionError();
  829. return 1; // Return target is smaller
  830. }
  831. Slice block_key(key_ptr, non_shared);
  832. UpdateRawKeyAndMaybePadMinTimestamp(block_key);
  833. return CompareCurrentKey(target);
  834. }
  835. // Binary search in block_ids to find the first block
  836. // with a key >= target
  837. bool IndexBlockIter::BinaryBlockIndexSeek(const Slice& target,
  838. uint32_t* block_ids, uint32_t left,
  839. uint32_t right, uint32_t* index,
  840. bool* prefix_may_exist) {
  841. assert(left <= right);
  842. assert(index);
  843. assert(prefix_may_exist);
  844. *prefix_may_exist = true;
  845. uint32_t left_bound = left;
  846. while (left <= right) {
  847. uint32_t mid = (right + left) / 2;
  848. int cmp = CompareBlockKey(block_ids[mid], target);
  849. if (!status_.ok()) {
  850. return false;
  851. }
  852. if (cmp < 0) {
  853. // Key at "target" is larger than "mid". Therefore all
  854. // blocks before or at "mid" are uninteresting.
  855. left = mid + 1;
  856. } else {
  857. // Key at "target" is <= "mid". Therefore all blocks
  858. // after "mid" are uninteresting.
  859. // If there is only one block left, we found it.
  860. if (left == right) {
  861. break;
  862. }
  863. right = mid;
  864. }
  865. }
  866. if (left == right) {
  867. // In one of the two following cases:
  868. // (1) left is the first one of block_ids
  869. // (2) there is a gap of blocks between block of `left` and `left-1`.
  870. // we can further distinguish the case of key in the block or key not
  871. // existing, by comparing the target key and the key of the previous
  872. // block to the left of the block found.
  873. if (block_ids[left] > 0 &&
  874. (left == left_bound || block_ids[left - 1] != block_ids[left] - 1) &&
  875. CompareBlockKey(block_ids[left] - 1, target) > 0) {
  876. current_ = restarts_;
  877. *prefix_may_exist = false;
  878. return false;
  879. }
  880. *index = block_ids[left];
  881. return true;
  882. } else {
  883. assert(left > right);
  884. // If the next block key is larger than seek key, it is possible that
  885. // no key shares the prefix with `target`, or all keys with the same
  886. // prefix as `target` are smaller than prefix. In the latter case,
  887. // we are mandated to set the position the same as the total order.
  888. // In the latter case, either:
  889. // (1) `target` falls into the range of the next block. In this case,
  890. // we can place the iterator to the next block, or
  891. // (2) `target` is larger than all block keys. In this case we can
  892. // keep the iterator invalidate without setting `prefix_may_exist`
  893. // to false.
  894. // We might sometimes end up with setting the total order position
  895. // while there is no key sharing the prefix as `target`, but it
  896. // still follows the contract.
  897. uint32_t right_index = block_ids[right];
  898. assert(right_index + 1 <= num_restarts_);
  899. if (right_index + 1 < num_restarts_) {
  900. if (CompareBlockKey(right_index + 1, target) >= 0) {
  901. *index = right_index + 1;
  902. return true;
  903. } else {
  904. // We have to set the flag here because we are not positioning
  905. // the iterator to the total order position.
  906. *prefix_may_exist = false;
  907. }
  908. }
  909. // Mark iterator invalid
  910. current_ = restarts_;
  911. return false;
  912. }
  913. }
  914. bool IndexBlockIter::PrefixSeek(const Slice& target, uint32_t* index,
  915. bool* prefix_may_exist) {
  916. assert(index);
  917. assert(prefix_may_exist);
  918. assert(prefix_index_);
  919. *prefix_may_exist = true;
  920. Slice seek_key = target;
  921. if (raw_key_.IsUserKey()) {
  922. seek_key = ExtractUserKey(target);
  923. }
  924. uint32_t* block_ids = nullptr;
  925. uint32_t num_blocks = prefix_index_->GetBlocks(target, &block_ids);
  926. if (num_blocks == 0) {
  927. current_ = restarts_;
  928. *prefix_may_exist = false;
  929. return false;
  930. } else {
  931. assert(block_ids);
  932. return BinaryBlockIndexSeek(seek_key, block_ids, 0, num_blocks - 1, index,
  933. prefix_may_exist);
  934. }
  935. }
  936. uint32_t Block::NumRestarts() const {
  937. assert(size() >= 2 * sizeof(uint32_t));
  938. uint32_t block_footer = DecodeFixed32(data() + size() - sizeof(uint32_t));
  939. uint32_t num_restarts = block_footer;
  940. if (size() > kMaxBlockSizeSupportedByHashIndex) {
  941. // In BlockBuilder, we have ensured a block with HashIndex is less than
  942. // kMaxBlockSizeSupportedByHashIndex (64KiB).
  943. //
  944. // Therefore, if we encounter a block with a size > 64KiB, the block
  945. // cannot have HashIndex. So the footer will directly interpreted as
  946. // num_restarts.
  947. //
  948. // Such check is for backward compatibility. We can ensure legacy block
  949. // with a vary large num_restarts i.e. >= 0x80000000 can be interpreted
  950. // correctly as no HashIndex even if the MSB of num_restarts is set.
  951. return num_restarts;
  952. }
  953. BlockBasedTableOptions::DataBlockIndexType index_type;
  954. UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts);
  955. return num_restarts;
  956. }
  957. BlockBasedTableOptions::DataBlockIndexType Block::IndexType() const {
  958. assert(size() >= 2 * sizeof(uint32_t));
  959. if (size() > kMaxBlockSizeSupportedByHashIndex) {
  960. // The check is for the same reason as that in NumRestarts()
  961. return BlockBasedTableOptions::kDataBlockBinarySearch;
  962. }
  963. uint32_t block_footer = DecodeFixed32(data() + size() - sizeof(uint32_t));
  964. uint32_t num_restarts = block_footer;
  965. BlockBasedTableOptions::DataBlockIndexType index_type;
  966. UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts);
  967. return index_type;
  968. }
  969. Block::~Block() {
  970. // This sync point can be re-enabled if RocksDB can control the
  971. // initialization order of any/all static options created by the user.
  972. // TEST_SYNC_POINT("Block::~Block");
  973. delete[] kv_checksum_;
  974. }
  975. Block::Block(BlockContents&& contents, size_t read_amp_bytes_per_bit,
  976. Statistics* statistics)
  977. : contents_(std::move(contents)), restart_offset_(0), num_restarts_(0) {
  978. TEST_SYNC_POINT("Block::Block:0");
  979. auto& size = contents_.data.size_;
  980. if (size < sizeof(uint32_t)) {
  981. size = 0; // Error marker
  982. } else {
  983. // Should only decode restart points for uncompressed blocks
  984. num_restarts_ = NumRestarts();
  985. switch (IndexType()) {
  986. case BlockBasedTableOptions::kDataBlockBinarySearch:
  987. restart_offset_ = static_cast<uint32_t>(size) -
  988. (1 + num_restarts_) * sizeof(uint32_t);
  989. if (restart_offset_ > size - sizeof(uint32_t)) {
  990. // The size is too small for NumRestarts() and therefore
  991. // restart_offset_ wrapped around.
  992. size = 0;
  993. }
  994. break;
  995. case BlockBasedTableOptions::kDataBlockBinaryAndHash:
  996. if (size < sizeof(uint32_t) /* block footer */ +
  997. sizeof(uint16_t) /* NUM_BUCK */) {
  998. size = 0;
  999. break;
  1000. }
  1001. uint16_t map_offset;
  1002. data_block_hash_index_.Initialize(
  1003. contents_.data.data(),
  1004. /* chop off NUM_RESTARTS */
  1005. static_cast<uint16_t>(size - sizeof(uint32_t)), &map_offset);
  1006. restart_offset_ = map_offset - num_restarts_ * sizeof(uint32_t);
  1007. if (restart_offset_ > map_offset) {
  1008. // map_offset is too small for NumRestarts() and
  1009. // therefore restart_offset_ wrapped around.
  1010. size = 0;
  1011. break;
  1012. }
  1013. break;
  1014. default:
  1015. size = 0; // Error marker
  1016. }
  1017. }
  1018. if (read_amp_bytes_per_bit != 0 && statistics && size != 0) {
  1019. read_amp_bitmap_.reset(new BlockReadAmpBitmap(
  1020. restart_offset_, read_amp_bytes_per_bit, statistics));
  1021. }
  1022. }
  1023. void Block::InitializeDataBlockProtectionInfo(uint8_t protection_bytes_per_key,
  1024. const Comparator* raw_ucmp) {
  1025. protection_bytes_per_key_ = 0;
  1026. if (protection_bytes_per_key > 0 && num_restarts_ > 0) {
  1027. // NewDataIterator() is called with protection_bytes_per_key_ = 0.
  1028. // This is intended since checksum is not constructed yet.
  1029. //
  1030. // We do not know global_seqno yet, so checksum computation and
  1031. // verification all assume global_seqno = 0.
  1032. // TODO(yuzhangyu): handle the implication of padding timestamp for kv
  1033. // protection.
  1034. std::unique_ptr<DataBlockIter> iter{NewDataIterator(
  1035. raw_ucmp, kDisableGlobalSequenceNumber, nullptr /* iter */,
  1036. nullptr /* stats */, true /* block_contents_pinned */,
  1037. true /* user_defined_timestamps_persisted */)};
  1038. if (iter->status().ok()) {
  1039. block_restart_interval_ = iter->GetRestartInterval();
  1040. }
  1041. uint32_t num_keys = 0;
  1042. if (iter->status().ok()) {
  1043. num_keys = iter->NumberOfKeys(block_restart_interval_);
  1044. }
  1045. if (iter->status().ok()) {
  1046. checksum_size_ = num_keys * protection_bytes_per_key;
  1047. kv_checksum_ = new char[(size_t)checksum_size_];
  1048. size_t i = 0;
  1049. iter->SeekToFirst();
  1050. while (iter->Valid()) {
  1051. GenerateKVChecksum(kv_checksum_ + i, protection_bytes_per_key,
  1052. iter->key(), iter->value());
  1053. iter->Next();
  1054. i += protection_bytes_per_key;
  1055. }
  1056. assert(!iter->status().ok() || i == num_keys * protection_bytes_per_key);
  1057. }
  1058. if (!iter->status().ok()) {
  1059. contents_.data.size_ = 0; // Error marker
  1060. return;
  1061. }
  1062. protection_bytes_per_key_ = protection_bytes_per_key;
  1063. }
  1064. }
  1065. void Block::InitializeIndexBlockProtectionInfo(uint8_t protection_bytes_per_key,
  1066. const Comparator* raw_ucmp,
  1067. bool value_is_full,
  1068. bool index_has_first_key) {
  1069. protection_bytes_per_key_ = 0;
  1070. if (num_restarts_ > 0 && protection_bytes_per_key > 0) {
  1071. // Note that `global_seqno` and `key_includes_seq` are hardcoded here. They
  1072. // do not impact how the index block is parsed. During checksum
  1073. // construction/verification, we use the entire key buffer from
  1074. // raw_key_.GetKey() returned by iter->key() as the `key` part of key-value
  1075. // checksum, and the content of this buffer do not change for different
  1076. // values of `global_seqno` or `key_includes_seq`.
  1077. // TODO(yuzhangyu): handle the implication of padding timestamp for kv
  1078. // protection.
  1079. std::unique_ptr<IndexBlockIter> iter{NewIndexIterator(
  1080. raw_ucmp, kDisableGlobalSequenceNumber /* global_seqno */, nullptr,
  1081. nullptr /* Statistics */, true /* total_order_seek */,
  1082. index_has_first_key /* have_first_key */, false /* key_includes_seq */,
  1083. value_is_full, true /* block_contents_pinned */,
  1084. true /* user_defined_timestamps_persisted*/,
  1085. nullptr /* prefix_index */)};
  1086. if (iter->status().ok()) {
  1087. block_restart_interval_ = iter->GetRestartInterval();
  1088. }
  1089. uint32_t num_keys = 0;
  1090. if (iter->status().ok()) {
  1091. num_keys = iter->NumberOfKeys(block_restart_interval_);
  1092. }
  1093. if (iter->status().ok()) {
  1094. checksum_size_ = num_keys * protection_bytes_per_key;
  1095. kv_checksum_ = new char[(size_t)checksum_size_];
  1096. iter->SeekToFirst();
  1097. size_t i = 0;
  1098. while (iter->Valid()) {
  1099. GenerateKVChecksum(kv_checksum_ + i, protection_bytes_per_key,
  1100. iter->key(), iter->raw_value());
  1101. iter->Next();
  1102. i += protection_bytes_per_key;
  1103. }
  1104. assert(!iter->status().ok() || i == num_keys * protection_bytes_per_key);
  1105. }
  1106. if (!iter->status().ok()) {
  1107. contents_.data.size_ = 0; // Error marker
  1108. return;
  1109. }
  1110. protection_bytes_per_key_ = protection_bytes_per_key;
  1111. }
  1112. }
  1113. void Block::InitializeMetaIndexBlockProtectionInfo(
  1114. uint8_t protection_bytes_per_key) {
  1115. protection_bytes_per_key_ = 0;
  1116. if (num_restarts_ > 0 && protection_bytes_per_key > 0) {
  1117. std::unique_ptr<MetaBlockIter> iter{
  1118. NewMetaIterator(true /* block_contents_pinned */)};
  1119. if (iter->status().ok()) {
  1120. block_restart_interval_ = iter->GetRestartInterval();
  1121. }
  1122. uint32_t num_keys = 0;
  1123. if (iter->status().ok()) {
  1124. num_keys = iter->NumberOfKeys(block_restart_interval_);
  1125. }
  1126. if (iter->status().ok()) {
  1127. checksum_size_ = num_keys * protection_bytes_per_key;
  1128. kv_checksum_ = new char[(size_t)checksum_size_];
  1129. iter->SeekToFirst();
  1130. size_t i = 0;
  1131. while (iter->Valid()) {
  1132. GenerateKVChecksum(kv_checksum_ + i, protection_bytes_per_key,
  1133. iter->key(), iter->value());
  1134. iter->Next();
  1135. i += protection_bytes_per_key;
  1136. }
  1137. assert(!iter->status().ok() || i == num_keys * protection_bytes_per_key);
  1138. }
  1139. if (!iter->status().ok()) {
  1140. contents_.data.size_ = 0; // Error marker
  1141. return;
  1142. }
  1143. protection_bytes_per_key_ = protection_bytes_per_key;
  1144. }
  1145. }
  1146. MetaBlockIter* Block::NewMetaIterator(bool block_contents_pinned) {
  1147. MetaBlockIter* iter = new MetaBlockIter();
  1148. if (size() < 2 * sizeof(uint32_t)) {
  1149. iter->Invalidate(Status::Corruption("bad block contents"));
  1150. return iter;
  1151. } else if (num_restarts_ == 0) {
  1152. // Empty block.
  1153. iter->Invalidate(Status::OK());
  1154. } else {
  1155. iter->Initialize(data(), restart_offset_, num_restarts_,
  1156. block_contents_pinned, protection_bytes_per_key_,
  1157. kv_checksum_, block_restart_interval_);
  1158. }
  1159. return iter;
  1160. }
  1161. DataBlockIter* Block::NewDataIterator(const Comparator* raw_ucmp,
  1162. SequenceNumber global_seqno,
  1163. DataBlockIter* iter, Statistics* stats,
  1164. bool block_contents_pinned,
  1165. bool user_defined_timestamps_persisted) {
  1166. DataBlockIter* ret_iter;
  1167. if (iter != nullptr) {
  1168. ret_iter = iter;
  1169. } else {
  1170. ret_iter = new DataBlockIter;
  1171. }
  1172. if (size() < 2 * sizeof(uint32_t)) {
  1173. ret_iter->Invalidate(Status::Corruption("bad block contents"));
  1174. return ret_iter;
  1175. }
  1176. if (num_restarts_ == 0) {
  1177. // Empty block.
  1178. ret_iter->Invalidate(Status::OK());
  1179. return ret_iter;
  1180. } else {
  1181. ret_iter->Initialize(
  1182. raw_ucmp, data(), restart_offset_, num_restarts_, global_seqno,
  1183. read_amp_bitmap_.get(), block_contents_pinned,
  1184. user_defined_timestamps_persisted,
  1185. data_block_hash_index_.Valid() ? &data_block_hash_index_ : nullptr,
  1186. protection_bytes_per_key_, kv_checksum_, block_restart_interval_);
  1187. if (read_amp_bitmap_) {
  1188. if (read_amp_bitmap_->GetStatistics() != stats) {
  1189. // DB changed the Statistics pointer, we need to notify read_amp_bitmap_
  1190. read_amp_bitmap_->SetStatistics(stats);
  1191. }
  1192. }
  1193. }
  1194. return ret_iter;
  1195. }
  1196. IndexBlockIter* Block::NewIndexIterator(
  1197. const Comparator* raw_ucmp, SequenceNumber global_seqno,
  1198. IndexBlockIter* iter, Statistics* /*stats*/, bool total_order_seek,
  1199. bool have_first_key, bool key_includes_seq, bool value_is_full,
  1200. bool block_contents_pinned, bool user_defined_timestamps_persisted,
  1201. BlockPrefixIndex* prefix_index) {
  1202. IndexBlockIter* ret_iter;
  1203. if (iter != nullptr) {
  1204. ret_iter = iter;
  1205. } else {
  1206. ret_iter = new IndexBlockIter;
  1207. }
  1208. if (size() < 2 * sizeof(uint32_t)) {
  1209. ret_iter->Invalidate(Status::Corruption("bad block contents"));
  1210. return ret_iter;
  1211. }
  1212. if (num_restarts_ == 0) {
  1213. // Empty block.
  1214. ret_iter->Invalidate(Status::OK());
  1215. return ret_iter;
  1216. } else {
  1217. BlockPrefixIndex* prefix_index_ptr =
  1218. total_order_seek ? nullptr : prefix_index;
  1219. ret_iter->Initialize(
  1220. raw_ucmp, data(), restart_offset_, num_restarts_, global_seqno,
  1221. prefix_index_ptr, have_first_key, key_includes_seq, value_is_full,
  1222. block_contents_pinned, user_defined_timestamps_persisted,
  1223. protection_bytes_per_key_, kv_checksum_, block_restart_interval_);
  1224. }
  1225. return ret_iter;
  1226. }
  1227. size_t Block::ApproximateMemoryUsage() const {
  1228. size_t usage = usable_size();
  1229. #ifdef ROCKSDB_MALLOC_USABLE_SIZE
  1230. usage += malloc_usable_size((void*)this);
  1231. #else
  1232. usage += sizeof(*this);
  1233. #endif // ROCKSDB_MALLOC_USABLE_SIZE
  1234. if (read_amp_bitmap_) {
  1235. usage += read_amp_bitmap_->ApproximateMemoryUsage();
  1236. }
  1237. usage += checksum_size_;
  1238. return usage;
  1239. }
  1240. } // namespace ROCKSDB_NAMESPACE