block.cc 34 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004
  1. // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. //
  6. // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
  7. // Use of this source code is governed by a BSD-style license that can be
  8. // found in the LICENSE file. See the AUTHORS file for names of contributors.
  9. //
  10. // Decodes the blocks generated by block_builder.cc.
  11. #include "table/block_based/block.h"
  12. #include <algorithm>
  13. #include <string>
  14. #include <unordered_map>
  15. #include <vector>
  16. #include "logging/logging.h"
  17. #include "monitoring/perf_context_imp.h"
  18. #include "port/port.h"
  19. #include "port/stack_trace.h"
  20. #include "rocksdb/comparator.h"
  21. #include "table/block_based/block_prefix_index.h"
  22. #include "table/block_based/data_block_footer.h"
  23. #include "table/format.h"
  24. #include "util/coding.h"
  25. namespace ROCKSDB_NAMESPACE {
  26. // Helper routine: decode the next block entry starting at "p",
  27. // storing the number of shared key bytes, non_shared key bytes,
  28. // and the length of the value in "*shared", "*non_shared", and
  29. // "*value_length", respectively. Will not derefence past "limit".
  30. //
  31. // If any errors are detected, returns nullptr. Otherwise, returns a
  32. // pointer to the key delta (just past the three decoded values).
  33. struct DecodeEntry {
  34. inline const char* operator()(const char* p, const char* limit,
  35. uint32_t* shared, uint32_t* non_shared,
  36. uint32_t* value_length) {
  37. // We need 2 bytes for shared and non_shared size. We also need one more
  38. // byte either for value size or the actual value in case of value delta
  39. // encoding.
  40. assert(limit - p >= 3);
  41. *shared = reinterpret_cast<const unsigned char*>(p)[0];
  42. *non_shared = reinterpret_cast<const unsigned char*>(p)[1];
  43. *value_length = reinterpret_cast<const unsigned char*>(p)[2];
  44. if ((*shared | *non_shared | *value_length) < 128) {
  45. // Fast path: all three values are encoded in one byte each
  46. p += 3;
  47. } else {
  48. if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr;
  49. if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr;
  50. if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) {
  51. return nullptr;
  52. }
  53. }
  54. // Using an assert in place of "return null" since we should not pay the
  55. // cost of checking for corruption on every single key decoding
  56. assert(!(static_cast<uint32_t>(limit - p) < (*non_shared + *value_length)));
  57. return p;
  58. }
  59. };
  60. // Helper routine: similar to DecodeEntry but does not have assertions.
  61. // Instead, returns nullptr so that caller can detect and report failure.
  62. struct CheckAndDecodeEntry {
  63. inline const char* operator()(const char* p, const char* limit,
  64. uint32_t* shared, uint32_t* non_shared,
  65. uint32_t* value_length) {
  66. // We need 2 bytes for shared and non_shared size. We also need one more
  67. // byte either for value size or the actual value in case of value delta
  68. // encoding.
  69. if (limit - p < 3) {
  70. return nullptr;
  71. }
  72. *shared = reinterpret_cast<const unsigned char*>(p)[0];
  73. *non_shared = reinterpret_cast<const unsigned char*>(p)[1];
  74. *value_length = reinterpret_cast<const unsigned char*>(p)[2];
  75. if ((*shared | *non_shared | *value_length) < 128) {
  76. // Fast path: all three values are encoded in one byte each
  77. p += 3;
  78. } else {
  79. if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr;
  80. if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr;
  81. if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) {
  82. return nullptr;
  83. }
  84. }
  85. if (static_cast<uint32_t>(limit - p) < (*non_shared + *value_length)) {
  86. return nullptr;
  87. }
  88. return p;
  89. }
  90. };
  91. struct DecodeKey {
  92. inline const char* operator()(const char* p, const char* limit,
  93. uint32_t* shared, uint32_t* non_shared) {
  94. uint32_t value_length;
  95. return DecodeEntry()(p, limit, shared, non_shared, &value_length);
  96. }
  97. };
  98. // In format_version 4, which is used by index blocks, the value size is not
  99. // encoded before the entry, as the value is known to be the handle with the
  100. // known size.
  101. struct DecodeKeyV4 {
  102. inline const char* operator()(const char* p, const char* limit,
  103. uint32_t* shared, uint32_t* non_shared) {
  104. // We need 2 bytes for shared and non_shared size. We also need one more
  105. // byte either for value size or the actual value in case of value delta
  106. // encoding.
  107. if (limit - p < 3) return nullptr;
  108. *shared = reinterpret_cast<const unsigned char*>(p)[0];
  109. *non_shared = reinterpret_cast<const unsigned char*>(p)[1];
  110. if ((*shared | *non_shared) < 128) {
  111. // Fast path: all three values are encoded in one byte each
  112. p += 2;
  113. } else {
  114. if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr;
  115. if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr;
  116. }
  117. return p;
  118. }
  119. };
  120. void DataBlockIter::Next() {
  121. assert(Valid());
  122. ParseNextDataKey<DecodeEntry>();
  123. }
  124. void DataBlockIter::NextOrReport() {
  125. assert(Valid());
  126. ParseNextDataKey<CheckAndDecodeEntry>();
  127. }
  128. void IndexBlockIter::Next() {
  129. assert(Valid());
  130. ParseNextIndexKey();
  131. }
  132. void IndexBlockIter::Prev() {
  133. assert(Valid());
  134. // Scan backwards to a restart point before current_
  135. const uint32_t original = current_;
  136. while (GetRestartPoint(restart_index_) >= original) {
  137. if (restart_index_ == 0) {
  138. // No more entries
  139. current_ = restarts_;
  140. restart_index_ = num_restarts_;
  141. return;
  142. }
  143. restart_index_--;
  144. }
  145. SeekToRestartPoint(restart_index_);
  146. // Loop until end of current entry hits the start of original entry
  147. while (ParseNextIndexKey() && NextEntryOffset() < original) {
  148. }
  149. }
  150. // Similar to IndexBlockIter::Prev but also caches the prev entries
  151. void DataBlockIter::Prev() {
  152. assert(Valid());
  153. assert(prev_entries_idx_ == -1 ||
  154. static_cast<size_t>(prev_entries_idx_) < prev_entries_.size());
  155. // Check if we can use cached prev_entries_
  156. if (prev_entries_idx_ > 0 &&
  157. prev_entries_[prev_entries_idx_].offset == current_) {
  158. // Read cached CachedPrevEntry
  159. prev_entries_idx_--;
  160. const CachedPrevEntry& current_prev_entry =
  161. prev_entries_[prev_entries_idx_];
  162. const char* key_ptr = nullptr;
  163. if (current_prev_entry.key_ptr != nullptr) {
  164. // The key is not delta encoded and stored in the data block
  165. key_ptr = current_prev_entry.key_ptr;
  166. key_pinned_ = true;
  167. } else {
  168. // The key is delta encoded and stored in prev_entries_keys_buff_
  169. key_ptr = prev_entries_keys_buff_.data() + current_prev_entry.key_offset;
  170. key_pinned_ = false;
  171. }
  172. const Slice current_key(key_ptr, current_prev_entry.key_size);
  173. current_ = current_prev_entry.offset;
  174. key_.SetKey(current_key, false /* copy */);
  175. value_ = current_prev_entry.value;
  176. return;
  177. }
  178. // Clear prev entries cache
  179. prev_entries_idx_ = -1;
  180. prev_entries_.clear();
  181. prev_entries_keys_buff_.clear();
  182. // Scan backwards to a restart point before current_
  183. const uint32_t original = current_;
  184. while (GetRestartPoint(restart_index_) >= original) {
  185. if (restart_index_ == 0) {
  186. // No more entries
  187. current_ = restarts_;
  188. restart_index_ = num_restarts_;
  189. return;
  190. }
  191. restart_index_--;
  192. }
  193. SeekToRestartPoint(restart_index_);
  194. do {
  195. if (!ParseNextDataKey<DecodeEntry>()) {
  196. break;
  197. }
  198. Slice current_key = key();
  199. if (key_.IsKeyPinned()) {
  200. // The key is not delta encoded
  201. prev_entries_.emplace_back(current_, current_key.data(), 0,
  202. current_key.size(), value());
  203. } else {
  204. // The key is delta encoded, cache decoded key in buffer
  205. size_t new_key_offset = prev_entries_keys_buff_.size();
  206. prev_entries_keys_buff_.append(current_key.data(), current_key.size());
  207. prev_entries_.emplace_back(current_, nullptr, new_key_offset,
  208. current_key.size(), value());
  209. }
  210. // Loop until end of current entry hits the start of original entry
  211. } while (NextEntryOffset() < original);
  212. prev_entries_idx_ = static_cast<int32_t>(prev_entries_.size()) - 1;
  213. }
  214. void DataBlockIter::Seek(const Slice& target) {
  215. Slice seek_key = target;
  216. PERF_TIMER_GUARD(block_seek_nanos);
  217. if (data_ == nullptr) { // Not init yet
  218. return;
  219. }
  220. uint32_t index = 0;
  221. bool ok = BinarySeek<DecodeKey>(seek_key, 0, num_restarts_ - 1, &index,
  222. comparator_);
  223. if (!ok) {
  224. return;
  225. }
  226. SeekToRestartPoint(index);
  227. // Linear search (within restart block) for first key >= target
  228. while (ParseNextDataKey<DecodeEntry>() && Compare(key_, seek_key) < 0) {
  229. }
  230. }
  231. // Optimized Seek for point lookup for an internal key `target`
  232. // target = "seek_user_key @ type | seqno".
  233. //
  234. // For any type other than kTypeValue, kTypeDeletion, kTypeSingleDeletion,
  235. // or kTypeBlobIndex, this function behaves identically as Seek().
  236. //
  237. // For any type in kTypeValue, kTypeDeletion, kTypeSingleDeletion,
  238. // or kTypeBlobIndex:
  239. //
  240. // If the return value is FALSE, iter location is undefined, and it means:
  241. // 1) there is no key in this block falling into the range:
  242. // ["seek_user_key @ type | seqno", "seek_user_key @ kTypeDeletion | 0"],
  243. // inclusive; AND
  244. // 2) the last key of this block has a greater user_key from seek_user_key
  245. //
  246. // If the return value is TRUE, iter location has two possibilies:
  247. // 1) If iter is valid, it is set to a location as if set by BinarySeek. In
  248. // this case, it points to the first key_ with a larger user_key or a
  249. // matching user_key with a seqno no greater than the seeking seqno.
  250. // 2) If the iter is invalid, it means that either all the user_key is less
  251. // than the seek_user_key, or the block ends with a matching user_key but
  252. // with a smaller [ type | seqno ] (i.e. a larger seqno, or the same seqno
  253. // but larger type).
  254. bool DataBlockIter::SeekForGetImpl(const Slice& target) {
  255. Slice target_user_key = ExtractUserKey(target);
  256. uint32_t map_offset = restarts_ + num_restarts_ * sizeof(uint32_t);
  257. uint8_t entry =
  258. data_block_hash_index_->Lookup(data_, map_offset, target_user_key);
  259. if (entry == kCollision) {
  260. // HashSeek not effective, falling back
  261. Seek(target);
  262. return true;
  263. }
  264. if (entry == kNoEntry) {
  265. // Even if we cannot find the user_key in this block, the result may
  266. // exist in the next block. Consider this exmpale:
  267. //
  268. // Block N: [aab@100, ... , app@120]
  269. // bounary key: axy@50 (we make minimal assumption about a boundary key)
  270. // Block N+1: [axy@10, ... ]
  271. //
  272. // If seek_key = axy@60, the search will starts from Block N.
  273. // Even if the user_key is not found in the hash map, the caller still
  274. // have to conntinue searching the next block.
  275. //
  276. // In this case, we pretend the key is the the last restart interval.
  277. // The while-loop below will search the last restart interval for the
  278. // key. It will stop at the first key that is larger than the seek_key,
  279. // or to the end of the block if no one is larger.
  280. entry = static_cast<uint8_t>(num_restarts_ - 1);
  281. }
  282. uint32_t restart_index = entry;
  283. // check if the key is in the restart_interval
  284. assert(restart_index < num_restarts_);
  285. SeekToRestartPoint(restart_index);
  286. const char* limit = nullptr;
  287. if (restart_index_ + 1 < num_restarts_) {
  288. limit = data_ + GetRestartPoint(restart_index_ + 1);
  289. } else {
  290. limit = data_ + restarts_;
  291. }
  292. while (true) {
  293. // Here we only linear seek the target key inside the restart interval.
  294. // If a key does not exist inside a restart interval, we avoid
  295. // further searching the block content accross restart interval boundary.
  296. //
  297. // TODO(fwu): check the left and write boundary of the restart interval
  298. // to avoid linear seek a target key that is out of range.
  299. if (!ParseNextDataKey<DecodeEntry>(limit) || Compare(key_, target) >= 0) {
  300. // we stop at the first potential matching user key.
  301. break;
  302. }
  303. }
  304. if (current_ == restarts_) {
  305. // Search reaches to the end of the block. There are three possibilites:
  306. // 1) there is only one user_key match in the block (otherwise collsion).
  307. // the matching user_key resides in the last restart interval, and it
  308. // is the last key of the restart interval and of the block as well.
  309. // ParseNextDataKey() skiped it as its [ type | seqno ] is smaller.
  310. //
  311. // 2) The seek_key is not found in the HashIndex Lookup(), i.e. kNoEntry,
  312. // AND all existing user_keys in the restart interval are smaller than
  313. // seek_user_key.
  314. //
  315. // 3) The seek_key is a false positive and happens to be hashed to the
  316. // last restart interval, AND all existing user_keys in the restart
  317. // interval are smaller than seek_user_key.
  318. //
  319. // The result may exist in the next block each case, so we return true.
  320. return true;
  321. }
  322. if (user_comparator_->Compare(key_.GetUserKey(), target_user_key) != 0) {
  323. // the key is not in this block and cannot be at the next block either.
  324. return false;
  325. }
  326. // Here we are conservative and only support a limited set of cases
  327. ValueType value_type = ExtractValueType(key_.GetKey());
  328. if (value_type != ValueType::kTypeValue &&
  329. value_type != ValueType::kTypeDeletion &&
  330. value_type != ValueType::kTypeSingleDeletion &&
  331. value_type != ValueType::kTypeBlobIndex) {
  332. Seek(target);
  333. return true;
  334. }
  335. // Result found, and the iter is correctly set.
  336. return true;
  337. }
  338. void IndexBlockIter::Seek(const Slice& target) {
  339. TEST_SYNC_POINT("IndexBlockIter::Seek:0");
  340. Slice seek_key = target;
  341. if (!key_includes_seq_) {
  342. seek_key = ExtractUserKey(target);
  343. }
  344. PERF_TIMER_GUARD(block_seek_nanos);
  345. if (data_ == nullptr) { // Not init yet
  346. return;
  347. }
  348. status_ = Status::OK();
  349. uint32_t index = 0;
  350. bool ok = false;
  351. if (prefix_index_) {
  352. bool prefix_may_exist = true;
  353. ok = PrefixSeek(target, &index, &prefix_may_exist);
  354. if (!prefix_may_exist) {
  355. // This is to let the caller to distinguish between non-existing prefix,
  356. // and when key is larger than the last key, which both set Valid() to
  357. // false.
  358. current_ = restarts_;
  359. status_ = Status::NotFound();
  360. }
  361. } else if (value_delta_encoded_) {
  362. ok = BinarySeek<DecodeKeyV4>(seek_key, 0, num_restarts_ - 1, &index,
  363. comparator_);
  364. } else {
  365. ok = BinarySeek<DecodeKey>(seek_key, 0, num_restarts_ - 1, &index,
  366. comparator_);
  367. }
  368. if (!ok) {
  369. return;
  370. }
  371. SeekToRestartPoint(index);
  372. // Linear search (within restart block) for first key >= target
  373. while (ParseNextIndexKey() && Compare(key_, seek_key) < 0) {
  374. }
  375. }
  376. void DataBlockIter::SeekForPrev(const Slice& target) {
  377. PERF_TIMER_GUARD(block_seek_nanos);
  378. Slice seek_key = target;
  379. if (data_ == nullptr) { // Not init yet
  380. return;
  381. }
  382. uint32_t index = 0;
  383. bool ok = BinarySeek<DecodeKey>(seek_key, 0, num_restarts_ - 1, &index,
  384. comparator_);
  385. if (!ok) {
  386. return;
  387. }
  388. SeekToRestartPoint(index);
  389. // Linear search (within restart block) for first key >= seek_key
  390. while (ParseNextDataKey<DecodeEntry>() && Compare(key_, seek_key) < 0) {
  391. }
  392. if (!Valid()) {
  393. SeekToLast();
  394. } else {
  395. while (Valid() && Compare(key_, seek_key) > 0) {
  396. Prev();
  397. }
  398. }
  399. }
  400. void DataBlockIter::SeekToFirst() {
  401. if (data_ == nullptr) { // Not init yet
  402. return;
  403. }
  404. SeekToRestartPoint(0);
  405. ParseNextDataKey<DecodeEntry>();
  406. }
  407. void DataBlockIter::SeekToFirstOrReport() {
  408. if (data_ == nullptr) { // Not init yet
  409. return;
  410. }
  411. SeekToRestartPoint(0);
  412. ParseNextDataKey<CheckAndDecodeEntry>();
  413. }
  414. void IndexBlockIter::SeekToFirst() {
  415. if (data_ == nullptr) { // Not init yet
  416. return;
  417. }
  418. status_ = Status::OK();
  419. SeekToRestartPoint(0);
  420. ParseNextIndexKey();
  421. }
  422. void DataBlockIter::SeekToLast() {
  423. if (data_ == nullptr) { // Not init yet
  424. return;
  425. }
  426. SeekToRestartPoint(num_restarts_ - 1);
  427. while (ParseNextDataKey<DecodeEntry>() && NextEntryOffset() < restarts_) {
  428. // Keep skipping
  429. }
  430. }
  431. void IndexBlockIter::SeekToLast() {
  432. if (data_ == nullptr) { // Not init yet
  433. return;
  434. }
  435. status_ = Status::OK();
  436. SeekToRestartPoint(num_restarts_ - 1);
  437. while (ParseNextIndexKey() && NextEntryOffset() < restarts_) {
  438. // Keep skipping
  439. }
  440. }
  441. template <class TValue>
  442. void BlockIter<TValue>::CorruptionError() {
  443. current_ = restarts_;
  444. restart_index_ = num_restarts_;
  445. status_ = Status::Corruption("bad entry in block");
  446. key_.Clear();
  447. value_.clear();
  448. }
  449. template <typename DecodeEntryFunc>
  450. bool DataBlockIter::ParseNextDataKey(const char* limit) {
  451. current_ = NextEntryOffset();
  452. const char* p = data_ + current_;
  453. if (!limit) {
  454. limit = data_ + restarts_; // Restarts come right after data
  455. }
  456. if (p >= limit) {
  457. // No more entries to return. Mark as invalid.
  458. current_ = restarts_;
  459. restart_index_ = num_restarts_;
  460. return false;
  461. }
  462. // Decode next entry
  463. uint32_t shared, non_shared, value_length;
  464. p = DecodeEntryFunc()(p, limit, &shared, &non_shared, &value_length);
  465. if (p == nullptr || key_.Size() < shared) {
  466. CorruptionError();
  467. return false;
  468. } else {
  469. if (shared == 0) {
  470. // If this key dont share any bytes with prev key then we dont need
  471. // to decode it and can use it's address in the block directly.
  472. key_.SetKey(Slice(p, non_shared), false /* copy */);
  473. key_pinned_ = true;
  474. } else {
  475. // This key share `shared` bytes with prev key, we need to decode it
  476. key_.TrimAppend(shared, p, non_shared);
  477. key_pinned_ = false;
  478. }
  479. if (global_seqno_ != kDisableGlobalSequenceNumber) {
  480. // If we are reading a file with a global sequence number we should
  481. // expect that all encoded sequence numbers are zeros and any value
  482. // type is kTypeValue, kTypeMerge, kTypeDeletion, or kTypeRangeDeletion.
  483. assert(GetInternalKeySeqno(key_.GetInternalKey()) == 0);
  484. ValueType value_type = ExtractValueType(key_.GetKey());
  485. assert(value_type == ValueType::kTypeValue ||
  486. value_type == ValueType::kTypeMerge ||
  487. value_type == ValueType::kTypeDeletion ||
  488. value_type == ValueType::kTypeRangeDeletion);
  489. if (key_pinned_) {
  490. // TODO(tec): Investigate updating the seqno in the loaded block
  491. // directly instead of doing a copy and update.
  492. // We cannot use the key address in the block directly because
  493. // we have a global_seqno_ that will overwrite the encoded one.
  494. key_.OwnKey();
  495. key_pinned_ = false;
  496. }
  497. key_.UpdateInternalKey(global_seqno_, value_type);
  498. }
  499. value_ = Slice(p + non_shared, value_length);
  500. if (shared == 0) {
  501. while (restart_index_ + 1 < num_restarts_ &&
  502. GetRestartPoint(restart_index_ + 1) < current_) {
  503. ++restart_index_;
  504. }
  505. }
  506. // else we are in the middle of a restart interval and the restart_index_
  507. // thus has not changed
  508. return true;
  509. }
  510. }
  511. bool IndexBlockIter::ParseNextIndexKey() {
  512. current_ = NextEntryOffset();
  513. const char* p = data_ + current_;
  514. const char* limit = data_ + restarts_; // Restarts come right after data
  515. if (p >= limit) {
  516. // No more entries to return. Mark as invalid.
  517. current_ = restarts_;
  518. restart_index_ = num_restarts_;
  519. return false;
  520. }
  521. // Decode next entry
  522. uint32_t shared, non_shared, value_length;
  523. if (value_delta_encoded_) {
  524. p = DecodeKeyV4()(p, limit, &shared, &non_shared);
  525. value_length = 0;
  526. } else {
  527. p = DecodeEntry()(p, limit, &shared, &non_shared, &value_length);
  528. }
  529. if (p == nullptr || key_.Size() < shared) {
  530. CorruptionError();
  531. return false;
  532. }
  533. if (shared == 0) {
  534. // If this key dont share any bytes with prev key then we dont need
  535. // to decode it and can use it's address in the block directly.
  536. key_.SetKey(Slice(p, non_shared), false /* copy */);
  537. key_pinned_ = true;
  538. } else {
  539. // This key share `shared` bytes with prev key, we need to decode it
  540. key_.TrimAppend(shared, p, non_shared);
  541. key_pinned_ = false;
  542. }
  543. value_ = Slice(p + non_shared, value_length);
  544. if (shared == 0) {
  545. while (restart_index_ + 1 < num_restarts_ &&
  546. GetRestartPoint(restart_index_ + 1) < current_) {
  547. ++restart_index_;
  548. }
  549. }
  550. // else we are in the middle of a restart interval and the restart_index_
  551. // thus has not changed
  552. if (value_delta_encoded_ || global_seqno_state_ != nullptr) {
  553. DecodeCurrentValue(shared);
  554. }
  555. return true;
  556. }
  557. // The format:
  558. // restart_point 0: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
  559. // restart_point 1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
  560. // ...
  561. // restart_point n-1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
  562. // where, k is key, v is value, and its encoding is in parenthesis.
  563. // The format of each key is (shared_size, non_shared_size, shared, non_shared)
  564. // The format of each value, i.e., block hanlde, is (offset, size) whenever the
  565. // shared_size is 0, which included the first entry in each restart point.
  566. // Otherwise the format is delta-size = block handle size - size of last block
  567. // handle.
  568. void IndexBlockIter::DecodeCurrentValue(uint32_t shared) {
  569. Slice v(value_.data(), data_ + restarts_ - value_.data());
  570. // Delta encoding is used if `shared` != 0.
  571. Status decode_s __attribute__((__unused__)) = decoded_value_.DecodeFrom(
  572. &v, have_first_key_,
  573. (value_delta_encoded_ && shared) ? &decoded_value_.handle : nullptr);
  574. assert(decode_s.ok());
  575. value_ = Slice(value_.data(), v.data() - value_.data());
  576. if (global_seqno_state_ != nullptr) {
  577. // Overwrite sequence number the same way as in DataBlockIter.
  578. IterKey& first_internal_key = global_seqno_state_->first_internal_key;
  579. first_internal_key.SetInternalKey(decoded_value_.first_internal_key,
  580. /* copy */ true);
  581. assert(GetInternalKeySeqno(first_internal_key.GetInternalKey()) == 0);
  582. ValueType value_type = ExtractValueType(first_internal_key.GetKey());
  583. assert(value_type == ValueType::kTypeValue ||
  584. value_type == ValueType::kTypeMerge ||
  585. value_type == ValueType::kTypeDeletion ||
  586. value_type == ValueType::kTypeRangeDeletion);
  587. first_internal_key.UpdateInternalKey(global_seqno_state_->global_seqno,
  588. value_type);
  589. decoded_value_.first_internal_key = first_internal_key.GetKey();
  590. }
  591. }
  592. // Binary search in restart array to find the first restart point that
  593. // is either the last restart point with a key less than target,
  594. // which means the key of next restart point is larger than target, or
  595. // the first restart point with a key = target
  596. template <class TValue>
  597. template <typename DecodeKeyFunc>
  598. bool BlockIter<TValue>::BinarySeek(const Slice& target, uint32_t left,
  599. uint32_t right, uint32_t* index,
  600. const Comparator* comp) {
  601. assert(left <= right);
  602. while (left < right) {
  603. uint32_t mid = (left + right + 1) / 2;
  604. uint32_t region_offset = GetRestartPoint(mid);
  605. uint32_t shared, non_shared;
  606. const char* key_ptr = DecodeKeyFunc()(
  607. data_ + region_offset, data_ + restarts_, &shared, &non_shared);
  608. if (key_ptr == nullptr || (shared != 0)) {
  609. CorruptionError();
  610. return false;
  611. }
  612. Slice mid_key(key_ptr, non_shared);
  613. int cmp = comp->Compare(mid_key, target);
  614. if (cmp < 0) {
  615. // Key at "mid" is smaller than "target". Therefore all
  616. // blocks before "mid" are uninteresting.
  617. left = mid;
  618. } else if (cmp > 0) {
  619. // Key at "mid" is >= "target". Therefore all blocks at or
  620. // after "mid" are uninteresting.
  621. right = mid - 1;
  622. } else {
  623. left = right = mid;
  624. }
  625. }
  626. *index = left;
  627. return true;
  628. }
  629. // Compare target key and the block key of the block of `block_index`.
  630. // Return -1 if error.
  631. int IndexBlockIter::CompareBlockKey(uint32_t block_index, const Slice& target) {
  632. uint32_t region_offset = GetRestartPoint(block_index);
  633. uint32_t shared, non_shared;
  634. const char* key_ptr =
  635. value_delta_encoded_
  636. ? DecodeKeyV4()(data_ + region_offset, data_ + restarts_, &shared,
  637. &non_shared)
  638. : DecodeKey()(data_ + region_offset, data_ + restarts_, &shared,
  639. &non_shared);
  640. if (key_ptr == nullptr || (shared != 0)) {
  641. CorruptionError();
  642. return 1; // Return target is smaller
  643. }
  644. Slice block_key(key_ptr, non_shared);
  645. return Compare(block_key, target);
  646. }
  647. // Binary search in block_ids to find the first block
  648. // with a key >= target
  649. bool IndexBlockIter::BinaryBlockIndexSeek(const Slice& target,
  650. uint32_t* block_ids, uint32_t left,
  651. uint32_t right, uint32_t* index,
  652. bool* prefix_may_exist) {
  653. assert(left <= right);
  654. assert(index);
  655. assert(prefix_may_exist);
  656. *prefix_may_exist = true;
  657. uint32_t left_bound = left;
  658. while (left <= right) {
  659. uint32_t mid = (right + left) / 2;
  660. int cmp = CompareBlockKey(block_ids[mid], target);
  661. if (!status_.ok()) {
  662. return false;
  663. }
  664. if (cmp < 0) {
  665. // Key at "target" is larger than "mid". Therefore all
  666. // blocks before or at "mid" are uninteresting.
  667. left = mid + 1;
  668. } else {
  669. // Key at "target" is <= "mid". Therefore all blocks
  670. // after "mid" are uninteresting.
  671. // If there is only one block left, we found it.
  672. if (left == right) break;
  673. right = mid;
  674. }
  675. }
  676. if (left == right) {
  677. // In one of the two following cases:
  678. // (1) left is the first one of block_ids
  679. // (2) there is a gap of blocks between block of `left` and `left-1`.
  680. // we can further distinguish the case of key in the block or key not
  681. // existing, by comparing the target key and the key of the previous
  682. // block to the left of the block found.
  683. if (block_ids[left] > 0 &&
  684. (left == left_bound || block_ids[left - 1] != block_ids[left] - 1) &&
  685. CompareBlockKey(block_ids[left] - 1, target) > 0) {
  686. current_ = restarts_;
  687. *prefix_may_exist = false;
  688. return false;
  689. }
  690. *index = block_ids[left];
  691. return true;
  692. } else {
  693. assert(left > right);
  694. // If the next block key is larger than seek key, it is possible that
  695. // no key shares the prefix with `target`, or all keys with the same
  696. // prefix as `target` are smaller than prefix. In the latter case,
  697. // we are mandated to set the position the same as the total order.
  698. // In the latter case, either:
  699. // (1) `target` falls into the range of the next block. In this case,
  700. // we can place the iterator to the next block, or
  701. // (2) `target` is larger than all block keys. In this case we can
  702. // keep the iterator invalidate without setting `prefix_may_exist`
  703. // to false.
  704. // We might sometimes end up with setting the total order position
  705. // while there is no key sharing the prefix as `target`, but it
  706. // still follows the contract.
  707. uint32_t right_index = block_ids[right];
  708. assert(right_index + 1 <= num_restarts_);
  709. if (right_index + 1 < num_restarts_) {
  710. if (CompareBlockKey(right_index + 1, target) >= 0) {
  711. *index = right_index + 1;
  712. return true;
  713. } else {
  714. // We have to set the flag here because we are not positioning
  715. // the iterator to the total order position.
  716. *prefix_may_exist = false;
  717. }
  718. }
  719. // Mark iterator invalid
  720. current_ = restarts_;
  721. return false;
  722. }
  723. }
  724. bool IndexBlockIter::PrefixSeek(const Slice& target, uint32_t* index,
  725. bool* prefix_may_exist) {
  726. assert(index);
  727. assert(prefix_may_exist);
  728. assert(prefix_index_);
  729. *prefix_may_exist = true;
  730. Slice seek_key = target;
  731. if (!key_includes_seq_) {
  732. seek_key = ExtractUserKey(target);
  733. }
  734. uint32_t* block_ids = nullptr;
  735. uint32_t num_blocks = prefix_index_->GetBlocks(target, &block_ids);
  736. if (num_blocks == 0) {
  737. current_ = restarts_;
  738. *prefix_may_exist = false;
  739. return false;
  740. } else {
  741. assert(block_ids);
  742. return BinaryBlockIndexSeek(seek_key, block_ids, 0, num_blocks - 1, index,
  743. prefix_may_exist);
  744. }
  745. }
  746. uint32_t Block::NumRestarts() const {
  747. assert(size_ >= 2 * sizeof(uint32_t));
  748. uint32_t block_footer = DecodeFixed32(data_ + size_ - sizeof(uint32_t));
  749. uint32_t num_restarts = block_footer;
  750. if (size_ > kMaxBlockSizeSupportedByHashIndex) {
  751. // In BlockBuilder, we have ensured a block with HashIndex is less than
  752. // kMaxBlockSizeSupportedByHashIndex (64KiB).
  753. //
  754. // Therefore, if we encounter a block with a size > 64KiB, the block
  755. // cannot have HashIndex. So the footer will directly interpreted as
  756. // num_restarts.
  757. //
  758. // Such check is for backward compatibility. We can ensure legacy block
  759. // with a vary large num_restarts i.e. >= 0x80000000 can be interpreted
  760. // correctly as no HashIndex even if the MSB of num_restarts is set.
  761. return num_restarts;
  762. }
  763. BlockBasedTableOptions::DataBlockIndexType index_type;
  764. UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts);
  765. return num_restarts;
  766. }
  767. BlockBasedTableOptions::DataBlockIndexType Block::IndexType() const {
  768. assert(size_ >= 2 * sizeof(uint32_t));
  769. if (size_ > kMaxBlockSizeSupportedByHashIndex) {
  770. // The check is for the same reason as that in NumRestarts()
  771. return BlockBasedTableOptions::kDataBlockBinarySearch;
  772. }
  773. uint32_t block_footer = DecodeFixed32(data_ + size_ - sizeof(uint32_t));
  774. uint32_t num_restarts = block_footer;
  775. BlockBasedTableOptions::DataBlockIndexType index_type;
  776. UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts);
  777. return index_type;
  778. }
  779. Block::~Block() {
  780. // This sync point can be re-enabled if RocksDB can control the
  781. // initialization order of any/all static options created by the user.
  782. // TEST_SYNC_POINT("Block::~Block");
  783. }
  784. Block::Block(BlockContents&& contents, SequenceNumber _global_seqno,
  785. size_t read_amp_bytes_per_bit, Statistics* statistics)
  786. : contents_(std::move(contents)),
  787. data_(contents_.data.data()),
  788. size_(contents_.data.size()),
  789. restart_offset_(0),
  790. num_restarts_(0),
  791. global_seqno_(_global_seqno) {
  792. TEST_SYNC_POINT("Block::Block:0");
  793. if (size_ < sizeof(uint32_t)) {
  794. size_ = 0; // Error marker
  795. } else {
  796. // Should only decode restart points for uncompressed blocks
  797. num_restarts_ = NumRestarts();
  798. switch (IndexType()) {
  799. case BlockBasedTableOptions::kDataBlockBinarySearch:
  800. restart_offset_ = static_cast<uint32_t>(size_) -
  801. (1 + num_restarts_) * sizeof(uint32_t);
  802. if (restart_offset_ > size_ - sizeof(uint32_t)) {
  803. // The size is too small for NumRestarts() and therefore
  804. // restart_offset_ wrapped around.
  805. size_ = 0;
  806. }
  807. break;
  808. case BlockBasedTableOptions::kDataBlockBinaryAndHash:
  809. if (size_ < sizeof(uint32_t) /* block footer */ +
  810. sizeof(uint16_t) /* NUM_BUCK */) {
  811. size_ = 0;
  812. break;
  813. }
  814. uint16_t map_offset;
  815. data_block_hash_index_.Initialize(
  816. contents.data.data(),
  817. static_cast<uint16_t>(contents.data.size() -
  818. sizeof(uint32_t)), /*chop off
  819. NUM_RESTARTS*/
  820. &map_offset);
  821. restart_offset_ = map_offset - num_restarts_ * sizeof(uint32_t);
  822. if (restart_offset_ > map_offset) {
  823. // map_offset is too small for NumRestarts() and
  824. // therefore restart_offset_ wrapped around.
  825. size_ = 0;
  826. break;
  827. }
  828. break;
  829. default:
  830. size_ = 0; // Error marker
  831. }
  832. }
  833. if (read_amp_bytes_per_bit != 0 && statistics && size_ != 0) {
  834. read_amp_bitmap_.reset(new BlockReadAmpBitmap(
  835. restart_offset_, read_amp_bytes_per_bit, statistics));
  836. }
  837. }
  838. DataBlockIter* Block::NewDataIterator(const Comparator* cmp,
  839. const Comparator* ucmp,
  840. DataBlockIter* iter, Statistics* stats,
  841. bool block_contents_pinned) {
  842. DataBlockIter* ret_iter;
  843. if (iter != nullptr) {
  844. ret_iter = iter;
  845. } else {
  846. ret_iter = new DataBlockIter;
  847. }
  848. if (size_ < 2 * sizeof(uint32_t)) {
  849. ret_iter->Invalidate(Status::Corruption("bad block contents"));
  850. return ret_iter;
  851. }
  852. if (num_restarts_ == 0) {
  853. // Empty block.
  854. ret_iter->Invalidate(Status::OK());
  855. return ret_iter;
  856. } else {
  857. ret_iter->Initialize(
  858. cmp, ucmp, data_, restart_offset_, num_restarts_, global_seqno_,
  859. read_amp_bitmap_.get(), block_contents_pinned,
  860. data_block_hash_index_.Valid() ? &data_block_hash_index_ : nullptr);
  861. if (read_amp_bitmap_) {
  862. if (read_amp_bitmap_->GetStatistics() != stats) {
  863. // DB changed the Statistics pointer, we need to notify read_amp_bitmap_
  864. read_amp_bitmap_->SetStatistics(stats);
  865. }
  866. }
  867. }
  868. return ret_iter;
  869. }
  870. IndexBlockIter* Block::NewIndexIterator(
  871. const Comparator* cmp, const Comparator* ucmp, IndexBlockIter* iter,
  872. Statistics* /*stats*/, bool total_order_seek, bool have_first_key,
  873. bool key_includes_seq, bool value_is_full, bool block_contents_pinned,
  874. BlockPrefixIndex* prefix_index) {
  875. IndexBlockIter* ret_iter;
  876. if (iter != nullptr) {
  877. ret_iter = iter;
  878. } else {
  879. ret_iter = new IndexBlockIter;
  880. }
  881. if (size_ < 2 * sizeof(uint32_t)) {
  882. ret_iter->Invalidate(Status::Corruption("bad block contents"));
  883. return ret_iter;
  884. }
  885. if (num_restarts_ == 0) {
  886. // Empty block.
  887. ret_iter->Invalidate(Status::OK());
  888. return ret_iter;
  889. } else {
  890. BlockPrefixIndex* prefix_index_ptr =
  891. total_order_seek ? nullptr : prefix_index;
  892. ret_iter->Initialize(cmp, ucmp, data_, restart_offset_, num_restarts_,
  893. global_seqno_, prefix_index_ptr, have_first_key,
  894. key_includes_seq, value_is_full,
  895. block_contents_pinned);
  896. }
  897. return ret_iter;
  898. }
  899. size_t Block::ApproximateMemoryUsage() const {
  900. size_t usage = usable_size();
  901. #ifdef ROCKSDB_MALLOC_USABLE_SIZE
  902. usage += malloc_usable_size((void*)this);
  903. #else
  904. usage += sizeof(*this);
  905. #endif // ROCKSDB_MALLOC_USABLE_SIZE
  906. if (read_amp_bitmap_) {
  907. usage += read_amp_bitmap_->ApproximateMemoryUsage();
  908. }
  909. return usage;
  910. }
  911. } // namespace ROCKSDB_NAMESPACE