plain_table_reader.cc 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775
  1. // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
  2. // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
  3. // Use of this source code is governed by a BSD-style license that can be
  4. // found in the LICENSE file. See the AUTHORS file for names of contributors.
  5. #ifndef ROCKSDB_LITE
  6. #include "table/plain/plain_table_reader.h"
  7. #include <string>
  8. #include <vector>
  9. #include "db/dbformat.h"
  10. #include "rocksdb/cache.h"
  11. #include "rocksdb/comparator.h"
  12. #include "rocksdb/env.h"
  13. #include "rocksdb/filter_policy.h"
  14. #include "rocksdb/options.h"
  15. #include "rocksdb/statistics.h"
  16. #include "table/block_based/block.h"
  17. #include "table/block_based/filter_block.h"
  18. #include "table/format.h"
  19. #include "table/get_context.h"
  20. #include "table/internal_iterator.h"
  21. #include "table/meta_blocks.h"
  22. #include "table/plain/plain_table_bloom.h"
  23. #include "table/plain/plain_table_factory.h"
  24. #include "table/plain/plain_table_key_coding.h"
  25. #include "table/two_level_iterator.h"
  26. #include "memory/arena.h"
  27. #include "monitoring/histogram.h"
  28. #include "monitoring/perf_context_imp.h"
  29. #include "util/coding.h"
  30. #include "util/dynamic_bloom.h"
  31. #include "util/hash.h"
  32. #include "util/stop_watch.h"
  33. #include "util/string_util.h"
  34. namespace ROCKSDB_NAMESPACE {
  35. namespace {
  36. // Safely getting a uint32_t element from a char array, where, starting from
  37. // `base`, every 4 bytes are considered as an fixed 32 bit integer.
  38. inline uint32_t GetFixed32Element(const char* base, size_t offset) {
  39. return DecodeFixed32(base + offset * sizeof(uint32_t));
  40. }
  41. } // namespace
  42. // Iterator to iterate IndexedTable
  43. class PlainTableIterator : public InternalIterator {
  44. public:
  45. explicit PlainTableIterator(PlainTableReader* table, bool use_prefix_seek);
  46. // No copying allowed
  47. PlainTableIterator(const PlainTableIterator&) = delete;
  48. void operator=(const Iterator&) = delete;
  49. ~PlainTableIterator() override;
  50. bool Valid() const override;
  51. void SeekToFirst() override;
  52. void SeekToLast() override;
  53. void Seek(const Slice& target) override;
  54. void SeekForPrev(const Slice& target) override;
  55. void Next() override;
  56. void Prev() override;
  57. Slice key() const override;
  58. Slice value() const override;
  59. Status status() const override;
  60. private:
  61. PlainTableReader* table_;
  62. PlainTableKeyDecoder decoder_;
  63. bool use_prefix_seek_;
  64. uint32_t offset_;
  65. uint32_t next_offset_;
  66. Slice key_;
  67. Slice value_;
  68. Status status_;
  69. };
  70. extern const uint64_t kPlainTableMagicNumber;
  71. PlainTableReader::PlainTableReader(
  72. const ImmutableCFOptions& ioptions,
  73. std::unique_ptr<RandomAccessFileReader>&& file,
  74. const EnvOptions& storage_options, const InternalKeyComparator& icomparator,
  75. EncodingType encoding_type, uint64_t file_size,
  76. const TableProperties* table_properties,
  77. const SliceTransform* prefix_extractor)
  78. : internal_comparator_(icomparator),
  79. encoding_type_(encoding_type),
  80. full_scan_mode_(false),
  81. user_key_len_(static_cast<uint32_t>(table_properties->fixed_key_len)),
  82. prefix_extractor_(prefix_extractor),
  83. enable_bloom_(false),
  84. bloom_(6),
  85. file_info_(std::move(file), storage_options,
  86. static_cast<uint32_t>(table_properties->data_size)),
  87. ioptions_(ioptions),
  88. file_size_(file_size),
  89. table_properties_(nullptr) {}
  90. PlainTableReader::~PlainTableReader() {
  91. }
  92. Status PlainTableReader::Open(
  93. const ImmutableCFOptions& ioptions, const EnvOptions& env_options,
  94. const InternalKeyComparator& internal_comparator,
  95. std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
  96. std::unique_ptr<TableReader>* table_reader, const int bloom_bits_per_key,
  97. double hash_table_ratio, size_t index_sparseness, size_t huge_page_tlb_size,
  98. bool full_scan_mode, const bool immortal_table,
  99. const SliceTransform* prefix_extractor) {
  100. if (file_size > PlainTableIndex::kMaxFileSize) {
  101. return Status::NotSupported("File is too large for PlainTableReader!");
  102. }
  103. TableProperties* props_ptr = nullptr;
  104. auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber,
  105. ioptions, &props_ptr,
  106. true /* compression_type_missing */);
  107. std::shared_ptr<TableProperties> props(props_ptr);
  108. if (!s.ok()) {
  109. return s;
  110. }
  111. assert(hash_table_ratio >= 0.0);
  112. auto& user_props = props->user_collected_properties;
  113. auto prefix_extractor_in_file = props->prefix_extractor_name;
  114. if (!full_scan_mode &&
  115. !prefix_extractor_in_file.empty() /* old version sst file*/
  116. && prefix_extractor_in_file != "nullptr") {
  117. if (!prefix_extractor) {
  118. return Status::InvalidArgument(
  119. "Prefix extractor is missing when opening a PlainTable built "
  120. "using a prefix extractor");
  121. } else if (prefix_extractor_in_file.compare(prefix_extractor->Name()) !=
  122. 0) {
  123. return Status::InvalidArgument(
  124. "Prefix extractor given doesn't match the one used to build "
  125. "PlainTable");
  126. }
  127. }
  128. EncodingType encoding_type = kPlain;
  129. auto encoding_type_prop =
  130. user_props.find(PlainTablePropertyNames::kEncodingType);
  131. if (encoding_type_prop != user_props.end()) {
  132. encoding_type = static_cast<EncodingType>(
  133. DecodeFixed32(encoding_type_prop->second.c_str()));
  134. }
  135. std::unique_ptr<PlainTableReader> new_reader(new PlainTableReader(
  136. ioptions, std::move(file), env_options, internal_comparator,
  137. encoding_type, file_size, props.get(), prefix_extractor));
  138. s = new_reader->MmapDataIfNeeded();
  139. if (!s.ok()) {
  140. return s;
  141. }
  142. if (!full_scan_mode) {
  143. s = new_reader->PopulateIndex(props.get(), bloom_bits_per_key,
  144. hash_table_ratio, index_sparseness,
  145. huge_page_tlb_size);
  146. if (!s.ok()) {
  147. return s;
  148. }
  149. } else {
  150. // Flag to indicate it is a full scan mode so that none of the indexes
  151. // can be used.
  152. new_reader->full_scan_mode_ = true;
  153. }
  154. // PopulateIndex can add to the props, so don't store them until now
  155. new_reader->table_properties_ = props;
  156. if (immortal_table && new_reader->file_info_.is_mmap_mode) {
  157. new_reader->dummy_cleanable_.reset(new Cleanable());
  158. }
  159. *table_reader = std::move(new_reader);
  160. return s;
  161. }
  162. void PlainTableReader::SetupForCompaction() {
  163. }
  164. InternalIterator* PlainTableReader::NewIterator(
  165. const ReadOptions& options, const SliceTransform* /* prefix_extractor */,
  166. Arena* arena, bool /*skip_filters*/, TableReaderCaller /*caller*/,
  167. size_t /*compaction_readahead_size*/) {
  168. // Not necessarily used here, but make sure this has been initialized
  169. assert(table_properties_);
  170. // Auto prefix mode is not implemented in PlainTable.
  171. bool use_prefix_seek = !IsTotalOrderMode() && !options.total_order_seek &&
  172. !options.auto_prefix_mode;
  173. if (arena == nullptr) {
  174. return new PlainTableIterator(this, use_prefix_seek);
  175. } else {
  176. auto mem = arena->AllocateAligned(sizeof(PlainTableIterator));
  177. return new (mem) PlainTableIterator(this, use_prefix_seek);
  178. }
  179. }
  180. Status PlainTableReader::PopulateIndexRecordList(
  181. PlainTableIndexBuilder* index_builder,
  182. std::vector<uint32_t>* prefix_hashes) {
  183. Slice prev_key_prefix_slice;
  184. std::string prev_key_prefix_buf;
  185. uint32_t pos = data_start_offset_;
  186. bool is_first_record = true;
  187. Slice key_prefix_slice;
  188. PlainTableKeyDecoder decoder(&file_info_, encoding_type_, user_key_len_,
  189. prefix_extractor_);
  190. while (pos < file_info_.data_end_offset) {
  191. uint32_t key_offset = pos;
  192. ParsedInternalKey key;
  193. Slice value_slice;
  194. bool seekable = false;
  195. Status s = Next(&decoder, &pos, &key, nullptr, &value_slice, &seekable);
  196. if (!s.ok()) {
  197. return s;
  198. }
  199. key_prefix_slice = GetPrefix(key);
  200. if (enable_bloom_) {
  201. bloom_.AddHash(GetSliceHash(key.user_key));
  202. } else {
  203. if (is_first_record || prev_key_prefix_slice != key_prefix_slice) {
  204. if (!is_first_record) {
  205. prefix_hashes->push_back(GetSliceHash(prev_key_prefix_slice));
  206. }
  207. if (file_info_.is_mmap_mode) {
  208. prev_key_prefix_slice = key_prefix_slice;
  209. } else {
  210. prev_key_prefix_buf = key_prefix_slice.ToString();
  211. prev_key_prefix_slice = prev_key_prefix_buf;
  212. }
  213. }
  214. }
  215. index_builder->AddKeyPrefix(GetPrefix(key), key_offset);
  216. if (!seekable && is_first_record) {
  217. return Status::Corruption("Key for a prefix is not seekable");
  218. }
  219. is_first_record = false;
  220. }
  221. prefix_hashes->push_back(GetSliceHash(key_prefix_slice));
  222. auto s = index_.InitFromRawData(index_builder->Finish());
  223. return s;
  224. }
  225. void PlainTableReader::AllocateBloom(int bloom_bits_per_key, int num_keys,
  226. size_t huge_page_tlb_size) {
  227. uint32_t bloom_total_bits = num_keys * bloom_bits_per_key;
  228. if (bloom_total_bits > 0) {
  229. enable_bloom_ = true;
  230. bloom_.SetTotalBits(&arena_, bloom_total_bits, ioptions_.bloom_locality,
  231. huge_page_tlb_size, ioptions_.info_log);
  232. }
  233. }
  234. void PlainTableReader::FillBloom(const std::vector<uint32_t>& prefix_hashes) {
  235. assert(bloom_.IsInitialized());
  236. for (const auto prefix_hash : prefix_hashes) {
  237. bloom_.AddHash(prefix_hash);
  238. }
  239. }
  240. Status PlainTableReader::MmapDataIfNeeded() {
  241. if (file_info_.is_mmap_mode) {
  242. // Get mmapped memory.
  243. return file_info_.file->Read(0, static_cast<size_t>(file_size_), &file_info_.file_data, nullptr);
  244. }
  245. return Status::OK();
  246. }
  247. Status PlainTableReader::PopulateIndex(TableProperties* props,
  248. int bloom_bits_per_key,
  249. double hash_table_ratio,
  250. size_t index_sparseness,
  251. size_t huge_page_tlb_size) {
  252. assert(props != nullptr);
  253. BlockContents index_block_contents;
  254. Status s = ReadMetaBlock(file_info_.file.get(), nullptr /* prefetch_buffer */,
  255. file_size_, kPlainTableMagicNumber, ioptions_,
  256. PlainTableIndexBuilder::kPlainTableIndexBlock,
  257. BlockType::kIndex, &index_block_contents,
  258. true /* compression_type_missing */);
  259. bool index_in_file = s.ok();
  260. BlockContents bloom_block_contents;
  261. bool bloom_in_file = false;
  262. // We only need to read the bloom block if index block is in file.
  263. if (index_in_file) {
  264. s = ReadMetaBlock(file_info_.file.get(), nullptr /* prefetch_buffer */,
  265. file_size_, kPlainTableMagicNumber, ioptions_,
  266. BloomBlockBuilder::kBloomBlock, BlockType::kFilter,
  267. &bloom_block_contents,
  268. true /* compression_type_missing */);
  269. bloom_in_file = s.ok() && bloom_block_contents.data.size() > 0;
  270. }
  271. Slice* bloom_block;
  272. if (bloom_in_file) {
  273. // If bloom_block_contents.allocation is not empty (which will be the case
  274. // for non-mmap mode), it holds the alloated memory for the bloom block.
  275. // It needs to be kept alive to keep `bloom_block` valid.
  276. bloom_block_alloc_ = std::move(bloom_block_contents.allocation);
  277. bloom_block = &bloom_block_contents.data;
  278. } else {
  279. bloom_block = nullptr;
  280. }
  281. Slice* index_block;
  282. if (index_in_file) {
  283. // If index_block_contents.allocation is not empty (which will be the case
  284. // for non-mmap mode), it holds the alloated memory for the index block.
  285. // It needs to be kept alive to keep `index_block` valid.
  286. index_block_alloc_ = std::move(index_block_contents.allocation);
  287. index_block = &index_block_contents.data;
  288. } else {
  289. index_block = nullptr;
  290. }
  291. if ((prefix_extractor_ == nullptr) && (hash_table_ratio != 0)) {
  292. // moptions.prefix_extractor is requried for a hash-based look-up.
  293. return Status::NotSupported(
  294. "PlainTable requires a prefix extractor enable prefix hash mode.");
  295. }
  296. // First, read the whole file, for every kIndexIntervalForSamePrefixKeys rows
  297. // for a prefix (starting from the first one), generate a record of (hash,
  298. // offset) and append it to IndexRecordList, which is a data structure created
  299. // to store them.
  300. if (!index_in_file) {
  301. // Allocate bloom filter here for total order mode.
  302. if (IsTotalOrderMode()) {
  303. AllocateBloom(bloom_bits_per_key,
  304. static_cast<uint32_t>(props->num_entries),
  305. huge_page_tlb_size);
  306. }
  307. } else if (bloom_in_file) {
  308. enable_bloom_ = true;
  309. auto num_blocks_property = props->user_collected_properties.find(
  310. PlainTablePropertyNames::kNumBloomBlocks);
  311. uint32_t num_blocks = 0;
  312. if (num_blocks_property != props->user_collected_properties.end()) {
  313. Slice temp_slice(num_blocks_property->second);
  314. if (!GetVarint32(&temp_slice, &num_blocks)) {
  315. num_blocks = 0;
  316. }
  317. }
  318. // cast away const qualifier, because bloom_ won't be changed
  319. bloom_.SetRawData(const_cast<char*>(bloom_block->data()),
  320. static_cast<uint32_t>(bloom_block->size()) * 8,
  321. num_blocks);
  322. } else {
  323. // Index in file but no bloom in file. Disable bloom filter in this case.
  324. enable_bloom_ = false;
  325. bloom_bits_per_key = 0;
  326. }
  327. PlainTableIndexBuilder index_builder(&arena_, ioptions_, prefix_extractor_,
  328. index_sparseness, hash_table_ratio,
  329. huge_page_tlb_size);
  330. std::vector<uint32_t> prefix_hashes;
  331. if (!index_in_file) {
  332. // Populates _bloom if enabled (total order mode)
  333. s = PopulateIndexRecordList(&index_builder, &prefix_hashes);
  334. if (!s.ok()) {
  335. return s;
  336. }
  337. } else {
  338. s = index_.InitFromRawData(*index_block);
  339. if (!s.ok()) {
  340. return s;
  341. }
  342. }
  343. if (!index_in_file) {
  344. if (!IsTotalOrderMode()) {
  345. // Calculated bloom filter size and allocate memory for
  346. // bloom filter based on the number of prefixes, then fill it.
  347. AllocateBloom(bloom_bits_per_key, index_.GetNumPrefixes(),
  348. huge_page_tlb_size);
  349. if (enable_bloom_) {
  350. FillBloom(prefix_hashes);
  351. }
  352. }
  353. }
  354. // Fill two table properties.
  355. if (!index_in_file) {
  356. props->user_collected_properties["plain_table_hash_table_size"] =
  357. ToString(index_.GetIndexSize() * PlainTableIndex::kOffsetLen);
  358. props->user_collected_properties["plain_table_sub_index_size"] =
  359. ToString(index_.GetSubIndexSize());
  360. } else {
  361. props->user_collected_properties["plain_table_hash_table_size"] =
  362. ToString(0);
  363. props->user_collected_properties["plain_table_sub_index_size"] =
  364. ToString(0);
  365. }
  366. return Status::OK();
  367. }
  368. Status PlainTableReader::GetOffset(PlainTableKeyDecoder* decoder,
  369. const Slice& target, const Slice& prefix,
  370. uint32_t prefix_hash, bool& prefix_matched,
  371. uint32_t* offset) const {
  372. prefix_matched = false;
  373. uint32_t prefix_index_offset;
  374. auto res = index_.GetOffset(prefix_hash, &prefix_index_offset);
  375. if (res == PlainTableIndex::kNoPrefixForBucket) {
  376. *offset = file_info_.data_end_offset;
  377. return Status::OK();
  378. } else if (res == PlainTableIndex::kDirectToFile) {
  379. *offset = prefix_index_offset;
  380. return Status::OK();
  381. }
  382. // point to sub-index, need to do a binary search
  383. uint32_t upper_bound;
  384. const char* base_ptr =
  385. index_.GetSubIndexBasePtrAndUpperBound(prefix_index_offset, &upper_bound);
  386. uint32_t low = 0;
  387. uint32_t high = upper_bound;
  388. ParsedInternalKey mid_key;
  389. ParsedInternalKey parsed_target;
  390. if (!ParseInternalKey(target, &parsed_target)) {
  391. return Status::Corruption(Slice());
  392. }
  393. // The key is between [low, high). Do a binary search between it.
  394. while (high - low > 1) {
  395. uint32_t mid = (high + low) / 2;
  396. uint32_t file_offset = GetFixed32Element(base_ptr, mid);
  397. uint32_t tmp;
  398. Status s = decoder->NextKeyNoValue(file_offset, &mid_key, nullptr, &tmp);
  399. if (!s.ok()) {
  400. return s;
  401. }
  402. int cmp_result = internal_comparator_.Compare(mid_key, parsed_target);
  403. if (cmp_result < 0) {
  404. low = mid;
  405. } else {
  406. if (cmp_result == 0) {
  407. // Happen to have found the exact key or target is smaller than the
  408. // first key after base_offset.
  409. prefix_matched = true;
  410. *offset = file_offset;
  411. return Status::OK();
  412. } else {
  413. high = mid;
  414. }
  415. }
  416. }
  417. // Both of the key at the position low or low+1 could share the same
  418. // prefix as target. We need to rule out one of them to avoid to go
  419. // to the wrong prefix.
  420. ParsedInternalKey low_key;
  421. uint32_t tmp;
  422. uint32_t low_key_offset = GetFixed32Element(base_ptr, low);
  423. Status s = decoder->NextKeyNoValue(low_key_offset, &low_key, nullptr, &tmp);
  424. if (!s.ok()) {
  425. return s;
  426. }
  427. if (GetPrefix(low_key) == prefix) {
  428. prefix_matched = true;
  429. *offset = low_key_offset;
  430. } else if (low + 1 < upper_bound) {
  431. // There is possible a next prefix, return it
  432. prefix_matched = false;
  433. *offset = GetFixed32Element(base_ptr, low + 1);
  434. } else {
  435. // target is larger than a key of the last prefix in this bucket
  436. // but with a different prefix. Key does not exist.
  437. *offset = file_info_.data_end_offset;
  438. }
  439. return Status::OK();
  440. }
  441. bool PlainTableReader::MatchBloom(uint32_t hash) const {
  442. if (!enable_bloom_) {
  443. return true;
  444. }
  445. if (bloom_.MayContainHash(hash)) {
  446. PERF_COUNTER_ADD(bloom_sst_hit_count, 1);
  447. return true;
  448. } else {
  449. PERF_COUNTER_ADD(bloom_sst_miss_count, 1);
  450. return false;
  451. }
  452. }
  453. Status PlainTableReader::Next(PlainTableKeyDecoder* decoder, uint32_t* offset,
  454. ParsedInternalKey* parsed_key,
  455. Slice* internal_key, Slice* value,
  456. bool* seekable) const {
  457. if (*offset == file_info_.data_end_offset) {
  458. *offset = file_info_.data_end_offset;
  459. return Status::OK();
  460. }
  461. if (*offset > file_info_.data_end_offset) {
  462. return Status::Corruption("Offset is out of file size");
  463. }
  464. uint32_t bytes_read;
  465. Status s = decoder->NextKey(*offset, parsed_key, internal_key, value,
  466. &bytes_read, seekable);
  467. if (!s.ok()) {
  468. return s;
  469. }
  470. *offset = *offset + bytes_read;
  471. return Status::OK();
  472. }
  473. void PlainTableReader::Prepare(const Slice& target) {
  474. if (enable_bloom_) {
  475. uint32_t prefix_hash = GetSliceHash(GetPrefix(target));
  476. bloom_.Prefetch(prefix_hash);
  477. }
  478. }
  479. Status PlainTableReader::Get(const ReadOptions& /*ro*/, const Slice& target,
  480. GetContext* get_context,
  481. const SliceTransform* /* prefix_extractor */,
  482. bool /*skip_filters*/) {
  483. // Check bloom filter first.
  484. Slice prefix_slice;
  485. uint32_t prefix_hash;
  486. if (IsTotalOrderMode()) {
  487. if (full_scan_mode_) {
  488. status_ =
  489. Status::InvalidArgument("Get() is not allowed in full scan mode.");
  490. }
  491. // Match whole user key for bloom filter check.
  492. if (!MatchBloom(GetSliceHash(GetUserKey(target)))) {
  493. return Status::OK();
  494. }
  495. // in total order mode, there is only one bucket 0, and we always use empty
  496. // prefix.
  497. prefix_slice = Slice();
  498. prefix_hash = 0;
  499. } else {
  500. prefix_slice = GetPrefix(target);
  501. prefix_hash = GetSliceHash(prefix_slice);
  502. if (!MatchBloom(prefix_hash)) {
  503. return Status::OK();
  504. }
  505. }
  506. uint32_t offset;
  507. bool prefix_match;
  508. PlainTableKeyDecoder decoder(&file_info_, encoding_type_, user_key_len_,
  509. prefix_extractor_);
  510. Status s = GetOffset(&decoder, target, prefix_slice, prefix_hash,
  511. prefix_match, &offset);
  512. if (!s.ok()) {
  513. return s;
  514. }
  515. ParsedInternalKey found_key;
  516. ParsedInternalKey parsed_target;
  517. if (!ParseInternalKey(target, &parsed_target)) {
  518. return Status::Corruption(Slice());
  519. }
  520. Slice found_value;
  521. while (offset < file_info_.data_end_offset) {
  522. s = Next(&decoder, &offset, &found_key, nullptr, &found_value);
  523. if (!s.ok()) {
  524. return s;
  525. }
  526. if (!prefix_match) {
  527. // Need to verify prefix for the first key found if it is not yet
  528. // checked.
  529. if (GetPrefix(found_key) != prefix_slice) {
  530. return Status::OK();
  531. }
  532. prefix_match = true;
  533. }
  534. // TODO(ljin): since we know the key comparison result here,
  535. // can we enable the fast path?
  536. if (internal_comparator_.Compare(found_key, parsed_target) >= 0) {
  537. bool dont_care __attribute__((__unused__));
  538. if (!get_context->SaveValue(found_key, found_value, &dont_care,
  539. dummy_cleanable_.get())) {
  540. break;
  541. }
  542. }
  543. }
  544. return Status::OK();
  545. }
  546. uint64_t PlainTableReader::ApproximateOffsetOf(const Slice& /*key*/,
  547. TableReaderCaller /*caller*/) {
  548. return 0;
  549. }
  550. uint64_t PlainTableReader::ApproximateSize(const Slice& /*start*/,
  551. const Slice& /*end*/,
  552. TableReaderCaller /*caller*/) {
  553. return 0;
  554. }
  555. PlainTableIterator::PlainTableIterator(PlainTableReader* table,
  556. bool use_prefix_seek)
  557. : table_(table),
  558. decoder_(&table_->file_info_, table_->encoding_type_,
  559. table_->user_key_len_, table_->prefix_extractor_),
  560. use_prefix_seek_(use_prefix_seek) {
  561. next_offset_ = offset_ = table_->file_info_.data_end_offset;
  562. }
  563. PlainTableIterator::~PlainTableIterator() {
  564. }
  565. bool PlainTableIterator::Valid() const {
  566. return offset_ < table_->file_info_.data_end_offset &&
  567. offset_ >= table_->data_start_offset_;
  568. }
  569. void PlainTableIterator::SeekToFirst() {
  570. status_ = Status::OK();
  571. next_offset_ = table_->data_start_offset_;
  572. if (next_offset_ >= table_->file_info_.data_end_offset) {
  573. next_offset_ = offset_ = table_->file_info_.data_end_offset;
  574. } else {
  575. Next();
  576. }
  577. }
  578. void PlainTableIterator::SeekToLast() {
  579. assert(false);
  580. status_ = Status::NotSupported("SeekToLast() is not supported in PlainTable");
  581. next_offset_ = offset_ = table_->file_info_.data_end_offset;
  582. }
  583. void PlainTableIterator::Seek(const Slice& target) {
  584. if (use_prefix_seek_ != !table_->IsTotalOrderMode()) {
  585. // This check is done here instead of NewIterator() to permit creating an
  586. // iterator with total_order_seek = true even if we won't be able to Seek()
  587. // it. This is needed for compaction: it creates iterator with
  588. // total_order_seek = true but usually never does Seek() on it,
  589. // only SeekToFirst().
  590. status_ =
  591. Status::InvalidArgument(
  592. "total_order_seek not implemented for PlainTable.");
  593. offset_ = next_offset_ = table_->file_info_.data_end_offset;
  594. return;
  595. }
  596. // If the user doesn't set prefix seek option and we are not able to do a
  597. // total Seek(). assert failure.
  598. if (table_->IsTotalOrderMode()) {
  599. if (table_->full_scan_mode_) {
  600. status_ =
  601. Status::InvalidArgument("Seek() is not allowed in full scan mode.");
  602. offset_ = next_offset_ = table_->file_info_.data_end_offset;
  603. return;
  604. } else if (table_->GetIndexSize() > 1) {
  605. assert(false);
  606. status_ = Status::NotSupported(
  607. "PlainTable cannot issue non-prefix seek unless in total order "
  608. "mode.");
  609. offset_ = next_offset_ = table_->file_info_.data_end_offset;
  610. return;
  611. }
  612. }
  613. Slice prefix_slice = table_->GetPrefix(target);
  614. uint32_t prefix_hash = 0;
  615. // Bloom filter is ignored in total-order mode.
  616. if (!table_->IsTotalOrderMode()) {
  617. prefix_hash = GetSliceHash(prefix_slice);
  618. if (!table_->MatchBloom(prefix_hash)) {
  619. status_ = Status::OK();
  620. offset_ = next_offset_ = table_->file_info_.data_end_offset;
  621. return;
  622. }
  623. }
  624. bool prefix_match;
  625. status_ = table_->GetOffset(&decoder_, target, prefix_slice, prefix_hash,
  626. prefix_match, &next_offset_);
  627. if (!status_.ok()) {
  628. offset_ = next_offset_ = table_->file_info_.data_end_offset;
  629. return;
  630. }
  631. if (next_offset_ < table_->file_info_.data_end_offset) {
  632. for (Next(); status_.ok() && Valid(); Next()) {
  633. if (!prefix_match) {
  634. // Need to verify the first key's prefix
  635. if (table_->GetPrefix(key()) != prefix_slice) {
  636. offset_ = next_offset_ = table_->file_info_.data_end_offset;
  637. break;
  638. }
  639. prefix_match = true;
  640. }
  641. if (table_->internal_comparator_.Compare(key(), target) >= 0) {
  642. break;
  643. }
  644. }
  645. } else {
  646. offset_ = table_->file_info_.data_end_offset;
  647. }
  648. }
  649. void PlainTableIterator::SeekForPrev(const Slice& /*target*/) {
  650. assert(false);
  651. status_ =
  652. Status::NotSupported("SeekForPrev() is not supported in PlainTable");
  653. offset_ = next_offset_ = table_->file_info_.data_end_offset;
  654. }
  655. void PlainTableIterator::Next() {
  656. offset_ = next_offset_;
  657. if (offset_ < table_->file_info_.data_end_offset) {
  658. Slice tmp_slice;
  659. ParsedInternalKey parsed_key;
  660. status_ =
  661. table_->Next(&decoder_, &next_offset_, &parsed_key, &key_, &value_);
  662. if (!status_.ok()) {
  663. offset_ = next_offset_ = table_->file_info_.data_end_offset;
  664. }
  665. }
  666. }
  667. void PlainTableIterator::Prev() {
  668. assert(false);
  669. }
  670. Slice PlainTableIterator::key() const {
  671. assert(Valid());
  672. return key_;
  673. }
  674. Slice PlainTableIterator::value() const {
  675. assert(Valid());
  676. return value_;
  677. }
  678. Status PlainTableIterator::status() const {
  679. return status_;
  680. }
  681. } // namespace ROCKSDB_NAMESPACE
  682. #endif // ROCKSDB_LITE