block_based_table_reader.h 35 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824
  1. // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. //
  6. // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
  7. // Use of this source code is governed by a BSD-style license that can be
  8. // found in the LICENSE file. See the AUTHORS file for names of contributors.
  9. #pragma once
  10. #include <stdint.h>
  11. #include <memory>
  12. #include <set>
  13. #include <string>
  14. #include <utility>
  15. #include <vector>
  16. #include "db/range_tombstone_fragmenter.h"
  17. #include "file/filename.h"
  18. #include "file/random_access_file_reader.h"
  19. #include "options/cf_options.h"
  20. #include "rocksdb/options.h"
  21. #include "rocksdb/persistent_cache.h"
  22. #include "rocksdb/statistics.h"
  23. #include "rocksdb/status.h"
  24. #include "rocksdb/table.h"
  25. #include "table/block_based/block.h"
  26. #include "table/block_based/block_based_table_factory.h"
  27. #include "table/block_based/block_type.h"
  28. #include "table/block_based/cachable_entry.h"
  29. #include "table/block_based/filter_block.h"
  30. #include "table/block_based/uncompression_dict_reader.h"
  31. #include "table/format.h"
  32. #include "table/get_context.h"
  33. #include "table/multiget_context.h"
  34. #include "table/persistent_cache_helper.h"
  35. #include "table/table_properties_internal.h"
  36. #include "table/table_reader.h"
  37. #include "table/two_level_iterator.h"
  38. #include "trace_replay/block_cache_tracer.h"
  39. #include "util/coding.h"
  40. #include "util/user_comparator_wrapper.h"
  41. namespace ROCKSDB_NAMESPACE {
  42. class Cache;
  43. class FilterBlockReader;
  44. class BlockBasedFilterBlockReader;
  45. class FullFilterBlockReader;
  46. class Footer;
  47. class InternalKeyComparator;
  48. class Iterator;
  49. class FSRandomAccessFile;
  50. class TableCache;
  51. class TableReader;
  52. class WritableFile;
  53. struct BlockBasedTableOptions;
  54. struct EnvOptions;
  55. struct ReadOptions;
  56. class GetContext;
  57. typedef std::vector<std::pair<std::string, std::string>> KVPairBlock;
  58. // Reader class for BlockBasedTable format.
  59. // For the format of BlockBasedTable refer to
  60. // https://github.com/facebook/rocksdb/wiki/Rocksdb-BlockBasedTable-Format.
  61. // This is the default table type. Data is chucked into fixed size blocks and
  62. // each block in-turn stores entries. When storing data, we can compress and/or
  63. // encode data efficiently within a block, which often results in a much smaller
  64. // data size compared with the raw data size. As for the record retrieval, we'll
  65. // first locate the block where target record may reside, then read the block to
  66. // memory, and finally search that record within the block. Of course, to avoid
  67. // frequent reads of the same block, we introduced the block cache to keep the
  68. // loaded blocks in the memory.
  69. class BlockBasedTable : public TableReader {
  70. public:
  71. static const std::string kFilterBlockPrefix;
  72. static const std::string kFullFilterBlockPrefix;
  73. static const std::string kPartitionedFilterBlockPrefix;
  74. // The longest prefix of the cache key used to identify blocks.
  75. // For Posix files the unique ID is three varints.
  76. static const size_t kMaxCacheKeyPrefixSize = kMaxVarint64Length * 3 + 1;
  77. // All the below fields control iterator readahead
  78. static const size_t kInitAutoReadaheadSize = 8 * 1024;
  79. // Found that 256 KB readahead size provides the best performance, based on
  80. // experiments, for auto readahead. Experiment data is in PR #3282.
  81. static const size_t kMaxAutoReadaheadSize;
  82. static const int kMinNumFileReadsToStartAutoReadahead = 2;
  83. // Attempt to open the table that is stored in bytes [0..file_size)
  84. // of "file", and read the metadata entries necessary to allow
  85. // retrieving data from the table.
  86. //
  87. // If successful, returns ok and sets "*table_reader" to the newly opened
  88. // table. The client should delete "*table_reader" when no longer needed.
  89. // If there was an error while initializing the table, sets "*table_reader"
  90. // to nullptr and returns a non-ok status.
  91. //
  92. // @param file must remain live while this Table is in use.
  93. // @param prefetch_index_and_filter_in_cache can be used to disable
  94. // prefetching of
  95. // index and filter blocks into block cache at startup
  96. // @param skip_filters Disables loading/accessing the filter block. Overrides
  97. // prefetch_index_and_filter_in_cache, so filter will be skipped if both
  98. // are set.
  99. static Status Open(const ImmutableCFOptions& ioptions,
  100. const EnvOptions& env_options,
  101. const BlockBasedTableOptions& table_options,
  102. const InternalKeyComparator& internal_key_comparator,
  103. std::unique_ptr<RandomAccessFileReader>&& file,
  104. uint64_t file_size,
  105. std::unique_ptr<TableReader>* table_reader,
  106. const SliceTransform* prefix_extractor = nullptr,
  107. bool prefetch_index_and_filter_in_cache = true,
  108. bool skip_filters = false, int level = -1,
  109. const bool immortal_table = false,
  110. const SequenceNumber largest_seqno = 0,
  111. TailPrefetchStats* tail_prefetch_stats = nullptr,
  112. BlockCacheTracer* const block_cache_tracer = nullptr);
  113. bool PrefixMayMatch(const Slice& internal_key,
  114. const ReadOptions& read_options,
  115. const SliceTransform* options_prefix_extractor,
  116. const bool need_upper_bound_check,
  117. BlockCacheLookupContext* lookup_context) const;
  118. // Returns a new iterator over the table contents.
  119. // The result of NewIterator() is initially invalid (caller must
  120. // call one of the Seek methods on the iterator before using it).
  121. // @param skip_filters Disables loading/accessing the filter block
  122. // compaction_readahead_size: its value will only be used if caller =
  123. // kCompaction.
  124. InternalIterator* NewIterator(const ReadOptions&,
  125. const SliceTransform* prefix_extractor,
  126. Arena* arena, bool skip_filters,
  127. TableReaderCaller caller,
  128. size_t compaction_readahead_size = 0) override;
  129. FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator(
  130. const ReadOptions& read_options) override;
  131. // @param skip_filters Disables loading/accessing the filter block
  132. Status Get(const ReadOptions& readOptions, const Slice& key,
  133. GetContext* get_context, const SliceTransform* prefix_extractor,
  134. bool skip_filters = false) override;
  135. void MultiGet(const ReadOptions& readOptions,
  136. const MultiGetContext::Range* mget_range,
  137. const SliceTransform* prefix_extractor,
  138. bool skip_filters = false) override;
  139. // Pre-fetch the disk blocks that correspond to the key range specified by
  140. // (kbegin, kend). The call will return error status in the event of
  141. // IO or iteration error.
  142. Status Prefetch(const Slice* begin, const Slice* end) override;
  143. // Given a key, return an approximate byte offset in the file where
  144. // the data for that key begins (or would begin if the key were
  145. // present in the file). The returned value is in terms of file
  146. // bytes, and so includes effects like compression of the underlying data.
  147. // E.g., the approximate offset of the last key in the table will
  148. // be close to the file length.
  149. uint64_t ApproximateOffsetOf(const Slice& key,
  150. TableReaderCaller caller) override;
  151. // Given start and end keys, return the approximate data size in the file
  152. // between the keys. The returned value is in terms of file bytes, and so
  153. // includes effects like compression of the underlying data.
  154. // The start key must not be greater than the end key.
  155. uint64_t ApproximateSize(const Slice& start, const Slice& end,
  156. TableReaderCaller caller) override;
  157. bool TEST_BlockInCache(const BlockHandle& handle) const;
  158. // Returns true if the block for the specified key is in cache.
  159. // REQUIRES: key is in this table && block cache enabled
  160. bool TEST_KeyInCache(const ReadOptions& options, const Slice& key);
  161. // Set up the table for Compaction. Might change some parameters with
  162. // posix_fadvise
  163. void SetupForCompaction() override;
  164. std::shared_ptr<const TableProperties> GetTableProperties() const override;
  165. size_t ApproximateMemoryUsage() const override;
  166. // convert SST file to a human readable form
  167. Status DumpTable(WritableFile* out_file) override;
  168. Status VerifyChecksum(const ReadOptions& readOptions,
  169. TableReaderCaller caller) override;
  170. ~BlockBasedTable();
  171. bool TEST_FilterBlockInCache() const;
  172. bool TEST_IndexBlockInCache() const;
  173. // IndexReader is the interface that provides the functionality for index
  174. // access.
  175. class IndexReader {
  176. public:
  177. virtual ~IndexReader() = default;
  178. // Create an iterator for index access. If iter is null, then a new object
  179. // is created on the heap, and the callee will have the ownership.
  180. // If a non-null iter is passed in, it will be used, and the returned value
  181. // is either the same as iter or a new on-heap object that
  182. // wraps the passed iter. In the latter case the return value points
  183. // to a different object then iter, and the callee has the ownership of the
  184. // returned object.
  185. virtual InternalIteratorBase<IndexValue>* NewIterator(
  186. const ReadOptions& read_options, bool disable_prefix_seek,
  187. IndexBlockIter* iter, GetContext* get_context,
  188. BlockCacheLookupContext* lookup_context) = 0;
  189. // Report an approximation of how much memory has been used other than
  190. // memory that was allocated in block cache.
  191. virtual size_t ApproximateMemoryUsage() const = 0;
  192. // Cache the dependencies of the index reader (e.g. the partitions
  193. // of a partitioned index).
  194. virtual void CacheDependencies(bool /* pin */) {}
  195. };
  196. class IndexReaderCommon;
  197. static Slice GetCacheKey(const char* cache_key_prefix,
  198. size_t cache_key_prefix_size,
  199. const BlockHandle& handle, char* cache_key);
  200. // Retrieve all key value pairs from data blocks in the table.
  201. // The key retrieved are internal keys.
  202. Status GetKVPairsFromDataBlocks(std::vector<KVPairBlock>* kv_pair_blocks);
  203. struct Rep;
  204. Rep* get_rep() { return rep_; }
  205. const Rep* get_rep() const { return rep_; }
  206. // input_iter: if it is not null, update this one and return it as Iterator
  207. template <typename TBlockIter>
  208. TBlockIter* NewDataBlockIterator(
  209. const ReadOptions& ro, const BlockHandle& block_handle,
  210. TBlockIter* input_iter, BlockType block_type, GetContext* get_context,
  211. BlockCacheLookupContext* lookup_context, Status s,
  212. FilePrefetchBuffer* prefetch_buffer, bool for_compaction = false) const;
  213. // input_iter: if it is not null, update this one and return it as Iterator
  214. template <typename TBlockIter>
  215. TBlockIter* NewDataBlockIterator(const ReadOptions& ro,
  216. CachableEntry<Block>& block,
  217. TBlockIter* input_iter, Status s) const;
  218. class PartitionedIndexIteratorState;
  219. template <typename TBlocklike>
  220. friend class FilterBlockReaderCommon;
  221. friend class PartitionIndexReader;
  222. friend class UncompressionDictReader;
  223. protected:
  224. Rep* rep_;
  225. explicit BlockBasedTable(Rep* rep, BlockCacheTracer* const block_cache_tracer)
  226. : rep_(rep), block_cache_tracer_(block_cache_tracer) {}
  227. // No copying allowed
  228. explicit BlockBasedTable(const TableReader&) = delete;
  229. void operator=(const TableReader&) = delete;
  230. private:
  231. friend class MockedBlockBasedTable;
  232. static std::atomic<uint64_t> next_cache_key_id_;
  233. BlockCacheTracer* const block_cache_tracer_;
  234. void UpdateCacheHitMetrics(BlockType block_type, GetContext* get_context,
  235. size_t usage) const;
  236. void UpdateCacheMissMetrics(BlockType block_type,
  237. GetContext* get_context) const;
  238. void UpdateCacheInsertionMetrics(BlockType block_type,
  239. GetContext* get_context, size_t usage) const;
  240. Cache::Handle* GetEntryFromCache(Cache* block_cache, const Slice& key,
  241. BlockType block_type,
  242. GetContext* get_context) const;
  243. // Either Block::NewDataIterator() or Block::NewIndexIterator().
  244. template <typename TBlockIter>
  245. static TBlockIter* InitBlockIterator(const Rep* rep, Block* block,
  246. TBlockIter* input_iter,
  247. bool block_contents_pinned);
  248. // If block cache enabled (compressed or uncompressed), looks for the block
  249. // identified by handle in (1) uncompressed cache, (2) compressed cache, and
  250. // then (3) file. If found, inserts into the cache(s) that were searched
  251. // unsuccessfully (e.g., if found in file, will add to both uncompressed and
  252. // compressed caches if they're enabled).
  253. //
  254. // @param block_entry value is set to the uncompressed block if found. If
  255. // in uncompressed block cache, also sets cache_handle to reference that
  256. // block.
  257. template <typename TBlocklike>
  258. Status MaybeReadBlockAndLoadToCache(
  259. FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
  260. const BlockHandle& handle, const UncompressionDict& uncompression_dict,
  261. CachableEntry<TBlocklike>* block_entry, BlockType block_type,
  262. GetContext* get_context, BlockCacheLookupContext* lookup_context,
  263. BlockContents* contents) const;
  264. // Similar to the above, with one crucial difference: it will retrieve the
  265. // block from the file even if there are no caches configured (assuming the
  266. // read options allow I/O).
  267. template <typename TBlocklike>
  268. Status RetrieveBlock(FilePrefetchBuffer* prefetch_buffer,
  269. const ReadOptions& ro, const BlockHandle& handle,
  270. const UncompressionDict& uncompression_dict,
  271. CachableEntry<TBlocklike>* block_entry,
  272. BlockType block_type, GetContext* get_context,
  273. BlockCacheLookupContext* lookup_context,
  274. bool for_compaction, bool use_cache) const;
  275. void RetrieveMultipleBlocks(
  276. const ReadOptions& options, const MultiGetRange* batch,
  277. const autovector<BlockHandle, MultiGetContext::MAX_BATCH_SIZE>* handles,
  278. autovector<Status, MultiGetContext::MAX_BATCH_SIZE>* statuses,
  279. autovector<CachableEntry<Block>, MultiGetContext::MAX_BATCH_SIZE>*
  280. results,
  281. char* scratch, const UncompressionDict& uncompression_dict) const;
  282. // Get the iterator from the index reader.
  283. //
  284. // If input_iter is not set, return a new Iterator.
  285. // If input_iter is set, try to update it and return it as Iterator.
  286. // However note that in some cases the returned iterator may be different
  287. // from input_iter. In such case the returned iterator should be freed.
  288. //
  289. // Note: ErrorIterator with Status::Incomplete shall be returned if all the
  290. // following conditions are met:
  291. // 1. We enabled table_options.cache_index_and_filter_blocks.
  292. // 2. index is not present in block cache.
  293. // 3. We disallowed any io to be performed, that is, read_options ==
  294. // kBlockCacheTier
  295. InternalIteratorBase<IndexValue>* NewIndexIterator(
  296. const ReadOptions& read_options, bool need_upper_bound_check,
  297. IndexBlockIter* input_iter, GetContext* get_context,
  298. BlockCacheLookupContext* lookup_context) const;
  299. // Read block cache from block caches (if set): block_cache and
  300. // block_cache_compressed.
  301. // On success, Status::OK with be returned and @block will be populated with
  302. // pointer to the block as well as its block handle.
  303. // @param uncompression_dict Data for presetting the compression library's
  304. // dictionary.
  305. template <typename TBlocklike>
  306. Status GetDataBlockFromCache(
  307. const Slice& block_cache_key, const Slice& compressed_block_cache_key,
  308. Cache* block_cache, Cache* block_cache_compressed,
  309. const ReadOptions& read_options, CachableEntry<TBlocklike>* block,
  310. const UncompressionDict& uncompression_dict, BlockType block_type,
  311. GetContext* get_context) const;
  312. // Put a raw block (maybe compressed) to the corresponding block caches.
  313. // This method will perform decompression against raw_block if needed and then
  314. // populate the block caches.
  315. // On success, Status::OK will be returned; also @block will be populated with
  316. // uncompressed block and its cache handle.
  317. //
  318. // Allocated memory managed by raw_block_contents will be transferred to
  319. // PutDataBlockToCache(). After the call, the object will be invalid.
  320. // @param uncompression_dict Data for presetting the compression library's
  321. // dictionary.
  322. template <typename TBlocklike>
  323. Status PutDataBlockToCache(
  324. const Slice& block_cache_key, const Slice& compressed_block_cache_key,
  325. Cache* block_cache, Cache* block_cache_compressed,
  326. CachableEntry<TBlocklike>* cached_block,
  327. BlockContents* raw_block_contents, CompressionType raw_block_comp_type,
  328. const UncompressionDict& uncompression_dict, SequenceNumber seq_no,
  329. MemoryAllocator* memory_allocator, BlockType block_type,
  330. GetContext* get_context) const;
  331. // Calls (*handle_result)(arg, ...) repeatedly, starting with the entry found
  332. // after a call to Seek(key), until handle_result returns false.
  333. // May not make such a call if filter policy says that key is not present.
  334. friend class TableCache;
  335. friend class BlockBasedTableBuilder;
  336. // Create a index reader based on the index type stored in the table.
  337. // Optionally, user can pass a preloaded meta_index_iter for the index that
  338. // need to access extra meta blocks for index construction. This parameter
  339. // helps avoid re-reading meta index block if caller already created one.
  340. Status CreateIndexReader(FilePrefetchBuffer* prefetch_buffer,
  341. InternalIterator* preloaded_meta_index_iter,
  342. bool use_cache, bool prefetch, bool pin,
  343. BlockCacheLookupContext* lookup_context,
  344. std::unique_ptr<IndexReader>* index_reader);
  345. bool FullFilterKeyMayMatch(const ReadOptions& read_options,
  346. FilterBlockReader* filter, const Slice& user_key,
  347. const bool no_io,
  348. const SliceTransform* prefix_extractor,
  349. GetContext* get_context,
  350. BlockCacheLookupContext* lookup_context) const;
  351. void FullFilterKeysMayMatch(const ReadOptions& read_options,
  352. FilterBlockReader* filter, MultiGetRange* range,
  353. const bool no_io,
  354. const SliceTransform* prefix_extractor,
  355. BlockCacheLookupContext* lookup_context) const;
  356. static Status PrefetchTail(
  357. RandomAccessFileReader* file, uint64_t file_size,
  358. TailPrefetchStats* tail_prefetch_stats, const bool prefetch_all,
  359. const bool preload_all,
  360. std::unique_ptr<FilePrefetchBuffer>* prefetch_buffer);
  361. Status ReadMetaIndexBlock(FilePrefetchBuffer* prefetch_buffer,
  362. std::unique_ptr<Block>* metaindex_block,
  363. std::unique_ptr<InternalIterator>* iter);
  364. Status TryReadPropertiesWithGlobalSeqno(FilePrefetchBuffer* prefetch_buffer,
  365. const Slice& handle_value,
  366. TableProperties** table_properties);
  367. Status ReadPropertiesBlock(FilePrefetchBuffer* prefetch_buffer,
  368. InternalIterator* meta_iter,
  369. const SequenceNumber largest_seqno);
  370. Status ReadRangeDelBlock(FilePrefetchBuffer* prefetch_buffer,
  371. InternalIterator* meta_iter,
  372. const InternalKeyComparator& internal_comparator,
  373. BlockCacheLookupContext* lookup_context);
  374. Status PrefetchIndexAndFilterBlocks(
  375. FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
  376. BlockBasedTable* new_table, bool prefetch_all,
  377. const BlockBasedTableOptions& table_options, const int level,
  378. BlockCacheLookupContext* lookup_context);
  379. static BlockType GetBlockTypeForMetaBlockByName(const Slice& meta_block_name);
  380. Status VerifyChecksumInMetaBlocks(InternalIteratorBase<Slice>* index_iter);
  381. Status VerifyChecksumInBlocks(const ReadOptions& read_options,
  382. InternalIteratorBase<IndexValue>* index_iter);
  383. // Create the filter from the filter block.
  384. std::unique_ptr<FilterBlockReader> CreateFilterBlockReader(
  385. FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch,
  386. bool pin, BlockCacheLookupContext* lookup_context);
  387. static void SetupCacheKeyPrefix(Rep* rep);
  388. // Generate a cache key prefix from the file
  389. static void GenerateCachePrefix(Cache* cc, FSRandomAccessFile* file,
  390. char* buffer, size_t* size);
  391. static void GenerateCachePrefix(Cache* cc, FSWritableFile* file, char* buffer,
  392. size_t* size);
  393. // Given an iterator return its offset in file.
  394. uint64_t ApproximateOffsetOf(
  395. const InternalIteratorBase<IndexValue>& index_iter) const;
  396. // Helper functions for DumpTable()
  397. Status DumpIndexBlock(WritableFile* out_file);
  398. Status DumpDataBlocks(WritableFile* out_file);
  399. void DumpKeyValue(const Slice& key, const Slice& value,
  400. WritableFile* out_file);
  401. // A cumulative data block file read in MultiGet lower than this size will
  402. // use a stack buffer
  403. static constexpr size_t kMultiGetReadStackBufSize = 8192;
  404. friend class PartitionedFilterBlockReader;
  405. friend class PartitionedFilterBlockTest;
  406. friend class DBBasicTest_MultiGetIOBufferOverrun_Test;
  407. };
  408. // Maitaning state of a two-level iteration on a partitioned index structure.
  409. class BlockBasedTable::PartitionedIndexIteratorState
  410. : public TwoLevelIteratorState {
  411. public:
  412. PartitionedIndexIteratorState(
  413. const BlockBasedTable* table,
  414. std::unordered_map<uint64_t, CachableEntry<Block>>* block_map);
  415. InternalIteratorBase<IndexValue>* NewSecondaryIterator(
  416. const BlockHandle& index_value) override;
  417. private:
  418. // Don't own table_
  419. const BlockBasedTable* table_;
  420. std::unordered_map<uint64_t, CachableEntry<Block>>* block_map_;
  421. };
  422. // Stores all the properties associated with a BlockBasedTable.
  423. // These are immutable.
  424. struct BlockBasedTable::Rep {
  425. Rep(const ImmutableCFOptions& _ioptions, const EnvOptions& _env_options,
  426. const BlockBasedTableOptions& _table_opt,
  427. const InternalKeyComparator& _internal_comparator, bool skip_filters,
  428. int _level, const bool _immortal_table)
  429. : ioptions(_ioptions),
  430. env_options(_env_options),
  431. table_options(_table_opt),
  432. filter_policy(skip_filters ? nullptr : _table_opt.filter_policy.get()),
  433. internal_comparator(_internal_comparator),
  434. filter_type(FilterType::kNoFilter),
  435. index_type(BlockBasedTableOptions::IndexType::kBinarySearch),
  436. hash_index_allow_collision(false),
  437. whole_key_filtering(_table_opt.whole_key_filtering),
  438. prefix_filtering(true),
  439. global_seqno(kDisableGlobalSequenceNumber),
  440. level(_level),
  441. immortal_table(_immortal_table) {}
  442. const ImmutableCFOptions& ioptions;
  443. const EnvOptions& env_options;
  444. const BlockBasedTableOptions table_options;
  445. const FilterPolicy* const filter_policy;
  446. const InternalKeyComparator& internal_comparator;
  447. Status status;
  448. std::unique_ptr<RandomAccessFileReader> file;
  449. char cache_key_prefix[kMaxCacheKeyPrefixSize];
  450. size_t cache_key_prefix_size = 0;
  451. char persistent_cache_key_prefix[kMaxCacheKeyPrefixSize];
  452. size_t persistent_cache_key_prefix_size = 0;
  453. char compressed_cache_key_prefix[kMaxCacheKeyPrefixSize];
  454. size_t compressed_cache_key_prefix_size = 0;
  455. PersistentCacheOptions persistent_cache_options;
  456. // Footer contains the fixed table information
  457. Footer footer;
  458. std::unique_ptr<IndexReader> index_reader;
  459. std::unique_ptr<FilterBlockReader> filter;
  460. std::unique_ptr<UncompressionDictReader> uncompression_dict_reader;
  461. enum class FilterType {
  462. kNoFilter,
  463. kFullFilter,
  464. kBlockFilter,
  465. kPartitionedFilter,
  466. };
  467. FilterType filter_type;
  468. BlockHandle filter_handle;
  469. BlockHandle compression_dict_handle;
  470. std::shared_ptr<const TableProperties> table_properties;
  471. BlockBasedTableOptions::IndexType index_type;
  472. bool hash_index_allow_collision;
  473. bool whole_key_filtering;
  474. bool prefix_filtering;
  475. // TODO(kailiu) It is very ugly to use internal key in table, since table
  476. // module should not be relying on db module. However to make things easier
  477. // and compatible with existing code, we introduce a wrapper that allows
  478. // block to extract prefix without knowing if a key is internal or not.
  479. // null if no prefix_extractor is passed in when opening the table reader.
  480. std::unique_ptr<SliceTransform> internal_prefix_transform;
  481. std::shared_ptr<const SliceTransform> table_prefix_extractor;
  482. std::shared_ptr<const FragmentedRangeTombstoneList> fragmented_range_dels;
  483. // If global_seqno is used, all Keys in this file will have the same
  484. // seqno with value `global_seqno`.
  485. //
  486. // A value of kDisableGlobalSequenceNumber means that this feature is disabled
  487. // and every key have it's own seqno.
  488. SequenceNumber global_seqno;
  489. // the level when the table is opened, could potentially change when trivial
  490. // move is involved
  491. int level;
  492. // If false, blocks in this file are definitely all uncompressed. Knowing this
  493. // before reading individual blocks enables certain optimizations.
  494. bool blocks_maybe_compressed = true;
  495. // If true, data blocks in this file are definitely ZSTD compressed. If false
  496. // they might not be. When false we skip creating a ZSTD digested
  497. // uncompression dictionary. Even if we get a false negative, things should
  498. // still work, just not as quickly.
  499. bool blocks_definitely_zstd_compressed = false;
  500. // These describe how index is encoded.
  501. bool index_has_first_key = false;
  502. bool index_key_includes_seq = true;
  503. bool index_value_is_full = true;
  504. const bool immortal_table;
  505. SequenceNumber get_global_seqno(BlockType block_type) const {
  506. return (block_type == BlockType::kFilter ||
  507. block_type == BlockType::kCompressionDictionary)
  508. ? kDisableGlobalSequenceNumber
  509. : global_seqno;
  510. }
  511. uint64_t cf_id_for_tracing() const {
  512. return table_properties
  513. ? table_properties->column_family_id
  514. : ROCKSDB_NAMESPACE::TablePropertiesCollectorFactory::Context::
  515. kUnknownColumnFamily;
  516. }
  517. Slice cf_name_for_tracing() const {
  518. return table_properties ? table_properties->column_family_name
  519. : BlockCacheTraceHelper::kUnknownColumnFamilyName;
  520. }
  521. uint32_t level_for_tracing() const { return level >= 0 ? level : UINT32_MAX; }
  522. uint64_t sst_number_for_tracing() const {
  523. return file ? TableFileNameToNumber(file->file_name()) : UINT64_MAX;
  524. }
  525. void CreateFilePrefetchBuffer(
  526. size_t readahead_size, size_t max_readahead_size,
  527. std::unique_ptr<FilePrefetchBuffer>* fpb) const {
  528. fpb->reset(new FilePrefetchBuffer(file.get(), readahead_size,
  529. max_readahead_size,
  530. !ioptions.allow_mmap_reads /* enable */));
  531. }
  532. };
  533. // Iterates over the contents of BlockBasedTable.
  534. template <class TBlockIter, typename TValue = Slice>
  535. class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
  536. // compaction_readahead_size: its value will only be used if for_compaction =
  537. // true
  538. public:
  539. BlockBasedTableIterator(const BlockBasedTable* table,
  540. const ReadOptions& read_options,
  541. const InternalKeyComparator& icomp,
  542. InternalIteratorBase<IndexValue>* index_iter,
  543. bool check_filter, bool need_upper_bound_check,
  544. const SliceTransform* prefix_extractor,
  545. BlockType block_type, TableReaderCaller caller,
  546. size_t compaction_readahead_size = 0)
  547. : table_(table),
  548. read_options_(read_options),
  549. icomp_(icomp),
  550. user_comparator_(icomp.user_comparator()),
  551. index_iter_(index_iter),
  552. pinned_iters_mgr_(nullptr),
  553. block_iter_points_to_real_block_(false),
  554. check_filter_(check_filter),
  555. need_upper_bound_check_(need_upper_bound_check),
  556. prefix_extractor_(prefix_extractor),
  557. block_type_(block_type),
  558. lookup_context_(caller),
  559. compaction_readahead_size_(compaction_readahead_size) {}
  560. ~BlockBasedTableIterator() { delete index_iter_; }
  561. void Seek(const Slice& target) override;
  562. void SeekForPrev(const Slice& target) override;
  563. void SeekToFirst() override;
  564. void SeekToLast() override;
  565. void Next() final override;
  566. bool NextAndGetResult(IterateResult* result) override;
  567. void Prev() override;
  568. bool Valid() const override {
  569. return !is_out_of_bound_ &&
  570. (is_at_first_key_from_index_ ||
  571. (block_iter_points_to_real_block_ && block_iter_.Valid()));
  572. }
  573. Slice key() const override {
  574. assert(Valid());
  575. if (is_at_first_key_from_index_) {
  576. return index_iter_->value().first_internal_key;
  577. } else {
  578. return block_iter_.key();
  579. }
  580. }
  581. Slice user_key() const override {
  582. assert(Valid());
  583. if (is_at_first_key_from_index_) {
  584. return ExtractUserKey(index_iter_->value().first_internal_key);
  585. } else {
  586. return block_iter_.user_key();
  587. }
  588. }
  589. TValue value() const override {
  590. assert(Valid());
  591. // Load current block if not loaded.
  592. if (is_at_first_key_from_index_ &&
  593. !const_cast<BlockBasedTableIterator*>(this)
  594. ->MaterializeCurrentBlock()) {
  595. // Oops, index is not consistent with block contents, but we have
  596. // no good way to report error at this point. Let's return empty value.
  597. return TValue();
  598. }
  599. return block_iter_.value();
  600. }
  601. Status status() const override {
  602. // Prefix index set status to NotFound when the prefix does not exist
  603. if (!index_iter_->status().ok() && !index_iter_->status().IsNotFound()) {
  604. return index_iter_->status();
  605. } else if (block_iter_points_to_real_block_) {
  606. return block_iter_.status();
  607. } else {
  608. return Status::OK();
  609. }
  610. }
  611. // Whether iterator invalidated for being out of bound.
  612. bool IsOutOfBound() override { return is_out_of_bound_; }
  613. inline bool MayBeOutOfUpperBound() override {
  614. assert(Valid());
  615. return !data_block_within_upper_bound_;
  616. }
  617. void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
  618. pinned_iters_mgr_ = pinned_iters_mgr;
  619. }
  620. bool IsKeyPinned() const override {
  621. // Our key comes either from block_iter_'s current key
  622. // or index_iter_'s current *value*.
  623. return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
  624. ((is_at_first_key_from_index_ && index_iter_->IsValuePinned()) ||
  625. (block_iter_points_to_real_block_ && block_iter_.IsKeyPinned()));
  626. }
  627. bool IsValuePinned() const override {
  628. // Load current block if not loaded.
  629. if (is_at_first_key_from_index_) {
  630. const_cast<BlockBasedTableIterator*>(this)->MaterializeCurrentBlock();
  631. }
  632. // BlockIter::IsValuePinned() is always true. No need to check
  633. return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
  634. block_iter_points_to_real_block_;
  635. }
  636. void ResetDataIter() {
  637. if (block_iter_points_to_real_block_) {
  638. if (pinned_iters_mgr_ != nullptr && pinned_iters_mgr_->PinningEnabled()) {
  639. block_iter_.DelegateCleanupsTo(pinned_iters_mgr_);
  640. }
  641. block_iter_.Invalidate(Status::OK());
  642. block_iter_points_to_real_block_ = false;
  643. }
  644. }
  645. void SavePrevIndexValue() {
  646. if (block_iter_points_to_real_block_) {
  647. // Reseek. If they end up with the same data block, we shouldn't re-fetch
  648. // the same data block.
  649. prev_block_offset_ = index_iter_->value().handle.offset();
  650. }
  651. }
  652. private:
  653. enum class IterDirection {
  654. kForward,
  655. kBackward,
  656. };
  657. const BlockBasedTable* table_;
  658. const ReadOptions read_options_;
  659. const InternalKeyComparator& icomp_;
  660. UserComparatorWrapper user_comparator_;
  661. InternalIteratorBase<IndexValue>* index_iter_;
  662. PinnedIteratorsManager* pinned_iters_mgr_;
  663. TBlockIter block_iter_;
  664. // True if block_iter_ is initialized and points to the same block
  665. // as index iterator.
  666. bool block_iter_points_to_real_block_;
  667. // See InternalIteratorBase::IsOutOfBound().
  668. bool is_out_of_bound_ = false;
  669. // Whether current data block being fully within iterate upper bound.
  670. bool data_block_within_upper_bound_ = false;
  671. // True if we're standing at the first key of a block, and we haven't loaded
  672. // that block yet. A call to value() will trigger loading the block.
  673. bool is_at_first_key_from_index_ = false;
  674. bool check_filter_;
  675. // TODO(Zhongyi): pick a better name
  676. bool need_upper_bound_check_;
  677. const SliceTransform* prefix_extractor_;
  678. BlockType block_type_;
  679. uint64_t prev_block_offset_ = std::numeric_limits<uint64_t>::max();
  680. BlockCacheLookupContext lookup_context_;
  681. // Readahead size used in compaction, its value is used only if
  682. // lookup_context_.caller = kCompaction.
  683. size_t compaction_readahead_size_;
  684. size_t readahead_size_ = BlockBasedTable::kInitAutoReadaheadSize;
  685. size_t readahead_limit_ = 0;
  686. int64_t num_file_reads_ = 0;
  687. std::unique_ptr<FilePrefetchBuffer> prefetch_buffer_;
  688. // If `target` is null, seek to first.
  689. void SeekImpl(const Slice* target);
  690. void InitDataBlock();
  691. bool MaterializeCurrentBlock();
  692. void FindKeyForward();
  693. void FindBlockForward();
  694. void FindKeyBackward();
  695. void CheckOutOfBound();
  696. // Check if data block is fully within iterate_upper_bound.
  697. //
  698. // Note MyRocks may update iterate bounds between seek. To workaround it,
  699. // we need to check and update data_block_within_upper_bound_ accordingly.
  700. void CheckDataBlockWithinUpperBound();
  701. bool CheckPrefixMayMatch(const Slice& ikey, IterDirection direction) {
  702. if (need_upper_bound_check_ && direction == IterDirection::kBackward) {
  703. // Upper bound check isn't sufficnet for backward direction to
  704. // guarantee the same result as total order, so disable prefix
  705. // check.
  706. return true;
  707. }
  708. if (check_filter_ &&
  709. !table_->PrefixMayMatch(ikey, read_options_, prefix_extractor_,
  710. need_upper_bound_check_, &lookup_context_)) {
  711. // TODO remember the iterator is invalidated because of prefix
  712. // match. This can avoid the upper level file iterator to falsely
  713. // believe the position is the end of the SST file and move to
  714. // the first key of the next file.
  715. ResetDataIter();
  716. return false;
  717. }
  718. return true;
  719. }
  720. };
  721. } // namespace ROCKSDB_NAMESPACE