block_based_table_reader.h 36 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817
  1. // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. //
  6. // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
  7. // Use of this source code is governed by a BSD-style license that can be
  8. // found in the LICENSE file. See the AUTHORS file for names of contributors.
  9. #pragma once
  10. #include <cstdint>
  11. #include <memory>
  12. #include "cache/cache_entry_roles.h"
  13. #include "cache/cache_key.h"
  14. #include "cache/cache_reservation_manager.h"
  15. #include "db/range_tombstone_fragmenter.h"
  16. #include "db/seqno_to_time_mapping.h"
  17. #include "file/filename.h"
  18. #include "rocksdb/slice_transform.h"
  19. #include "rocksdb/table_properties.h"
  20. #include "table/block_based/block.h"
  21. #include "table/block_based/block_based_table_factory.h"
  22. #include "table/block_based/block_cache.h"
  23. #include "table/block_based/block_type.h"
  24. #include "table/block_based/cachable_entry.h"
  25. #include "table/block_based/filter_block.h"
  26. #include "table/block_based/uncompression_dict_reader.h"
  27. #include "table/format.h"
  28. #include "table/persistent_cache_options.h"
  29. #include "table/table_properties_internal.h"
  30. #include "table/table_reader.h"
  31. #include "table/two_level_iterator.h"
  32. #include "trace_replay/block_cache_tracer.h"
  33. #include "util/atomic.h"
  34. #include "util/cast_util.h"
  35. #include "util/coro_utils.h"
  36. #include "util/hash_containers.h"
  37. namespace ROCKSDB_NAMESPACE {
  38. class Cache;
  39. class FilterBlockReader;
  40. class FullFilterBlockReader;
  41. class Footer;
  42. class InternalKeyComparator;
  43. class Iterator;
  44. class FSRandomAccessFile;
  45. class TableCache;
  46. class TableReader;
  47. class WritableFile;
  48. struct BlockBasedTableOptions;
  49. struct EnvOptions;
  50. struct ReadOptions;
  51. class GetContext;
  52. using KVPairBlock = std::vector<std::pair<std::string, std::string>>;
  53. // Reader class for BlockBasedTable format.
  54. // For the format of BlockBasedTable refer to
  55. // https://github.com/facebook/rocksdb/wiki/Rocksdb-BlockBasedTable-Format.
  56. // This is the default table type. Data is chucked into fixed size blocks and
  57. // each block in-turn stores entries. When storing data, we can compress and/or
  58. // encode data efficiently within a block, which often results in a much smaller
  59. // data size compared with the raw data size. As for the record retrieval, we'll
  60. // first locate the block where target record may reside, then read the block to
  61. // memory, and finally search that record within the block. Of course, to avoid
  62. // frequent reads of the same block, we introduced the block cache to keep the
  63. // loaded blocks in the memory.
  64. class BlockBasedTable : public TableReader {
  65. public:
  66. static const std::string kObsoleteFilterBlockPrefix;
  67. static const std::string kFullFilterBlockPrefix;
  68. static const std::string kPartitionedFilterBlockPrefix;
  69. // 1-byte compression type + 32-bit checksum
  70. static constexpr size_t kBlockTrailerSize = 5;
  71. // Attempt to open the table that is stored in bytes [0..file_size)
  72. // of "file", and read the metadata entries necessary to allow
  73. // retrieving data from the table.
  74. //
  75. // If successful, returns ok and sets "*table_reader" to the newly opened
  76. // table. The client should delete "*table_reader" when no longer needed.
  77. // If there was an error while initializing the table, sets "*table_reader"
  78. // to nullptr and returns a non-ok status.
  79. //
  80. // @param file must remain live while this Table is in use.
  81. // @param prefetch_index_and_filter_in_cache can be used to disable
  82. // prefetching of
  83. // index and filter blocks into block cache at startup
  84. // @param skip_filters Disables loading/accessing the filter block. Overrides
  85. // prefetch_index_and_filter_in_cache, so filter will be skipped if both
  86. // are set.
  87. // @param force_direct_prefetch if true, always prefetching to RocksDB
  88. // buffer, rather than calling RandomAccessFile::Prefetch().
  89. static Status Open(
  90. const ReadOptions& ro, const ImmutableOptions& ioptions,
  91. const EnvOptions& env_options,
  92. const BlockBasedTableOptions& table_options,
  93. const InternalKeyComparator& internal_key_comparator,
  94. std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
  95. uint8_t block_protection_bytes_per_key,
  96. std::unique_ptr<TableReader>* table_reader, uint64_t tail_size,
  97. std::shared_ptr<CacheReservationManager> table_reader_cache_res_mgr =
  98. nullptr,
  99. const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr,
  100. UnownedPtr<CompressionManager> compression_manager = nullptr,
  101. bool prefetch_index_and_filter_in_cache = true, bool skip_filters = false,
  102. int level = -1, const bool immortal_table = false,
  103. const SequenceNumber largest_seqno = 0,
  104. bool force_direct_prefetch = false,
  105. TailPrefetchStats* tail_prefetch_stats = nullptr,
  106. BlockCacheTracer* const block_cache_tracer = nullptr,
  107. size_t max_file_size_for_l0_meta_pin = 0,
  108. const std::string& cur_db_session_id = "", uint64_t cur_file_num = 0,
  109. UniqueId64x2 expected_unique_id = {},
  110. const bool user_defined_timestamps_persisted = true);
  111. bool PrefixRangeMayMatch(const Slice& internal_key,
  112. const ReadOptions& read_options,
  113. const SliceTransform* options_prefix_extractor,
  114. const bool need_upper_bound_check,
  115. BlockCacheLookupContext* lookup_context,
  116. bool* filter_checked) const;
  117. // Returns a new iterator over the table contents.
  118. // The result of NewIterator() is initially invalid (caller must
  119. // call one of the Seek methods on the iterator before using it).
  120. // @param read_options Must outlive the returned iterator.
  121. // @param skip_filters Disables loading/accessing the filter block
  122. // compaction_readahead_size: its value will only be used if caller =
  123. // kCompaction.
  124. InternalIterator* NewIterator(const ReadOptions&,
  125. const SliceTransform* prefix_extractor,
  126. Arena* arena, bool skip_filters,
  127. TableReaderCaller caller,
  128. size_t compaction_readahead_size = 0,
  129. bool allow_unprepared_value = false) override;
  130. FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator(
  131. const ReadOptions& read_options) override;
  132. FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator(
  133. SequenceNumber read_seqno, const Slice* timestamp) override;
  134. // @param skip_filters Disables loading/accessing the filter block
  135. Status Get(const ReadOptions& readOptions, const Slice& key,
  136. GetContext* get_context, const SliceTransform* prefix_extractor,
  137. bool skip_filters = false) override;
  138. Status MultiGetFilter(const ReadOptions& read_options,
  139. const SliceTransform* prefix_extractor,
  140. MultiGetRange* mget_range) override;
  141. DECLARE_SYNC_AND_ASYNC_OVERRIDE(void, MultiGet,
  142. const ReadOptions& readOptions,
  143. const MultiGetContext::Range* mget_range,
  144. const SliceTransform* prefix_extractor,
  145. bool skip_filters = false);
  146. // Pre-fetch the disk blocks that correspond to the key range specified by
  147. // (kbegin, kend). The call will return error status in the event of
  148. // IO or iteration error.
  149. Status Prefetch(const ReadOptions& read_options, const Slice* begin,
  150. const Slice* end) override;
  151. // Given a key, return an approximate byte offset in the file where
  152. // the data for that key begins (or would begin if the key were
  153. // present in the file). The returned value is in terms of file
  154. // bytes, and so includes effects like compression of the underlying data.
  155. // E.g., the approximate offset of the last key in the table will
  156. // be close to the file length.
  157. uint64_t ApproximateOffsetOf(const ReadOptions& read_options,
  158. const Slice& key,
  159. TableReaderCaller caller) override;
  160. // Given start and end keys, return the approximate data size in the file
  161. // between the keys. The returned value is in terms of file bytes, and so
  162. // includes effects like compression of the underlying data.
  163. // The start key must not be greater than the end key.
  164. uint64_t ApproximateSize(const ReadOptions& read_options, const Slice& start,
  165. const Slice& end, TableReaderCaller caller) override;
  166. Status ApproximateKeyAnchors(const ReadOptions& read_options,
  167. std::vector<Anchor>& anchors) override;
  168. bool EraseFromCache(const BlockHandle& handle) const;
  169. bool TEST_BlockInCache(const BlockHandle& handle) const;
  170. // Returns true if the block for the specified key is in cache.
  171. // REQUIRES: key is in this table && block cache enabled
  172. bool TEST_KeyInCache(const ReadOptions& options, const Slice& key);
  173. void TEST_GetDataBlockHandle(const ReadOptions& options, const Slice& key,
  174. BlockHandle& handle);
  175. // Set up the table for Compaction. Might change some parameters with
  176. // posix_fadvise
  177. void SetupForCompaction() override;
  178. std::shared_ptr<const TableProperties> GetTableProperties() const override;
  179. const SeqnoToTimeMapping& GetSeqnoToTimeMapping() const;
  180. size_t ApproximateMemoryUsage() const override;
  181. // convert SST file to a human readable form
  182. Status DumpTable(WritableFile* out_file) override;
  183. Status VerifyChecksum(const ReadOptions& readOptions,
  184. TableReaderCaller caller) override;
  185. void MarkObsolete(uint32_t uncache_aggressiveness) override;
  186. ~BlockBasedTable();
  187. bool TEST_FilterBlockInCache() const;
  188. bool TEST_IndexBlockInCache() const;
  189. // IndexReader is the interface that provides the functionality for index
  190. // access.
  191. class IndexReader {
  192. public:
  193. virtual ~IndexReader() = default;
  194. // Create an iterator for index access. If iter is null, then a new object
  195. // is created on the heap, and the callee will have the ownership.
  196. // If a non-null iter is passed in, it may be used, and the returned value
  197. // is either the same as iter or a new on-heap object.
  198. // In the latter case the return value points to a different object then
  199. // iter, and the callee has the ownership of the returned object.
  200. //
  201. // Under all circumstances, the caller MUST use the returned iterator
  202. // for further operations. If the returned iterator != iter, then the
  203. // caller MUST ensure that iter stays in scope until the returned
  204. // iterator is destroyed.
  205. virtual InternalIteratorBase<IndexValue>* NewIterator(
  206. const ReadOptions& read_options, bool disable_prefix_seek,
  207. IndexBlockIter* iter, GetContext* get_context,
  208. BlockCacheLookupContext* lookup_context) = 0;
  209. // Report an approximation of how much memory has been used other than
  210. // memory that was allocated in block cache.
  211. virtual size_t ApproximateMemoryUsage() const = 0;
  212. // Cache the dependencies of the index reader (e.g. the partitions
  213. // of a partitioned index).
  214. virtual Status CacheDependencies(
  215. const ReadOptions& /*ro*/, bool /* pin */,
  216. FilePrefetchBuffer* /* tail_prefetch_buffer */) {
  217. return Status::OK();
  218. }
  219. virtual void EraseFromCacheBeforeDestruction(
  220. uint32_t /*uncache_aggressiveness*/) {}
  221. };
  222. class IndexReaderCommon;
  223. static void SetupBaseCacheKey(const TableProperties* properties,
  224. const std::string& cur_db_session_id,
  225. uint64_t cur_file_number,
  226. OffsetableCacheKey* out_base_cache_key,
  227. bool* out_is_stable = nullptr);
  228. static CacheKey GetCacheKey(const OffsetableCacheKey& base_cache_key,
  229. const BlockHandle& handle);
  230. static void UpdateCacheInsertionMetrics(BlockType block_type,
  231. GetContext* get_context, size_t usage,
  232. bool redundant,
  233. Statistics* const statistics);
  234. Statistics* GetStatistics() const;
  235. bool IsLastLevel() const;
  236. // Get the size to read from storage for a BlockHandle. size_t because we
  237. // are about to load into memory.
  238. static inline size_t BlockSizeWithTrailer(const BlockHandle& handle) {
  239. return static_cast<size_t>(handle.size() + kBlockTrailerSize);
  240. }
  241. // It is the caller's responsibility to make sure that this is called with
  242. // block-based table serialized block contents, which contains the compression
  243. // byte in the trailer after `block_size`.
  244. static inline CompressionType GetBlockCompressionType(const char* block_data,
  245. size_t block_size) {
  246. return static_cast<CompressionType>(block_data[block_size]);
  247. }
  248. static inline CompressionType GetBlockCompressionType(
  249. const BlockContents& contents) {
  250. assert(contents.has_trailer);
  251. return GetBlockCompressionType(contents.data.data(), contents.data.size());
  252. }
  253. // Retrieve all key value pairs from data blocks in the table.
  254. // The key retrieved are internal keys.
  255. Status GetKVPairsFromDataBlocks(const ReadOptions& read_options,
  256. std::vector<KVPairBlock>* kv_pair_blocks);
  257. // Look up the block cache for the specified block.
  258. // out_parsed_block is set to nullptr if the block is not found in the cache.
  259. template <typename TBlocklike>
  260. Status LookupAndPinBlocksInCache(
  261. const ReadOptions& ro, const BlockHandle& handle,
  262. CachableEntry<TBlocklike>* out_parsed_block) const;
  263. // Create the block given in `block_contents` and insert it into block cache.
  264. // `out_parsed_block` points to the inserted block if successful.
  265. template <typename TBlocklike>
  266. Status CreateAndPinBlockInCache(
  267. const ReadOptions& ro, const BlockHandle& handle,
  268. UnownedPtr<Decompressor> decomp, BlockContents* block_contents,
  269. CachableEntry<TBlocklike>* out_parsed_block) const;
  270. struct Rep;
  271. Rep* get_rep() { return rep_; }
  272. const Rep* get_rep() const { return rep_; }
  273. // input_iter: if it is not null, update this one and return it as Iterator
  274. template <typename TBlockIter>
  275. TBlockIter* NewDataBlockIterator(
  276. const ReadOptions& ro, const BlockHandle& block_handle,
  277. TBlockIter* input_iter, BlockType block_type, GetContext* get_context,
  278. BlockCacheLookupContext* lookup_context,
  279. FilePrefetchBuffer* prefetch_buffer, bool for_compaction, bool async_read,
  280. Status& s, bool use_block_cache_for_lookup) const;
  281. // input_iter: if it is not null, update this one and return it as Iterator
  282. template <typename TBlockIter>
  283. TBlockIter* NewDataBlockIterator(const ReadOptions& ro,
  284. CachableEntry<Block>& block,
  285. TBlockIter* input_iter, Status s) const;
  286. class PartitionedIndexIteratorState;
  287. template <typename TBlocklike>
  288. friend class FilterBlockReaderCommon;
  289. friend class PartitionIndexReader;
  290. friend class UncompressionDictReader;
  291. protected:
  292. Rep* rep_;
  293. explicit BlockBasedTable(Rep* rep, BlockCacheTracer* const block_cache_tracer)
  294. : rep_(rep), block_cache_tracer_(block_cache_tracer) {}
  295. // No copying allowed
  296. explicit BlockBasedTable(const TableReader&) = delete;
  297. void operator=(const TableReader&) = delete;
  298. private:
  299. friend class MockedBlockBasedTable;
  300. friend class BlockBasedTableReaderTestVerifyChecksum_ChecksumMismatch_Test;
  301. BlockCacheTracer* const block_cache_tracer_;
  302. void UpdateCacheHitMetrics(BlockType block_type, GetContext* get_context,
  303. size_t usage) const;
  304. void UpdateCacheMissMetrics(BlockType block_type,
  305. GetContext* get_context) const;
  306. // Either Block::NewDataIterator() or Block::NewIndexIterator().
  307. template <typename TBlockIter>
  308. static TBlockIter* InitBlockIterator(const Rep* rep, Block* block,
  309. BlockType block_type,
  310. TBlockIter* input_iter,
  311. bool block_contents_pinned);
  312. // If block cache enabled (compressed or uncompressed), looks for the block
  313. // identified by handle in (1) uncompressed cache, (2) compressed cache, and
  314. // then (3) file. If found, inserts into the cache(s) that were searched
  315. // unsuccessfully (e.g., if found in file, will add to both uncompressed and
  316. // compressed caches if they're enabled).
  317. //
  318. // @param block_entry value is set to the uncompressed block if found. If
  319. // in uncompressed block cache, also sets cache_handle to reference that
  320. // block.
  321. template <typename TBlocklike>
  322. WithBlocklikeCheck<Status, TBlocklike> MaybeReadBlockAndLoadToCache(
  323. FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
  324. const BlockHandle& handle, UnownedPtr<Decompressor> decomp,
  325. bool for_compaction, CachableEntry<TBlocklike>* block_entry,
  326. GetContext* get_context, BlockCacheLookupContext* lookup_context,
  327. BlockContents* contents, bool async_read,
  328. bool use_block_cache_for_lookup) const;
  329. // Similar to the above, with one crucial difference: it will retrieve the
  330. // block from the file even if there are no caches configured (assuming the
  331. // read options allow I/O).
  332. template <typename TBlocklike>
  333. WithBlocklikeCheck<Status, TBlocklike> RetrieveBlock(
  334. FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
  335. const BlockHandle& handle, UnownedPtr<Decompressor> decomp,
  336. CachableEntry<TBlocklike>* block_entry, GetContext* get_context,
  337. BlockCacheLookupContext* lookup_context, bool for_compaction,
  338. bool use_cache, bool async_read, bool use_block_cache_for_lookup) const;
  339. template <typename TBlocklike>
  340. WithBlocklikeCheck<void, TBlocklike> SaveLookupContextOrTraceRecord(
  341. const Slice& block_key, bool is_cache_hit, const ReadOptions& ro,
  342. const TBlocklike* parsed_block_value,
  343. BlockCacheLookupContext* lookup_context) const;
  344. void FinishTraceRecord(const BlockCacheLookupContext& lookup_context,
  345. const Slice& block_key, const Slice& referenced_key,
  346. bool does_referenced_key_exist,
  347. uint64_t referenced_data_size) const;
  348. DECLARE_SYNC_AND_ASYNC_CONST(
  349. void, RetrieveMultipleBlocks, const ReadOptions& options,
  350. const MultiGetRange* batch,
  351. const autovector<BlockHandle, MultiGetContext::MAX_BATCH_SIZE>* handles,
  352. Status* statuses, CachableEntry<Block_kData>* results, char* scratch,
  353. UnownedPtr<Decompressor> decomp, bool use_fs_scratch);
  354. // Get the iterator from the index reader.
  355. //
  356. // If input_iter is not set, return a new Iterator.
  357. // If input_iter is set, try to update it and return it as Iterator.
  358. // However note that in some cases the returned iterator may be different
  359. // from input_iter. In such case the returned iterator should be freed.
  360. //
  361. // Note: ErrorIterator with Status::Incomplete shall be returned if all the
  362. // following conditions are met:
  363. // 1. We enabled table_options.cache_index_and_filter_blocks.
  364. // 2. index is not present in block cache.
  365. // 3. We disallowed any io to be performed, that is, read_options ==
  366. // kBlockCacheTier
  367. InternalIteratorBase<IndexValue>* NewIndexIterator(
  368. const ReadOptions& read_options, bool need_upper_bound_check,
  369. IndexBlockIter* input_iter, GetContext* get_context,
  370. BlockCacheLookupContext* lookup_context) const;
  371. template <typename TBlocklike>
  372. Cache::Priority GetCachePriority() const;
  373. // Read block cache from block caches (if set): block_cache.
  374. // On success, Status::OK with be returned and @block will be populated with
  375. // pointer to the block as well as its block handle.
  376. // @param uncompression_dict Data for presetting the compression library's
  377. // dictionary.
  378. template <typename TBlocklike>
  379. WithBlocklikeCheck<Status, TBlocklike> GetDataBlockFromCache(
  380. const Slice& cache_key, BlockCacheInterface<TBlocklike> block_cache,
  381. CachableEntry<TBlocklike>* block, GetContext* get_context,
  382. UnownedPtr<Decompressor> decomp) const;
  383. // Put a maybe compressed block to the corresponding block caches.
  384. // This method will perform decompression against block_contents if needed
  385. // and then populate the block caches.
  386. // On success, Status::OK will be returned; also @block will be populated with
  387. // uncompressed block and its cache handle.
  388. //
  389. // Allocated memory managed by block_contents will be transferred to
  390. // PutDataBlockToCache(). After the call, the object will be invalid.
  391. // @param uncompression_dict Data for presetting the compression library's
  392. // dictionary.
  393. template <typename TBlocklike>
  394. WithBlocklikeCheck<Status, TBlocklike> PutDataBlockToCache(
  395. const Slice& cache_key, BlockCacheInterface<TBlocklike> block_cache,
  396. CachableEntry<TBlocklike>* cached_block,
  397. BlockContents&& uncompressed_block_contents,
  398. BlockContents&& compressed_block_contents,
  399. CompressionType block_comp_type, UnownedPtr<Decompressor> decomp,
  400. MemoryAllocator* memory_allocator, GetContext* get_context) const;
  401. // Calls (*handle_result)(arg, ...) repeatedly, starting with the entry found
  402. // after a call to Seek(key), until handle_result returns false.
  403. // May not make such a call if filter policy says that key is not present.
  404. friend class TableCache;
  405. friend class BlockBasedTableBuilder;
  406. // Create a index reader based on the index type stored in the table.
  407. // Optionally, user can pass a preloaded meta_index_iter for the index that
  408. // need to access extra meta blocks for index construction. This parameter
  409. // helps avoid re-reading meta index block if caller already created one.
  410. Status CreateIndexReader(const ReadOptions& ro,
  411. FilePrefetchBuffer* prefetch_buffer,
  412. InternalIterator* preloaded_meta_index_iter,
  413. bool use_cache, bool prefetch, bool pin,
  414. BlockCacheLookupContext* lookup_context,
  415. std::unique_ptr<IndexReader>* index_reader);
  416. bool FullFilterKeyMayMatch(FilterBlockReader* filter, const Slice& user_key,
  417. const SliceTransform* prefix_extractor,
  418. GetContext* get_context,
  419. BlockCacheLookupContext* lookup_context,
  420. const ReadOptions& read_options) const;
  421. void FullFilterKeysMayMatch(FilterBlockReader* filter, MultiGetRange* range,
  422. const SliceTransform* prefix_extractor,
  423. BlockCacheLookupContext* lookup_context,
  424. const ReadOptions& read_options) const;
  425. // If force_direct_prefetch is true, always prefetching to RocksDB
  426. // buffer, rather than calling RandomAccessFile::Prefetch().
  427. static Status PrefetchTail(
  428. const ReadOptions& ro, const ImmutableOptions& ioptions,
  429. RandomAccessFileReader* file, uint64_t file_size,
  430. bool force_direct_prefetch, TailPrefetchStats* tail_prefetch_stats,
  431. const bool prefetch_all, const bool preload_all,
  432. std::unique_ptr<FilePrefetchBuffer>* prefetch_buffer, Statistics* stats,
  433. uint64_t tail_size, Logger* const logger);
  434. Status ReadMetaIndexBlock(const ReadOptions& ro,
  435. FilePrefetchBuffer* prefetch_buffer,
  436. std::unique_ptr<Block>* metaindex_block,
  437. std::unique_ptr<InternalIterator>* iter);
  438. Status ReadPropertiesBlock(const ReadOptions& ro,
  439. FilePrefetchBuffer* prefetch_buffer,
  440. InternalIterator* meta_iter,
  441. const SequenceNumber largest_seqno);
  442. Status ReadRangeDelBlock(const ReadOptions& ro,
  443. FilePrefetchBuffer* prefetch_buffer,
  444. InternalIterator* meta_iter,
  445. const InternalKeyComparator& internal_comparator,
  446. BlockCacheLookupContext* lookup_context);
  447. // If index and filter blocks do not need to be pinned, `prefetch_all`
  448. // determines whether they will be read and add to cache.
  449. Status PrefetchIndexAndFilterBlocks(
  450. const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer,
  451. InternalIterator* meta_iter, BlockBasedTable* new_table,
  452. bool prefetch_all, const BlockBasedTableOptions& table_options,
  453. const int level, size_t file_size, size_t max_file_size_for_l0_meta_pin,
  454. BlockCacheLookupContext* lookup_context);
  455. static BlockType GetBlockTypeForMetaBlockByName(const Slice& meta_block_name);
  456. Status VerifyChecksumInMetaBlocks(const ReadOptions& read_options,
  457. InternalIteratorBase<Slice>* index_iter);
  458. Status VerifyChecksumInBlocks(const ReadOptions& read_options,
  459. InternalIteratorBase<IndexValue>* index_iter);
  460. // Create the filter from the filter block.
  461. std::unique_ptr<FilterBlockReader> CreateFilterBlockReader(
  462. const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer,
  463. bool use_cache, bool prefetch, bool pin,
  464. BlockCacheLookupContext* lookup_context);
  465. // Size of all data blocks, maybe approximate
  466. uint64_t GetApproximateDataSize();
  467. // Given an iterator return its offset in data block section of file.
  468. uint64_t ApproximateDataOffsetOf(
  469. const InternalIteratorBase<IndexValue>& index_iter,
  470. uint64_t data_size) const;
  471. // Helper functions for DumpTable()
  472. Status DumpIndexBlock(std::ostream& out_stream);
  473. Status DumpDataBlocks(std::ostream& out_stream);
  474. void DumpKeyValue(const Slice& key, const Slice& value,
  475. std::ostream& out_stream);
  476. // Returns false if prefix_extractor exists and is compatible with that used
  477. // in building the table file, otherwise true.
  478. bool PrefixExtractorChanged(const SliceTransform* prefix_extractor) const;
  479. bool TimestampMayMatch(const ReadOptions& read_options) const;
  480. bool BlockTypeMaybeCompressed(BlockType type) const {
  481. return type != BlockType::kFilter &&
  482. type != BlockType::kCompressionDictionary &&
  483. type != BlockType::kUserDefinedIndex;
  484. }
  485. // A cumulative data block file read in MultiGet lower than this size will
  486. // use a stack buffer
  487. static constexpr size_t kMultiGetReadStackBufSize = 8192;
  488. friend class PartitionedFilterBlockReader;
  489. friend class PartitionedFilterBlockTest;
  490. friend class DBBasicTest_MultiGetIOBufferOverrun_Test;
  491. };
  492. // Maintaining state of a two-level iteration on a partitioned index structure.
  493. class BlockBasedTable::PartitionedIndexIteratorState
  494. : public TwoLevelIteratorState {
  495. public:
  496. PartitionedIndexIteratorState(
  497. const BlockBasedTable* table,
  498. UnorderedMap<uint64_t, CachableEntry<Block>>* block_map);
  499. InternalIteratorBase<IndexValue>* NewSecondaryIterator(
  500. const BlockHandle& index_value) override;
  501. private:
  502. // Don't own table_
  503. const BlockBasedTable* table_;
  504. UnorderedMap<uint64_t, CachableEntry<Block>>* block_map_;
  505. };
  506. // Stores all the properties associated with a BlockBasedTable.
  507. // These are immutable.
  508. struct BlockBasedTable::Rep {
  509. Rep(const ImmutableOptions& _ioptions, const EnvOptions& _env_options,
  510. const BlockBasedTableOptions& _table_opt,
  511. const InternalKeyComparator& _internal_comparator, bool skip_filters,
  512. uint64_t _file_size, int _level, const bool _immortal_table,
  513. const bool _user_defined_timestamps_persisted = true)
  514. : ioptions(_ioptions),
  515. env_options(_env_options),
  516. table_options(_table_opt),
  517. filter_policy(skip_filters ? nullptr : _table_opt.filter_policy.get()),
  518. internal_comparator(_internal_comparator),
  519. filter_type(FilterType::kNoFilter),
  520. index_type(BlockBasedTableOptions::IndexType::kBinarySearch),
  521. whole_key_filtering(_table_opt.whole_key_filtering),
  522. prefix_filtering(true),
  523. global_seqno(kDisableGlobalSequenceNumber),
  524. file_size(_file_size),
  525. level(_level),
  526. immortal_table(_immortal_table),
  527. user_defined_timestamps_persisted(_user_defined_timestamps_persisted),
  528. fs_prefetch_support(CheckFSFeatureSupport(
  529. _ioptions.fs.get(), FSSupportedOps::kFSPrefetch)) {}
  530. ~Rep() { status.PermitUncheckedError(); }
  531. const ImmutableOptions& ioptions;
  532. const EnvOptions& env_options;
  533. const BlockBasedTableOptions table_options;
  534. const FilterPolicy* const filter_policy;
  535. const InternalKeyComparator& internal_comparator;
  536. Status status;
  537. std::unique_ptr<RandomAccessFileReader> file;
  538. OffsetableCacheKey base_cache_key;
  539. PersistentCacheOptions persistent_cache_options;
  540. // Footer contains the fixed table information
  541. Footer footer;
  542. std::unique_ptr<IndexReader> index_reader;
  543. std::unique_ptr<FilterBlockReader> filter;
  544. std::unique_ptr<UncompressionDictReader> uncompression_dict_reader;
  545. enum class FilterType {
  546. kNoFilter,
  547. kFullFilter,
  548. kPartitionedFilter,
  549. };
  550. FilterType filter_type;
  551. BlockHandle filter_handle;
  552. BlockHandle compression_dict_handle;
  553. std::shared_ptr<const TableProperties> table_properties;
  554. SeqnoToTimeMapping seqno_to_time_mapping;
  555. BlockHandle index_handle;
  556. BlockBasedTableOptions::IndexType index_type;
  557. bool whole_key_filtering;
  558. bool prefix_filtering;
  559. std::shared_ptr<const SliceTransform> table_prefix_extractor;
  560. std::shared_ptr<FragmentedRangeTombstoneList> fragmented_range_dels;
  561. // Context for block cache CreateCallback
  562. BlockCreateContext create_context;
  563. // If global_seqno is used, all Keys in this file will have the same
  564. // seqno with value `global_seqno`.
  565. //
  566. // A value of kDisableGlobalSequenceNumber means that this feature is disabled
  567. // and every key have it's own seqno.
  568. SequenceNumber global_seqno;
  569. // Size of the table file on disk
  570. uint64_t file_size;
  571. // the level when the table is opened, could potentially change when trivial
  572. // move is involved
  573. int level;
  574. // the timestamp range of table
  575. // Points into memory owned by TableProperties. This would need to change if
  576. // TableProperties become subject to cache eviction.
  577. Slice min_timestamp;
  578. Slice max_timestamp;
  579. // If blocks might be compressed, refers to a decompressor that can decompress
  580. // them. (nullptr -> no blocks compressed) However, if (data) blocks are
  581. // dictionary compressed, a dictionary-aware decompressor is needed, which
  582. // might live in the block cache.
  583. std::shared_ptr<Decompressor> decompressor;
  584. // These describe how index is encoded.
  585. bool index_has_first_key = false;
  586. bool index_key_includes_seq = true;
  587. bool index_value_is_full = true;
  588. // Whether block checksums in metadata blocks were verified on open.
  589. // This is only to mostly maintain current dubious behavior of VerifyChecksum
  590. // with respect to index blocks, but only when the checksum was previously
  591. // verified.
  592. bool verify_checksum_set_on_open = false;
  593. const bool immortal_table;
  594. // Whether the user key contains user-defined timestamps. If this is false and
  595. // the running user comparator has a non-zero timestamp size, a min timestamp
  596. // of this size will be padded to each user key while parsing blocks whenever
  597. // it applies. This includes the keys in data block, index block for data
  598. // block, top-level index for index partitions (if index type is
  599. // `kTwoLevelIndexSearch`), top-level index for filter partitions (if using
  600. // partitioned filters), the `first_internal_key` in `IndexValue`, the
  601. // `end_key` for range deletion entries.
  602. const bool user_defined_timestamps_persisted;
  603. const bool fs_prefetch_support;
  604. // Set to >0 when the file is known to be obsolete and should have its block
  605. // cache entries evicted on close. NOTE: when the file becomes obsolete,
  606. // there could be multiple table cache references that all mark this file as
  607. // obsolete. An atomic resolves the race quite reasonably. Even in the rare
  608. // case of such a race, they will most likely be storing the same value.
  609. RelaxedAtomic<uint32_t> uncache_aggressiveness{0};
  610. std::unique_ptr<CacheReservationManager::CacheReservationHandle>
  611. table_reader_cache_res_handle = nullptr;
  612. CachableEntry<Block_kUserDefinedIndex> udi_block;
  613. SequenceNumber get_global_seqno(BlockType block_type) const {
  614. return (block_type == BlockType::kFilterPartitionIndex ||
  615. block_type == BlockType::kCompressionDictionary)
  616. ? kDisableGlobalSequenceNumber
  617. : global_seqno;
  618. }
  619. uint64_t cf_id_for_tracing() const {
  620. return table_properties
  621. ? table_properties->column_family_id
  622. : ROCKSDB_NAMESPACE::TablePropertiesCollectorFactory::Context::
  623. kUnknownColumnFamily;
  624. }
  625. Slice cf_name_for_tracing() const {
  626. return table_properties ? table_properties->column_family_name
  627. : BlockCacheTraceHelper::kUnknownColumnFamilyName;
  628. }
  629. uint32_t level_for_tracing() const { return level >= 0 ? level : UINT32_MAX; }
  630. uint64_t sst_number_for_tracing() const {
  631. return file ? TableFileNameToNumber(file->file_name()) : UINT64_MAX;
  632. }
  633. void CreateFilePrefetchBuffer(
  634. const ReadaheadParams& readahead_params,
  635. std::unique_ptr<FilePrefetchBuffer>* fpb,
  636. const std::function<void(bool, uint64_t&, uint64_t&)>& readaheadsize_cb,
  637. FilePrefetchBufferUsage usage) const {
  638. fpb->reset(new FilePrefetchBuffer(
  639. readahead_params, !ioptions.allow_mmap_reads /* enable */,
  640. false /* track_min_offset */, ioptions.fs.get(), ioptions.clock,
  641. ioptions.stats, readaheadsize_cb, usage));
  642. }
  643. void CreateFilePrefetchBufferIfNotExists(
  644. const ReadaheadParams& readahead_params,
  645. std::unique_ptr<FilePrefetchBuffer>* fpb,
  646. const std::function<void(bool, uint64_t&, uint64_t&)>& readaheadsize_cb,
  647. FilePrefetchBufferUsage usage = FilePrefetchBufferUsage::kUnknown) const {
  648. if (!(*fpb)) {
  649. CreateFilePrefetchBuffer(readahead_params, fpb, readaheadsize_cb, usage);
  650. }
  651. }
  652. std::size_t ApproximateMemoryUsage() const {
  653. std::size_t usage = 0;
  654. #ifdef ROCKSDB_MALLOC_USABLE_SIZE
  655. usage += malloc_usable_size(const_cast<BlockBasedTable::Rep*>(this));
  656. #else
  657. usage += sizeof(*this);
  658. #endif // ROCKSDB_MALLOC_USABLE_SIZE
  659. return usage;
  660. }
  661. };
  662. // This is an adapter class for `WritableFile` to be used for `std::ostream`.
  663. // The adapter wraps a `WritableFile`, which can be passed to a `std::ostream`
  664. // constructor for storing streaming data.
  665. // Note:
  666. // * This adapter doesn't provide any buffering, each write is forwarded to
  667. // `WritableFile->Append()` directly.
  668. // * For a failed write, the user needs to check the status by `ostream.good()`
  669. class WritableFileStringStreamAdapter : public std::stringbuf {
  670. public:
  671. explicit WritableFileStringStreamAdapter(WritableFile* writable_file)
  672. : file_(writable_file) {}
  673. // Override overflow() to handle `sputc()`. There are cases that will not go
  674. // through `xsputn()` e.g. `std::endl` or an unsigned long long is written by
  675. // `os.put()` directly and will call `sputc()` By internal implementation:
  676. // int_type __CLR_OR_THIS_CALL sputc(_Elem _Ch) { // put a character
  677. // return 0 < _Pnavail() ? _Traits::to_int_type(*_Pninc() = _Ch) :
  678. // overflow(_Traits::to_int_type(_Ch));
  679. // }
  680. // As we explicitly disabled buffering (_Pnavail() is always 0), every write,
  681. // not captured by xsputn(), becomes an overflow here.
  682. int overflow(int ch = EOF) override {
  683. if (ch != EOF) {
  684. Status s = file_->Append(Slice((char*)&ch, 1));
  685. if (s.ok()) {
  686. return ch;
  687. }
  688. }
  689. return EOF;
  690. }
  691. std::streamsize xsputn(char const* p, std::streamsize n) override {
  692. Status s = file_->Append(Slice(p, n));
  693. if (!s.ok()) {
  694. return 0;
  695. }
  696. return n;
  697. }
  698. private:
  699. WritableFile* file_;
  700. };
  701. } // namespace ROCKSDB_NAMESPACE