| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824 |
- // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
- // This source code is licensed under both the GPLv2 (found in the
- // COPYING file in the root directory) and Apache 2.0 License
- // (found in the LICENSE.Apache file in the root directory).
- //
- // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
- // Use of this source code is governed by a BSD-style license that can be
- // found in the LICENSE file. See the AUTHORS file for names of contributors.
- #pragma once
- #include <stdint.h>
- #include <memory>
- #include <set>
- #include <string>
- #include <utility>
- #include <vector>
- #include "db/range_tombstone_fragmenter.h"
- #include "file/filename.h"
- #include "file/random_access_file_reader.h"
- #include "options/cf_options.h"
- #include "rocksdb/options.h"
- #include "rocksdb/persistent_cache.h"
- #include "rocksdb/statistics.h"
- #include "rocksdb/status.h"
- #include "rocksdb/table.h"
- #include "table/block_based/block.h"
- #include "table/block_based/block_based_table_factory.h"
- #include "table/block_based/block_type.h"
- #include "table/block_based/cachable_entry.h"
- #include "table/block_based/filter_block.h"
- #include "table/block_based/uncompression_dict_reader.h"
- #include "table/format.h"
- #include "table/get_context.h"
- #include "table/multiget_context.h"
- #include "table/persistent_cache_helper.h"
- #include "table/table_properties_internal.h"
- #include "table/table_reader.h"
- #include "table/two_level_iterator.h"
- #include "trace_replay/block_cache_tracer.h"
- #include "util/coding.h"
- #include "util/user_comparator_wrapper.h"
- namespace ROCKSDB_NAMESPACE {
- class Cache;
- class FilterBlockReader;
- class BlockBasedFilterBlockReader;
- class FullFilterBlockReader;
- class Footer;
- class InternalKeyComparator;
- class Iterator;
- class FSRandomAccessFile;
- class TableCache;
- class TableReader;
- class WritableFile;
- struct BlockBasedTableOptions;
- struct EnvOptions;
- struct ReadOptions;
- class GetContext;
- typedef std::vector<std::pair<std::string, std::string>> KVPairBlock;
- // Reader class for BlockBasedTable format.
- // For the format of BlockBasedTable refer to
- // https://github.com/facebook/rocksdb/wiki/Rocksdb-BlockBasedTable-Format.
- // This is the default table type. Data is chucked into fixed size blocks and
- // each block in-turn stores entries. When storing data, we can compress and/or
- // encode data efficiently within a block, which often results in a much smaller
- // data size compared with the raw data size. As for the record retrieval, we'll
- // first locate the block where target record may reside, then read the block to
- // memory, and finally search that record within the block. Of course, to avoid
- // frequent reads of the same block, we introduced the block cache to keep the
- // loaded blocks in the memory.
- class BlockBasedTable : public TableReader {
- public:
- static const std::string kFilterBlockPrefix;
- static const std::string kFullFilterBlockPrefix;
- static const std::string kPartitionedFilterBlockPrefix;
- // The longest prefix of the cache key used to identify blocks.
- // For Posix files the unique ID is three varints.
- static const size_t kMaxCacheKeyPrefixSize = kMaxVarint64Length * 3 + 1;
- // All the below fields control iterator readahead
- static const size_t kInitAutoReadaheadSize = 8 * 1024;
- // Found that 256 KB readahead size provides the best performance, based on
- // experiments, for auto readahead. Experiment data is in PR #3282.
- static const size_t kMaxAutoReadaheadSize;
- static const int kMinNumFileReadsToStartAutoReadahead = 2;
- // Attempt to open the table that is stored in bytes [0..file_size)
- // of "file", and read the metadata entries necessary to allow
- // retrieving data from the table.
- //
- // If successful, returns ok and sets "*table_reader" to the newly opened
- // table. The client should delete "*table_reader" when no longer needed.
- // If there was an error while initializing the table, sets "*table_reader"
- // to nullptr and returns a non-ok status.
- //
- // @param file must remain live while this Table is in use.
- // @param prefetch_index_and_filter_in_cache can be used to disable
- // prefetching of
- // index and filter blocks into block cache at startup
- // @param skip_filters Disables loading/accessing the filter block. Overrides
- // prefetch_index_and_filter_in_cache, so filter will be skipped if both
- // are set.
- static Status Open(const ImmutableCFOptions& ioptions,
- const EnvOptions& env_options,
- const BlockBasedTableOptions& table_options,
- const InternalKeyComparator& internal_key_comparator,
- std::unique_ptr<RandomAccessFileReader>&& file,
- uint64_t file_size,
- std::unique_ptr<TableReader>* table_reader,
- const SliceTransform* prefix_extractor = nullptr,
- bool prefetch_index_and_filter_in_cache = true,
- bool skip_filters = false, int level = -1,
- const bool immortal_table = false,
- const SequenceNumber largest_seqno = 0,
- TailPrefetchStats* tail_prefetch_stats = nullptr,
- BlockCacheTracer* const block_cache_tracer = nullptr);
- bool PrefixMayMatch(const Slice& internal_key,
- const ReadOptions& read_options,
- const SliceTransform* options_prefix_extractor,
- const bool need_upper_bound_check,
- BlockCacheLookupContext* lookup_context) const;
- // Returns a new iterator over the table contents.
- // The result of NewIterator() is initially invalid (caller must
- // call one of the Seek methods on the iterator before using it).
- // @param skip_filters Disables loading/accessing the filter block
- // compaction_readahead_size: its value will only be used if caller =
- // kCompaction.
- InternalIterator* NewIterator(const ReadOptions&,
- const SliceTransform* prefix_extractor,
- Arena* arena, bool skip_filters,
- TableReaderCaller caller,
- size_t compaction_readahead_size = 0) override;
- FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator(
- const ReadOptions& read_options) override;
- // @param skip_filters Disables loading/accessing the filter block
- Status Get(const ReadOptions& readOptions, const Slice& key,
- GetContext* get_context, const SliceTransform* prefix_extractor,
- bool skip_filters = false) override;
- void MultiGet(const ReadOptions& readOptions,
- const MultiGetContext::Range* mget_range,
- const SliceTransform* prefix_extractor,
- bool skip_filters = false) override;
- // Pre-fetch the disk blocks that correspond to the key range specified by
- // (kbegin, kend). The call will return error status in the event of
- // IO or iteration error.
- Status Prefetch(const Slice* begin, const Slice* end) override;
- // Given a key, return an approximate byte offset in the file where
- // the data for that key begins (or would begin if the key were
- // present in the file). The returned value is in terms of file
- // bytes, and so includes effects like compression of the underlying data.
- // E.g., the approximate offset of the last key in the table will
- // be close to the file length.
- uint64_t ApproximateOffsetOf(const Slice& key,
- TableReaderCaller caller) override;
- // Given start and end keys, return the approximate data size in the file
- // between the keys. The returned value is in terms of file bytes, and so
- // includes effects like compression of the underlying data.
- // The start key must not be greater than the end key.
- uint64_t ApproximateSize(const Slice& start, const Slice& end,
- TableReaderCaller caller) override;
- bool TEST_BlockInCache(const BlockHandle& handle) const;
- // Returns true if the block for the specified key is in cache.
- // REQUIRES: key is in this table && block cache enabled
- bool TEST_KeyInCache(const ReadOptions& options, const Slice& key);
- // Set up the table for Compaction. Might change some parameters with
- // posix_fadvise
- void SetupForCompaction() override;
- std::shared_ptr<const TableProperties> GetTableProperties() const override;
- size_t ApproximateMemoryUsage() const override;
- // convert SST file to a human readable form
- Status DumpTable(WritableFile* out_file) override;
- Status VerifyChecksum(const ReadOptions& readOptions,
- TableReaderCaller caller) override;
- ~BlockBasedTable();
- bool TEST_FilterBlockInCache() const;
- bool TEST_IndexBlockInCache() const;
- // IndexReader is the interface that provides the functionality for index
- // access.
- class IndexReader {
- public:
- virtual ~IndexReader() = default;
- // Create an iterator for index access. If iter is null, then a new object
- // is created on the heap, and the callee will have the ownership.
- // If a non-null iter is passed in, it will be used, and the returned value
- // is either the same as iter or a new on-heap object that
- // wraps the passed iter. In the latter case the return value points
- // to a different object then iter, and the callee has the ownership of the
- // returned object.
- virtual InternalIteratorBase<IndexValue>* NewIterator(
- const ReadOptions& read_options, bool disable_prefix_seek,
- IndexBlockIter* iter, GetContext* get_context,
- BlockCacheLookupContext* lookup_context) = 0;
- // Report an approximation of how much memory has been used other than
- // memory that was allocated in block cache.
- virtual size_t ApproximateMemoryUsage() const = 0;
- // Cache the dependencies of the index reader (e.g. the partitions
- // of a partitioned index).
- virtual void CacheDependencies(bool /* pin */) {}
- };
- class IndexReaderCommon;
- static Slice GetCacheKey(const char* cache_key_prefix,
- size_t cache_key_prefix_size,
- const BlockHandle& handle, char* cache_key);
- // Retrieve all key value pairs from data blocks in the table.
- // The key retrieved are internal keys.
- Status GetKVPairsFromDataBlocks(std::vector<KVPairBlock>* kv_pair_blocks);
- struct Rep;
- Rep* get_rep() { return rep_; }
- const Rep* get_rep() const { return rep_; }
- // input_iter: if it is not null, update this one and return it as Iterator
- template <typename TBlockIter>
- TBlockIter* NewDataBlockIterator(
- const ReadOptions& ro, const BlockHandle& block_handle,
- TBlockIter* input_iter, BlockType block_type, GetContext* get_context,
- BlockCacheLookupContext* lookup_context, Status s,
- FilePrefetchBuffer* prefetch_buffer, bool for_compaction = false) const;
- // input_iter: if it is not null, update this one and return it as Iterator
- template <typename TBlockIter>
- TBlockIter* NewDataBlockIterator(const ReadOptions& ro,
- CachableEntry<Block>& block,
- TBlockIter* input_iter, Status s) const;
- class PartitionedIndexIteratorState;
- template <typename TBlocklike>
- friend class FilterBlockReaderCommon;
- friend class PartitionIndexReader;
- friend class UncompressionDictReader;
- protected:
- Rep* rep_;
- explicit BlockBasedTable(Rep* rep, BlockCacheTracer* const block_cache_tracer)
- : rep_(rep), block_cache_tracer_(block_cache_tracer) {}
- // No copying allowed
- explicit BlockBasedTable(const TableReader&) = delete;
- void operator=(const TableReader&) = delete;
- private:
- friend class MockedBlockBasedTable;
- static std::atomic<uint64_t> next_cache_key_id_;
- BlockCacheTracer* const block_cache_tracer_;
- void UpdateCacheHitMetrics(BlockType block_type, GetContext* get_context,
- size_t usage) const;
- void UpdateCacheMissMetrics(BlockType block_type,
- GetContext* get_context) const;
- void UpdateCacheInsertionMetrics(BlockType block_type,
- GetContext* get_context, size_t usage) const;
- Cache::Handle* GetEntryFromCache(Cache* block_cache, const Slice& key,
- BlockType block_type,
- GetContext* get_context) const;
- // Either Block::NewDataIterator() or Block::NewIndexIterator().
- template <typename TBlockIter>
- static TBlockIter* InitBlockIterator(const Rep* rep, Block* block,
- TBlockIter* input_iter,
- bool block_contents_pinned);
- // If block cache enabled (compressed or uncompressed), looks for the block
- // identified by handle in (1) uncompressed cache, (2) compressed cache, and
- // then (3) file. If found, inserts into the cache(s) that were searched
- // unsuccessfully (e.g., if found in file, will add to both uncompressed and
- // compressed caches if they're enabled).
- //
- // @param block_entry value is set to the uncompressed block if found. If
- // in uncompressed block cache, also sets cache_handle to reference that
- // block.
- template <typename TBlocklike>
- Status MaybeReadBlockAndLoadToCache(
- FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
- const BlockHandle& handle, const UncompressionDict& uncompression_dict,
- CachableEntry<TBlocklike>* block_entry, BlockType block_type,
- GetContext* get_context, BlockCacheLookupContext* lookup_context,
- BlockContents* contents) const;
- // Similar to the above, with one crucial difference: it will retrieve the
- // block from the file even if there are no caches configured (assuming the
- // read options allow I/O).
- template <typename TBlocklike>
- Status RetrieveBlock(FilePrefetchBuffer* prefetch_buffer,
- const ReadOptions& ro, const BlockHandle& handle,
- const UncompressionDict& uncompression_dict,
- CachableEntry<TBlocklike>* block_entry,
- BlockType block_type, GetContext* get_context,
- BlockCacheLookupContext* lookup_context,
- bool for_compaction, bool use_cache) const;
- void RetrieveMultipleBlocks(
- const ReadOptions& options, const MultiGetRange* batch,
- const autovector<BlockHandle, MultiGetContext::MAX_BATCH_SIZE>* handles,
- autovector<Status, MultiGetContext::MAX_BATCH_SIZE>* statuses,
- autovector<CachableEntry<Block>, MultiGetContext::MAX_BATCH_SIZE>*
- results,
- char* scratch, const UncompressionDict& uncompression_dict) const;
- // Get the iterator from the index reader.
- //
- // If input_iter is not set, return a new Iterator.
- // If input_iter is set, try to update it and return it as Iterator.
- // However note that in some cases the returned iterator may be different
- // from input_iter. In such case the returned iterator should be freed.
- //
- // Note: ErrorIterator with Status::Incomplete shall be returned if all the
- // following conditions are met:
- // 1. We enabled table_options.cache_index_and_filter_blocks.
- // 2. index is not present in block cache.
- // 3. We disallowed any io to be performed, that is, read_options ==
- // kBlockCacheTier
- InternalIteratorBase<IndexValue>* NewIndexIterator(
- const ReadOptions& read_options, bool need_upper_bound_check,
- IndexBlockIter* input_iter, GetContext* get_context,
- BlockCacheLookupContext* lookup_context) const;
- // Read block cache from block caches (if set): block_cache and
- // block_cache_compressed.
- // On success, Status::OK with be returned and @block will be populated with
- // pointer to the block as well as its block handle.
- // @param uncompression_dict Data for presetting the compression library's
- // dictionary.
- template <typename TBlocklike>
- Status GetDataBlockFromCache(
- const Slice& block_cache_key, const Slice& compressed_block_cache_key,
- Cache* block_cache, Cache* block_cache_compressed,
- const ReadOptions& read_options, CachableEntry<TBlocklike>* block,
- const UncompressionDict& uncompression_dict, BlockType block_type,
- GetContext* get_context) const;
- // Put a raw block (maybe compressed) to the corresponding block caches.
- // This method will perform decompression against raw_block if needed and then
- // populate the block caches.
- // On success, Status::OK will be returned; also @block will be populated with
- // uncompressed block and its cache handle.
- //
- // Allocated memory managed by raw_block_contents will be transferred to
- // PutDataBlockToCache(). After the call, the object will be invalid.
- // @param uncompression_dict Data for presetting the compression library's
- // dictionary.
- template <typename TBlocklike>
- Status PutDataBlockToCache(
- const Slice& block_cache_key, const Slice& compressed_block_cache_key,
- Cache* block_cache, Cache* block_cache_compressed,
- CachableEntry<TBlocklike>* cached_block,
- BlockContents* raw_block_contents, CompressionType raw_block_comp_type,
- const UncompressionDict& uncompression_dict, SequenceNumber seq_no,
- MemoryAllocator* memory_allocator, BlockType block_type,
- GetContext* get_context) const;
- // Calls (*handle_result)(arg, ...) repeatedly, starting with the entry found
- // after a call to Seek(key), until handle_result returns false.
- // May not make such a call if filter policy says that key is not present.
- friend class TableCache;
- friend class BlockBasedTableBuilder;
- // Create a index reader based on the index type stored in the table.
- // Optionally, user can pass a preloaded meta_index_iter for the index that
- // need to access extra meta blocks for index construction. This parameter
- // helps avoid re-reading meta index block if caller already created one.
- Status CreateIndexReader(FilePrefetchBuffer* prefetch_buffer,
- InternalIterator* preloaded_meta_index_iter,
- bool use_cache, bool prefetch, bool pin,
- BlockCacheLookupContext* lookup_context,
- std::unique_ptr<IndexReader>* index_reader);
- bool FullFilterKeyMayMatch(const ReadOptions& read_options,
- FilterBlockReader* filter, const Slice& user_key,
- const bool no_io,
- const SliceTransform* prefix_extractor,
- GetContext* get_context,
- BlockCacheLookupContext* lookup_context) const;
- void FullFilterKeysMayMatch(const ReadOptions& read_options,
- FilterBlockReader* filter, MultiGetRange* range,
- const bool no_io,
- const SliceTransform* prefix_extractor,
- BlockCacheLookupContext* lookup_context) const;
- static Status PrefetchTail(
- RandomAccessFileReader* file, uint64_t file_size,
- TailPrefetchStats* tail_prefetch_stats, const bool prefetch_all,
- const bool preload_all,
- std::unique_ptr<FilePrefetchBuffer>* prefetch_buffer);
- Status ReadMetaIndexBlock(FilePrefetchBuffer* prefetch_buffer,
- std::unique_ptr<Block>* metaindex_block,
- std::unique_ptr<InternalIterator>* iter);
- Status TryReadPropertiesWithGlobalSeqno(FilePrefetchBuffer* prefetch_buffer,
- const Slice& handle_value,
- TableProperties** table_properties);
- Status ReadPropertiesBlock(FilePrefetchBuffer* prefetch_buffer,
- InternalIterator* meta_iter,
- const SequenceNumber largest_seqno);
- Status ReadRangeDelBlock(FilePrefetchBuffer* prefetch_buffer,
- InternalIterator* meta_iter,
- const InternalKeyComparator& internal_comparator,
- BlockCacheLookupContext* lookup_context);
- Status PrefetchIndexAndFilterBlocks(
- FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
- BlockBasedTable* new_table, bool prefetch_all,
- const BlockBasedTableOptions& table_options, const int level,
- BlockCacheLookupContext* lookup_context);
- static BlockType GetBlockTypeForMetaBlockByName(const Slice& meta_block_name);
- Status VerifyChecksumInMetaBlocks(InternalIteratorBase<Slice>* index_iter);
- Status VerifyChecksumInBlocks(const ReadOptions& read_options,
- InternalIteratorBase<IndexValue>* index_iter);
- // Create the filter from the filter block.
- std::unique_ptr<FilterBlockReader> CreateFilterBlockReader(
- FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch,
- bool pin, BlockCacheLookupContext* lookup_context);
- static void SetupCacheKeyPrefix(Rep* rep);
- // Generate a cache key prefix from the file
- static void GenerateCachePrefix(Cache* cc, FSRandomAccessFile* file,
- char* buffer, size_t* size);
- static void GenerateCachePrefix(Cache* cc, FSWritableFile* file, char* buffer,
- size_t* size);
- // Given an iterator return its offset in file.
- uint64_t ApproximateOffsetOf(
- const InternalIteratorBase<IndexValue>& index_iter) const;
- // Helper functions for DumpTable()
- Status DumpIndexBlock(WritableFile* out_file);
- Status DumpDataBlocks(WritableFile* out_file);
- void DumpKeyValue(const Slice& key, const Slice& value,
- WritableFile* out_file);
- // A cumulative data block file read in MultiGet lower than this size will
- // use a stack buffer
- static constexpr size_t kMultiGetReadStackBufSize = 8192;
- friend class PartitionedFilterBlockReader;
- friend class PartitionedFilterBlockTest;
- friend class DBBasicTest_MultiGetIOBufferOverrun_Test;
- };
- // Maitaning state of a two-level iteration on a partitioned index structure.
- class BlockBasedTable::PartitionedIndexIteratorState
- : public TwoLevelIteratorState {
- public:
- PartitionedIndexIteratorState(
- const BlockBasedTable* table,
- std::unordered_map<uint64_t, CachableEntry<Block>>* block_map);
- InternalIteratorBase<IndexValue>* NewSecondaryIterator(
- const BlockHandle& index_value) override;
- private:
- // Don't own table_
- const BlockBasedTable* table_;
- std::unordered_map<uint64_t, CachableEntry<Block>>* block_map_;
- };
- // Stores all the properties associated with a BlockBasedTable.
- // These are immutable.
- struct BlockBasedTable::Rep {
- Rep(const ImmutableCFOptions& _ioptions, const EnvOptions& _env_options,
- const BlockBasedTableOptions& _table_opt,
- const InternalKeyComparator& _internal_comparator, bool skip_filters,
- int _level, const bool _immortal_table)
- : ioptions(_ioptions),
- env_options(_env_options),
- table_options(_table_opt),
- filter_policy(skip_filters ? nullptr : _table_opt.filter_policy.get()),
- internal_comparator(_internal_comparator),
- filter_type(FilterType::kNoFilter),
- index_type(BlockBasedTableOptions::IndexType::kBinarySearch),
- hash_index_allow_collision(false),
- whole_key_filtering(_table_opt.whole_key_filtering),
- prefix_filtering(true),
- global_seqno(kDisableGlobalSequenceNumber),
- level(_level),
- immortal_table(_immortal_table) {}
- const ImmutableCFOptions& ioptions;
- const EnvOptions& env_options;
- const BlockBasedTableOptions table_options;
- const FilterPolicy* const filter_policy;
- const InternalKeyComparator& internal_comparator;
- Status status;
- std::unique_ptr<RandomAccessFileReader> file;
- char cache_key_prefix[kMaxCacheKeyPrefixSize];
- size_t cache_key_prefix_size = 0;
- char persistent_cache_key_prefix[kMaxCacheKeyPrefixSize];
- size_t persistent_cache_key_prefix_size = 0;
- char compressed_cache_key_prefix[kMaxCacheKeyPrefixSize];
- size_t compressed_cache_key_prefix_size = 0;
- PersistentCacheOptions persistent_cache_options;
- // Footer contains the fixed table information
- Footer footer;
- std::unique_ptr<IndexReader> index_reader;
- std::unique_ptr<FilterBlockReader> filter;
- std::unique_ptr<UncompressionDictReader> uncompression_dict_reader;
- enum class FilterType {
- kNoFilter,
- kFullFilter,
- kBlockFilter,
- kPartitionedFilter,
- };
- FilterType filter_type;
- BlockHandle filter_handle;
- BlockHandle compression_dict_handle;
- std::shared_ptr<const TableProperties> table_properties;
- BlockBasedTableOptions::IndexType index_type;
- bool hash_index_allow_collision;
- bool whole_key_filtering;
- bool prefix_filtering;
- // TODO(kailiu) It is very ugly to use internal key in table, since table
- // module should not be relying on db module. However to make things easier
- // and compatible with existing code, we introduce a wrapper that allows
- // block to extract prefix without knowing if a key is internal or not.
- // null if no prefix_extractor is passed in when opening the table reader.
- std::unique_ptr<SliceTransform> internal_prefix_transform;
- std::shared_ptr<const SliceTransform> table_prefix_extractor;
- std::shared_ptr<const FragmentedRangeTombstoneList> fragmented_range_dels;
- // If global_seqno is used, all Keys in this file will have the same
- // seqno with value `global_seqno`.
- //
- // A value of kDisableGlobalSequenceNumber means that this feature is disabled
- // and every key have it's own seqno.
- SequenceNumber global_seqno;
- // the level when the table is opened, could potentially change when trivial
- // move is involved
- int level;
- // If false, blocks in this file are definitely all uncompressed. Knowing this
- // before reading individual blocks enables certain optimizations.
- bool blocks_maybe_compressed = true;
- // If true, data blocks in this file are definitely ZSTD compressed. If false
- // they might not be. When false we skip creating a ZSTD digested
- // uncompression dictionary. Even if we get a false negative, things should
- // still work, just not as quickly.
- bool blocks_definitely_zstd_compressed = false;
- // These describe how index is encoded.
- bool index_has_first_key = false;
- bool index_key_includes_seq = true;
- bool index_value_is_full = true;
- const bool immortal_table;
- SequenceNumber get_global_seqno(BlockType block_type) const {
- return (block_type == BlockType::kFilter ||
- block_type == BlockType::kCompressionDictionary)
- ? kDisableGlobalSequenceNumber
- : global_seqno;
- }
- uint64_t cf_id_for_tracing() const {
- return table_properties
- ? table_properties->column_family_id
- : ROCKSDB_NAMESPACE::TablePropertiesCollectorFactory::Context::
- kUnknownColumnFamily;
- }
- Slice cf_name_for_tracing() const {
- return table_properties ? table_properties->column_family_name
- : BlockCacheTraceHelper::kUnknownColumnFamilyName;
- }
- uint32_t level_for_tracing() const { return level >= 0 ? level : UINT32_MAX; }
- uint64_t sst_number_for_tracing() const {
- return file ? TableFileNameToNumber(file->file_name()) : UINT64_MAX;
- }
- void CreateFilePrefetchBuffer(
- size_t readahead_size, size_t max_readahead_size,
- std::unique_ptr<FilePrefetchBuffer>* fpb) const {
- fpb->reset(new FilePrefetchBuffer(file.get(), readahead_size,
- max_readahead_size,
- !ioptions.allow_mmap_reads /* enable */));
- }
- };
- // Iterates over the contents of BlockBasedTable.
- template <class TBlockIter, typename TValue = Slice>
- class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
- // compaction_readahead_size: its value will only be used if for_compaction =
- // true
- public:
- BlockBasedTableIterator(const BlockBasedTable* table,
- const ReadOptions& read_options,
- const InternalKeyComparator& icomp,
- InternalIteratorBase<IndexValue>* index_iter,
- bool check_filter, bool need_upper_bound_check,
- const SliceTransform* prefix_extractor,
- BlockType block_type, TableReaderCaller caller,
- size_t compaction_readahead_size = 0)
- : table_(table),
- read_options_(read_options),
- icomp_(icomp),
- user_comparator_(icomp.user_comparator()),
- index_iter_(index_iter),
- pinned_iters_mgr_(nullptr),
- block_iter_points_to_real_block_(false),
- check_filter_(check_filter),
- need_upper_bound_check_(need_upper_bound_check),
- prefix_extractor_(prefix_extractor),
- block_type_(block_type),
- lookup_context_(caller),
- compaction_readahead_size_(compaction_readahead_size) {}
- ~BlockBasedTableIterator() { delete index_iter_; }
- void Seek(const Slice& target) override;
- void SeekForPrev(const Slice& target) override;
- void SeekToFirst() override;
- void SeekToLast() override;
- void Next() final override;
- bool NextAndGetResult(IterateResult* result) override;
- void Prev() override;
- bool Valid() const override {
- return !is_out_of_bound_ &&
- (is_at_first_key_from_index_ ||
- (block_iter_points_to_real_block_ && block_iter_.Valid()));
- }
- Slice key() const override {
- assert(Valid());
- if (is_at_first_key_from_index_) {
- return index_iter_->value().first_internal_key;
- } else {
- return block_iter_.key();
- }
- }
- Slice user_key() const override {
- assert(Valid());
- if (is_at_first_key_from_index_) {
- return ExtractUserKey(index_iter_->value().first_internal_key);
- } else {
- return block_iter_.user_key();
- }
- }
- TValue value() const override {
- assert(Valid());
- // Load current block if not loaded.
- if (is_at_first_key_from_index_ &&
- !const_cast<BlockBasedTableIterator*>(this)
- ->MaterializeCurrentBlock()) {
- // Oops, index is not consistent with block contents, but we have
- // no good way to report error at this point. Let's return empty value.
- return TValue();
- }
- return block_iter_.value();
- }
- Status status() const override {
- // Prefix index set status to NotFound when the prefix does not exist
- if (!index_iter_->status().ok() && !index_iter_->status().IsNotFound()) {
- return index_iter_->status();
- } else if (block_iter_points_to_real_block_) {
- return block_iter_.status();
- } else {
- return Status::OK();
- }
- }
- // Whether iterator invalidated for being out of bound.
- bool IsOutOfBound() override { return is_out_of_bound_; }
- inline bool MayBeOutOfUpperBound() override {
- assert(Valid());
- return !data_block_within_upper_bound_;
- }
- void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
- pinned_iters_mgr_ = pinned_iters_mgr;
- }
- bool IsKeyPinned() const override {
- // Our key comes either from block_iter_'s current key
- // or index_iter_'s current *value*.
- return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
- ((is_at_first_key_from_index_ && index_iter_->IsValuePinned()) ||
- (block_iter_points_to_real_block_ && block_iter_.IsKeyPinned()));
- }
- bool IsValuePinned() const override {
- // Load current block if not loaded.
- if (is_at_first_key_from_index_) {
- const_cast<BlockBasedTableIterator*>(this)->MaterializeCurrentBlock();
- }
- // BlockIter::IsValuePinned() is always true. No need to check
- return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
- block_iter_points_to_real_block_;
- }
- void ResetDataIter() {
- if (block_iter_points_to_real_block_) {
- if (pinned_iters_mgr_ != nullptr && pinned_iters_mgr_->PinningEnabled()) {
- block_iter_.DelegateCleanupsTo(pinned_iters_mgr_);
- }
- block_iter_.Invalidate(Status::OK());
- block_iter_points_to_real_block_ = false;
- }
- }
- void SavePrevIndexValue() {
- if (block_iter_points_to_real_block_) {
- // Reseek. If they end up with the same data block, we shouldn't re-fetch
- // the same data block.
- prev_block_offset_ = index_iter_->value().handle.offset();
- }
- }
- private:
- enum class IterDirection {
- kForward,
- kBackward,
- };
- const BlockBasedTable* table_;
- const ReadOptions read_options_;
- const InternalKeyComparator& icomp_;
- UserComparatorWrapper user_comparator_;
- InternalIteratorBase<IndexValue>* index_iter_;
- PinnedIteratorsManager* pinned_iters_mgr_;
- TBlockIter block_iter_;
- // True if block_iter_ is initialized and points to the same block
- // as index iterator.
- bool block_iter_points_to_real_block_;
- // See InternalIteratorBase::IsOutOfBound().
- bool is_out_of_bound_ = false;
- // Whether current data block being fully within iterate upper bound.
- bool data_block_within_upper_bound_ = false;
- // True if we're standing at the first key of a block, and we haven't loaded
- // that block yet. A call to value() will trigger loading the block.
- bool is_at_first_key_from_index_ = false;
- bool check_filter_;
- // TODO(Zhongyi): pick a better name
- bool need_upper_bound_check_;
- const SliceTransform* prefix_extractor_;
- BlockType block_type_;
- uint64_t prev_block_offset_ = std::numeric_limits<uint64_t>::max();
- BlockCacheLookupContext lookup_context_;
- // Readahead size used in compaction, its value is used only if
- // lookup_context_.caller = kCompaction.
- size_t compaction_readahead_size_;
- size_t readahead_size_ = BlockBasedTable::kInitAutoReadaheadSize;
- size_t readahead_limit_ = 0;
- int64_t num_file_reads_ = 0;
- std::unique_ptr<FilePrefetchBuffer> prefetch_buffer_;
- // If `target` is null, seek to first.
- void SeekImpl(const Slice* target);
- void InitDataBlock();
- bool MaterializeCurrentBlock();
- void FindKeyForward();
- void FindBlockForward();
- void FindKeyBackward();
- void CheckOutOfBound();
- // Check if data block is fully within iterate_upper_bound.
- //
- // Note MyRocks may update iterate bounds between seek. To workaround it,
- // we need to check and update data_block_within_upper_bound_ accordingly.
- void CheckDataBlockWithinUpperBound();
- bool CheckPrefixMayMatch(const Slice& ikey, IterDirection direction) {
- if (need_upper_bound_check_ && direction == IterDirection::kBackward) {
- // Upper bound check isn't sufficnet for backward direction to
- // guarantee the same result as total order, so disable prefix
- // check.
- return true;
- }
- if (check_filter_ &&
- !table_->PrefixMayMatch(ikey, read_options_, prefix_extractor_,
- need_upper_bound_check_, &lookup_context_)) {
- // TODO remember the iterator is invalidated because of prefix
- // match. This can avoid the upper level file iterator to falsely
- // believe the position is the end of the SST file and move to
- // the first key of the next file.
- ResetDataIter();
- return false;
- }
- return true;
- }
- };
- } // namespace ROCKSDB_NAMESPACE
|