plain_table_reader.h 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243
  1. // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
  2. // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
  3. // Use of this source code is governed by a BSD-style license that can be
  4. // found in the LICENSE file. See the AUTHORS file for names of contributors.
  5. #pragma once
  6. #include <stdint.h>
  7. #include <memory>
  8. #include <string>
  9. #include <unordered_map>
  10. #include <vector>
  11. #include "file/random_access_file_reader.h"
  12. #include "memory/arena.h"
  13. #include "rocksdb/env.h"
  14. #include "rocksdb/iterator.h"
  15. #include "rocksdb/slice_transform.h"
  16. #include "rocksdb/table.h"
  17. #include "rocksdb/table_properties.h"
  18. #include "table/plain/plain_table_bloom.h"
  19. #include "table/plain/plain_table_factory.h"
  20. #include "table/plain/plain_table_index.h"
  21. #include "table/table_reader.h"
  22. namespace ROCKSDB_NAMESPACE {
  23. class Block;
  24. struct BlockContents;
  25. class BlockHandle;
  26. class Footer;
  27. struct Options;
  28. class RandomAccessFile;
  29. struct ReadOptions;
  30. class TableCache;
  31. class TableReader;
  32. class InternalKeyComparator;
  33. class PlainTableKeyDecoder;
  34. class GetContext;
  35. extern const uint32_t kPlainTableVariableLength;
  36. struct PlainTableReaderFileInfo {
  37. bool is_mmap_mode;
  38. Slice file_data;
  39. uint32_t data_end_offset;
  40. std::unique_ptr<RandomAccessFileReader> file;
  41. PlainTableReaderFileInfo(std::unique_ptr<RandomAccessFileReader>&& _file,
  42. const EnvOptions& storage_options,
  43. uint32_t _data_size_offset)
  44. : is_mmap_mode(storage_options.use_mmap_reads),
  45. data_end_offset(_data_size_offset),
  46. file(std::move(_file)) {}
  47. };
  48. // The reader class of PlainTable. For description of PlainTable format
  49. // See comments of class PlainTableFactory, where instances of
  50. // PlainTableReader are created.
  51. class PlainTableReader : public TableReader {
  52. public:
  53. // Based on following output file format shown in plain_table_factory.h
  54. // When opening the output file, PlainTableReader creates a hash table
  55. // from key prefixes to offset of the output file. PlainTable will decide
  56. // whether it points to the data offset of the first key with the key prefix
  57. // or the offset of it. If there are too many keys share this prefix, it will
  58. // create a binary search-able index from the suffix to offset on disk.
  59. static Status Open(const ImmutableOptions& ioptions,
  60. const EnvOptions& env_options,
  61. const InternalKeyComparator& internal_comparator,
  62. std::unique_ptr<RandomAccessFileReader>&& file,
  63. uint64_t file_size, std::unique_ptr<TableReader>* table,
  64. const int bloom_bits_per_key, double hash_table_ratio,
  65. size_t index_sparseness, size_t huge_page_tlb_size,
  66. bool full_scan_mode, const bool immortal_table = false,
  67. const SliceTransform* prefix_extractor = nullptr);
  68. // Returns new iterator over table contents
  69. // compaction_readahead_size: its value will only be used if for_compaction =
  70. // true
  71. InternalIterator* NewIterator(const ReadOptions&,
  72. const SliceTransform* prefix_extractor,
  73. Arena* arena, bool skip_filters,
  74. TableReaderCaller caller,
  75. size_t compaction_readahead_size = 0,
  76. bool allow_unprepared_value = false) override;
  77. void Prepare(const Slice& target) override;
  78. Status Get(const ReadOptions& readOptions, const Slice& key,
  79. GetContext* get_context, const SliceTransform* prefix_extractor,
  80. bool skip_filters = false) override;
  81. uint64_t ApproximateOffsetOf(const ReadOptions& read_options,
  82. const Slice& key,
  83. TableReaderCaller caller) override;
  84. uint64_t ApproximateSize(const ReadOptions& read_options, const Slice& start,
  85. const Slice& end, TableReaderCaller caller) override;
  86. uint32_t GetIndexSize() const { return index_.GetIndexSize(); }
  87. void SetupForCompaction() override;
  88. std::shared_ptr<const TableProperties> GetTableProperties() const override {
  89. return table_properties_;
  90. }
  91. size_t ApproximateMemoryUsage() const override {
  92. return arena_.MemoryAllocatedBytes();
  93. }
  94. PlainTableReader(const ImmutableOptions& ioptions,
  95. std::unique_ptr<RandomAccessFileReader>&& file,
  96. const EnvOptions& env_options,
  97. const InternalKeyComparator& internal_comparator,
  98. EncodingType encoding_type, uint64_t file_size,
  99. const TableProperties* table_properties,
  100. const SliceTransform* prefix_extractor);
  101. virtual ~PlainTableReader();
  102. protected:
  103. // Check bloom filter to see whether it might contain this prefix.
  104. // The hash of the prefix is given, since it can be reused for index lookup
  105. // too.
  106. virtual bool MatchBloom(uint32_t hash) const;
  107. // PopulateIndex() builds index of keys. It must be called before any query
  108. // to the table.
  109. //
  110. // props: the table properties object that need to be stored. Ownership of
  111. // the object will be passed.
  112. //
  113. Status PopulateIndex(TableProperties* props, int bloom_bits_per_key,
  114. double hash_table_ratio, size_t index_sparseness,
  115. size_t huge_page_tlb_size);
  116. Status MmapDataIfNeeded();
  117. private:
  118. const InternalKeyComparator internal_comparator_;
  119. EncodingType encoding_type_;
  120. // represents plain table's current status.
  121. Status status_;
  122. PlainTableIndex index_;
  123. bool full_scan_mode_;
  124. // data_start_offset_ and data_end_offset_ defines the range of the
  125. // sst file that stores data.
  126. const uint32_t data_start_offset_ = 0;
  127. const uint32_t user_key_len_;
  128. const SliceTransform* prefix_extractor_;
  129. static const size_t kNumInternalBytes = 8;
  130. // Bloom filter is used to rule out non-existent key
  131. bool enable_bloom_;
  132. PlainTableBloomV1 bloom_;
  133. PlainTableReaderFileInfo file_info_;
  134. Arena arena_;
  135. CacheAllocationPtr index_block_alloc_;
  136. CacheAllocationPtr bloom_block_alloc_;
  137. const ImmutableOptions& ioptions_;
  138. std::unique_ptr<Cleanable> dummy_cleanable_;
  139. uint64_t file_size_;
  140. protected: // for testing
  141. std::shared_ptr<const TableProperties> table_properties_;
  142. private:
  143. bool IsFixedLength() const {
  144. return user_key_len_ != kPlainTableVariableLength;
  145. }
  146. size_t GetFixedInternalKeyLength() const {
  147. return user_key_len_ + kNumInternalBytes;
  148. }
  149. Slice GetPrefix(const Slice& target) const {
  150. assert(target.size() >= 8); // target is internal key
  151. return GetPrefixFromUserKey(ExtractUserKey(target));
  152. }
  153. Slice GetPrefix(const ParsedInternalKey& target) const {
  154. return GetPrefixFromUserKey(target.user_key);
  155. }
  156. Slice GetPrefixFromUserKey(const Slice& user_key) const {
  157. if (!IsTotalOrderMode()) {
  158. return prefix_extractor_->Transform(user_key);
  159. } else {
  160. // Use empty slice as prefix if prefix_extractor is not set.
  161. // In that case,
  162. // it falls back to pure binary search and
  163. // total iterator seek is supported.
  164. return Slice();
  165. }
  166. }
  167. friend class TableCache;
  168. friend class PlainTableIterator;
  169. // Internal helper function to generate an IndexRecordList object from all
  170. // the rows, which contains index records as a list.
  171. // If bloom_ is not null, all the keys' full-key hash will be added to the
  172. // bloom filter.
  173. Status PopulateIndexRecordList(PlainTableIndexBuilder* index_builder,
  174. std::vector<uint32_t>* prefix_hashes);
  175. // Internal helper function to allocate memory for bloom filter
  176. void AllocateBloom(int bloom_bits_per_key, int num_prefixes,
  177. size_t huge_page_tlb_size);
  178. void FillBloom(const std::vector<uint32_t>& prefix_hashes);
  179. // Read the key and value at `offset` to parameters for keys, the and
  180. // `seekable`.
  181. // On success, `offset` will be updated as the offset for the next key.
  182. // `parsed_key` will be key in parsed format.
  183. // if `internal_key` is not empty, it will be filled with key with slice
  184. // format.
  185. // if `seekable` is not null, it will return whether we can directly read
  186. // data using this offset.
  187. Status Next(PlainTableKeyDecoder* decoder, uint32_t* offset,
  188. ParsedInternalKey* parsed_key, Slice* internal_key, Slice* value,
  189. bool* seekable = nullptr) const;
  190. // Get file offset for key target.
  191. // return value prefix_matched is set to true if the offset is confirmed
  192. // for a key with the same prefix as target.
  193. Status GetOffset(PlainTableKeyDecoder* decoder, const Slice& target,
  194. const Slice& prefix, uint32_t prefix_hash,
  195. bool& prefix_matched, uint32_t* offset) const;
  196. bool IsTotalOrderMode() const { return (prefix_extractor_ == nullptr); }
  197. // No copying allowed
  198. explicit PlainTableReader(const TableReader&) = delete;
  199. void operator=(const TableReader&) = delete;
  200. };
  201. } // namespace ROCKSDB_NAMESPACE