plain_table_reader.h 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246
  1. // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
  2. // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
  3. // Use of this source code is governed by a BSD-style license that can be
  4. // found in the LICENSE file. See the AUTHORS file for names of contributors.
  5. #pragma once
  6. #ifndef ROCKSDB_LITE
  7. #include <unordered_map>
  8. #include <memory>
  9. #include <vector>
  10. #include <string>
  11. #include <stdint.h>
  12. #include "db/dbformat.h"
  13. #include "file/random_access_file_reader.h"
  14. #include "memory/arena.h"
  15. #include "rocksdb/env.h"
  16. #include "rocksdb/iterator.h"
  17. #include "rocksdb/slice_transform.h"
  18. #include "rocksdb/table.h"
  19. #include "rocksdb/table_properties.h"
  20. #include "table/plain/plain_table_bloom.h"
  21. #include "table/plain/plain_table_factory.h"
  22. #include "table/plain/plain_table_index.h"
  23. #include "table/table_reader.h"
  24. namespace ROCKSDB_NAMESPACE {
  25. class Block;
  26. struct BlockContents;
  27. class BlockHandle;
  28. class Footer;
  29. struct Options;
  30. class RandomAccessFile;
  31. struct ReadOptions;
  32. class TableCache;
  33. class TableReader;
  34. class InternalKeyComparator;
  35. class PlainTableKeyDecoder;
  36. class GetContext;
  37. extern const uint32_t kPlainTableVariableLength;
  38. struct PlainTableReaderFileInfo {
  39. bool is_mmap_mode;
  40. Slice file_data;
  41. uint32_t data_end_offset;
  42. std::unique_ptr<RandomAccessFileReader> file;
  43. PlainTableReaderFileInfo(std::unique_ptr<RandomAccessFileReader>&& _file,
  44. const EnvOptions& storage_options,
  45. uint32_t _data_size_offset)
  46. : is_mmap_mode(storage_options.use_mmap_reads),
  47. data_end_offset(_data_size_offset),
  48. file(std::move(_file)) {}
  49. };
  50. // The reader class of PlainTable. For description of PlainTable format
  51. // See comments of class PlainTableFactory, where instances of
  52. // PlainTableReader are created.
  53. class PlainTableReader: public TableReader {
  54. public:
  55. // Based on following output file format shown in plain_table_factory.h
  56. // When opening the output file, PlainTableReader creates a hash table
  57. // from key prefixes to offset of the output file. PlainTable will decide
  58. // whether it points to the data offset of the first key with the key prefix
  59. // or the offset of it. If there are too many keys share this prefix, it will
  60. // create a binary search-able index from the suffix to offset on disk.
  61. static Status Open(const ImmutableCFOptions& ioptions,
  62. const EnvOptions& env_options,
  63. const InternalKeyComparator& internal_comparator,
  64. std::unique_ptr<RandomAccessFileReader>&& file,
  65. uint64_t file_size, std::unique_ptr<TableReader>* table,
  66. const int bloom_bits_per_key, double hash_table_ratio,
  67. size_t index_sparseness, size_t huge_page_tlb_size,
  68. bool full_scan_mode, const bool immortal_table = false,
  69. const SliceTransform* prefix_extractor = nullptr);
  70. // Returns new iterator over table contents
  71. // compaction_readahead_size: its value will only be used if for_compaction =
  72. // true
  73. InternalIterator* NewIterator(const ReadOptions&,
  74. const SliceTransform* prefix_extractor,
  75. Arena* arena, bool skip_filters,
  76. TableReaderCaller caller,
  77. size_t compaction_readahead_size = 0) override;
  78. void Prepare(const Slice& target) override;
  79. Status Get(const ReadOptions& readOptions, const Slice& key,
  80. GetContext* get_context, const SliceTransform* prefix_extractor,
  81. bool skip_filters = false) override;
  82. uint64_t ApproximateOffsetOf(const Slice& key,
  83. TableReaderCaller caller) override;
  84. uint64_t ApproximateSize(const Slice& start, const Slice& end,
  85. TableReaderCaller caller) override;
  86. uint32_t GetIndexSize() const { return index_.GetIndexSize(); }
  87. void SetupForCompaction() override;
  88. std::shared_ptr<const TableProperties> GetTableProperties() const override {
  89. return table_properties_;
  90. }
  91. virtual size_t ApproximateMemoryUsage() const override {
  92. return arena_.MemoryAllocatedBytes();
  93. }
  94. PlainTableReader(const ImmutableCFOptions& ioptions,
  95. std::unique_ptr<RandomAccessFileReader>&& file,
  96. const EnvOptions& env_options,
  97. const InternalKeyComparator& internal_comparator,
  98. EncodingType encoding_type, uint64_t file_size,
  99. const TableProperties* table_properties,
  100. const SliceTransform* prefix_extractor);
  101. virtual ~PlainTableReader();
  102. protected:
  103. // Check bloom filter to see whether it might contain this prefix.
  104. // The hash of the prefix is given, since it can be reused for index lookup
  105. // too.
  106. virtual bool MatchBloom(uint32_t hash) const;
  107. // PopulateIndex() builds index of keys. It must be called before any query
  108. // to the table.
  109. //
  110. // props: the table properties object that need to be stored. Ownership of
  111. // the object will be passed.
  112. //
  113. Status PopulateIndex(TableProperties* props, int bloom_bits_per_key,
  114. double hash_table_ratio, size_t index_sparseness,
  115. size_t huge_page_tlb_size);
  116. Status MmapDataIfNeeded();
  117. private:
  118. const InternalKeyComparator internal_comparator_;
  119. EncodingType encoding_type_;
  120. // represents plain table's current status.
  121. Status status_;
  122. PlainTableIndex index_;
  123. bool full_scan_mode_;
  124. // data_start_offset_ and data_end_offset_ defines the range of the
  125. // sst file that stores data.
  126. const uint32_t data_start_offset_ = 0;
  127. const uint32_t user_key_len_;
  128. const SliceTransform* prefix_extractor_;
  129. static const size_t kNumInternalBytes = 8;
  130. // Bloom filter is used to rule out non-existent key
  131. bool enable_bloom_;
  132. PlainTableBloomV1 bloom_;
  133. PlainTableReaderFileInfo file_info_;
  134. Arena arena_;
  135. CacheAllocationPtr index_block_alloc_;
  136. CacheAllocationPtr bloom_block_alloc_;
  137. const ImmutableCFOptions& ioptions_;
  138. std::unique_ptr<Cleanable> dummy_cleanable_;
  139. uint64_t file_size_;
  140. protected: // for testing
  141. std::shared_ptr<const TableProperties> table_properties_;
  142. private:
  143. bool IsFixedLength() const {
  144. return user_key_len_ != kPlainTableVariableLength;
  145. }
  146. size_t GetFixedInternalKeyLength() const {
  147. return user_key_len_ + kNumInternalBytes;
  148. }
  149. Slice GetPrefix(const Slice& target) const {
  150. assert(target.size() >= 8); // target is internal key
  151. return GetPrefixFromUserKey(GetUserKey(target));
  152. }
  153. Slice GetPrefix(const ParsedInternalKey& target) const {
  154. return GetPrefixFromUserKey(target.user_key);
  155. }
  156. Slice GetUserKey(const Slice& key) const {
  157. return Slice(key.data(), key.size() - 8);
  158. }
  159. Slice GetPrefixFromUserKey(const Slice& user_key) const {
  160. if (!IsTotalOrderMode()) {
  161. return prefix_extractor_->Transform(user_key);
  162. } else {
  163. // Use empty slice as prefix if prefix_extractor is not set.
  164. // In that case,
  165. // it falls back to pure binary search and
  166. // total iterator seek is supported.
  167. return Slice();
  168. }
  169. }
  170. friend class TableCache;
  171. friend class PlainTableIterator;
  172. // Internal helper function to generate an IndexRecordList object from all
  173. // the rows, which contains index records as a list.
  174. // If bloom_ is not null, all the keys' full-key hash will be added to the
  175. // bloom filter.
  176. Status PopulateIndexRecordList(PlainTableIndexBuilder* index_builder,
  177. std::vector<uint32_t>* prefix_hashes);
  178. // Internal helper function to allocate memory for bloom filter
  179. void AllocateBloom(int bloom_bits_per_key, int num_prefixes,
  180. size_t huge_page_tlb_size);
  181. void FillBloom(const std::vector<uint32_t>& prefix_hashes);
  182. // Read the key and value at `offset` to parameters for keys, the and
  183. // `seekable`.
  184. // On success, `offset` will be updated as the offset for the next key.
  185. // `parsed_key` will be key in parsed format.
  186. // if `internal_key` is not empty, it will be filled with key with slice
  187. // format.
  188. // if `seekable` is not null, it will return whether we can directly read
  189. // data using this offset.
  190. Status Next(PlainTableKeyDecoder* decoder, uint32_t* offset,
  191. ParsedInternalKey* parsed_key, Slice* internal_key, Slice* value,
  192. bool* seekable = nullptr) const;
  193. // Get file offset for key target.
  194. // return value prefix_matched is set to true if the offset is confirmed
  195. // for a key with the same prefix as target.
  196. Status GetOffset(PlainTableKeyDecoder* decoder, const Slice& target,
  197. const Slice& prefix, uint32_t prefix_hash,
  198. bool& prefix_matched, uint32_t* offset) const;
  199. bool IsTotalOrderMode() const { return (prefix_extractor_ == nullptr); }
  200. // No copying allowed
  201. explicit PlainTableReader(const TableReader&) = delete;
  202. void operator=(const TableReader&) = delete;
  203. };
  204. } // namespace ROCKSDB_NAMESPACE
  205. #endif // ROCKSDB_LITE