table_cache.h 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226
  1. // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. //
  6. // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
  7. // Use of this source code is governed by a BSD-style license that can be
  8. // found in the LICENSE file. See the AUTHORS file for names of contributors.
  9. //
  10. // Thread-safe (provides internal synchronization)
  11. #pragma once
  12. #include <string>
  13. #include <vector>
  14. #include <stdint.h>
  15. #include "db/dbformat.h"
  16. #include "db/range_del_aggregator.h"
  17. #include "options/cf_options.h"
  18. #include "port/port.h"
  19. #include "rocksdb/cache.h"
  20. #include "rocksdb/env.h"
  21. #include "rocksdb/options.h"
  22. #include "rocksdb/table.h"
  23. #include "table/table_reader.h"
  24. #include "trace_replay/block_cache_tracer.h"
  25. namespace ROCKSDB_NAMESPACE {
  26. class Env;
  27. class Arena;
  28. struct FileDescriptor;
  29. class GetContext;
  30. class HistogramImpl;
  31. // Manages caching for TableReader objects for a column family. The actual
  32. // cache is allocated separately and passed to the constructor. TableCache
  33. // wraps around the underlying SST file readers by providing Get(),
  34. // MultiGet() and NewIterator() methods that hide the instantiation,
  35. // caching and access to the TableReader. The main purpose of this is
  36. // performance - by caching the TableReader, it avoids unnecessary file opens
  37. // and object allocation and instantiation. One exception is compaction, where
  38. // a new TableReader may be instantiated - see NewIterator() comments
  39. //
  40. // Another service provided by TableCache is managing the row cache - if the
  41. // DB is configured with a row cache, and the lookup key is present in the row
  42. // cache, lookup is very fast. The row cache is obtained from
  43. // ioptions.row_cache
  44. class TableCache {
  45. public:
  46. TableCache(const ImmutableCFOptions& ioptions,
  47. const FileOptions& storage_options, Cache* cache,
  48. BlockCacheTracer* const block_cache_tracer);
  49. ~TableCache();
  50. // Return an iterator for the specified file number (the corresponding
  51. // file length must be exactly "file_size" bytes). If "table_reader_ptr"
  52. // is non-nullptr, also sets "*table_reader_ptr" to point to the Table object
  53. // underlying the returned iterator, or nullptr if no Table object underlies
  54. // the returned iterator. The returned "*table_reader_ptr" object is owned
  55. // by the cache and should not be deleted, and is valid for as long as the
  56. // returned iterator is live.
  57. // @param range_del_agg If non-nullptr, adds range deletions to the
  58. // aggregator. If an error occurs, returns it in a NewErrorInternalIterator
  59. // @param for_compaction If true, a new TableReader may be allocated (but
  60. // not cached), depending on the CF options
  61. // @param skip_filters Disables loading/accessing the filter block
  62. // @param level The level this table is at, -1 for "not set / don't know"
  63. InternalIterator* NewIterator(
  64. const ReadOptions& options, const FileOptions& toptions,
  65. const InternalKeyComparator& internal_comparator,
  66. const FileMetaData& file_meta, RangeDelAggregator* range_del_agg,
  67. const SliceTransform* prefix_extractor, TableReader** table_reader_ptr,
  68. HistogramImpl* file_read_hist, TableReaderCaller caller, Arena* arena,
  69. bool skip_filters, int level, const InternalKey* smallest_compaction_key,
  70. const InternalKey* largest_compaction_key);
  71. // If a seek to internal key "k" in specified file finds an entry,
  72. // call get_context->SaveValue() repeatedly until
  73. // it returns false. As a side effect, it will insert the TableReader
  74. // into the cache and potentially evict another entry
  75. // @param get_context Context for get operation. The result of the lookup
  76. // can be retrieved by calling get_context->State()
  77. // @param file_read_hist If non-nullptr, the file reader statistics are
  78. // recorded
  79. // @param skip_filters Disables loading/accessing the filter block
  80. // @param level The level this table is at, -1 for "not set / don't know"
  81. Status Get(const ReadOptions& options,
  82. const InternalKeyComparator& internal_comparator,
  83. const FileMetaData& file_meta, const Slice& k,
  84. GetContext* get_context,
  85. const SliceTransform* prefix_extractor = nullptr,
  86. HistogramImpl* file_read_hist = nullptr, bool skip_filters = false,
  87. int level = -1);
  88. // Return the range delete tombstone iterator of the file specified by
  89. // `file_meta`.
  90. Status GetRangeTombstoneIterator(
  91. const ReadOptions& options,
  92. const InternalKeyComparator& internal_comparator,
  93. const FileMetaData& file_meta,
  94. std::unique_ptr<FragmentedRangeTombstoneIterator>* out_iter);
  95. // If a seek to internal key "k" in specified file finds an entry,
  96. // call get_context->SaveValue() repeatedly until
  97. // it returns false. As a side effect, it will insert the TableReader
  98. // into the cache and potentially evict another entry
  99. // @param mget_range Pointer to the structure describing a batch of keys to
  100. // be looked up in this table file. The result is stored
  101. // in the embedded GetContext
  102. // @param skip_filters Disables loading/accessing the filter block
  103. // @param level The level this table is at, -1 for "not set / don't know"
  104. Status MultiGet(const ReadOptions& options,
  105. const InternalKeyComparator& internal_comparator,
  106. const FileMetaData& file_meta,
  107. const MultiGetContext::Range* mget_range,
  108. const SliceTransform* prefix_extractor = nullptr,
  109. HistogramImpl* file_read_hist = nullptr,
  110. bool skip_filters = false, int level = -1);
  111. // Evict any entry for the specified file number
  112. static void Evict(Cache* cache, uint64_t file_number);
  113. // Clean table handle and erase it from the table cache
  114. // Used in DB close, or the file is not live anymore.
  115. void EraseHandle(const FileDescriptor& fd, Cache::Handle* handle);
  116. // Find table reader
  117. // @param skip_filters Disables loading/accessing the filter block
  118. // @param level == -1 means not specified
  119. Status FindTable(const FileOptions& toptions,
  120. const InternalKeyComparator& internal_comparator,
  121. const FileDescriptor& file_fd, Cache::Handle**,
  122. const SliceTransform* prefix_extractor = nullptr,
  123. const bool no_io = false, bool record_read_stats = true,
  124. HistogramImpl* file_read_hist = nullptr,
  125. bool skip_filters = false, int level = -1,
  126. bool prefetch_index_and_filter_in_cache = true);
  127. // Get TableReader from a cache handle.
  128. TableReader* GetTableReaderFromHandle(Cache::Handle* handle);
  129. // Get the table properties of a given table.
  130. // @no_io: indicates if we should load table to the cache if it is not present
  131. // in table cache yet.
  132. // @returns: `properties` will be reset on success. Please note that we will
  133. // return Status::Incomplete() if table is not present in cache and
  134. // we set `no_io` to be true.
  135. Status GetTableProperties(const FileOptions& toptions,
  136. const InternalKeyComparator& internal_comparator,
  137. const FileDescriptor& file_meta,
  138. std::shared_ptr<const TableProperties>* properties,
  139. const SliceTransform* prefix_extractor = nullptr,
  140. bool no_io = false);
  141. // Return total memory usage of the table reader of the file.
  142. // 0 if table reader of the file is not loaded.
  143. size_t GetMemoryUsageByTableReader(
  144. const FileOptions& toptions,
  145. const InternalKeyComparator& internal_comparator,
  146. const FileDescriptor& fd,
  147. const SliceTransform* prefix_extractor = nullptr);
  148. // Returns approximated offset of a key in a file represented by fd.
  149. uint64_t ApproximateOffsetOf(
  150. const Slice& key, const FileDescriptor& fd, TableReaderCaller caller,
  151. const InternalKeyComparator& internal_comparator,
  152. const SliceTransform* prefix_extractor = nullptr);
  153. // Returns approximated data size between start and end keys in a file
  154. // represented by fd (the start key must not be greater than the end key).
  155. uint64_t ApproximateSize(const Slice& start, const Slice& end,
  156. const FileDescriptor& fd, TableReaderCaller caller,
  157. const InternalKeyComparator& internal_comparator,
  158. const SliceTransform* prefix_extractor = nullptr);
  159. // Release the handle from a cache
  160. void ReleaseHandle(Cache::Handle* handle);
  161. Cache* get_cache() const { return cache_; }
  162. // Capacity of the backing Cache that indicates inifinite TableCache capacity.
  163. // For example when max_open_files is -1 we set the backing Cache to this.
  164. static const int kInfiniteCapacity = 0x400000;
  165. // The tables opened with this TableCache will be immortal, i.e., their
  166. // lifetime is as long as that of the DB.
  167. void SetTablesAreImmortal() {
  168. if (cache_->GetCapacity() >= kInfiniteCapacity) {
  169. immortal_tables_ = true;
  170. }
  171. }
  172. private:
  173. // Build a table reader
  174. Status GetTableReader(const FileOptions& file_options,
  175. const InternalKeyComparator& internal_comparator,
  176. const FileDescriptor& fd, bool sequential_mode,
  177. bool record_read_stats, HistogramImpl* file_read_hist,
  178. std::unique_ptr<TableReader>* table_reader,
  179. const SliceTransform* prefix_extractor = nullptr,
  180. bool skip_filters = false, int level = -1,
  181. bool prefetch_index_and_filter_in_cache = true);
  182. // Create a key prefix for looking up the row cache. The prefix is of the
  183. // format row_cache_id + fd_number + seq_no. Later, the user key can be
  184. // appended to form the full key
  185. void CreateRowCacheKeyPrefix(const ReadOptions& options,
  186. const FileDescriptor& fd,
  187. const Slice& internal_key,
  188. GetContext* get_context, IterKey& row_cache_key);
  189. // Helper function to lookup the row cache for a key. It appends the
  190. // user key to row_cache_key at offset prefix_size
  191. bool GetFromRowCache(const Slice& user_key, IterKey& row_cache_key,
  192. size_t prefix_size, GetContext* get_context);
  193. const ImmutableCFOptions& ioptions_;
  194. const FileOptions& file_options_;
  195. Cache* const cache_;
  196. std::string row_cache_id_;
  197. bool immortal_tables_;
  198. BlockCacheTracer* const block_cache_tracer_;
  199. };
  200. } // namespace ROCKSDB_NAMESPACE