table_cache.h 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292
  1. // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. //
  6. // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
  7. // Use of this source code is governed by a BSD-style license that can be
  8. // found in the LICENSE file. See the AUTHORS file for names of contributors.
  9. //
  10. // Thread-safe (provides internal synchronization)
  11. #pragma once
  12. #include <cstdint>
  13. #include <string>
  14. #include <vector>
  15. #include "cache/typed_cache.h"
  16. #include "db/dbformat.h"
  17. #include "db/range_del_aggregator.h"
  18. #include "options/cf_options.h"
  19. #include "port/port.h"
  20. #include "rocksdb/cache.h"
  21. #include "rocksdb/env.h"
  22. #include "rocksdb/options.h"
  23. #include "rocksdb/table.h"
  24. #include "table/table_reader.h"
  25. #include "trace_replay/block_cache_tracer.h"
  26. #include "util/coro_utils.h"
  27. namespace ROCKSDB_NAMESPACE {
  28. class Env;
  29. class Arena;
  30. struct FileDescriptor;
  31. class GetContext;
  32. class HistogramImpl;
  33. // Manages caching for TableReader objects for a column family. The actual
  34. // cache is allocated separately and passed to the constructor. TableCache
  35. // wraps around the underlying SST file readers by providing Get(),
  36. // MultiGet() and NewIterator() methods that hide the instantiation,
  37. // caching and access to the TableReader. The main purpose of this is
  38. // performance - by caching the TableReader, it avoids unnecessary file opens
  39. // and object allocation and instantiation. One exception is compaction, where
  40. // a new TableReader may be instantiated - see NewIterator() comments
  41. //
  42. // Another service provided by TableCache is managing the row cache - if the
  43. // DB is configured with a row cache, and the lookup key is present in the row
  44. // cache, lookup is very fast. The row cache is obtained from
  45. // ioptions.row_cache
  46. class TableCache {
  47. public:
  48. TableCache(const ImmutableOptions& ioptions,
  49. const FileOptions* storage_options, Cache* cache,
  50. BlockCacheTracer* const block_cache_tracer,
  51. const std::shared_ptr<IOTracer>& io_tracer,
  52. const std::string& db_session_id);
  53. ~TableCache();
  54. // Cache interface for table cache
  55. using CacheInterface =
  56. BasicTypedCacheInterface<TableReader, CacheEntryRole::kMisc>;
  57. using TypedHandle = CacheInterface::TypedHandle;
  58. // Cache interface for row cache
  59. using RowCacheInterface =
  60. BasicTypedCacheInterface<std::string, CacheEntryRole::kMisc>;
  61. using RowHandle = RowCacheInterface::TypedHandle;
  62. // Return an iterator for the specified file number (the corresponding
  63. // file length must be exactly "file_size" bytes). If "table_reader_ptr"
  64. // is non-nullptr, also sets "*table_reader_ptr" to point to the Table object
  65. // underlying the returned iterator, or nullptr if no Table object underlies
  66. // the returned iterator. The returned "*table_reader_ptr" object is owned
  67. // by the cache and should not be deleted, and is valid for as long as the
  68. // returned iterator is live.
  69. // If !options.ignore_range_deletions, and range_del_iter is non-nullptr,
  70. // then range_del_iter is set to a TruncatedRangeDelIterator for range
  71. // tombstones in the SST file corresponding to the specified file number. The
  72. // upper/lower bounds for the TruncatedRangeDelIterator are set to the SST
  73. // file's boundary.
  74. // @param options Must outlive the returned iterator.
  75. // @param range_del_agg If non-nullptr, adds range deletions to the
  76. // aggregator. If an error occurs, returns it in a NewErrorInternalIterator
  77. // @param for_compaction If true, a new TableReader may be allocated (but
  78. // not cached), depending on the CF options
  79. // @param skip_filters Disables loading/accessing the filter block
  80. // @param level The level this table is at, -1 for "not set / don't know"
  81. // @param range_del_read_seqno If non-nullptr, will be used to create
  82. // *range_del_iter.
  83. InternalIterator* NewIterator(
  84. const ReadOptions& options, const FileOptions& toptions,
  85. const InternalKeyComparator& internal_comparator,
  86. const FileMetaData& file_meta, RangeDelAggregator* range_del_agg,
  87. const MutableCFOptions& mutable_cf_options,
  88. TableReader** table_reader_ptr, HistogramImpl* file_read_hist,
  89. TableReaderCaller caller, Arena* arena, bool skip_filters, int level,
  90. size_t max_file_size_for_l0_meta_pin,
  91. const InternalKey* smallest_compaction_key,
  92. const InternalKey* largest_compaction_key, bool allow_unprepared_value,
  93. const SequenceNumber* range_del_read_seqno = nullptr,
  94. std::unique_ptr<TruncatedRangeDelIterator>* range_del_iter = nullptr);
  95. // If a seek to internal key "k" in specified file finds an entry,
  96. // call get_context->SaveValue() repeatedly until
  97. // it returns false. As a side effect, it will insert the TableReader
  98. // into the cache and potentially evict another entry
  99. // @param get_context Context for get operation. The result of the lookup
  100. // can be retrieved by calling get_context->State()
  101. // @param file_read_hist If non-nullptr, the file reader statistics are
  102. // recorded
  103. // @param skip_filters Disables loading/accessing the filter block
  104. // @param level The level this table is at, -1 for "not set / don't know"
  105. Status Get(const ReadOptions& options,
  106. const InternalKeyComparator& internal_comparator,
  107. const FileMetaData& file_meta, const Slice& k,
  108. GetContext* get_context,
  109. const MutableCFOptions& mutable_cf_options,
  110. HistogramImpl* file_read_hist = nullptr, bool skip_filters = false,
  111. int level = -1, size_t max_file_size_for_l0_meta_pin = 0);
  112. // Return the range delete tombstone iterator of the file specified by
  113. // `file_meta`.
  114. Status GetRangeTombstoneIterator(
  115. const ReadOptions& options,
  116. const InternalKeyComparator& internal_comparator,
  117. const FileMetaData& file_meta, const MutableCFOptions& mutable_cf_options,
  118. std::unique_ptr<FragmentedRangeTombstoneIterator>* out_iter);
  119. // Call table reader's MultiGetFilter to use the bloom filter to filter out
  120. // keys. Returns Status::NotSupported() if row cache needs to be checked.
  121. // If the table cache is looked up to get the table reader, the cache handle
  122. // is returned in table_handle. This handle should be passed back to
  123. // MultiGet() so it can be released.
  124. Status MultiGetFilter(const ReadOptions& options,
  125. const InternalKeyComparator& internal_comparator,
  126. const FileMetaData& file_meta,
  127. const MutableCFOptions& mutable_cf_options,
  128. HistogramImpl* file_read_hist, int level,
  129. MultiGetContext::Range* mget_range,
  130. TypedHandle** table_handle);
  131. // If a seek to internal key "k" in specified file finds an entry,
  132. // call get_context->SaveValue() repeatedly until
  133. // it returns false. As a side effect, it will insert the TableReader
  134. // into the cache and potentially evict another entry
  135. // @param mget_range Pointer to the structure describing a batch of keys to
  136. // be looked up in this table file. The result is stored
  137. // in the embedded GetContext
  138. // @param skip_filters Disables loading/accessing the filter block
  139. // @param level The level this table is at, -1 for "not set / don't know"
  140. DECLARE_SYNC_AND_ASYNC(Status, MultiGet, const ReadOptions& options,
  141. const InternalKeyComparator& internal_comparator,
  142. const FileMetaData& file_meta,
  143. const MultiGetContext::Range* mget_range,
  144. const MutableCFOptions& mutable_cf_options,
  145. HistogramImpl* file_read_hist = nullptr,
  146. bool skip_filters = false,
  147. bool skip_range_deletions = false, int level = -1,
  148. TypedHandle* table_handle = nullptr);
  149. // Evict any entry for the specified file number. ReleaseObsolete() is
  150. // preferred for cleaning up from obsolete files.
  151. static void Evict(Cache* cache, uint64_t file_number);
  152. // Handles releasing, erasing, etc. of what should be the last reference
  153. // to an obsolete file. `handle` may be nullptr if no prior handle is known.
  154. static void ReleaseObsolete(Cache* cache, uint64_t file_number,
  155. Cache::Handle* handle,
  156. uint32_t uncache_aggressiveness);
  157. // Return handle to an existing cache entry if there is one
  158. static Cache::Handle* Lookup(Cache* cache, uint64_t file_number);
  159. // Find table reader
  160. // @param skip_filters Disables loading/accessing the filter block
  161. // @param level == -1 means not specified
  162. Status FindTable(const ReadOptions& ro, const FileOptions& toptions,
  163. const InternalKeyComparator& internal_comparator,
  164. const FileMetaData& file_meta, TypedHandle**,
  165. const MutableCFOptions& mutable_cf_options,
  166. const bool no_io = false,
  167. HistogramImpl* file_read_hist = nullptr,
  168. bool skip_filters = false, int level = -1,
  169. bool prefetch_index_and_filter_in_cache = true,
  170. size_t max_file_size_for_l0_meta_pin = 0,
  171. Temperature file_temperature = Temperature::kUnknown);
  172. // Get the table properties of a given table.
  173. // @no_io: indicates if we should load table to the cache if it is not present
  174. // in table cache yet.
  175. // @returns: `properties` will be reset on success. Please note that we will
  176. // return Status::Incomplete() if table is not present in cache and
  177. // we set `no_io` to be true.
  178. Status GetTableProperties(const FileOptions& toptions,
  179. const ReadOptions& read_options,
  180. const InternalKeyComparator& internal_comparator,
  181. const FileMetaData& file_meta,
  182. std::shared_ptr<const TableProperties>* properties,
  183. const MutableCFOptions& mutable_cf_options,
  184. bool no_io = false);
  185. Status ApproximateKeyAnchors(const ReadOptions& ro,
  186. const InternalKeyComparator& internal_comparator,
  187. const FileMetaData& file_meta,
  188. const MutableCFOptions& mutable_cf_options,
  189. std::vector<TableReader::Anchor>& anchors);
  190. // Return total memory usage of the table reader of the file.
  191. // 0 if table reader of the file is not loaded.
  192. size_t GetMemoryUsageByTableReader(
  193. const FileOptions& toptions, const ReadOptions& read_options,
  194. const InternalKeyComparator& internal_comparator,
  195. const FileMetaData& file_meta,
  196. const MutableCFOptions& mutable_cf_options);
  197. // Returns approximated offset of a key in a file represented by fd.
  198. uint64_t ApproximateOffsetOf(const ReadOptions& read_options,
  199. const Slice& key, const FileMetaData& file_meta,
  200. TableReaderCaller caller,
  201. const InternalKeyComparator& internal_comparator,
  202. const MutableCFOptions& mutable_cf_options);
  203. // Returns approximated data size between start and end keys in a file
  204. // represented by fd (the start key must not be greater than the end key).
  205. uint64_t ApproximateSize(const ReadOptions& read_options, const Slice& start,
  206. const Slice& end, const FileMetaData& file_meta,
  207. TableReaderCaller caller,
  208. const InternalKeyComparator& internal_comparator,
  209. const MutableCFOptions& mutable_cf_options);
  210. CacheInterface& get_cache() { return cache_; }
  211. // Capacity of the backing Cache that indicates infinite TableCache capacity.
  212. // For example when max_open_files is -1 we set the backing Cache to this.
  213. static const int kInfiniteCapacity = 0x400000;
  214. // The tables opened with this TableCache will be immortal, i.e., their
  215. // lifetime is as long as that of the DB.
  216. void SetTablesAreImmortal() {
  217. if (cache_.get()->GetCapacity() >= kInfiniteCapacity) {
  218. immortal_tables_ = true;
  219. }
  220. }
  221. private:
  222. // Build a table reader
  223. Status GetTableReader(const ReadOptions& ro, const FileOptions& file_options,
  224. const InternalKeyComparator& internal_comparator,
  225. const FileMetaData& file_meta, bool sequential_mode,
  226. HistogramImpl* file_read_hist,
  227. std::unique_ptr<TableReader>* table_reader,
  228. const MutableCFOptions& mutable_cf_options,
  229. bool skip_filters = false, int level = -1,
  230. bool prefetch_index_and_filter_in_cache = true,
  231. size_t max_file_size_for_l0_meta_pin = 0,
  232. Temperature file_temperature = Temperature::kUnknown);
  233. // Update the max_covering_tombstone_seq in the GetContext for each key based
  234. // on the range deletions in the table
  235. void UpdateRangeTombstoneSeqnums(const ReadOptions& options, TableReader* t,
  236. MultiGetContext::Range& table_range);
  237. // Create a key prefix for looking up the row cache. The prefix is of the
  238. // format row_cache_id + fd_number + seq_no. Later, the user key can be
  239. // appended to form the full key
  240. // Return the sequence number that determines the visibility of row_cache_key
  241. uint64_t CreateRowCacheKeyPrefix(const ReadOptions& options,
  242. const FileDescriptor& fd,
  243. const Slice& internal_key,
  244. GetContext* get_context,
  245. IterKey& row_cache_key);
  246. // Helper function to lookup the row cache for a key. It appends the
  247. // user key to row_cache_key at offset prefix_size
  248. bool GetFromRowCache(const Slice& user_key, IterKey& row_cache_key,
  249. size_t prefix_size, GetContext* get_context,
  250. Status* read_status,
  251. SequenceNumber seq_no = kMaxSequenceNumber);
  252. const ImmutableOptions& ioptions_;
  253. const FileOptions& file_options_;
  254. CacheInterface cache_;
  255. std::string row_cache_id_;
  256. bool immortal_tables_;
  257. BlockCacheTracer* const block_cache_tracer_;
  258. Striped<CacheAlignedWrapper<port::Mutex>> loader_mutex_;
  259. std::shared_ptr<IOTracer> io_tracer_;
  260. std::string db_session_id_;
  261. };
  262. } // namespace ROCKSDB_NAMESPACE