block_cache_tracer.h 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294
  1. // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. #pragma once
  6. #include <atomic>
  7. #include <fstream>
  8. #include "monitoring/instrumented_mutex.h"
  9. #include "rocksdb/env.h"
  10. #include "rocksdb/options.h"
  11. #include "rocksdb/trace_reader_writer.h"
  12. #include "table/table_reader_caller.h"
  13. #include "trace_replay/trace_replay.h"
  14. namespace ROCKSDB_NAMESPACE {
  15. extern const uint64_t kMicrosInSecond;
  16. extern const uint64_t kSecondInMinute;
  17. extern const uint64_t kSecondInHour;
  18. struct BlockCacheTraceRecord;
  19. class BlockCacheTraceHelper {
  20. public:
  21. static bool IsGetOrMultiGetOnDataBlock(TraceType block_type,
  22. TableReaderCaller caller);
  23. static bool IsGetOrMultiGet(TableReaderCaller caller);
  24. static bool IsUserAccess(TableReaderCaller caller);
  25. // Row key is a concatenation of the access's fd_number and the referenced
  26. // user key.
  27. static std::string ComputeRowKey(const BlockCacheTraceRecord& access);
  28. // The first four bytes of the referenced key in a Get request is the table
  29. // id.
  30. static uint64_t GetTableId(const BlockCacheTraceRecord& access);
  31. // The sequence number of a get request is the last part of the referenced
  32. // key.
  33. static uint64_t GetSequenceNumber(const BlockCacheTraceRecord& access);
  34. // Block offset in a file is the last varint64 in the block key.
  35. static uint64_t GetBlockOffsetInFile(const BlockCacheTraceRecord& access);
  36. static const std::string kUnknownColumnFamilyName;
  37. static const uint64_t kReservedGetId;
  38. };
  39. // Lookup context for tracing block cache accesses.
  40. // We trace block accesses at five places:
  41. // 1. BlockBasedTable::GetFilter
  42. // 2. BlockBasedTable::GetUncompressedDict.
  43. // 3. BlockBasedTable::MaybeReadAndLoadToCache. (To trace access on data, index,
  44. // and range deletion block.)
  45. // 4. BlockBasedTable::Get. (To trace the referenced key and whether the
  46. // referenced key exists in a fetched data block.)
  47. // 5. BlockBasedTable::MultiGet. (To trace the referenced key and whether the
  48. // referenced key exists in a fetched data block.)
  49. // The context is created at:
  50. // 1. BlockBasedTable::Get. (kUserGet)
  51. // 2. BlockBasedTable::MultiGet. (kUserMGet)
  52. // 3. BlockBasedTable::NewIterator. (either kUserIterator, kCompaction, or
  53. // external SST ingestion calls this function.)
  54. // 4. BlockBasedTable::Open. (kPrefetch)
  55. // 5. Index/Filter::CacheDependencies. (kPrefetch)
  56. // 6. BlockBasedTable::ApproximateOffsetOf. (kCompaction or
  57. // kUserApproximateSize).
  58. struct BlockCacheLookupContext {
  59. BlockCacheLookupContext(const TableReaderCaller& _caller) : caller(_caller) {}
  60. BlockCacheLookupContext(const TableReaderCaller& _caller, uint64_t _get_id,
  61. bool _get_from_user_specified_snapshot)
  62. : caller(_caller),
  63. get_id(_get_id),
  64. get_from_user_specified_snapshot(_get_from_user_specified_snapshot) {}
  65. const TableReaderCaller caller;
  66. // These are populated when we perform lookup/insert on block cache. The block
  67. // cache tracer uses these inforation when logging the block access at
  68. // BlockBasedTable::GET and BlockBasedTable::MultiGet.
  69. bool is_cache_hit = false;
  70. bool no_insert = false;
  71. TraceType block_type = TraceType::kTraceMax;
  72. uint64_t block_size = 0;
  73. std::string block_key;
  74. uint64_t num_keys_in_block = 0;
  75. // The unique id associated with Get and MultiGet. This enables us to track
  76. // how many blocks a Get/MultiGet request accesses. We can also measure the
  77. // impact of row cache vs block cache.
  78. uint64_t get_id = 0;
  79. std::string referenced_key;
  80. bool get_from_user_specified_snapshot = false;
  81. void FillLookupContext(bool _is_cache_hit, bool _no_insert,
  82. TraceType _block_type, uint64_t _block_size,
  83. const std::string& _block_key,
  84. uint64_t _num_keys_in_block) {
  85. is_cache_hit = _is_cache_hit;
  86. no_insert = _no_insert;
  87. block_type = _block_type;
  88. block_size = _block_size;
  89. block_key = _block_key;
  90. num_keys_in_block = _num_keys_in_block;
  91. }
  92. };
  93. enum Boolean : char { kTrue = 1, kFalse = 0 };
  94. struct BlockCacheTraceRecord {
  95. // Required fields for all accesses.
  96. uint64_t access_timestamp = 0;
  97. std::string block_key;
  98. TraceType block_type = TraceType::kTraceMax;
  99. uint64_t block_size = 0;
  100. uint64_t cf_id = 0;
  101. std::string cf_name;
  102. uint32_t level = 0;
  103. uint64_t sst_fd_number = 0;
  104. TableReaderCaller caller = TableReaderCaller::kMaxBlockCacheLookupCaller;
  105. Boolean is_cache_hit = Boolean::kFalse;
  106. Boolean no_insert = Boolean::kFalse;
  107. // Required field for Get and MultiGet
  108. uint64_t get_id = BlockCacheTraceHelper::kReservedGetId;
  109. Boolean get_from_user_specified_snapshot = Boolean::kFalse;
  110. std::string referenced_key;
  111. // Required fields for data block and user Get/Multi-Get only.
  112. uint64_t referenced_data_size = 0;
  113. uint64_t num_keys_in_block = 0;
  114. Boolean referenced_key_exist_in_block = Boolean::kFalse;
  115. BlockCacheTraceRecord() {}
  116. BlockCacheTraceRecord(
  117. uint64_t _access_timestamp, std::string _block_key, TraceType _block_type,
  118. uint64_t _block_size, uint64_t _cf_id, std::string _cf_name,
  119. uint32_t _level, uint64_t _sst_fd_number, TableReaderCaller _caller,
  120. bool _is_cache_hit, bool _no_insert,
  121. uint64_t _get_id = BlockCacheTraceHelper::kReservedGetId,
  122. bool _get_from_user_specified_snapshot = false,
  123. std::string _referenced_key = "", uint64_t _referenced_data_size = 0,
  124. uint64_t _num_keys_in_block = 0,
  125. bool _referenced_key_exist_in_block = false)
  126. : access_timestamp(_access_timestamp),
  127. block_key(_block_key),
  128. block_type(_block_type),
  129. block_size(_block_size),
  130. cf_id(_cf_id),
  131. cf_name(_cf_name),
  132. level(_level),
  133. sst_fd_number(_sst_fd_number),
  134. caller(_caller),
  135. is_cache_hit(_is_cache_hit ? Boolean::kTrue : Boolean::kFalse),
  136. no_insert(_no_insert ? Boolean::kTrue : Boolean::kFalse),
  137. get_id(_get_id),
  138. get_from_user_specified_snapshot(_get_from_user_specified_snapshot
  139. ? Boolean::kTrue
  140. : Boolean::kFalse),
  141. referenced_key(_referenced_key),
  142. referenced_data_size(_referenced_data_size),
  143. num_keys_in_block(_num_keys_in_block),
  144. referenced_key_exist_in_block(
  145. _referenced_key_exist_in_block ? Boolean::kTrue : Boolean::kFalse) {
  146. }
  147. };
  148. struct BlockCacheTraceHeader {
  149. uint64_t start_time;
  150. uint32_t rocksdb_major_version;
  151. uint32_t rocksdb_minor_version;
  152. };
  153. // BlockCacheTraceWriter captures all RocksDB block cache accesses using a
  154. // user-provided TraceWriter. Every RocksDB operation is written as a single
  155. // trace. Each trace will have a timestamp and type, followed by the trace
  156. // payload.
  157. class BlockCacheTraceWriter {
  158. public:
  159. BlockCacheTraceWriter(Env* env, const TraceOptions& trace_options,
  160. std::unique_ptr<TraceWriter>&& trace_writer);
  161. ~BlockCacheTraceWriter() = default;
  162. // No copy and move.
  163. BlockCacheTraceWriter(const BlockCacheTraceWriter&) = delete;
  164. BlockCacheTraceWriter& operator=(const BlockCacheTraceWriter&) = delete;
  165. BlockCacheTraceWriter(BlockCacheTraceWriter&&) = delete;
  166. BlockCacheTraceWriter& operator=(BlockCacheTraceWriter&&) = delete;
  167. // Pass Slice references to avoid copy.
  168. Status WriteBlockAccess(const BlockCacheTraceRecord& record,
  169. const Slice& block_key, const Slice& cf_name,
  170. const Slice& referenced_key);
  171. // Write a trace header at the beginning, typically on initiating a trace,
  172. // with some metadata like a magic number and RocksDB version.
  173. Status WriteHeader();
  174. private:
  175. Env* env_;
  176. TraceOptions trace_options_;
  177. std::unique_ptr<TraceWriter> trace_writer_;
  178. };
  179. // Write a trace record in human readable format, see
  180. // https://github.com/facebook/rocksdb/wiki/Block-cache-analysis-and-simulation-tools#trace-format
  181. // for details.
  182. class BlockCacheHumanReadableTraceWriter {
  183. public:
  184. ~BlockCacheHumanReadableTraceWriter();
  185. Status NewWritableFile(const std::string& human_readable_trace_file_path,
  186. ROCKSDB_NAMESPACE::Env* env);
  187. Status WriteHumanReadableTraceRecord(const BlockCacheTraceRecord& access,
  188. uint64_t block_id, uint64_t get_key_id);
  189. private:
  190. char trace_record_buffer_[1024 * 1024];
  191. std::unique_ptr<ROCKSDB_NAMESPACE::WritableFile>
  192. human_readable_trace_file_writer_;
  193. };
  194. // BlockCacheTraceReader helps read the trace file generated by
  195. // BlockCacheTraceWriter using a user provided TraceReader.
  196. class BlockCacheTraceReader {
  197. public:
  198. BlockCacheTraceReader(std::unique_ptr<TraceReader>&& reader);
  199. ~BlockCacheTraceReader() = default;
  200. // No copy and move.
  201. BlockCacheTraceReader(const BlockCacheTraceReader&) = delete;
  202. BlockCacheTraceReader& operator=(const BlockCacheTraceReader&) = delete;
  203. BlockCacheTraceReader(BlockCacheTraceReader&&) = delete;
  204. BlockCacheTraceReader& operator=(BlockCacheTraceReader&&) = delete;
  205. Status ReadHeader(BlockCacheTraceHeader* header);
  206. Status ReadAccess(BlockCacheTraceRecord* record);
  207. private:
  208. std::unique_ptr<TraceReader> trace_reader_;
  209. };
  210. // Read a trace record in human readable format, see
  211. // https://github.com/facebook/rocksdb/wiki/Block-cache-analysis-and-simulation-tools#trace-format
  212. // for detailed.
  213. class BlockCacheHumanReadableTraceReader : public BlockCacheTraceReader {
  214. public:
  215. BlockCacheHumanReadableTraceReader(const std::string& trace_file_path);
  216. ~BlockCacheHumanReadableTraceReader();
  217. Status ReadHeader(BlockCacheTraceHeader* header);
  218. Status ReadAccess(BlockCacheTraceRecord* record);
  219. private:
  220. std::ifstream human_readable_trace_reader_;
  221. };
  222. // A block cache tracer. It downsamples the accesses according to
  223. // trace_options and uses BlockCacheTraceWriter to write the access record to
  224. // the trace file.
  225. class BlockCacheTracer {
  226. public:
  227. BlockCacheTracer();
  228. ~BlockCacheTracer();
  229. // No copy and move.
  230. BlockCacheTracer(const BlockCacheTracer&) = delete;
  231. BlockCacheTracer& operator=(const BlockCacheTracer&) = delete;
  232. BlockCacheTracer(BlockCacheTracer&&) = delete;
  233. BlockCacheTracer& operator=(BlockCacheTracer&&) = delete;
  234. // Start writing block cache accesses to the trace_writer.
  235. Status StartTrace(Env* env, const TraceOptions& trace_options,
  236. std::unique_ptr<TraceWriter>&& trace_writer);
  237. // Stop writing block cache accesses to the trace_writer.
  238. void EndTrace();
  239. bool is_tracing_enabled() const {
  240. return writer_.load(std::memory_order_relaxed);
  241. }
  242. Status WriteBlockAccess(const BlockCacheTraceRecord& record,
  243. const Slice& block_key, const Slice& cf_name,
  244. const Slice& referenced_key);
  245. // GetId cycles from 1 to port::kMaxUint64.
  246. uint64_t NextGetId();
  247. private:
  248. TraceOptions trace_options_;
  249. // A mutex protects the writer_.
  250. InstrumentedMutex trace_writer_mutex_;
  251. std::atomic<BlockCacheTraceWriter*> writer_;
  252. std::atomic<uint64_t> get_id_counter_;
  253. };
  254. } // namespace ROCKSDB_NAMESPACE