persistent_cache_tier.h 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338
  1. // Copyright (c) 2013, Facebook, Inc. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. //
  6. #pragma once
  7. #include <limits>
  8. #include <list>
  9. #include <map>
  10. #include <string>
  11. #include <vector>
  12. #include "monitoring/histogram.h"
  13. #include "rocksdb/env.h"
  14. #include "rocksdb/persistent_cache.h"
  15. #include "rocksdb/status.h"
  16. #include "rocksdb/system_clock.h"
  17. // Persistent Cache
  18. //
  19. // Persistent cache is tiered key-value cache that can use persistent medium. It
  20. // is a generic design and can leverage any storage medium -- disk/SSD/NVM/RAM.
  21. // The code has been kept generic but significant benchmark/design/development
  22. // time has been spent to make sure the cache performs appropriately for
  23. // respective storage medium.
  24. // The file defines
  25. // PersistentCacheTier : Implementation that handles individual cache tier
  26. // PersistentTieresCache : Implementation that handles all tiers as a logical
  27. // unit
  28. //
  29. // PersistentTieredCache architecture:
  30. // +--------------------------+ PersistentCacheTier that handles multiple tiers
  31. // | +----------------+ |
  32. // | | RAM | PersistentCacheTier that handles RAM (VolatileCacheImpl)
  33. // | +----------------+ |
  34. // | | next |
  35. // | v |
  36. // | +----------------+ |
  37. // | | NVM | PersistentCacheTier implementation that handles NVM
  38. // | +----------------+ (BlockCacheImpl)
  39. // | | next |
  40. // | V |
  41. // | +----------------+ |
  42. // | | LE-SSD | PersistentCacheTier implementation that handles LE-SSD
  43. // | +----------------+ (BlockCacheImpl)
  44. // | | |
  45. // | V |
  46. // | null |
  47. // +--------------------------+
  48. // |
  49. // V
  50. // null
  51. namespace ROCKSDB_NAMESPACE {
  52. // Persistent Cache Config
  53. //
  54. // This struct captures all the options that are used to configure persistent
  55. // cache. Some of the terminologies used in naming the options are
  56. //
  57. // dispatch size :
  58. // This is the size in which IO is dispatched to the device
  59. //
  60. // write buffer size :
  61. // This is the size of an individual write buffer size. Write buffers are
  62. // grouped to form buffered file.
  63. //
  64. // cache size :
  65. // This is the logical maximum for the cache size
  66. //
  67. // qdepth :
  68. // This is the max number of IOs that can issues to the device in parallel
  69. //
  70. // pepeling :
  71. // The writer code path follows pipelined architecture, which means the
  72. // operations are handed off from one stage to another
  73. //
  74. // pipelining backlog size :
  75. // With the pipelined architecture, there can always be backlogging of ops in
  76. // pipeline queues. This is the maximum backlog size after which ops are dropped
  77. // from queue
  78. struct PersistentCacheConfig {
  79. explicit PersistentCacheConfig(
  80. Env* const _env, const std::string& _path, const uint64_t _cache_size,
  81. const std::shared_ptr<Logger>& _log,
  82. const uint32_t _write_buffer_size = 1 * 1024 * 1024 /*1MB*/) {
  83. env = _env;
  84. clock = (env != nullptr) ? env->GetSystemClock().get()
  85. : SystemClock::Default().get();
  86. path = _path;
  87. log = _log;
  88. cache_size = _cache_size;
  89. writer_dispatch_size = write_buffer_size = _write_buffer_size;
  90. }
  91. //
  92. // Validate the settings. Our intentions are to catch erroneous settings ahead
  93. // of time instead going violating invariants or causing dead locks.
  94. //
  95. Status ValidateSettings() const {
  96. // (1) check pre-conditions for variables
  97. if (!env || path.empty()) {
  98. return Status::InvalidArgument("empty or null args");
  99. }
  100. // (2) assert size related invariants
  101. // - cache size cannot be less than cache file size
  102. // - individual write buffer size cannot be greater than cache file size
  103. // - total write buffer size cannot be less than 2X cache file size
  104. if (cache_size < cache_file_size || write_buffer_size >= cache_file_size ||
  105. write_buffer_size * write_buffer_count() < 2 * cache_file_size) {
  106. return Status::InvalidArgument("invalid cache size");
  107. }
  108. // (2) check writer settings
  109. // - Queue depth cannot be 0
  110. // - writer_dispatch_size cannot be greater than writer_buffer_size
  111. // - dispatch size and buffer size need to be aligned
  112. if (!writer_qdepth || writer_dispatch_size > write_buffer_size ||
  113. write_buffer_size % writer_dispatch_size) {
  114. return Status::InvalidArgument("invalid writer settings");
  115. }
  116. return Status::OK();
  117. }
  118. //
  119. // Env abstraction to use for system level operations
  120. //
  121. Env* env;
  122. SystemClock* clock;
  123. //
  124. // Path for the block cache where blocks are persisted
  125. //
  126. std::string path;
  127. //
  128. // Log handle for logging messages
  129. //
  130. std::shared_ptr<Logger> log;
  131. //
  132. // Enable direct IO for reading
  133. //
  134. bool enable_direct_reads = true;
  135. //
  136. // Enable direct IO for writing
  137. //
  138. bool enable_direct_writes = false;
  139. //
  140. // Logical cache size
  141. //
  142. uint64_t cache_size = std::numeric_limits<uint64_t>::max();
  143. // cache-file-size
  144. //
  145. // Cache consists of multiples of small files. This parameter defines the
  146. // size of an individual cache file
  147. //
  148. // default: 1M
  149. uint32_t cache_file_size = 100ULL * 1024 * 1024;
  150. // writer-qdepth
  151. //
  152. // The writers can issues IO to the devices in parallel. This parameter
  153. // controls the max number if IOs that can issues in parallel to the block
  154. // device
  155. //
  156. // default :1
  157. uint32_t writer_qdepth = 1;
  158. // pipeline-writes
  159. //
  160. // The write optionally follow pipelined architecture. This helps
  161. // avoid regression in the eviction code path of the primary tier. This
  162. // parameter defines if pipelining is enabled or disabled
  163. //
  164. // default: true
  165. bool pipeline_writes = true;
  166. // max-write-pipeline-backlog-size
  167. //
  168. // Max pipeline buffer size. This is the maximum backlog we can accumulate
  169. // while waiting for writes. After the limit, new ops will be dropped.
  170. //
  171. // Default: 1GiB
  172. uint64_t max_write_pipeline_backlog_size = 1ULL * 1024 * 1024 * 1024;
  173. // write-buffer-size
  174. //
  175. // This is the size in which buffer slabs are allocated.
  176. //
  177. // Default: 1M
  178. uint32_t write_buffer_size = 1ULL * 1024 * 1024;
  179. // write-buffer-count
  180. //
  181. // This is the total number of buffer slabs. This is calculated as a factor of
  182. // file size in order to avoid dead lock.
  183. size_t write_buffer_count() const {
  184. assert(write_buffer_size);
  185. return static_cast<size_t>((writer_qdepth + 1.2) * cache_file_size /
  186. write_buffer_size);
  187. }
  188. // writer-dispatch-size
  189. //
  190. // The writer thread will dispatch the IO at the specified IO size
  191. //
  192. // default: 1M
  193. uint64_t writer_dispatch_size = 1ULL * 1024 * 1024;
  194. // is_compressed
  195. //
  196. // This option determines if the cache will run in compressed mode or
  197. // uncompressed mode
  198. bool is_compressed = true;
  199. PersistentCacheConfig MakePersistentCacheConfig(
  200. const std::string& path, const uint64_t size,
  201. const std::shared_ptr<Logger>& log);
  202. std::string ToString() const;
  203. };
  204. // Persistent Cache Tier
  205. //
  206. // This a logical abstraction that defines a tier of the persistent cache. Tiers
  207. // can be stacked over one another. PersistentCahe provides the basic definition
  208. // for accessing/storing in the cache. PersistentCacheTier extends the interface
  209. // to enable management and stacking of tiers.
  210. class PersistentCacheTier : public PersistentCache {
  211. public:
  212. using Tier = std::shared_ptr<PersistentCacheTier>;
  213. virtual ~PersistentCacheTier() {}
  214. // Open the persistent cache tier
  215. virtual Status Open();
  216. // Close the persistent cache tier
  217. virtual Status Close();
  218. // Reserve space up to 'size' bytes
  219. virtual bool Reserve(const size_t size);
  220. // Erase a key from the cache
  221. virtual bool Erase(const Slice& key);
  222. // Print stats to string recursively
  223. virtual std::string PrintStats();
  224. PersistentCache::StatsType Stats() override;
  225. // Insert to page cache
  226. Status Insert(const Slice& page_key, const char* data,
  227. const size_t size) override = 0;
  228. // Lookup page cache by page identifier
  229. Status Lookup(const Slice& page_key, std::unique_ptr<char[]>* data,
  230. size_t* size) override = 0;
  231. // Does it store compressed data ?
  232. bool IsCompressed() override = 0;
  233. std::string GetPrintableOptions() const override = 0;
  234. uint64_t NewId() override;
  235. // Return a reference to next tier
  236. virtual Tier& next_tier() { return next_tier_; }
  237. // Set the value for next tier
  238. virtual void set_next_tier(const Tier& tier) {
  239. assert(!next_tier_);
  240. next_tier_ = tier;
  241. }
  242. virtual void TEST_Flush() {
  243. if (next_tier_) {
  244. next_tier_->TEST_Flush();
  245. }
  246. }
  247. private:
  248. Tier next_tier_; // next tier
  249. std::atomic<uint64_t> last_id_{1};
  250. };
  251. // PersistentTieredCache
  252. //
  253. // Abstraction that helps you construct a tiers of persistent caches as a
  254. // unified cache. The tier(s) of cache will act a single tier for management
  255. // ease and support PersistentCache methods for accessing data.
  256. class PersistentTieredCache : public PersistentCacheTier {
  257. public:
  258. virtual ~PersistentTieredCache();
  259. Status Open() override;
  260. Status Close() override;
  261. bool Erase(const Slice& key) override;
  262. std::string PrintStats() override;
  263. PersistentCache::StatsType Stats() override;
  264. Status Insert(const Slice& page_key, const char* data,
  265. const size_t size) override;
  266. Status Lookup(const Slice& page_key, std::unique_ptr<char[]>* data,
  267. size_t* size) override;
  268. bool IsCompressed() override;
  269. std::string GetPrintableOptions() const override {
  270. return "PersistentTieredCache";
  271. }
  272. void AddTier(const Tier& tier);
  273. Tier& next_tier() override {
  274. auto it = tiers_.end();
  275. return (*it)->next_tier();
  276. }
  277. void set_next_tier(const Tier& tier) override {
  278. auto it = tiers_.end();
  279. (*it)->set_next_tier(tier);
  280. }
  281. void TEST_Flush() override {
  282. assert(!tiers_.empty());
  283. tiers_.front()->TEST_Flush();
  284. PersistentCacheTier::TEST_Flush();
  285. }
  286. protected:
  287. std::list<Tier> tiers_; // list of tiers top-down
  288. };
  289. } // namespace ROCKSDB_NAMESPACE