cache_key.cc 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364
  1. // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. #include "cache/cache_key.h"
  6. #include <algorithm>
  7. #include <atomic>
  8. #include "rocksdb/advanced_cache.h"
  9. #include "table/unique_id_impl.h"
  10. #include "util/hash.h"
  11. #include "util/math.h"
  12. namespace ROCKSDB_NAMESPACE {
  13. // Value space plan for CacheKey:
  14. //
  15. // file_num_etc64_ | offset_etc64_ | Only generated by
  16. // ---------------+---------------+------------------------------------------
  17. // 0 | 0 | Reserved for "empty" CacheKey()
  18. // 0 | > 0, < 1<<63 | CreateUniqueForCacheLifetime
  19. // 0 | >= 1<<63 | CreateUniqueForProcessLifetime
  20. // > 0 | any | OffsetableCacheKey.WithOffset
  21. CacheKey CacheKey::CreateUniqueForCacheLifetime(Cache *cache) {
  22. // +1 so that we can reserve all zeros for "unset" cache key
  23. uint64_t id = cache->NewId() + 1;
  24. // Ensure we don't collide with CreateUniqueForProcessLifetime
  25. assert((id >> 63) == 0U);
  26. return CacheKey(0, id);
  27. }
  28. CacheKey CacheKey::CreateUniqueForProcessLifetime() {
  29. // To avoid colliding with CreateUniqueForCacheLifetime, assuming
  30. // Cache::NewId counts up from zero, here we count down from UINT64_MAX.
  31. // If this ever becomes a point of contention, we could sub-divide the
  32. // space and use CoreLocalArray.
  33. static std::atomic<uint64_t> counter{UINT64_MAX};
  34. uint64_t id = counter.fetch_sub(1, std::memory_order_relaxed);
  35. // Ensure we don't collide with CreateUniqueForCacheLifetime
  36. assert((id >> 63) == 1U);
  37. return CacheKey(0, id);
  38. }
  39. // How we generate CacheKeys and base OffsetableCacheKey, assuming that
  40. // db_session_ids are generated from a base_session_id and
  41. // session_id_counter (by SemiStructuredUniqueIdGen+EncodeSessionId
  42. // in DBImpl::GenerateDbSessionId):
  43. //
  44. // Conceptual inputs:
  45. // db_id (unstructured, from GenerateRawUniqueId or equiv)
  46. // * could be shared between cloned DBs but rare
  47. // * could be constant, if session id suffices
  48. // base_session_id (unstructured, from GenerateRawUniqueId)
  49. // session_id_counter (structured)
  50. // * usually much smaller than 2**24
  51. // orig_file_number (structured)
  52. // * usually smaller than 2**24
  53. // offset_in_file (structured, might skip lots of values)
  54. // * usually smaller than 2**32
  55. //
  56. // Overall approach (see https://github.com/pdillinger/unique_id for
  57. // background):
  58. //
  59. // First, we have three "structured" values, up to 64 bits each, that we
  60. // need to fit, without losses, into 128 bits. In practice, the values will
  61. // be small enough that they should fit. For example, applications generating
  62. // large SST files (large offsets) will naturally produce fewer files (small
  63. // file numbers). But we don't know ahead of time what bounds the values will
  64. // have.
  65. //
  66. // Second, we have unstructured inputs that enable distinct RocksDB processes
  67. // to pick a random point in space, likely very different from others. Xoring
  68. // the structured with the unstructured give us a cache key that is
  69. // structurally distinct between related keys (e.g. same file or same RocksDB
  70. // process) and distinct with high probability between unrelated keys.
  71. //
  72. // The problem of packing three structured values into the space for two is
  73. // complicated by the fact that we want to derive cache keys from SST unique
  74. // IDs, which have already combined structured and unstructured inputs in a
  75. // practically inseparable way. And we want a base cache key that works
  76. // with an offset of any size. So basically, we need to encode these three
  77. // structured values, each up to 64 bits, into 128 bits without knowing any
  78. // of their sizes. The DownwardInvolution() function gives us a mechanism to
  79. // accomplish this. (See its properties in math.h.) Specifically, for inputs
  80. // a, b, and c:
  81. // lower64 = DownwardInvolution(a) ^ ReverseBits(b);
  82. // upper64 = c ^ ReverseBits(a);
  83. // The 128-bit output is unique assuming there exist some i, j, and k
  84. // where a < 2**i, b < 2**j, c < 2**k, i <= 64, j <= 64, k <= 64, and
  85. // i + j + k <= 128. In other words, as long as there exist some bounds
  86. // that would allow us to pack the bits of a, b, and c into the output
  87. // if we know the bound, we can generate unique outputs without knowing
  88. // those bounds. To validate this claim, the inversion function (given
  89. // the bounds) has been implemented in CacheKeyDecoder in
  90. // db_block_cache_test.cc.
  91. //
  92. // With that in mind, the outputs in terms of the conceptual inputs look
  93. // like this, using bitwise-xor of the constituent pieces, low bits on left:
  94. //
  95. // |------------------------- file_num_etc64 -------------------------|
  96. // | +++++++++ base_session_id (lower 64 bits, involution) +++++++++ |
  97. // |-----------------------------------------------------------------|
  98. // | session_id_counter (involution) ..... | |
  99. // |-----------------------------------------------------------------|
  100. // | hash of: ++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
  101. // | * base_session_id (upper ~39 bits) |
  102. // | * db_id (~122 bits entropy) |
  103. // |-----------------------------------------------------------------|
  104. // | | ..... orig_file_number (reversed) |
  105. // |-----------------------------------------------------------------|
  106. //
  107. //
  108. // |------------------------- offset_etc64 --------------------------|
  109. // | ++++++++++ base_session_id (lower 64 bits, reversed) ++++++++++ |
  110. // |-----------------------------------------------------------------|
  111. // | | ..... session_id_counter (reversed) |
  112. // |-----------------------------------------------------------------|
  113. // | offset_in_file ............... | |
  114. // |-----------------------------------------------------------------|
  115. //
  116. // Some oddities or inconveniences of this layout are due to deriving
  117. // the "base" cache key (without offset) from the SST unique ID (see
  118. // GetSstInternalUniqueId). Specifically,
  119. // * Lower 64 of base_session_id occurs in both output words (ok but
  120. // weird)
  121. // * The inclusion of db_id is bad for the conditions under which we
  122. // can guarantee uniqueness, but could be useful in some cases with
  123. // few small files per process, to make up for db session id only having
  124. // ~103 bits of entropy.
  125. //
  126. // In fact, if DB ids were not involved, we would be guaranteed unique
  127. // cache keys for files generated in a single process until total bits for
  128. // biggest session_id_counter, orig_file_number, and offset_in_file
  129. // reach 128 bits.
  130. //
  131. // With the DB id limitation, we only have nice guaranteed unique cache
  132. // keys for files generated in a single process until biggest
  133. // session_id_counter and offset_in_file reach combined 64 bits. This
  134. // is quite good in practice because we can have millions of DB Opens
  135. // with terabyte size SST files, or billions of DB Opens with gigabyte
  136. // size SST files.
  137. //
  138. // One of the considerations in the translation between existing SST unique
  139. // IDs and base cache keys is supporting better SST unique IDs in a future
  140. // format_version. If we use a process-wide file counter instead of
  141. // session counter and file numbers, we only need to combine two 64-bit values
  142. // instead of three. But we don't want to track unique ID versions in the
  143. // manifest, so we want to keep the same translation layer between SST unique
  144. // IDs and base cache keys, even with updated SST unique IDs. If the new
  145. // unique IDs put the file counter where the orig_file_number was, and
  146. // use no structured field where session_id_counter was, then our translation
  147. // layer works fine for two structured fields as well as three (for
  148. // compatibility). The small computation for the translation (one
  149. // DownwardInvolution(), two ReverseBits(), both ~log(64) instructions deep)
  150. // is negligible for computing as part of SST file reader open.
  151. //
  152. // More on how https://github.com/pdillinger/unique_id applies here:
  153. // Every bit of output always includes "unstructured" uniqueness bits and
  154. // often combines with "structured" uniqueness bits. The "unstructured" bits
  155. // change infrequently: only when we cannot guarantee our state tracking for
  156. // "structured" uniqueness hasn't been cloned. Using a static
  157. // SemiStructuredUniqueIdGen for db_session_ids, this means we only get an
  158. // "all new" session id when a new process uses RocksDB. (Between processes,
  159. // we don't know if a DB or other persistent storage has been cloned. We
  160. // assume that if VM hot cloning is used, subsequently generated SST files
  161. // do not interact.) Within a process, only the session_lower of the
  162. // db_session_id changes incrementally ("structured" uniqueness).
  163. //
  164. // This basically means that our offsets, counters and file numbers allow us
  165. // to do somewhat "better than random" (birthday paradox) while in the
  166. // degenerate case of completely new session for each tiny file, we still
  167. // have strong uniqueness properties from the birthday paradox, with ~103
  168. // bit session IDs or up to 128 bits entropy with different DB IDs sharing a
  169. // cache.
  170. //
  171. // More collision probability analysis:
  172. // Suppose a RocksDB host generates (generously) 2 GB/s (10TB data, 17 DWPD)
  173. // with average process/session lifetime of (pessimistically) 4 minutes.
  174. // In 180 days (generous allowable data lifespan), we generate 31 million GB
  175. // of data, or 2^55 bytes, and 2^16 "all new" session IDs.
  176. //
  177. // First, suppose this is in a single DB (lifetime 180 days):
  178. // 128 bits cache key size
  179. // - 55 <- ideal size for byte offsets + file numbers
  180. // - 2 <- bits for offsets and file numbers not exactly powers of two
  181. // + 2 <- bits saved not using byte offsets in BlockBasedTable::GetCacheKey
  182. // ----
  183. // 73 <- bits remaining for distinguishing session IDs
  184. // The probability of a collision in 73 bits of session ID data is less than
  185. // 1 in 2**(73 - (2 * 16)), or roughly 1 in a trillion. And this assumes all
  186. // data from the last 180 days is in cache for potential collision, and that
  187. // cache keys under each session id exhaustively cover the remaining 57 bits
  188. // while in reality they'll only cover a small fraction of it.
  189. //
  190. // Although data could be transferred between hosts, each host has its own
  191. // cache and we are already assuming a high rate of "all new" session ids.
  192. // So this doesn't really change the collision calculation. Across a fleet
  193. // of 1 million, each with <1 in a trillion collision possibility,
  194. // fleetwide collision probability is <1 in a million.
  195. //
  196. // Now suppose we have many DBs per host, say 2**10, with same host-wide write
  197. // rate and process/session lifetime. File numbers will be ~10 bits smaller
  198. // and we will have 2**10 times as many session IDs because of simultaneous
  199. // lifetimes. So now collision chance is less than 1 in 2**(83 - (2 * 26)),
  200. // or roughly 1 in a billion.
  201. //
  202. // Suppose instead we generated random or hashed cache keys for each
  203. // (compressed) block. For 1KB compressed block size, that is 2^45 cache keys
  204. // in 180 days. Collision probability is more easily estimated at roughly
  205. // 1 in 2**(128 - (2 * 45)) or roughly 1 in a trillion (assuming all
  206. // data from the last 180 days is in cache, but NOT the other assumption
  207. // for the 1 in a trillion estimate above).
  208. //
  209. //
  210. // Collision probability estimation through simulation:
  211. // A tool ./cache_bench -stress_cache_key broadly simulates host-wide cache
  212. // activity over many months, by making some pessimistic simplifying
  213. // assumptions. See class StressCacheKey in cache_bench_tool.cc for details.
  214. // Here is some sample output with
  215. // `./cache_bench -stress_cache_key -sck_keep_bits=43`:
  216. //
  217. // Total cache or DBs size: 32TiB Writing 925.926 MiB/s or 76.2939TiB/day
  218. // Multiply by 1.15292e+18 to correct for simulation losses (but still
  219. // assume whole file cached)
  220. //
  221. // These come from default settings of 2.5M files per day of 32 MB each, and
  222. // `-sck_keep_bits=43` means that to represent a single file, we are only
  223. // keeping 43 bits of the 128-bit (base) cache key. With file size of 2**25
  224. // contiguous keys (pessimistic), our simulation is about 2\*\*(128-43-25) or
  225. // about 1 billion billion times more prone to collision than reality.
  226. //
  227. // More default assumptions, relatively pessimistic:
  228. // * 100 DBs in same process (doesn't matter much)
  229. // * Re-open DB in same process (new session ID related to old session ID) on
  230. // average every 100 files generated
  231. // * Restart process (all new session IDs unrelated to old) 24 times per day
  232. //
  233. // After enough data, we get a result at the end (-sck_keep_bits=43):
  234. //
  235. // (keep 43 bits) 18 collisions after 2 x 90 days, est 10 days between
  236. // (1.15292e+19 corrected)
  237. //
  238. // If we believe the (pessimistic) simulation and the mathematical
  239. // extrapolation, we would need to run a billion machines all for 11 billion
  240. // days to expect a cache key collision. To help verify that our extrapolation
  241. // ("corrected") is robust, we can make our simulation more precise by
  242. // increasing the "keep" bits, which takes more running time to get enough
  243. // collision data:
  244. //
  245. // (keep 44 bits) 16 collisions after 5 x 90 days, est 28.125 days between
  246. // (1.6213e+19 corrected)
  247. // (keep 45 bits) 15 collisions after 7 x 90 days, est 42 days between
  248. // (1.21057e+19 corrected)
  249. // (keep 46 bits) 15 collisions after 17 x 90 days, est 102 days between
  250. // (1.46997e+19 corrected)
  251. // (keep 47 bits) 15 collisions after 49 x 90 days, est 294 days between
  252. // (2.11849e+19 corrected)
  253. //
  254. // The extrapolated prediction seems to be within noise (sampling error).
  255. //
  256. // With the `-sck_randomize` option, we can see that typical workloads like
  257. // above have lower collision probability than "random" cache keys (note:
  258. // offsets still non-randomized) by a modest amount (roughly 2-3x less
  259. // collision prone than random), which should make us reasonably comfortable
  260. // even in "degenerate" cases (e.g. repeatedly launch a process to generate
  261. // one file with SstFileWriter):
  262. //
  263. // (rand 43 bits) 22 collisions after 1 x 90 days, est 4.09091 days between
  264. // (4.7165e+18 corrected)
  265. //
  266. // We can see that with more frequent process restarts,
  267. // -sck_restarts_per_day=5000, which means more all-new session IDs, we get
  268. // closer to the "random" cache key performance:
  269. //
  270. // 15 collisions after 1 x 90 days, est 6 days between (6.91753e+18 corrected)
  271. //
  272. // And with less frequent process restarts and re-opens,
  273. // -sck_restarts_per_day=1 -sck_reopen_nfiles=1000, we get lower collision
  274. // probability:
  275. //
  276. // 18 collisions after 8 x 90 days, est 40 days between (4.61169e+19 corrected)
  277. //
  278. // Other tests have been run to validate other conditions behave as expected,
  279. // never behaving "worse than random" unless we start chopping off structured
  280. // data.
  281. //
  282. // Conclusion: Even in extreme cases, rapidly burning through "all new" IDs
  283. // that only arise when a new process is started, the chance of any cache key
  284. // collisions in a giant fleet of machines is negligible. Especially when
  285. // processes live for hours or days, the chance of a cache key collision is
  286. // likely more plausibly due to bad hardware than to bad luck in random
  287. // session ID data. Software defects are surely more likely to cause corruption
  288. // than both of those.
  289. //
  290. // TODO: Nevertheless / regardless, an efficient way to detect (and thus
  291. // quantify) block cache corruptions, including collisions, should be added.
  292. OffsetableCacheKey::OffsetableCacheKey(const std::string &db_id,
  293. const std::string &db_session_id,
  294. uint64_t file_number) {
  295. UniqueId64x2 internal_id;
  296. Status s = GetSstInternalUniqueId(db_id, db_session_id, file_number,
  297. &internal_id, /*force=*/true);
  298. assert(s.ok());
  299. *this = FromInternalUniqueId(&internal_id);
  300. }
  301. OffsetableCacheKey OffsetableCacheKey::FromInternalUniqueId(UniqueIdPtr id) {
  302. uint64_t session_lower = id.ptr[0];
  303. uint64_t file_num_etc = id.ptr[1];
  304. #ifndef NDEBUG
  305. bool is_empty = session_lower == 0 && file_num_etc == 0;
  306. #endif
  307. // Although DBImpl guarantees (in recent versions) that session_lower is not
  308. // zero, that's not entirely sufficient to guarantee that file_num_etc64_ is
  309. // not zero (so that the 0 case can be used by CacheKey::CreateUnique*)
  310. // However, if we are given an "empty" id as input, then we should produce
  311. // "empty" as output.
  312. // As a consequence, this function is only bijective assuming
  313. // id[0] == 0 only if id[1] == 0.
  314. if (session_lower == 0U) {
  315. session_lower = file_num_etc;
  316. }
  317. // See comments above for how DownwardInvolution and ReverseBits
  318. // make this function invertible under various assumptions.
  319. OffsetableCacheKey rv;
  320. rv.file_num_etc64_ =
  321. DownwardInvolution(session_lower) ^ ReverseBits(file_num_etc);
  322. rv.offset_etc64_ = ReverseBits(session_lower);
  323. // Because of these transformations and needing to allow arbitrary
  324. // offset (thus, second 64 bits of cache key might be 0), we need to
  325. // make some correction to ensure the first 64 bits is not 0.
  326. // Fortunately, the transformation ensures the second 64 bits is not 0
  327. // for non-empty base key, so we can swap in the case one is 0 without
  328. // breaking bijectivity (assuming condition above).
  329. assert(is_empty || rv.offset_etc64_ > 0);
  330. if (rv.file_num_etc64_ == 0) {
  331. std::swap(rv.file_num_etc64_, rv.offset_etc64_);
  332. }
  333. assert(is_empty || rv.file_num_etc64_ > 0);
  334. return rv;
  335. }
  336. // Inverse of FromInternalUniqueId (assuming file_num_etc64 == 0 only if
  337. // offset_etc64 == 0)
  338. UniqueId64x2 OffsetableCacheKey::ToInternalUniqueId() {
  339. uint64_t a = file_num_etc64_;
  340. uint64_t b = offset_etc64_;
  341. if (b == 0) {
  342. std::swap(a, b);
  343. }
  344. UniqueId64x2 rv;
  345. rv[0] = ReverseBits(b);
  346. rv[1] = ReverseBits(a ^ DownwardInvolution(rv[0]));
  347. return rv;
  348. }
  349. } // namespace ROCKSDB_NAMESPACE