clock_cache.h 53 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179
  1. // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. //
  6. // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
  7. // Use of this source code is governed by a BSD-style license that can be
  8. // found in the LICENSE file. See the AUTHORS file for names of contributors.
  9. #pragma once
  10. #include <climits>
  11. #include <cstddef>
  12. #include <cstdint>
  13. #include <memory>
  14. #include <string>
  15. #include "cache/cache_key.h"
  16. #include "cache/sharded_cache.h"
  17. #include "port/mmap.h"
  18. #include "rocksdb/cache.h"
  19. #include "util/atomic.h"
  20. #include "util/bit_fields.h"
  21. #include "util/math.h"
  22. namespace ROCKSDB_NAMESPACE {
  23. namespace clock_cache {
  24. // Forward declaration of friend class.
  25. template <class ClockCache>
  26. class ClockCacheTest;
  27. // HyperClockCache is an alternative to LRUCache specifically tailored for
  28. // use as BlockBasedTableOptions::block_cache
  29. //
  30. // Benefits
  31. // --------
  32. // * Lock/wait free (no waits or spins) for efficiency under high concurrency
  33. // * Fixed version (estimated_entry_charge > 0) is fully lock/wait free
  34. // * Automatic version (estimated_entry_charge = 0) has rare waits among
  35. // certain insertion or erase operations that involve the same very small
  36. // set of entries.
  37. // * Optimized for hot path reads. For concurrency control, most Lookup() and
  38. // essentially all Release() are a single atomic add operation.
  39. // * Eviction on insertion is fully parallel.
  40. // * Uses a generalized + aging variant of CLOCK eviction that might outperform
  41. // LRU in some cases. (For background, see
  42. // https://en.wikipedia.org/wiki/Page_replacement_algorithm)
  43. //
  44. // Costs
  45. // -----
  46. // * FixedHyperClockCache (estimated_entry_charge > 0) - Hash table is not
  47. // resizable (for lock-free efficiency) so capacity is not dynamically
  48. // changeable. Rely on an estimated average value (block) size for
  49. // space+time efficiency. (See estimated_entry_charge option details.)
  50. // EXPERIMENTAL - This limitation is fixed in AutoHyperClockCache, activated
  51. // with estimated_entry_charge == 0.
  52. // * Insert usually does not (but might) overwrite a previous entry associated
  53. // with a cache key. This is OK for RocksDB uses of Cache, though it does mess
  54. // up our REDUNDANT block cache insertion statistics.
  55. // * Only supports keys of exactly 16 bytes, which is what RocksDB uses for
  56. // block cache (but not row cache or table cache).
  57. // * Cache priorities are less aggressively enforced. Unlike LRUCache, enough
  58. // transient LOW or BOTTOM priority items can evict HIGH priority entries that
  59. // are not referenced recently (or often) enough.
  60. // * If pinned entries leave little or nothing eligible for eviction,
  61. // performance can degrade substantially, because of clock eviction eating
  62. // CPU looking for evictable entries and because Release does not
  63. // pro-actively delete unreferenced entries when the cache is over-full.
  64. // Specifically, this makes this implementation more susceptible to the
  65. // following combination:
  66. // * num_shard_bits is high (e.g. 6)
  67. // * capacity small (e.g. some MBs)
  68. // * some large individual entries (e.g. non-partitioned filters)
  69. // where individual entries occupy a large portion of their shard capacity.
  70. // This should be mostly mitigated by the implementation picking a lower
  71. // number of cache shards than LRUCache for a given capacity (when
  72. // num_shard_bits is not overridden; see calls to GetDefaultCacheShardBits()).
  73. // * With strict_capacity_limit=false, respecting the capacity limit is not as
  74. // aggressive as LRUCache. The limit might be transiently exceeded by a very
  75. // small number of entries even when not strictly necessary, and slower to
  76. // recover after pinning forces limit to be substantially exceeded. (Even with
  77. // strict_capacity_limit=true, RocksDB will nevertheless transiently allocate
  78. // memory before discovering it is over the block cache capacity, so this
  79. // should not be a detectable regression in respecting memory limits, except
  80. // on exceptionally small caches.)
  81. // * In some cases, erased or duplicated entries might not be freed
  82. // immediately. They will eventually be freed by eviction from further Inserts.
  83. // * Internal metadata can overflow if the number of simultaneous references
  84. // to a cache handle reaches many millions.
  85. //
  86. // High-level eviction algorithm
  87. // -----------------------------
  88. // A score (or "countdown") is maintained for each entry, initially determined
  89. // by priority. The score is incremented on each Lookup, up to a max of 3,
  90. // though is easily returned to previous state if useful=false with Release.
  91. // During CLOCK-style eviction iteration, entries with score > 0 are
  92. // decremented if currently unreferenced and entries with score == 0 are
  93. // evicted if currently unreferenced. Note that scoring might not be perfect
  94. // because entries can be referenced transiently within the cache even when
  95. // there are no outside references to the entry.
  96. //
  97. // Cache sharding like LRUCache is used to reduce contention on usage+eviction
  98. // state, though here the performance improvement from more shards is small,
  99. // and (as noted above) potentially detrimental if shard capacity is too close
  100. // to largest entry size. Here cache sharding mostly only affects cache update
  101. // (Insert / Erase) performance, not read performance.
  102. //
  103. // Read efficiency (hot path)
  104. // --------------------------
  105. // Mostly to minimize the cost of accessing metadata blocks with
  106. // cache_index_and_filter_blocks=true, we focus on optimizing Lookup and
  107. // Release. In terms of concurrency, at a minimum, these operations have
  108. // to do reference counting (and Lookup has to compare full keys in a safe
  109. // way). Can we fold in all the other metadata tracking *for free* with
  110. // Lookup and Release doing a simple atomic fetch_add/fetch_sub? (Assume
  111. // for the moment that Lookup succeeds on the first probe.)
  112. //
  113. // We have a clever way of encoding an entry's reference count and countdown
  114. // clock so that Lookup and Release are each usually a single atomic addition.
  115. // In a single metadata word we have both an "acquire" count, incremented by
  116. // Lookup, and a "release" count, incremented by Release. If useful=false,
  117. // Release can instead decrement the acquire count. Thus the current ref
  118. // count is (acquires - releases), and the countdown clock is min(3, acquires).
  119. // Note that only unreferenced entries (acquires == releases) are eligible
  120. // for CLOCK manipulation and eviction. We tolerate use of more expensive
  121. // compare_exchange operations for cache writes (insertions and erasures).
  122. //
  123. // In a cache receiving many reads and little or no writes, it is possible
  124. // for the acquire and release counters to overflow. Assuming the *current*
  125. // refcount never reaches to many millions, we only have to correct for
  126. // overflow in both counters in Release, not in Lookup. The overflow check
  127. // should be only 1-2 CPU cycles per Release because it is a predictable
  128. // branch on a simple condition on data already in registers.
  129. //
  130. // Slot states
  131. // -----------
  132. // We encode a state indicator into the same metadata word with the
  133. // acquire and release counters. This allows bigger state transitions to
  134. // be atomic. States:
  135. //
  136. // * Empty - slot is not in use and unowned. All other metadata and data is
  137. // in an undefined state.
  138. // * Construction - slot is exclusively owned by one thread, the thread
  139. // successfully entering this state, for populating or freeing data
  140. // (de-construction, same state marker).
  141. // * Shareable (group) - slot holds an entry with counted references for
  142. // pinning and reading, including
  143. // * Visible - slot holds an entry that can be returned by Lookup
  144. // * Invisible - slot holds an entry that is not visible to Lookup
  145. // (erased by user) but can be read by existing references, and ref count
  146. // changed by Ref and Release.
  147. //
  148. // A special case is "standalone" entries, which are heap-allocated handles
  149. // not in the table. They are always Invisible and freed on zero refs.
  150. //
  151. // State transitions:
  152. // Empty -> Construction (in Insert): The encoding of state enables Insert to
  153. // perform an optimistic atomic bitwise-or to take ownership if a slot is
  154. // empty, or otherwise make no state change.
  155. //
  156. // Construction -> Visible (in Insert): This can be a simple assignment to the
  157. // metadata word because the current thread has exclusive ownership and other
  158. // metadata is meaningless.
  159. //
  160. // Visible -> Invisible (in Erase): This can be a bitwise-and while holding
  161. // a shared reference, which is safe because the change is idempotent (in case
  162. // of parallel Erase). By the way, we never go Invisible->Visible.
  163. //
  164. // Shareable -> Construction (in Evict part of Insert, in Erase, and in
  165. // Release if Invisible): This is for starting to freeing/deleting an
  166. // unreferenced entry. We have to use compare_exchange to ensure we only make
  167. // this transition when there are zero refs.
  168. //
  169. // Construction -> Empty (in same places): This is for completing free/delete
  170. // of an entry. A "release" atomic store suffices, as we have exclusive
  171. // ownership of the slot but have to ensure none of the data member reads are
  172. // re-ordered after committing the state transition.
  173. //
  174. // Insert
  175. // ------
  176. // If Insert were to guarantee replacing an existing entry for a key, there
  177. // would be complications for concurrency and efficiency. First, consider how
  178. // many probes to get to an entry. To ensure Lookup never waits and
  179. // availability of a key is uninterrupted, we would need to use a different
  180. // slot for a new entry for the same key. This means it is most likely in a
  181. // later probing position than the old version, which should soon be removed.
  182. // (Also, an entry is too big to replace atomically, even if no current refs.)
  183. //
  184. // However, overwrite capability is not really needed by RocksDB. Also, we
  185. // know from our "redundant" stats that overwrites are very rare for the block
  186. // cache, so we should not spend much to make them effective.
  187. //
  188. // FixedHyperClockCache: Instead we Insert as soon as we find an empty slot in
  189. // the probing sequence without seeing an existing (visible) entry for the same
  190. // key. This way we only insert if we can improve the probing performance, and
  191. // we don't need to probe beyond our insert position, assuming we are willing
  192. // to let the previous entry for the same key die of old age (eventual eviction
  193. // from not being used). We can reach a similar state with concurrent
  194. // insertions, where one will pass over the other while it is "under
  195. // construction." This temporary duplication is acceptable for RocksDB block
  196. // cache because we know redundant insertion is rare.
  197. // AutoHyperClockCache: Similar, except we only notice and return an existing
  198. // match if it is found in the search for a suitable empty slot (starting with
  199. // the same slot as the head pointer), not by following the existing chain of
  200. // entries. Insertions are always made to the head of the chain.
  201. //
  202. // Another problem to solve is what to return to the caller when we find an
  203. // existing entry whose probing position we cannot improve on, or when the
  204. // table occupancy limit has been reached. If strict_capacity_limit=false,
  205. // we must never fail Insert, and if a Handle* is provided, we have to return
  206. // a usable Cache handle on success. The solution to this (typically rare)
  207. // problem is "standalone" handles, which are usable by the caller but not
  208. // actually available for Lookup in the Cache. Standalone handles are allocated
  209. // independently on the heap and specially marked so that they are freed on
  210. // the heap when their last reference is released.
  211. //
  212. // Usage on capacity
  213. // -----------------
  214. // Insert takes different approaches to usage tracking depending on
  215. // strict_capacity_limit setting. If true, we enforce a kind of strong
  216. // consistency where compare-exchange is used to ensure the usage number never
  217. // exceeds its limit, and provide threads with an authoritative signal on how
  218. // much "usage" they have taken ownership of. With strict_capacity_limit=false,
  219. // we use a kind of "eventual consistency" where all threads Inserting to the
  220. // same cache shard might race on reserving the same space, but the
  221. // over-commitment will be worked out in later insertions. It is kind of a
  222. // dance because we don't want threads racing each other too much on paying
  223. // down the over-commitment (with eviction) either.
  224. //
  225. // Eviction
  226. // --------
  227. // A key part of Insert is evicting some entries currently unreferenced to
  228. // make room for new entries. The high-level eviction algorithm is described
  229. // above, but the details are also interesting. A key part is parallelizing
  230. // eviction with a single CLOCK pointer. This works by each thread working on
  231. // eviction pre-emptively incrementing the CLOCK pointer, and then CLOCK-
  232. // updating or evicting the incremented-over slot(s). To reduce contention at
  233. // the cost of possibly evicting too much, each thread increments the clock
  234. // pointer by 4, so commits to updating at least 4 slots per batch. As
  235. // described above, a CLOCK update will decrement the "countdown" of
  236. // unreferenced entries, or evict unreferenced entries with zero countdown.
  237. // Referenced entries are not updated, because we (presumably) don't want
  238. // long-referenced entries to age while referenced. Note however that we
  239. // cannot distinguish transiently referenced entries from cache user
  240. // references, so some CLOCK updates might be somewhat arbitrarily skipped.
  241. // This is OK as long as it is rare enough that eviction order is still
  242. // pretty good.
  243. //
  244. // There is no synchronization on the completion of the CLOCK updates, so it
  245. // is theoretically possible for another thread to cycle back around and have
  246. // two threads racing on CLOCK updates to the same slot. Thus, we cannot rely
  247. // on any implied exclusivity to make the updates or eviction more efficient.
  248. // These updates use an opportunistic compare-exchange (no loop), where a
  249. // racing thread might cause the update to be skipped without retry, but in
  250. // such case the update is likely not needed because the most likely update
  251. // to an entry is that it has become referenced. (TODO: test efficiency of
  252. // avoiding compare-exchange loop)
  253. //
  254. // Release
  255. // -------
  256. // In the common case, Release is a simple atomic increment of the release
  257. // counter. There is a simple overflow check that only does another atomic
  258. // update in extremely rare cases, so costs almost nothing.
  259. //
  260. // If the Release specifies "not useful", we can instead decrement the
  261. // acquire counter, which returns to the same CLOCK state as before Lookup
  262. // or Ref.
  263. //
  264. // Adding a check for over-full cache on every release to zero-refs would
  265. // likely be somewhat expensive, increasing read contention on cache shard
  266. // metadata. Instead we are less aggressive about deleting entries right
  267. // away in those cases.
  268. //
  269. // However Release tries to immediately delete entries reaching zero refs
  270. // if (a) erase_if_last_ref is set by the caller, or (b) the entry is already
  271. // marked invisible. Both of these are checks on values already in CPU
  272. // registers so do not increase cross-CPU contention when not applicable.
  273. // When applicable, they use a compare-exchange loop to take exclusive
  274. // ownership of the slot for freeing the entry. These are rare cases
  275. // that should not usually affect performance.
  276. //
  277. // Erase
  278. // -----
  279. // Searches for an entry like Lookup but moves it to Invisible state if found.
  280. // This state transition is with bit operations so is idempotent and safely
  281. // done while only holding a shared "read" reference. Like Release, it makes
  282. // a best effort to immediately release an Invisible entry that reaches zero
  283. // refs, but there are some corner cases where it will only be freed by the
  284. // clock eviction process.
  285. // ----------------------------------------------------------------------- //
  286. struct ClockHandleBasicData : public Cache::Handle {
  287. Cache::ObjectPtr value = nullptr;
  288. const Cache::CacheItemHelper* helper = nullptr;
  289. // A lossless, reversible hash of the fixed-size (16 byte) cache key. This
  290. // eliminates the need to store a hash separately.
  291. UniqueId64x2 hashed_key = kNullUniqueId64x2;
  292. size_t total_charge = 0;
  293. inline size_t GetTotalCharge() const { return total_charge; }
  294. // Calls deleter (if non-null) on cache key and value
  295. void FreeData(MemoryAllocator* allocator) const;
  296. // Required by concept HandleImpl
  297. const UniqueId64x2& GetHash() const { return hashed_key; }
  298. };
  299. struct ClockHandle : public ClockHandleBasicData {
  300. // Constants for handling the atomic `meta` word, which tracks most of the
  301. // state of the handle. The meta word looks like this:
  302. // low bits high bits
  303. // -----------------------------------------------------------------------
  304. // | acquire counter | release counter | hit bit | state marker |
  305. // -----------------------------------------------------------------------
  306. // For reading or updating counters in meta word.
  307. static constexpr uint8_t kCounterNumBits = 30;
  308. static constexpr uint64_t kCounterMask = (uint64_t{1} << kCounterNumBits) - 1;
  309. static constexpr uint8_t kAcquireCounterShift = 0;
  310. static constexpr uint64_t kAcquireIncrement = uint64_t{1}
  311. << kAcquireCounterShift;
  312. static constexpr uint8_t kReleaseCounterShift = kCounterNumBits;
  313. static constexpr uint64_t kReleaseIncrement = uint64_t{1}
  314. << kReleaseCounterShift;
  315. // For setting the hit bit
  316. static constexpr uint8_t kHitBitShift = 2U * kCounterNumBits;
  317. static constexpr uint64_t kHitBitMask = uint64_t{1} << kHitBitShift;
  318. // For reading or updating the state marker in meta word
  319. static constexpr uint8_t kStateShift = kHitBitShift + 1;
  320. // Bits contribution to state marker.
  321. // Occupied means any state other than empty
  322. static constexpr uint8_t kStateOccupiedBit = 0b100;
  323. // Shareable means the entry is reference counted (visible or invisible)
  324. // (only set if also occupied)
  325. static constexpr uint8_t kStateShareableBit = 0b010;
  326. // Visible is only set if also shareable
  327. static constexpr uint8_t kStateVisibleBit = 0b001;
  328. // Complete state markers (not shifted into full word)
  329. static constexpr uint8_t kStateEmpty = 0b000;
  330. static constexpr uint8_t kStateConstruction = kStateOccupiedBit;
  331. static constexpr uint8_t kStateInvisible =
  332. kStateOccupiedBit | kStateShareableBit;
  333. static constexpr uint8_t kStateVisible =
  334. kStateOccupiedBit | kStateShareableBit | kStateVisibleBit;
  335. // Constants for initializing the countdown clock. (Countdown clock is only
  336. // in effect with zero refs, acquire counter == release counter, and in that
  337. // case the countdown clock == both of those counters.)
  338. static constexpr uint8_t kHighCountdown = 3;
  339. static constexpr uint8_t kLowCountdown = 2;
  340. static constexpr uint8_t kBottomCountdown = 1;
  341. // During clock update, treat any countdown clock value greater than this
  342. // value the same as this value.
  343. static constexpr uint8_t kMaxCountdown = kHighCountdown;
  344. // TODO: make these coundown values tuning parameters for eviction?
  345. // See above. Mutable for read reference counting.
  346. mutable AcqRelAtomic<uint64_t> meta{};
  347. }; // struct ClockHandle
  348. class BaseClockTable {
  349. public:
  350. struct BaseOpts {
  351. explicit BaseOpts(int _eviction_effort_cap)
  352. : eviction_effort_cap(_eviction_effort_cap) {}
  353. explicit BaseOpts(const HyperClockCacheOptions& opts)
  354. : BaseOpts(opts.eviction_effort_cap) {}
  355. int eviction_effort_cap;
  356. };
  357. BaseClockTable(size_t capacity, bool strict_capacity_limit,
  358. int eviction_effort_cap,
  359. CacheMetadataChargePolicy metadata_charge_policy,
  360. MemoryAllocator* allocator,
  361. const Cache::EvictionCallback* eviction_callback,
  362. const uint32_t* hash_seed);
  363. template <class Table>
  364. typename Table::HandleImpl* CreateStandalone(ClockHandleBasicData& proto,
  365. bool allow_uncharged);
  366. template <class Table>
  367. Status Insert(const ClockHandleBasicData& proto,
  368. typename Table::HandleImpl** handle, Cache::Priority priority);
  369. void Ref(ClockHandle& handle);
  370. size_t GetOccupancy() const { return occupancy_.LoadRelaxed(); }
  371. size_t GetUsage() const { return usage_.LoadRelaxed(); }
  372. size_t GetStandaloneUsage() const { return standalone_usage_.LoadRelaxed(); }
  373. size_t GetCapacity() const { return capacity_.LoadRelaxed(); }
  374. void SetCapacity(size_t capacity) { capacity_.StoreRelaxed(capacity); }
  375. void SetStrictCapacityLimit(bool strict_capacity_limit) {
  376. if (strict_capacity_limit) {
  377. eec_and_scl_.ApplyRelaxed(StrictCapacityLimit::SetTransform());
  378. } else {
  379. eec_and_scl_.ApplyRelaxed(StrictCapacityLimit::ClearTransform());
  380. }
  381. }
  382. uint32_t GetHashSeed() const { return hash_seed_; }
  383. uint64_t GetYieldCount() const { return yield_count_.LoadRelaxed(); }
  384. uint64_t GetEvictionEffortExceededCount() const {
  385. return eviction_effort_exceeded_count_.LoadRelaxed();
  386. }
  387. struct EvictionData {
  388. size_t freed_charge = 0;
  389. size_t freed_count = 0;
  390. size_t seen_pinned_count = 0;
  391. };
  392. void TrackAndReleaseEvictedEntry(ClockHandle* h);
  393. bool IsEvictionEffortExceeded(const BaseClockTable::EvictionData& data) const;
  394. #ifndef NDEBUG
  395. // Acquire N references
  396. void TEST_RefN(ClockHandle& handle, size_t n);
  397. // Helper for TEST_ReleaseN
  398. void TEST_ReleaseNMinus1(ClockHandle* handle, size_t n);
  399. #endif
  400. private: // fns
  401. // Creates a "standalone" handle for returning from an Insert operation that
  402. // cannot be completed by actually inserting into the table.
  403. // Updates `standalone_usage_` but not `usage_` nor `occupancy_`.
  404. template <class HandleImpl>
  405. HandleImpl* StandaloneInsert(const ClockHandleBasicData& proto);
  406. // Helper for updating `usage_` for new entry with given `total_charge`
  407. // and evicting if needed under strict_capacity_limit=true rules. This
  408. // means the operation might fail with Status::MemoryLimit. If
  409. // `need_evict_for_occupancy`, then eviction of at least one entry is
  410. // required, and the operation should fail if not possible.
  411. // NOTE: Otherwise, occupancy_ is not managed in this function
  412. template <class Table>
  413. Status ChargeUsageMaybeEvictStrict(size_t total_charge,
  414. bool need_evict_for_occupancy,
  415. typename Table::InsertState& state);
  416. // Helper for updating `usage_` for new entry with given `total_charge`
  417. // and evicting if needed under strict_capacity_limit=false rules. This
  418. // means that updating `usage_` always succeeds even if forced to exceed
  419. // capacity. If `need_evict_for_occupancy`, then eviction of at least one
  420. // entry is required, and the operation should return false if such eviction
  421. // is not possible. `usage_` is not updated in that case. Otherwise, returns
  422. // true, indicating success.
  423. // NOTE: occupancy_ is not managed in this function
  424. template <class Table>
  425. bool ChargeUsageMaybeEvictNonStrict(size_t total_charge,
  426. bool need_evict_for_occupancy,
  427. typename Table::InsertState& state);
  428. protected: // data
  429. // We partition the following members into different cache lines
  430. // to avoid false sharing among Lookup, Release, Erase and Insert
  431. // operations in ClockCacheShard.
  432. // Clock algorithm sweep pointer.
  433. // (Relaxed: only needs to be consistent with itself.)
  434. RelaxedAtomic<uint64_t> clock_pointer_{};
  435. // Counter for number of times we yield to wait on another thread.
  436. // It is normal for this to occur rarely in normal operation.
  437. // (Relaxed: a simple stat counter.)
  438. RelaxedAtomic<uint64_t> yield_count_{};
  439. // Counter for number of times eviction effort cap is exceeded.
  440. // It is normal for this to occur rarely in normal operation.
  441. // (Relaxed: a simple stat counter.)
  442. RelaxedAtomic<uint64_t> eviction_effort_exceeded_count_{};
  443. // TODO: is this separation needed if we don't do background evictions?
  444. ALIGN_AS(CACHE_LINE_SIZE)
  445. // Number of elements in the table.
  446. AcqRelAtomic<size_t> occupancy_{};
  447. // Memory usage by entries tracked by the cache (including standalone)
  448. AcqRelAtomic<size_t> usage_{};
  449. // Part of usage by standalone entries (not in table)
  450. AcqRelAtomic<size_t> standalone_usage_{};
  451. // Maximum total charge of all elements stored in the table.
  452. // (Relaxed: eventual consistency/update is OK)
  453. RelaxedAtomic<size_t> capacity_;
  454. // Encodes eviction_effort_cap (bottom 31 bits) and strict_capacity_limit
  455. // (top bit). See HyperClockCacheOptions::eviction_effort_cap etc.
  456. struct EecAndScl : public BitFields<uint32_t, EecAndScl> {
  457. uint32_t GetEffectiveEvictionEffortCap() const {
  458. // Because setting strict_capacity_limit is supposed to imply infinite
  459. // cap on eviction effort, we can let the bit for strict_capacity_limit
  460. // in the upper-most bit position to used as part of the effective cap.
  461. return underlying;
  462. }
  463. };
  464. using EvictionEffortCap = UnsignedBitField<EecAndScl, 31, NoPrevBitField>;
  465. using StrictCapacityLimit = BoolBitField<EecAndScl, EvictionEffortCap>;
  466. // (Relaxed: eventual consistency/update is OK)
  467. RelaxedBitFieldsAtomic<EecAndScl> eec_and_scl_;
  468. ALIGN_AS(CACHE_LINE_SIZE)
  469. const CacheMetadataChargePolicy metadata_charge_policy_;
  470. // From Cache, for deleter
  471. MemoryAllocator* const allocator_;
  472. // A reference to Cache::eviction_callback_
  473. const Cache::EvictionCallback& eviction_callback_;
  474. // A reference to ShardedCacheBase::hash_seed_
  475. const uint32_t& hash_seed_;
  476. };
  477. // Hash table for cache entries with size determined at creation time.
  478. // Uses open addressing and double hashing. Since entries cannot be moved,
  479. // the "displacements" count ensures probing sequences find entries even when
  480. // entries earlier in the probing sequence have been removed.
  481. class FixedHyperClockTable : public BaseClockTable {
  482. public:
  483. // Target size to be exactly a common cache line size (see static_assert in
  484. // clock_cache.cc)
  485. struct ALIGN_AS(64U) HandleImpl : public ClockHandle {
  486. // The number of elements that hash to this slot or a lower one, but wind
  487. // up in this slot or a higher one.
  488. // (Relaxed: within a Cache op, does not need consistency with entries
  489. // inserted/removed during that op. For example, a Lookup() that
  490. // happens-after an Insert() will see an appropriate displacements value
  491. // for the entry to be in a published state.)
  492. RelaxedAtomic<uint32_t> displacements{};
  493. // Whether this is a "deteched" handle that is independently allocated
  494. // with `new` (so must be deleted with `delete`).
  495. // TODO: ideally this would be packed into some other data field, such
  496. // as upper bits of total_charge, but that incurs a measurable performance
  497. // regression.
  498. bool standalone = false;
  499. inline bool IsStandalone() const { return standalone; }
  500. inline void SetStandalone() { standalone = true; }
  501. }; // struct HandleImpl
  502. struct Opts : public BaseOpts {
  503. explicit Opts(size_t _estimated_value_size, int _eviction_effort_cap)
  504. : BaseOpts(_eviction_effort_cap),
  505. estimated_value_size(_estimated_value_size) {}
  506. explicit Opts(const HyperClockCacheOptions& opts)
  507. : BaseOpts(opts.eviction_effort_cap) {
  508. assert(opts.estimated_entry_charge > 0);
  509. estimated_value_size = opts.estimated_entry_charge;
  510. }
  511. size_t estimated_value_size;
  512. };
  513. FixedHyperClockTable(size_t capacity, bool strict_capacity_limit,
  514. CacheMetadataChargePolicy metadata_charge_policy,
  515. MemoryAllocator* allocator,
  516. const Cache::EvictionCallback* eviction_callback,
  517. const uint32_t* hash_seed, const Opts& opts);
  518. ~FixedHyperClockTable();
  519. // For BaseClockTable::Insert
  520. struct InsertState {};
  521. void StartInsert(InsertState& state);
  522. // Returns true iff there is room for the proposed number of entries.
  523. bool GrowIfNeeded(size_t new_occupancy, InsertState& state);
  524. HandleImpl* DoInsert(const ClockHandleBasicData& proto,
  525. uint64_t initial_countdown, bool take_ref,
  526. InsertState& state);
  527. // Runs the clock eviction algorithm trying to reclaim at least
  528. // requested_charge. Returns how much is evicted, which could be less
  529. // if it appears impossible to evict the requested amount without blocking.
  530. void Evict(size_t requested_charge, InsertState& state, EvictionData* data);
  531. HandleImpl* Lookup(const UniqueId64x2& hashed_key);
  532. bool Release(HandleImpl* handle, bool useful, bool erase_if_last_ref);
  533. void Erase(const UniqueId64x2& hashed_key);
  534. void EraseUnRefEntries();
  535. size_t GetTableSize() const { return size_t{1} << length_bits_; }
  536. size_t GetOccupancyLimit() const { return occupancy_limit_; }
  537. const HandleImpl* HandlePtr(size_t idx) const { return &array_[idx]; }
  538. #ifndef NDEBUG
  539. size_t& TEST_MutableOccupancyLimit() {
  540. return const_cast<size_t&>(occupancy_limit_);
  541. }
  542. // Release N references
  543. void TEST_ReleaseN(HandleImpl* handle, size_t n);
  544. #endif
  545. // The load factor p is a real number in (0, 1) such that at all
  546. // times at most a fraction p of all slots, without counting tombstones,
  547. // are occupied by elements. This means that the probability that a random
  548. // probe hits an occupied slot is at most p, and thus at most 1/p probes
  549. // are required on average. For example, p = 70% implies that between 1 and 2
  550. // probes are needed on average (bear in mind that this reasoning doesn't
  551. // consider the effects of clustering over time, which should be negligible
  552. // with double hashing).
  553. // Because the size of the hash table is always rounded up to the next
  554. // power of 2, p is really an upper bound on the actual load factor---the
  555. // actual load factor is anywhere between p/2 and p. This is a bit wasteful,
  556. // but bear in mind that slots only hold metadata, not actual values.
  557. // Since space cost is dominated by the values (the LSM blocks),
  558. // overprovisioning the table with metadata only increases the total cache
  559. // space usage by a tiny fraction.
  560. static constexpr double kLoadFactor = 0.7;
  561. // The user can exceed kLoadFactor if the sizes of the inserted values don't
  562. // match estimated_value_size, or in some rare cases with
  563. // strict_capacity_limit == false. To avoid degenerate performance, we set a
  564. // strict upper bound on the load factor.
  565. static constexpr double kStrictLoadFactor = 0.84;
  566. private: // functions
  567. // Returns x mod 2^{length_bits_}.
  568. inline size_t ModTableSize(uint64_t x) {
  569. return BitwiseAnd(x, length_bits_mask_);
  570. }
  571. // Returns the first slot in the probe sequence with a handle e such that
  572. // match_fn(e) is true. At every step, the function first tests whether
  573. // match_fn(e) holds. If this is false, it evaluates abort_fn(e) to decide
  574. // whether the search should be aborted, and if so, FindSlot immediately
  575. // returns nullptr. For every handle e that is not a match and not aborted,
  576. // FindSlot runs update_fn(e, is_last) where is_last is set to true iff that
  577. // slot will be the last probed because the next would cycle back to the first
  578. // slot probed. This function uses templates instead of std::function to
  579. // minimize the risk of heap-allocated closures being created.
  580. template <typename MatchFn, typename AbortFn, typename UpdateFn>
  581. inline HandleImpl* FindSlot(const UniqueId64x2& hashed_key,
  582. const MatchFn& match_fn, const AbortFn& abort_fn,
  583. const UpdateFn& update_fn);
  584. // Re-decrement all displacements in probe path starting from beginning
  585. // until (not including) the given handle
  586. inline void Rollback(const UniqueId64x2& hashed_key, const HandleImpl* h);
  587. // Subtracts `total_charge` from `usage_` and 1 from `occupancy_`.
  588. // Ideally this comes after releasing the entry itself so that we
  589. // actually have the available occupancy/usage that is claimed.
  590. // However, that means total_charge has to be saved from the handle
  591. // before releasing it so that it can be provided to this function.
  592. inline void ReclaimEntryUsage(size_t total_charge);
  593. MemoryAllocator* GetAllocator() const { return allocator_; }
  594. // Returns the number of bits used to hash an element in the hash
  595. // table.
  596. static int CalcHashBits(size_t capacity, size_t estimated_value_size,
  597. CacheMetadataChargePolicy metadata_charge_policy);
  598. private: // data
  599. // Number of hash bits used for table index.
  600. // The size of the table is 1 << length_bits_.
  601. const int length_bits_;
  602. // For faster computation of ModTableSize.
  603. const size_t length_bits_mask_;
  604. // Maximum number of elements the user can store in the table.
  605. const size_t occupancy_limit_;
  606. // Array of slots comprising the hash table.
  607. const std::unique_ptr<HandleImpl[]> array_;
  608. }; // class FixedHyperClockTable
  609. // Hash table for cache entries that resizes automatically based on occupancy.
  610. // However, it depends on a contiguous memory region to grow into
  611. // incrementally, using linear hashing, so uses an anonymous mmap so that
  612. // only the used portion of the memory region is mapped to physical memory
  613. // (part of RSS).
  614. //
  615. // This table implementation uses the same "low-level protocol" for managing
  616. // the contens of an entry slot as FixedHyperClockTable does, captured in the
  617. // ClockHandle struct. The provides most of the essential data safety, but
  618. // AutoHyperClockTable is another "high-level protocol" for organizing entries
  619. // into a hash table, with automatic resizing.
  620. //
  621. // This implementation is not fully wait-free but we can call it "essentially
  622. // wait-free," and here's why. First, like FixedHyperClockCache, there is no
  623. // locking nor other forms of waiting at the cache or shard level. Also like
  624. // FixedHCC there is essentially an entry-level read-write lock implemented
  625. // with atomics, but our relaxed atomicity/consistency guarantees (e.g.
  626. // duplicate inserts are possible) mean we do not need to wait for entry
  627. // locking. Lookups, non-erasing Releases, and non-evicting non-growing Inserts
  628. // are all fully wait-free. Of course, these waits are not dependent on any
  629. // external factors such as I/O.
  630. //
  631. // For operations that remove entries from a chain or grow the table by
  632. // splitting a chain, there is a chain-level locking mechanism that we call a
  633. // "rewrite" lock, and the only waits are for these locks. On average, each
  634. // chain lock is relevant to < 2 entries each. (The average would be less than
  635. // one entry each, but we do not lock when there's no entry to remove or
  636. // migrate.) And a given thread can only hold two such chain locks at a time,
  637. // more typically just one. So in that sense alone, the waiting that does exist
  638. // is very localized.
  639. //
  640. // If we look closer at the operations utilizing that locking mechanism, we
  641. // can see why it's "essentially wait-free."
  642. // * Grow operations to increase the size of the table: each operation splits
  643. // an existing chain into two, and chains for splitting are chosen in table
  644. // order. Grow operations are fully parallel except for the chain locking, but
  645. // for one Grow operation to wait on another, it has to be feeding into the
  646. // other, which means the table has doubled in size already from other Grow
  647. // operations without the original one finishing. So Grow operations are very
  648. // low latency (unlike LRUCache doubling the table size in one operation) and
  649. // very parallelizeable. (We use some tricks to break up dependencies in
  650. // updating metadata on the usable size of the table.) And obviously Grow
  651. // operations are very rare after the initial population of the table.
  652. // * Evict operations (part of many Inserts): clock updates and evictions
  653. // sweep through the structure in table order, so like Grow operations,
  654. // parallel Evict can only wait on each other if an Evict has lingered (slept)
  655. // long enough that the clock pointer has wrapped around the entire structure.
  656. // * Random erasures (Erase, Release with erase_if_last_ref, etc.): these
  657. // operations are rare and not really considered performance critical.
  658. // Currently they're mostly used for removing placeholder cache entries, e.g.
  659. // for memory tracking, though that could use standalone entries instead to
  660. // avoid potential contention in table operations. It's possible that future
  661. // enhancements could pro-actively remove cache entries from obsolete files,
  662. // but that's not yet implemented.
  663. class AutoHyperClockTable : public BaseClockTable {
  664. public:
  665. // Target size to be exactly a common cache line size (see static_assert in
  666. // clock_cache.cc)
  667. struct ALIGN_AS(64U) HandleImpl : public ClockHandle {
  668. // To orgainize AutoHyperClockTable entries into a hash table while
  669. // allowing the table size to grow without existing entries being moved,
  670. // a version of chaining is used. Rather than being heap allocated (and
  671. // incurring overheads to ensure memory safety) entries must go into
  672. // Handles ("slots") in the pre-allocated array. To improve CPU cache
  673. // locality, the chain head pointers are interleved with the entries;
  674. // specifically, a Handle contains
  675. // * A head pointer for a chain of entries with this "home" location.
  676. // * A ClockHandle, for an entry that may or may not be in the chain
  677. // starting from that head (but for performance ideally is on that
  678. // chain).
  679. // * A next pointer for the continuation of the chain containing this
  680. // entry.
  681. //
  682. // The pointers are not raw pointers, but are indices into the array,
  683. // and are decorated in two ways to help detect and recover from
  684. // relevant concurrent modifications during Lookup, so that Lookup is
  685. // fully wait-free:
  686. // * Each "with_shift" pointer contains a shift count that indicates
  687. // how many hash bits were used in chosing the home address for the
  688. // chain--specifically the next entry in the chain.
  689. // * The end of a chain is given a special "end" marker and refers back
  690. // to the head of the chain.
  691. //
  692. // Why do we need shift on each pointer? To make Lookup wait-free, we need
  693. // to be able to query a chain without missing anything, and preferably
  694. // avoid synchronously double-checking the length_info. Without the shifts,
  695. // there is a risk that we start down a chain and while paused on an entry
  696. // that goes to a new home, we then follow the rest of the
  697. // partially-migrated chain to see the shared ending with the old home, but
  698. // for a time were following the chain for the new home, missing some
  699. // entries for the old home.
  700. //
  701. // Why do we need the end of the chain to loop back? If Lookup pauses
  702. // at an "under construction" entry, and sees that "next" is null after
  703. // waking up, we need something to tell whether the "under construction"
  704. // entry was freed and reused for another chain. Otherwise, we could
  705. // miss entries still on the original chain due in the presence of a
  706. // concurrent modification. Until an entry is fully erased from a chain,
  707. // it is normal to see "under construction" entries on the chain, and it
  708. // is not safe to read their hashed key without either a read reference
  709. // on the entry or a rewrite lock on the chain.
  710. // Marker in a "with_shift" head pointer for some thread owning writes
  711. // to the chain structure (except for inserts), but only if not an
  712. // "end" pointer. Also called the "rewrite lock."
  713. static constexpr uint64_t kHeadLocked = uint64_t{1} << 7;
  714. // Marker in a "with_shift" pointer for the end of a chain. Must also
  715. // point back to the head of the chain (with end marker removed).
  716. // Also includes the "locked" bit so that attempting to lock an empty
  717. // chain has no effect (not needed, as the lock is only needed for
  718. // removals).
  719. static constexpr uint64_t kNextEndFlags = (uint64_t{1} << 6) | kHeadLocked;
  720. static inline bool IsEnd(uint64_t next_with_shift) {
  721. // Assuming certain values never used, suffices to check this one bit
  722. constexpr auto kCheckBit = kNextEndFlags ^ kHeadLocked;
  723. return next_with_shift & kCheckBit;
  724. }
  725. // Bottom bits to right shift away to get an array index from a
  726. // "with_shift" pointer.
  727. static constexpr int kNextShift = 8;
  728. // A bit mask for the "shift" associated with each "with_shift" pointer.
  729. // Always bottommost bits.
  730. static constexpr int kShiftMask = 63;
  731. // A marker for head_next_with_shift that indicates this HandleImpl is
  732. // heap allocated (standalone) rather than in the table.
  733. static constexpr uint64_t kStandaloneMarker = UINT64_MAX;
  734. // A marker for head_next_with_shift indicating the head is not yet part
  735. // of the usable table, or for chain_next_with_shift indicating that the
  736. // entry is not present or is not yet part of a chain (must not be
  737. // "shareable" state).
  738. static constexpr uint64_t kUnusedMarker = 0;
  739. // See above. The head pointer is logically independent of the rest of
  740. // the entry, including the chain next pointer.
  741. AcqRelAtomic<uint64_t> head_next_with_shift{kUnusedMarker};
  742. AcqRelAtomic<uint64_t> chain_next_with_shift{kUnusedMarker};
  743. // For supporting CreateStandalone and some fallback cases.
  744. inline bool IsStandalone() const {
  745. return head_next_with_shift.Load() == kStandaloneMarker;
  746. }
  747. inline void SetStandalone() {
  748. head_next_with_shift.Store(kStandaloneMarker);
  749. }
  750. }; // struct HandleImpl
  751. struct Opts : public BaseOpts {
  752. explicit Opts(size_t _min_avg_value_size, int _eviction_effort_cap)
  753. : BaseOpts(_eviction_effort_cap),
  754. min_avg_value_size(_min_avg_value_size) {}
  755. explicit Opts(const HyperClockCacheOptions& opts)
  756. : BaseOpts(opts.eviction_effort_cap) {
  757. assert(opts.estimated_entry_charge == 0);
  758. min_avg_value_size = opts.min_avg_entry_charge;
  759. }
  760. size_t min_avg_value_size;
  761. };
  762. AutoHyperClockTable(size_t capacity, bool strict_capacity_limit,
  763. CacheMetadataChargePolicy metadata_charge_policy,
  764. MemoryAllocator* allocator,
  765. const Cache::EvictionCallback* eviction_callback,
  766. const uint32_t* hash_seed, const Opts& opts);
  767. ~AutoHyperClockTable();
  768. // For BaseClockTable::Insert
  769. struct InsertState {
  770. uint64_t saved_length_info = 0;
  771. size_t likely_empty_slot = 0;
  772. };
  773. void StartInsert(InsertState& state);
  774. // Does initial check for whether there's hash table room for another
  775. // inserted entry, possibly growing if needed. Returns true iff (after
  776. // the call) there is room for the proposed number of entries.
  777. bool GrowIfNeeded(size_t new_occupancy, InsertState& state);
  778. HandleImpl* DoInsert(const ClockHandleBasicData& proto,
  779. uint64_t initial_countdown, bool take_ref,
  780. InsertState& state);
  781. // Runs the clock eviction algorithm trying to reclaim at least
  782. // requested_charge. Returns how much is evicted, which could be less
  783. // if it appears impossible to evict the requested amount without blocking.
  784. void Evict(size_t requested_charge, InsertState& state, EvictionData* data);
  785. HandleImpl* Lookup(const UniqueId64x2& hashed_key);
  786. bool Release(HandleImpl* handle, bool useful, bool erase_if_last_ref);
  787. void Erase(const UniqueId64x2& hashed_key);
  788. void EraseUnRefEntries();
  789. size_t GetTableSize() const;
  790. size_t GetOccupancyLimit() const;
  791. const HandleImpl* HandlePtr(size_t idx) const { return &array_[idx]; }
  792. #ifndef NDEBUG
  793. size_t& TEST_MutableOccupancyLimit() {
  794. return *reinterpret_cast<size_t*>(&occupancy_limit_);
  795. }
  796. // Release N references
  797. void TEST_ReleaseN(HandleImpl* handle, size_t n);
  798. #endif
  799. // Maximum ratio of number of occupied slots to number of usable slots. The
  800. // actual load factor should float pretty close to this number, which should
  801. // be a nice space/time trade-off, though large swings in WriteBufferManager
  802. // memory could lead to low (but very much safe) load factors (only after
  803. // seeing high load factors). Linear hashing along with (modified) linear
  804. // probing to find an available slot increases potential risks of high
  805. // load factors, so are disallowed.
  806. static constexpr double kMaxLoadFactor = 0.60;
  807. private: // functions
  808. // Returns true iff increased usable length. Due to load factor
  809. // considerations, GrowIfNeeded might call this more than once to make room
  810. // for one more entry.
  811. bool Grow(InsertState& state);
  812. // Operational details of splitting a chain into two for Grow().
  813. void SplitForGrow(size_t grow_home, size_t old_home, int old_shift);
  814. // Takes an "under construction" entry and ensures it is no longer connected
  815. // to its home chain (in preparaion for completing erasure and freeing the
  816. // slot). Note that previous operations might have already noticed it being
  817. // "under (de)construction" and removed it from its chain.
  818. void Remove(HandleImpl* h);
  819. // Try to take ownership of an entry and erase+remove it from the table.
  820. // Returns true if successful. Could fail if
  821. // * There are other references to the entry
  822. // * Some other thread has exclusive ownership or has freed it.
  823. bool TryEraseHandle(HandleImpl* h, bool holding_ref, bool mark_invisible);
  824. // Calculates the appropriate maximum table size, for creating the memory
  825. // mapping.
  826. static size_t CalcMaxUsableLength(
  827. size_t capacity, size_t min_avg_value_size,
  828. CacheMetadataChargePolicy metadata_charge_policy);
  829. // Shared helper function that implements removing entries from a chain
  830. // with proper handling to ensure all existing data is seen even in the
  831. // presence of concurrent insertions, etc. (See implementation.)
  832. template <class OpData>
  833. void PurgeImpl(OpData* op_data, size_t home = SIZE_MAX,
  834. EvictionData* data = nullptr);
  835. // An RAII wrapper for locking a chain of entries for removals. See
  836. // implementation.
  837. class ChainRewriteLock;
  838. // Helper function for PurgeImpl while holding a ChainRewriteLock. See
  839. // implementation.
  840. template <class OpData>
  841. void PurgeImplLocked(OpData* op_data, ChainRewriteLock& rewrite_lock,
  842. size_t home, EvictionData* data);
  843. // Update length_info_ as much as possible without waiting, given a known
  844. // usable (ready for inserts and lookups) grow_home. (Previous grow_homes
  845. // might not be usable yet, but we can check if they are by looking at
  846. // the corresponding old home.)
  847. void CatchUpLengthInfoNoWait(size_t known_usable_grow_home);
  848. private: // data
  849. // mmaped area holding handles
  850. const TypedMemMapping<HandleImpl> array_;
  851. // Metadata for table size under linear hashing.
  852. //
  853. // Lowest 8 bits are the minimum number of lowest hash bits to use
  854. // ("min shift"). The upper 56 bits are a threshold. If that minumum number
  855. // of bits taken from a hash value is < this threshold, then one more bit of
  856. // hash value is taken and used.
  857. //
  858. // Other mechanisms (shift amounts on pointers) ensure complete availability
  859. // of data already in the table even if a reader only sees a completely
  860. // out-of-date version of this value. In the worst case, it could take
  861. // log time to find the correct chain, but normally this value enables
  862. // readers to find the correct chain on the first try.
  863. //
  864. // To maximize parallelization of Grow() operations, this field is only
  865. // updated opportunistically after Grow() operations and in DoInsert() where
  866. // it is found to be out-of-date. See CatchUpLengthInfoNoWait().
  867. AcqRelAtomic<uint64_t> length_info_;
  868. // An already-computed version of the usable length times the max load
  869. // factor. Could be slightly out of date but GrowIfNeeded()/Grow() handle
  870. // that internally.
  871. // (Relaxed: allowed to lag behind length_info_ by a little)
  872. RelaxedAtomic<size_t> occupancy_limit_;
  873. // The next index to use from array_ upon the next Grow(). Might be ahead of
  874. // length_info_.
  875. // (Relaxed: self-contained source of truth for next grow home)
  876. RelaxedAtomic<size_t> grow_frontier_;
  877. // See explanation in AutoHyperClockTable::Evict
  878. // (Relaxed: allowed to lag behind clock_pointer_ and length_info_ state)
  879. RelaxedAtomic<size_t> clock_pointer_mask_;
  880. }; // class AutoHyperClockTable
  881. // A single shard of sharded cache.
  882. template <class TableT>
  883. class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShardBase {
  884. public:
  885. using Table = TableT;
  886. ClockCacheShard(size_t capacity, bool strict_capacity_limit,
  887. CacheMetadataChargePolicy metadata_charge_policy,
  888. MemoryAllocator* allocator,
  889. const Cache::EvictionCallback* eviction_callback,
  890. const uint32_t* hash_seed, const typename Table::Opts& opts);
  891. // For CacheShard concept
  892. using HandleImpl = typename Table::HandleImpl;
  893. // Hash is lossless hash of 128-bit key
  894. using HashVal = UniqueId64x2;
  895. using HashCref = const HashVal&;
  896. static inline uint32_t HashPieceForSharding(HashCref hash) {
  897. return Upper32of64(hash[0]);
  898. }
  899. static inline HashVal ComputeHash(const Slice& key, uint32_t seed) {
  900. assert(key.size() == kCacheKeySize);
  901. HashVal in;
  902. HashVal out;
  903. // NOTE: endian dependence
  904. // TODO: use GetUnaligned?
  905. std::memcpy(&in, key.data(), kCacheKeySize);
  906. BijectiveHash2x64(in[1], in[0] ^ seed, &out[1], &out[0]);
  907. return out;
  908. }
  909. // For reconstructing key from hashed_key. Requires the caller to provide
  910. // backing storage for the Slice in `unhashed`
  911. static inline Slice ReverseHash(const UniqueId64x2& hashed,
  912. UniqueId64x2* unhashed, uint32_t seed) {
  913. BijectiveUnhash2x64(hashed[1], hashed[0], &(*unhashed)[1], &(*unhashed)[0]);
  914. (*unhashed)[0] ^= seed;
  915. // NOTE: endian dependence
  916. return Slice(reinterpret_cast<const char*>(unhashed), kCacheKeySize);
  917. }
  918. // Although capacity is dynamically changeable, the number of table slots is
  919. // not, so growing capacity substantially could lead to hitting occupancy
  920. // limit.
  921. void SetCapacity(size_t capacity);
  922. void SetStrictCapacityLimit(bool strict_capacity_limit);
  923. Status Insert(const Slice& key, const UniqueId64x2& hashed_key,
  924. Cache::ObjectPtr value, const Cache::CacheItemHelper* helper,
  925. size_t charge, HandleImpl** handle, Cache::Priority priority);
  926. HandleImpl* CreateStandalone(const Slice& key, const UniqueId64x2& hashed_key,
  927. Cache::ObjectPtr obj,
  928. const Cache::CacheItemHelper* helper,
  929. size_t charge, bool allow_uncharged);
  930. HandleImpl* Lookup(const Slice& key, const UniqueId64x2& hashed_key);
  931. bool Release(HandleImpl* handle, bool useful, bool erase_if_last_ref);
  932. bool Release(HandleImpl* handle, bool erase_if_last_ref = false);
  933. bool Ref(HandleImpl* handle);
  934. void Erase(const Slice& key, const UniqueId64x2& hashed_key);
  935. size_t GetCapacity() const;
  936. size_t GetUsage() const;
  937. size_t GetStandaloneUsage() const;
  938. size_t GetPinnedUsage() const;
  939. size_t GetOccupancyCount() const;
  940. size_t GetOccupancyLimit() const;
  941. size_t GetTableAddressCount() const;
  942. void ApplyToSomeEntries(
  943. const std::function<void(const Slice& key, Cache::ObjectPtr obj,
  944. size_t charge,
  945. const Cache::CacheItemHelper* helper)>& callback,
  946. size_t average_entries_per_lock, size_t* state);
  947. void EraseUnRefEntries();
  948. std::string GetPrintableOptions() const { return std::string{}; }
  949. HandleImpl* Lookup(const Slice& key, const UniqueId64x2& hashed_key,
  950. const Cache::CacheItemHelper* /*helper*/,
  951. Cache::CreateContext* /*create_context*/,
  952. Cache::Priority /*priority*/, Statistics* /*stats*/) {
  953. return Lookup(key, hashed_key);
  954. }
  955. Table& GetTable() { return table_; }
  956. const Table& GetTable() const { return table_; }
  957. #ifndef NDEBUG
  958. size_t& TEST_MutableOccupancyLimit() {
  959. return table_.TEST_MutableOccupancyLimit();
  960. }
  961. // Acquire/release N references
  962. void TEST_RefN(HandleImpl* handle, size_t n);
  963. void TEST_ReleaseN(HandleImpl* handle, size_t n);
  964. #endif
  965. private: // data
  966. Table table_;
  967. }; // class ClockCacheShard
  968. template <class Table>
  969. class BaseHyperClockCache : public ShardedCache<ClockCacheShard<Table>> {
  970. public:
  971. using Shard = ClockCacheShard<Table>;
  972. using Handle = Cache::Handle;
  973. using CacheItemHelper = Cache::CacheItemHelper;
  974. explicit BaseHyperClockCache(const HyperClockCacheOptions& opts);
  975. Cache::ObjectPtr Value(Handle* handle) override;
  976. size_t GetCharge(Handle* handle) const override;
  977. const CacheItemHelper* GetCacheItemHelper(Handle* handle) const override;
  978. void ApplyToHandle(
  979. Cache* cache, Handle* handle,
  980. const std::function<void(const Slice& key, Cache::ObjectPtr obj,
  981. size_t charge, const CacheItemHelper* helper)>&
  982. callback) override;
  983. void ReportProblems(
  984. const std::shared_ptr<Logger>& /*info_log*/) const override;
  985. };
  986. class FixedHyperClockCache
  987. #ifdef NDEBUG
  988. final
  989. #endif
  990. : public BaseHyperClockCache<FixedHyperClockTable> {
  991. public:
  992. using BaseHyperClockCache::BaseHyperClockCache;
  993. const char* Name() const override { return "FixedHyperClockCache"; }
  994. void ReportProblems(
  995. const std::shared_ptr<Logger>& /*info_log*/) const override;
  996. }; // class FixedHyperClockCache
  997. class AutoHyperClockCache
  998. #ifdef NDEBUG
  999. final
  1000. #endif
  1001. : public BaseHyperClockCache<AutoHyperClockTable> {
  1002. public:
  1003. using BaseHyperClockCache::BaseHyperClockCache;
  1004. const char* Name() const override { return "AutoHyperClockCache"; }
  1005. void ReportProblems(
  1006. const std::shared_ptr<Logger>& /*info_log*/) const override;
  1007. }; // class AutoHyperClockCache
  1008. } // namespace clock_cache
  1009. } // namespace ROCKSDB_NAMESPACE