cache_bench_tool.cc 48 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253
  1. // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. #ifdef GFLAGS
  6. #include <cinttypes>
  7. #include <cstddef>
  8. #include <cstdio>
  9. #include <limits>
  10. #include <memory>
  11. #include <set>
  12. #include <sstream>
  13. #include "cache/cache_key.h"
  14. #include "cache/sharded_cache.h"
  15. #include "db/db_impl/db_impl.h"
  16. #include "monitoring/histogram.h"
  17. #include "port/port.h"
  18. #include "port/stack_trace.h"
  19. #include "rocksdb/advanced_cache.h"
  20. #include "rocksdb/convenience.h"
  21. #include "rocksdb/db.h"
  22. #include "rocksdb/env.h"
  23. #include "rocksdb/secondary_cache.h"
  24. #include "rocksdb/system_clock.h"
  25. #include "rocksdb/table_properties.h"
  26. #include "table/block_based/block_based_table_reader.h"
  27. #include "table/block_based/cachable_entry.h"
  28. #include "util/coding.h"
  29. #include "util/distributed_mutex.h"
  30. #include "util/gflags_compat.h"
  31. #include "util/hash.h"
  32. #include "util/mutexlock.h"
  33. #include "util/random.h"
  34. #include "util/stderr_logger.h"
  35. #include "util/stop_watch.h"
  36. #include "util/string_util.h"
  37. using GFLAGS_NAMESPACE::ParseCommandLineFlags;
  38. static constexpr uint32_t KiB = uint32_t{1} << 10;
  39. static constexpr uint32_t MiB = KiB << 10;
  40. static constexpr uint64_t GiB = MiB << 10;
  41. DEFINE_uint32(threads, 16, "Number of concurrent threads to run.");
  42. DEFINE_uint64(cache_size, 1 * GiB,
  43. "Number of bytes to use as a cache of uncompressed data.");
  44. DEFINE_int32(num_shard_bits, -1,
  45. "ShardedCacheOptions::shard_bits. Default = auto");
  46. DEFINE_int32(
  47. eviction_effort_cap,
  48. ROCKSDB_NAMESPACE::HyperClockCacheOptions(1, 1).eviction_effort_cap,
  49. "HyperClockCacheOptions::eviction_effort_cap");
  50. DEFINE_double(resident_ratio, 0.25,
  51. "Ratio of keys fitting in cache to keyspace.");
  52. DEFINE_uint64(ops_per_thread, 2000000U, "Number of operations per thread.");
  53. DEFINE_uint32(value_bytes, 8 * KiB, "Size of each value added.");
  54. DEFINE_uint32(value_bytes_estimate, 0,
  55. "If > 0, overrides estimated_entry_charge or "
  56. "min_avg_entry_charge depending on cache_type.");
  57. DEFINE_double(compressible_to_ratio, 0.5,
  58. "Approximate size ratio that values can be compressed to.");
  59. DEFINE_int32(
  60. degenerate_hash_bits, 0,
  61. "With HCC, fix this many hash bits to increase table hash collisions");
  62. DEFINE_uint32(skew, 5, "Degree of skew in key selection. 0 = no skew");
  63. DEFINE_bool(populate_cache, true, "Populate cache before operations");
  64. DEFINE_double(pinned_ratio, 0.25,
  65. "Keep roughly this portion of entries pinned in cache.");
  66. DEFINE_double(
  67. vary_capacity_ratio, 0.0,
  68. "If greater than 0.0, will periodically vary the capacity between this "
  69. "ratio less than full size and full size. If vary_capacity_ratio + "
  70. "pinned_ratio is close to or exceeds 1.0, the cache might thrash.");
  71. DEFINE_uint32(lookup_insert_percent, 82,
  72. "Ratio of lookup (+ insert on not found) to total workload "
  73. "(expressed as a percentage)");
  74. DEFINE_uint32(insert_percent, 2,
  75. "Ratio of insert to total workload (expressed as a percentage)");
  76. DEFINE_uint32(blind_insert_percent, 5,
  77. "Ratio of insert without keeping handle to total workload "
  78. "(expressed as a percentage)");
  79. DEFINE_uint32(lookup_percent, 10,
  80. "Ratio of lookup to total workload (expressed as a percentage)");
  81. DEFINE_uint32(erase_percent, 1,
  82. "Ratio of erase to total workload (expressed as a percentage)");
  83. DEFINE_bool(gather_stats, false,
  84. "Whether to periodically simulate gathering block cache stats, "
  85. "using one more thread.");
  86. DEFINE_uint32(
  87. gather_stats_sleep_ms, 1000,
  88. "How many milliseconds to sleep between each gathering of stats.");
  89. DEFINE_uint32(gather_stats_entries_per_lock, 256,
  90. "For Cache::ApplyToAllEntries");
  91. DEFINE_uint32(usleep, 0, "Sleep up to this many microseconds after each op.");
  92. DEFINE_bool(lean, false,
  93. "If true, no additional computation is performed besides cache "
  94. "operations.");
  95. DEFINE_bool(early_exit, false,
  96. "Exit before deallocating most memory. Good for malloc stats, e.g."
  97. "MALLOC_CONF=\"stats_print:true\"");
  98. DEFINE_bool(histograms, true,
  99. "Whether to track and print histogram statistics.");
  100. DEFINE_bool(report_problems, true, "Whether to ReportProblems() at the end.");
  101. DEFINE_uint32(seed, 0, "Hashing/random seed to use. 0 = choose at random");
  102. DEFINE_string(secondary_cache_uri, "",
  103. "Full URI for creating a custom secondary cache object");
  104. DEFINE_string(cache_type, "lru_cache", "Type of block cache.");
  105. DEFINE_bool(use_jemalloc_no_dump_allocator, false,
  106. "Whether to use JemallocNoDumpAllocator");
  107. DEFINE_uint32(jemalloc_no_dump_allocator_num_arenas,
  108. ROCKSDB_NAMESPACE::JemallocAllocatorOptions().num_arenas,
  109. "JemallocNodumpAllocator::num_arenas");
  110. DEFINE_bool(jemalloc_no_dump_allocator_limit_tcache_size,
  111. ROCKSDB_NAMESPACE::JemallocAllocatorOptions().limit_tcache_size,
  112. "JemallocNodumpAllocator::limit_tcache_size");
  113. // ## BEGIN stress_cache_key sub-tool options ##
  114. // See class StressCacheKey below.
  115. DEFINE_bool(stress_cache_key, false,
  116. "If true, run cache key stress test instead");
  117. DEFINE_uint32(
  118. sck_files_per_day, 2500000,
  119. "(-stress_cache_key) Simulated files generated per simulated day");
  120. // NOTE: Giving each run a specified lifetime, rather than e.g. "until
  121. // first collision" ensures equal skew from start-up, when collisions are
  122. // less likely.
  123. DEFINE_uint32(sck_days_per_run, 90,
  124. "(-stress_cache_key) Number of days to simulate in each run");
  125. // NOTE: The number of observed collisions directly affects the relative
  126. // accuracy of the predicted probabilities. 15 observations should be well
  127. // within factor-of-2 accuracy.
  128. DEFINE_uint32(
  129. sck_min_collision, 15,
  130. "(-stress_cache_key) Keep running until this many collisions seen");
  131. // sck_file_size_mb can be thought of as average file size. The simulation is
  132. // not precise enough to care about the distribution of file sizes; other
  133. // simulations (https://github.com/pdillinger/unique_id/tree/main/monte_carlo)
  134. // indicate the distribution only makes a small difference (e.g. < 2x factor)
  135. DEFINE_uint32(
  136. sck_file_size_mb, 32,
  137. "(-stress_cache_key) Simulated file size in MiB, for accounting purposes");
  138. DEFINE_uint32(sck_reopen_nfiles, 100,
  139. "(-stress_cache_key) Simulate DB re-open average every n files");
  140. DEFINE_uint32(sck_newdb_nreopen, 1000,
  141. "(-stress_cache_key) Simulate new DB average every n re-opens");
  142. DEFINE_uint32(sck_restarts_per_day, 24,
  143. "(-stress_cache_key) Average simulated process restarts per day "
  144. "(across DBs)");
  145. DEFINE_uint32(
  146. sck_db_count, 100,
  147. "(-stress_cache_key) Parallel DBs in simulation sharing a block cache");
  148. DEFINE_uint32(
  149. sck_table_bits, 20,
  150. "(-stress_cache_key) Log2 number of tracked (live) files (across DBs)");
  151. // sck_keep_bits being well below full 128 bits amplifies the collision
  152. // probability so that the true probability can be estimated through observed
  153. // collisions. (More explanation below.)
  154. DEFINE_uint32(
  155. sck_keep_bits, 50,
  156. "(-stress_cache_key) Number of bits to keep from each cache key (<= 64)");
  157. // sck_randomize is used to validate whether cache key is performing "better
  158. // than random." Even with this setting, file offsets are not randomized.
  159. DEFINE_bool(sck_randomize, false,
  160. "(-stress_cache_key) Randomize (hash) cache key");
  161. // See https://github.com/facebook/rocksdb/pull/9058
  162. DEFINE_bool(sck_footer_unique_id, false,
  163. "(-stress_cache_key) Simulate using proposed footer unique id");
  164. // ## END stress_cache_key sub-tool options ##
  165. // ## BEGIN stress_cache_instances sub-tool options ##
  166. DEFINE_uint32(stress_cache_instances, 0,
  167. "If > 0, run cache instance stress test instead");
  168. // Uses cache_size and cache_type, maybe more
  169. // ## END stress_cache_instance sub-tool options ##
  170. namespace ROCKSDB_NAMESPACE {
  171. class CacheBench;
  172. namespace {
  173. // State shared by all concurrent executions of the same benchmark.
  174. class SharedState {
  175. public:
  176. explicit SharedState(CacheBench* cache_bench)
  177. : cv_(&mu_), cache_bench_(cache_bench) {}
  178. ~SharedState() = default;
  179. port::Mutex* GetMutex() { return &mu_; }
  180. port::CondVar* GetCondVar() { return &cv_; }
  181. CacheBench* GetCacheBench() const { return cache_bench_; }
  182. void IncInitialized() { num_initialized_++; }
  183. void IncDone() { num_done_++; }
  184. bool AllInitialized() const { return num_initialized_ >= FLAGS_threads; }
  185. bool AllDone() const { return num_done_ >= FLAGS_threads; }
  186. void SetStart() { start_ = true; }
  187. bool Started() const { return start_; }
  188. void AddLookupStats(uint64_t hits, uint64_t misses, size_t pinned_count) {
  189. MutexLock l(&mu_);
  190. lookup_count_ += hits + misses;
  191. lookup_hits_ += hits;
  192. pinned_count_ += pinned_count;
  193. }
  194. double GetLookupHitRatio() const {
  195. return 1.0 * lookup_hits_ / lookup_count_;
  196. }
  197. size_t GetPinnedCount() const { return pinned_count_; }
  198. private:
  199. port::Mutex mu_;
  200. port::CondVar cv_;
  201. CacheBench* cache_bench_;
  202. uint64_t num_initialized_ = 0;
  203. bool start_ = false;
  204. uint64_t num_done_ = 0;
  205. uint64_t lookup_count_ = 0;
  206. uint64_t lookup_hits_ = 0;
  207. size_t pinned_count_ = 0;
  208. };
  209. // Per-thread state for concurrent executions of the same benchmark.
  210. struct ThreadState {
  211. uint32_t tid;
  212. Random64 rnd;
  213. SharedState* shared;
  214. HistogramImpl latency_ns_hist;
  215. uint64_t duration_us = 0;
  216. ThreadState(uint32_t index, SharedState* _shared)
  217. : tid(index), rnd(FLAGS_seed + 1 + index), shared(_shared) {}
  218. };
  219. struct KeyGen {
  220. char key_data[27];
  221. Slice GetRand(Random64& rnd, uint64_t max_key, uint32_t skew) {
  222. uint64_t raw = rnd.Next();
  223. // Skew according to setting
  224. for (uint32_t i = 0; i < skew; ++i) {
  225. raw = std::min(raw, rnd.Next());
  226. }
  227. uint64_t key = FastRange64(raw, max_key);
  228. if (FLAGS_degenerate_hash_bits) {
  229. uint64_t key_hash =
  230. Hash64(reinterpret_cast<const char*>(&key), sizeof(key));
  231. // HCC uses the high 64 bits and a lower bit mask for starting probe
  232. // location, so we fix hash bits starting at the bottom of that word.
  233. auto hi_hash = uint64_t{0x9e3779b97f4a7c13U} ^
  234. (key_hash << 1 << (FLAGS_degenerate_hash_bits - 1));
  235. uint64_t un_hi, un_lo;
  236. BijectiveUnhash2x64(hi_hash, key_hash, &un_hi, &un_lo);
  237. un_lo ^= BitwiseAnd(FLAGS_seed, INT32_MAX);
  238. EncodeFixed64(key_data, un_lo);
  239. EncodeFixed64(key_data + 8, un_hi);
  240. return Slice(key_data, kCacheKeySize);
  241. }
  242. // Variable size and alignment
  243. size_t off = key % 8;
  244. key_data[0] = char{42};
  245. EncodeFixed64(key_data + 1, key);
  246. key_data[9] = char{11};
  247. EncodeFixed64(key_data + 10, key);
  248. key_data[18] = char{4};
  249. EncodeFixed64(key_data + 19, key);
  250. assert(27 >= kCacheKeySize);
  251. return Slice(&key_data[off], kCacheKeySize);
  252. }
  253. };
  254. Cache::ObjectPtr createValue(Random64& rnd, MemoryAllocator* alloc) {
  255. char* rv = AllocateBlock(FLAGS_value_bytes, alloc).release();
  256. // Fill with some filler data, and take some CPU time, but add redundancy
  257. // as requested for compressibility.
  258. uint32_t random_fill_size = std::max(
  259. uint32_t{1}, std::min(FLAGS_value_bytes,
  260. static_cast<uint32_t>(FLAGS_compressible_to_ratio *
  261. FLAGS_value_bytes)));
  262. uint32_t i = 0;
  263. for (; i < random_fill_size; i += 8) {
  264. EncodeFixed64(rv + i, rnd.Next());
  265. }
  266. for (; i < FLAGS_value_bytes; i++) {
  267. rv[i] = rv[i % random_fill_size];
  268. }
  269. return rv;
  270. }
  271. // Callbacks for secondary cache
  272. size_t SizeFn(Cache::ObjectPtr /*obj*/) { return FLAGS_value_bytes; }
  273. Status SaveToFn(Cache::ObjectPtr from_obj, size_t /*from_offset*/,
  274. size_t length, char* out) {
  275. memcpy(out, from_obj, length);
  276. return Status::OK();
  277. }
  278. Status CreateFn(const Slice& data, CompressionType /*type*/,
  279. CacheTier /*source*/, Cache::CreateContext* /*context*/,
  280. MemoryAllocator* alloc, Cache::ObjectPtr* out_obj,
  281. size_t* out_charge) {
  282. *out_obj = AllocateBlock(data.size(), alloc).release();
  283. memcpy(*out_obj, data.data(), data.size());
  284. *out_charge = data.size();
  285. return Status::OK();
  286. };
  287. void DeleteFn(Cache::ObjectPtr value, MemoryAllocator* alloc) {
  288. CacheAllocationDeleter{alloc}(static_cast<char*>(value));
  289. }
  290. Cache::CacheItemHelper helper1_wos(CacheEntryRole::kDataBlock, DeleteFn);
  291. Cache::CacheItemHelper helper1(CacheEntryRole::kDataBlock, DeleteFn, SizeFn,
  292. SaveToFn, CreateFn, &helper1_wos);
  293. Cache::CacheItemHelper helper2_wos(CacheEntryRole::kIndexBlock, DeleteFn);
  294. Cache::CacheItemHelper helper2(CacheEntryRole::kIndexBlock, DeleteFn, SizeFn,
  295. SaveToFn, CreateFn, &helper2_wos);
  296. Cache::CacheItemHelper helper3_wos(CacheEntryRole::kFilterBlock, DeleteFn);
  297. Cache::CacheItemHelper helper3(CacheEntryRole::kFilterBlock, DeleteFn, SizeFn,
  298. SaveToFn, CreateFn, &helper3_wos);
  299. void ConfigureSecondaryCache(ShardedCacheOptions& opts) {
  300. if (!FLAGS_secondary_cache_uri.empty()) {
  301. std::shared_ptr<SecondaryCache> secondary_cache;
  302. Status s = SecondaryCache::CreateFromString(
  303. ConfigOptions(), FLAGS_secondary_cache_uri, &secondary_cache);
  304. if (secondary_cache == nullptr) {
  305. fprintf(stderr,
  306. "No secondary cache registered matching string: %s status=%s\n",
  307. FLAGS_secondary_cache_uri.c_str(), s.ToString().c_str());
  308. exit(1);
  309. }
  310. opts.secondary_cache = secondary_cache;
  311. }
  312. }
  313. ShardedCacheBase* AsShardedCache(Cache* c) {
  314. if (!FLAGS_secondary_cache_uri.empty()) {
  315. c = static_cast_with_check<CacheWrapper>(c)->GetTarget().get();
  316. }
  317. return static_cast_with_check<ShardedCacheBase>(c);
  318. }
  319. } // namespace
  320. class CacheBench {
  321. static constexpr uint64_t kHundredthUint64 =
  322. std::numeric_limits<uint64_t>::max() / 100U;
  323. public:
  324. CacheBench()
  325. : max_key_(static_cast<uint64_t>(FLAGS_cache_size / FLAGS_resident_ratio /
  326. FLAGS_value_bytes)),
  327. lookup_insert_threshold_(kHundredthUint64 *
  328. FLAGS_lookup_insert_percent),
  329. insert_threshold_(lookup_insert_threshold_ +
  330. kHundredthUint64 * FLAGS_insert_percent),
  331. blind_insert_threshold_(insert_threshold_ +
  332. kHundredthUint64 * FLAGS_blind_insert_percent),
  333. lookup_threshold_(blind_insert_threshold_ +
  334. kHundredthUint64 * FLAGS_lookup_percent),
  335. erase_threshold_(lookup_threshold_ +
  336. kHundredthUint64 * FLAGS_erase_percent) {
  337. if (erase_threshold_ != 100U * kHundredthUint64) {
  338. fprintf(stderr, "Percentages must add to 100.\n");
  339. exit(1);
  340. }
  341. cache_ = MakeCache();
  342. }
  343. ~CacheBench() = default;
  344. static std::shared_ptr<Cache> MakeCache() {
  345. std::shared_ptr<MemoryAllocator> allocator;
  346. if (FLAGS_use_jemalloc_no_dump_allocator) {
  347. JemallocAllocatorOptions opts;
  348. opts.num_arenas = FLAGS_jemalloc_no_dump_allocator_num_arenas;
  349. opts.limit_tcache_size =
  350. FLAGS_jemalloc_no_dump_allocator_limit_tcache_size;
  351. Status s = NewJemallocNodumpAllocator(opts, &allocator);
  352. assert(s.ok());
  353. }
  354. if (FLAGS_cache_type == "clock_cache") {
  355. fprintf(stderr, "Old clock cache implementation has been removed.\n");
  356. exit(1);
  357. } else if (EndsWith(FLAGS_cache_type, "hyper_clock_cache")) {
  358. HyperClockCacheOptions opts(
  359. FLAGS_cache_size, /*estimated_entry_charge=*/0, FLAGS_num_shard_bits);
  360. opts.hash_seed = BitwiseAnd(FLAGS_seed, INT32_MAX);
  361. opts.memory_allocator = allocator;
  362. opts.eviction_effort_cap = FLAGS_eviction_effort_cap;
  363. if (FLAGS_cache_type == "fixed_hyper_clock_cache") {
  364. opts.estimated_entry_charge = FLAGS_value_bytes_estimate > 0
  365. ? FLAGS_value_bytes_estimate
  366. : FLAGS_value_bytes;
  367. } else if (FLAGS_cache_type == "auto_hyper_clock_cache" ||
  368. FLAGS_cache_type == "hyper_clock_cache") {
  369. if (FLAGS_value_bytes_estimate > 0) {
  370. opts.min_avg_entry_charge = FLAGS_value_bytes_estimate;
  371. }
  372. } else {
  373. fprintf(stderr, "Cache type not supported.\n");
  374. exit(1);
  375. }
  376. ConfigureSecondaryCache(opts);
  377. return opts.MakeSharedCache();
  378. } else if (FLAGS_cache_type == "lru_cache") {
  379. LRUCacheOptions opts(FLAGS_cache_size, FLAGS_num_shard_bits,
  380. false /* strict_capacity_limit */,
  381. 0.5 /* high_pri_pool_ratio */);
  382. opts.hash_seed = BitwiseAnd(FLAGS_seed, INT32_MAX);
  383. opts.memory_allocator = allocator;
  384. ConfigureSecondaryCache(opts);
  385. return NewLRUCache(opts);
  386. } else {
  387. fprintf(stderr, "Cache type not supported.\n");
  388. exit(1);
  389. }
  390. }
  391. void PopulateCache() {
  392. Random64 rnd(FLAGS_seed);
  393. KeyGen keygen;
  394. size_t max_occ = 0;
  395. size_t inserts_since_max_occ_increase = 0;
  396. size_t keys_since_last_not_found = 0;
  397. // Avoid redundant insertions by checking Lookup before Insert.
  398. // Loop until insertions consistently fail to increase max occupancy or
  399. // it becomes difficult to find keys not already inserted.
  400. while (inserts_since_max_occ_increase < 100 &&
  401. keys_since_last_not_found < 100) {
  402. Slice key = keygen.GetRand(rnd, max_key_, FLAGS_skew);
  403. Cache::Handle* handle = cache_->Lookup(key);
  404. if (handle != nullptr) {
  405. cache_->Release(handle);
  406. ++keys_since_last_not_found;
  407. continue;
  408. }
  409. keys_since_last_not_found = 0;
  410. Status s =
  411. cache_->Insert(key, createValue(rnd, cache_->memory_allocator()),
  412. &helper1, FLAGS_value_bytes);
  413. assert(s.ok());
  414. handle = cache_->Lookup(key);
  415. if (!handle) {
  416. fprintf(stderr, "Failed to lookup key just inserted.\n");
  417. assert(false);
  418. exit(42);
  419. } else {
  420. cache_->Release(handle);
  421. }
  422. size_t occ = cache_->GetOccupancyCount();
  423. if (occ > max_occ) {
  424. max_occ = occ;
  425. inserts_since_max_occ_increase = 0;
  426. } else {
  427. ++inserts_since_max_occ_increase;
  428. }
  429. }
  430. printf("Population complete (%zu entries, %g average charge)\n", max_occ,
  431. 1.0 * FLAGS_cache_size / max_occ);
  432. }
  433. bool Run() {
  434. const auto clock = SystemClock::Default().get();
  435. PrintEnv();
  436. SharedState shared(this);
  437. std::vector<std::unique_ptr<ThreadState>> threads(FLAGS_threads);
  438. for (uint32_t i = 0; i < FLAGS_threads; i++) {
  439. threads[i].reset(new ThreadState(i, &shared));
  440. std::thread(ThreadBody, threads[i].get()).detach();
  441. }
  442. HistogramImpl stats_hist;
  443. std::string stats_report;
  444. std::thread stats_thread(StatsBody, &shared, &stats_hist, &stats_report);
  445. uint64_t start_time;
  446. {
  447. MutexLock l(shared.GetMutex());
  448. while (!shared.AllInitialized()) {
  449. shared.GetCondVar()->Wait();
  450. }
  451. // Record start time
  452. start_time = clock->NowMicros();
  453. // Start all threads
  454. shared.SetStart();
  455. shared.GetCondVar()->SignalAll();
  456. // Wait threads to complete
  457. while (!shared.AllDone()) {
  458. shared.GetCondVar()->Wait();
  459. }
  460. }
  461. // Stats gathering is considered background work. This time measurement
  462. // is for foreground work, and not really ideal for that. See below.
  463. uint64_t end_time = clock->NowMicros();
  464. stats_thread.join();
  465. // Wall clock time - includes idle time if threads
  466. // finish at different times (not ideal).
  467. double elapsed_secs = static_cast<double>(end_time - start_time) * 1e-6;
  468. uint32_t ops_per_sec = static_cast<uint32_t>(
  469. 1.0 * FLAGS_threads * FLAGS_ops_per_thread / elapsed_secs);
  470. printf("Complete in %.3f s; Rough parallel ops/sec = %u\n", elapsed_secs,
  471. ops_per_sec);
  472. // Total time in each thread (more accurate throughput measure)
  473. elapsed_secs = 0;
  474. for (uint32_t i = 0; i < FLAGS_threads; i++) {
  475. elapsed_secs += threads[i]->duration_us * 1e-6;
  476. }
  477. ops_per_sec = static_cast<uint32_t>(1.0 * FLAGS_threads *
  478. FLAGS_ops_per_thread / elapsed_secs);
  479. printf("Thread ops/sec = %u\n", ops_per_sec);
  480. printf("Lookup hit ratio: %g\n", shared.GetLookupHitRatio());
  481. size_t occ = cache_->GetOccupancyCount();
  482. size_t slot = cache_->GetTableAddressCount();
  483. printf("Final load factor: %g (%zu / %zu)\n", 1.0 * occ / slot, occ, slot);
  484. printf("Final pinned count: %zu\n", shared.GetPinnedCount());
  485. if (FLAGS_histograms) {
  486. printf("\nOperation latency (ns):\n");
  487. HistogramImpl combined;
  488. for (uint32_t i = 0; i < FLAGS_threads; i++) {
  489. combined.Merge(threads[i]->latency_ns_hist);
  490. }
  491. printf("%s", combined.ToString().c_str());
  492. if (FLAGS_gather_stats) {
  493. printf("\nGather stats latency (us):\n");
  494. printf("%s", stats_hist.ToString().c_str());
  495. }
  496. }
  497. if (FLAGS_report_problems) {
  498. printf("\n");
  499. std::shared_ptr<Logger> logger =
  500. std::make_shared<StderrLogger>(InfoLogLevel::DEBUG_LEVEL);
  501. cache_->ReportProblems(logger);
  502. }
  503. printf("%s", stats_report.c_str());
  504. return true;
  505. }
  506. private:
  507. std::shared_ptr<Cache> cache_;
  508. const uint64_t max_key_;
  509. // Cumulative thresholds in the space of a random uint64_t
  510. const uint64_t lookup_insert_threshold_;
  511. const uint64_t insert_threshold_;
  512. const uint64_t blind_insert_threshold_;
  513. const uint64_t lookup_threshold_;
  514. const uint64_t erase_threshold_;
  515. // A benchmark version of gathering stats on an active block cache by
  516. // iterating over it. The primary purpose is to measure the impact of
  517. // gathering stats with ApplyToAllEntries on throughput- and
  518. // latency-sensitive Cache users. Performance of stats gathering is
  519. // also reported. The last set of gathered stats is also reported, for
  520. // manual sanity checking for logical errors or other unexpected
  521. // behavior of cache_bench or the underlying Cache.
  522. static void StatsBody(SharedState* shared, HistogramImpl* stats_hist,
  523. std::string* stats_report) {
  524. if (!FLAGS_gather_stats) {
  525. return;
  526. }
  527. const auto clock = SystemClock::Default().get();
  528. uint64_t total_key_size = 0;
  529. uint64_t total_charge = 0;
  530. uint64_t total_entry_count = 0;
  531. uint64_t table_occupancy = 0;
  532. uint64_t table_size = 0;
  533. std::set<const Cache::CacheItemHelper*> helpers;
  534. StopWatchNano timer(clock);
  535. for (;;) {
  536. uint64_t time;
  537. time = clock->NowMicros();
  538. uint64_t deadline = time + uint64_t{FLAGS_gather_stats_sleep_ms} * 1000;
  539. {
  540. MutexLock l(shared->GetMutex());
  541. for (;;) {
  542. if (shared->AllDone()) {
  543. std::ostringstream ostr;
  544. ostr << "\nMost recent cache entry stats:\n"
  545. << "Number of entries: " << total_entry_count << "\n"
  546. << "Table occupancy: " << table_occupancy << " / "
  547. << table_size << " = "
  548. << (100.0 * table_occupancy / table_size) << "%\n"
  549. << "Total charge: " << BytesToHumanString(total_charge) << "\n"
  550. << "Average key size: "
  551. << (1.0 * total_key_size / total_entry_count) << "\n"
  552. << "Average charge: "
  553. << BytesToHumanString(static_cast<uint64_t>(
  554. 1.0 * total_charge / total_entry_count))
  555. << "\n"
  556. << "Unique helpers: " << helpers.size() << "\n";
  557. *stats_report = ostr.str();
  558. return;
  559. }
  560. if (clock->NowMicros() >= deadline) {
  561. break;
  562. }
  563. uint64_t diff = deadline - std::min(clock->NowMicros(), deadline);
  564. shared->GetCondVar()->TimedWait(diff + 1);
  565. }
  566. }
  567. // Now gather stats, outside of mutex
  568. total_key_size = 0;
  569. total_charge = 0;
  570. total_entry_count = 0;
  571. helpers.clear();
  572. auto fn = [&](const Slice& key, Cache::ObjectPtr /*value*/, size_t charge,
  573. const Cache::CacheItemHelper* helper) {
  574. total_key_size += key.size();
  575. total_charge += charge;
  576. ++total_entry_count;
  577. // Something slightly more expensive as in stats by category
  578. helpers.insert(helper);
  579. };
  580. if (FLAGS_histograms) {
  581. timer.Start();
  582. }
  583. Cache::ApplyToAllEntriesOptions opts;
  584. opts.average_entries_per_lock = FLAGS_gather_stats_entries_per_lock;
  585. shared->GetCacheBench()->cache_->ApplyToAllEntries(fn, opts);
  586. table_occupancy = shared->GetCacheBench()->cache_->GetOccupancyCount();
  587. table_size = shared->GetCacheBench()->cache_->GetTableAddressCount();
  588. if (FLAGS_histograms) {
  589. stats_hist->Add(timer.ElapsedNanos() / 1000);
  590. }
  591. }
  592. }
  593. static void ThreadBody(ThreadState* thread) {
  594. SharedState* shared = thread->shared;
  595. {
  596. MutexLock l(shared->GetMutex());
  597. shared->IncInitialized();
  598. if (shared->AllInitialized()) {
  599. shared->GetCondVar()->SignalAll();
  600. }
  601. while (!shared->Started()) {
  602. shared->GetCondVar()->Wait();
  603. }
  604. }
  605. thread->shared->GetCacheBench()->OperateCache(thread);
  606. {
  607. MutexLock l(shared->GetMutex());
  608. shared->IncDone();
  609. if (shared->AllDone()) {
  610. shared->GetCondVar()->SignalAll();
  611. }
  612. }
  613. }
  614. void OperateCache(ThreadState* thread) {
  615. // To use looked-up values
  616. uint64_t result = 0;
  617. uint64_t lookup_misses = 0;
  618. uint64_t lookup_hits = 0;
  619. // To hold handles for a non-trivial amount of time
  620. std::deque<Cache::Handle*> pinned;
  621. size_t total_pin_count = static_cast<size_t>(
  622. (FLAGS_cache_size * FLAGS_pinned_ratio) / FLAGS_value_bytes + 0.999999);
  623. // For this thread. Some round up, some round down, as appropriate
  624. size_t pin_count = (total_pin_count + thread->tid) / FLAGS_threads;
  625. KeyGen gen;
  626. const auto clock = SystemClock::Default().get();
  627. uint64_t start_time = clock->NowMicros();
  628. StopWatchNano timer(clock);
  629. auto system_clock = SystemClock::Default();
  630. size_t steps_to_next_capacity_change = 0;
  631. for (uint64_t i = 0; i < FLAGS_ops_per_thread; i++) {
  632. Slice key = gen.GetRand(thread->rnd, max_key_, FLAGS_skew);
  633. uint64_t random_op = thread->rnd.Next();
  634. if (FLAGS_vary_capacity_ratio > 0.0 && thread->tid == 0) {
  635. if (steps_to_next_capacity_change == 0) {
  636. double cut_ratio = static_cast<double>(thread->rnd.Next()) /
  637. static_cast<double>(UINT64_MAX) *
  638. FLAGS_vary_capacity_ratio;
  639. cache_->SetCapacity(FLAGS_cache_size * (1.0 - cut_ratio));
  640. steps_to_next_capacity_change =
  641. static_cast<size_t>(FLAGS_ops_per_thread / 100);
  642. } else {
  643. --steps_to_next_capacity_change;
  644. }
  645. }
  646. if (FLAGS_histograms) {
  647. timer.Start();
  648. }
  649. if (random_op < lookup_insert_threshold_) {
  650. // do lookup
  651. auto handle = cache_->Lookup(key, &helper2, /*context*/ nullptr,
  652. Cache::Priority::LOW);
  653. if (handle) {
  654. ++lookup_hits;
  655. if (!FLAGS_lean) {
  656. // do something with the data
  657. result += NPHash64(static_cast<char*>(cache_->Value(handle)),
  658. FLAGS_value_bytes);
  659. }
  660. pinned.push_back(handle);
  661. } else {
  662. ++lookup_misses;
  663. // do insert
  664. Status s = cache_->Insert(
  665. key, createValue(thread->rnd, cache_->memory_allocator()),
  666. &helper2, FLAGS_value_bytes, &pinned.emplace_back());
  667. assert(s.ok());
  668. }
  669. } else if (random_op < insert_threshold_) {
  670. // do insert
  671. Status s = cache_->Insert(
  672. key, createValue(thread->rnd, cache_->memory_allocator()), &helper3,
  673. FLAGS_value_bytes, &pinned.emplace_back());
  674. assert(s.ok());
  675. } else if (random_op < blind_insert_threshold_) {
  676. // insert without keeping a handle
  677. Status s = cache_->Insert(
  678. key, createValue(thread->rnd, cache_->memory_allocator()), &helper3,
  679. FLAGS_value_bytes);
  680. assert(s.ok());
  681. } else if (random_op < lookup_threshold_) {
  682. // do lookup
  683. auto handle = cache_->Lookup(key, &helper2, /*context*/ nullptr,
  684. Cache::Priority::LOW);
  685. if (handle) {
  686. ++lookup_hits;
  687. if (!FLAGS_lean) {
  688. // do something with the data
  689. result += NPHash64(static_cast<char*>(cache_->Value(handle)),
  690. FLAGS_value_bytes);
  691. }
  692. pinned.push_back(handle);
  693. } else {
  694. ++lookup_misses;
  695. }
  696. } else if (random_op < erase_threshold_) {
  697. // do erase
  698. cache_->Erase(key);
  699. } else {
  700. // Should be extremely unlikely (noop)
  701. assert(random_op >= kHundredthUint64 * 100U);
  702. }
  703. if (FLAGS_histograms) {
  704. thread->latency_ns_hist.Add(timer.ElapsedNanos());
  705. }
  706. if (FLAGS_usleep > 0) {
  707. unsigned us =
  708. static_cast<unsigned>(thread->rnd.Uniform(FLAGS_usleep + 1));
  709. if (us > 0) {
  710. system_clock->SleepForMicroseconds(us);
  711. }
  712. }
  713. while (pinned.size() > pin_count) {
  714. cache_->Release(pinned.front());
  715. pinned.pop_front();
  716. }
  717. }
  718. if (FLAGS_early_exit) {
  719. MutexLock l(thread->shared->GetMutex());
  720. exit(0);
  721. }
  722. thread->shared->AddLookupStats(lookup_hits, lookup_misses, pinned.size());
  723. for (auto handle : pinned) {
  724. cache_->Release(handle);
  725. handle = nullptr;
  726. }
  727. // Ensure computations on `result` are not optimized away.
  728. if (result == 1) {
  729. printf("You are extremely unlucky(2). Try again.\n");
  730. exit(1);
  731. }
  732. thread->duration_us = clock->NowMicros() - start_time;
  733. }
  734. void PrintEnv() const {
  735. #if defined(__GNUC__) && !defined(__OPTIMIZE__)
  736. printf(
  737. "WARNING: Optimization is disabled: benchmarks unnecessarily slow\n");
  738. #endif
  739. #ifndef NDEBUG
  740. printf("WARNING: Assertions are enabled; benchmarks unnecessarily slow\n");
  741. #endif
  742. printf("----------------------------\n");
  743. printf("RocksDB version : %d.%d\n", kMajorVersion, kMinorVersion);
  744. printf("Cache impl name : %s\n", cache_->Name());
  745. printf("DMutex impl name : %s\n", DMutex::kName());
  746. printf("Number of threads : %u\n", FLAGS_threads);
  747. printf("Ops per thread : %" PRIu64 "\n", FLAGS_ops_per_thread);
  748. printf("Cache size : %s\n",
  749. BytesToHumanString(FLAGS_cache_size).c_str());
  750. printf("Num shard bits : %d\n",
  751. AsShardedCache(cache_.get())->GetNumShardBits());
  752. printf("Max key : %" PRIu64 "\n", max_key_);
  753. printf("Resident ratio : %g\n", FLAGS_resident_ratio);
  754. printf("Skew degree : %u\n", FLAGS_skew);
  755. printf("Populate cache : %d\n", int{FLAGS_populate_cache});
  756. printf("Lookup+Insert pct : %u%%\n", FLAGS_lookup_insert_percent);
  757. printf("Insert percentage : %u%%\n", FLAGS_insert_percent);
  758. printf("Lookup percentage : %u%%\n", FLAGS_lookup_percent);
  759. printf("Erase percentage : %u%%\n", FLAGS_erase_percent);
  760. std::ostringstream stats;
  761. if (FLAGS_gather_stats) {
  762. stats << "enabled (" << FLAGS_gather_stats_sleep_ms << "ms, "
  763. << FLAGS_gather_stats_entries_per_lock << "/lock)";
  764. } else {
  765. stats << "disabled";
  766. }
  767. printf("Gather stats : %s\n", stats.str().c_str());
  768. printf("----------------------------\n");
  769. }
  770. };
  771. // cache_bench -stress_cache_key is an independent embedded tool for
  772. // estimating the probability of CacheKey collisions through simulation.
  773. // At a high level, it simulates generating SST files over many months,
  774. // keeping them in the DB and/or cache for some lifetime while staying
  775. // under resource caps, and checking for any cache key collisions that
  776. // arise among the set of live files. For efficient simulation, we make
  777. // some simplifying "pessimistic" assumptions (that only increase the
  778. // chance of the simulation reporting a collision relative to the chance
  779. // of collision in practice):
  780. // * Every generated file has a cache entry for every byte offset in the
  781. // file (contiguous range of cache keys)
  782. // * All of every file is cached for its entire lifetime. (Here "lifetime"
  783. // is technically the union of DB and Cache lifetime, though we only
  784. // model a generous DB lifetime, where space usage is always maximized.
  785. // In a effective Cache, lifetime in cache can only substantially exceed
  786. // lifetime in DB if there is little cache activity; cache activity is
  787. // required to hit cache key collisions.)
  788. //
  789. // It would be possible to track an exact set of cache key ranges for the
  790. // set of live files, but we would have no hope of observing collisions
  791. // (overlap in live files) in our simulation. We need to employ some way
  792. // of amplifying collision probability that allows us to predict the real
  793. // collision probability by extrapolation from observed collisions. Our
  794. // basic approach is to reduce each cache key range down to some smaller
  795. // number of bits, and limiting to bits that are shared over the whole
  796. // range. Now we can observe collisions using a set of smaller stripped-down
  797. // (reduced) cache keys. Let's do some case analysis to understand why this
  798. // works:
  799. // * No collision in reduced key - because the reduction is a pure function
  800. // this implies no collision in the full keys
  801. // * Collision detected between two reduced keys - either
  802. // * The reduction has dropped some structured uniqueness info (from one of
  803. // session counter or file number; file offsets are never materialized here).
  804. // This can only artificially inflate the observed and extrapolated collision
  805. // probabilities. We only have to worry about this in designing the reduction.
  806. // * The reduction has preserved all the structured uniqueness in the cache
  807. // key, which means either
  808. // * REJECTED: We have a uniqueness bug in generating cache keys, where
  809. // structured uniqueness info should have been different but isn't. In such a
  810. // case, increasing by 1 the number of bits kept after reduction would not
  811. // reduce observed probabilities by half. (In our observations, the
  812. // probabilities are reduced approximately by half.)
  813. // * ACCEPTED: The lost unstructured uniqueness in the key determines the
  814. // probability that an observed collision would imply an overlap in ranges.
  815. // In short, dropping n bits from key would increase collision probability by
  816. // 2**n, assuming those n bits have full entropy in unstructured uniqueness.
  817. //
  818. // But we also have to account for the key ranges based on file size. If file
  819. // sizes are roughly 2**b offsets, using XOR in 128-bit cache keys for
  820. // "ranges", we know from other simulations (see
  821. // https://github.com/pdillinger/unique_id/) that that's roughly equivalent to
  822. // (less than 2x higher collision probability) using a cache key of size
  823. // 128 - b bits for the whole file. (This is the only place we make an
  824. // "optimistic" assumption, which is more than offset by the real
  825. // implementation stripping off 2 lower bits from block byte offsets for cache
  826. // keys. The simulation assumes byte offsets, which is net pessimistic.)
  827. //
  828. // So to accept the extrapolation as valid, we need to be confident that all
  829. // "lost" bits, excluding those covered by file offset, are full entropy.
  830. // Recall that we have assumed (verifiably, safely) that other structured data
  831. // (file number and session counter) are kept, not lost. Based on the
  832. // implementation comments for OffsetableCacheKey, the only potential hole here
  833. // is that we only have ~103 bits of entropy in "all new" session IDs, and in
  834. // extreme cases, there might be only 1 DB ID. However, because the upper ~39
  835. // bits of session ID are hashed, the combination of file number and file
  836. // offset only has to add to 25 bits (or more) to ensure full entropy in
  837. // unstructured uniqueness lost in the reduction. Typical file size of 32MB
  838. // suffices (at least for simulation purposes where we assume each file offset
  839. // occupies a cache key).
  840. //
  841. // Example results in comments on OffsetableCacheKey.
  842. class StressCacheKey {
  843. public:
  844. void Run() {
  845. if (FLAGS_sck_footer_unique_id) {
  846. // Proposed footer unique IDs are DB-independent and session-independent
  847. // (but process-dependent) which is most easily simulated here by
  848. // assuming 1 DB and (later below) no session resets without process
  849. // reset.
  850. FLAGS_sck_db_count = 1;
  851. }
  852. // Describe the simulated workload
  853. uint64_t mb_per_day =
  854. uint64_t{FLAGS_sck_files_per_day} * FLAGS_sck_file_size_mb;
  855. printf("Total cache or DBs size: %gTiB Writing %g MiB/s or %gTiB/day\n",
  856. FLAGS_sck_file_size_mb / 1024.0 / 1024.0 *
  857. std::pow(2.0, FLAGS_sck_table_bits),
  858. mb_per_day / 86400.0, mb_per_day / 1024.0 / 1024.0);
  859. // For extrapolating probability of any collisions from a number of
  860. // observed collisions
  861. multiplier_ = std::pow(2.0, 128 - FLAGS_sck_keep_bits) /
  862. (FLAGS_sck_file_size_mb * 1024.0 * 1024.0);
  863. printf(
  864. "Multiply by %g to correct for simulation losses (but still assume "
  865. "whole file cached)\n",
  866. multiplier_);
  867. restart_nfiles_ = FLAGS_sck_files_per_day / FLAGS_sck_restarts_per_day;
  868. double without_ejection =
  869. std::pow(1.414214, FLAGS_sck_keep_bits) / FLAGS_sck_files_per_day;
  870. // This should be a lower bound for -sck_randomize, usually a terribly
  871. // rough lower bound.
  872. // If observation is worse than this, then something has gone wrong.
  873. printf(
  874. "Without ejection, expect random collision after %g days (%g "
  875. "corrected)\n",
  876. without_ejection, without_ejection * multiplier_);
  877. double with_full_table =
  878. std::pow(2.0, FLAGS_sck_keep_bits - FLAGS_sck_table_bits) /
  879. FLAGS_sck_files_per_day;
  880. // This is an alternate lower bound for -sck_randomize, usually pretty
  881. // accurate. Our cache keys should usually perform "better than random"
  882. // but always no worse. (If observation is substantially worse than this,
  883. // then something has gone wrong.)
  884. printf(
  885. "With ejection and full table, expect random collision after %g "
  886. "days (%g corrected)\n",
  887. with_full_table, with_full_table * multiplier_);
  888. collisions_ = 0;
  889. // Run until sufficient number of observed collisions.
  890. for (int i = 1; collisions_ < FLAGS_sck_min_collision; i++) {
  891. RunOnce();
  892. if (collisions_ == 0) {
  893. printf(
  894. "No collisions after %d x %u days "
  895. " \n",
  896. i, FLAGS_sck_days_per_run);
  897. } else {
  898. double est = 1.0 * i * FLAGS_sck_days_per_run / collisions_;
  899. printf("%" PRIu64
  900. " collisions after %d x %u days, est %g days between (%g "
  901. "corrected) \n",
  902. collisions_, i, FLAGS_sck_days_per_run, est, est * multiplier_);
  903. }
  904. }
  905. }
  906. void RunOnce() {
  907. // Re-initialized simulated state
  908. const size_t db_count = std::max(size_t{FLAGS_sck_db_count}, size_t{1});
  909. dbs_.reset(new TableProperties[db_count]{});
  910. const size_t table_mask = (size_t{1} << FLAGS_sck_table_bits) - 1;
  911. table_.reset(new uint64_t[table_mask + 1]{});
  912. if (FLAGS_sck_keep_bits > 64) {
  913. FLAGS_sck_keep_bits = 64;
  914. }
  915. // Details of which bits are dropped in reduction
  916. uint32_t shift_away = 64 - FLAGS_sck_keep_bits;
  917. // Shift away fewer potential file number bits (b) than potential
  918. // session counter bits (a).
  919. uint32_t shift_away_b = shift_away / 3;
  920. uint32_t shift_away_a = shift_away - shift_away_b;
  921. process_count_ = 0;
  922. session_count_ = 0;
  923. newdb_count_ = 0;
  924. ResetProcess(/*newdbs*/ true);
  925. Random64 r{std::random_device{}()};
  926. uint64_t max_file_count =
  927. uint64_t{FLAGS_sck_files_per_day} * FLAGS_sck_days_per_run;
  928. uint32_t report_count = 0;
  929. uint32_t collisions_this_run = 0;
  930. size_t db_i = 0;
  931. for (uint64_t file_count = 1; file_count <= max_file_count;
  932. ++file_count, ++db_i) {
  933. // Round-robin through DBs (this faster than %)
  934. if (db_i >= db_count) {
  935. db_i = 0;
  936. }
  937. // Any other periodic actions before simulating next file
  938. if (!FLAGS_sck_footer_unique_id && r.OneIn(FLAGS_sck_reopen_nfiles)) {
  939. ResetSession(db_i, /*newdb*/ r.OneIn(FLAGS_sck_newdb_nreopen));
  940. } else if (r.OneIn(restart_nfiles_)) {
  941. ResetProcess(/*newdbs*/ false);
  942. }
  943. // Simulate next file
  944. OffsetableCacheKey ock;
  945. dbs_[db_i].orig_file_number += 1;
  946. // skip some file numbers for other file kinds, except in footer unique
  947. // ID, orig_file_number here tracks process-wide generated SST file
  948. // count.
  949. if (!FLAGS_sck_footer_unique_id) {
  950. dbs_[db_i].orig_file_number += (r.Next() & 3);
  951. }
  952. bool is_stable;
  953. BlockBasedTable::SetupBaseCacheKey(&dbs_[db_i], /* ignored */ "",
  954. /* ignored */ 42, &ock, &is_stable);
  955. assert(is_stable);
  956. // Get a representative cache key, which later we analytically generalize
  957. // to a range.
  958. CacheKey ck = ock.WithOffset(0);
  959. uint64_t reduced_key;
  960. if (FLAGS_sck_randomize) {
  961. reduced_key = GetSliceHash64(ck.AsSlice()) >> shift_away;
  962. } else if (FLAGS_sck_footer_unique_id) {
  963. // Special case: keep only file number, not session counter
  964. reduced_key = DecodeFixed64(ck.AsSlice().data()) >> shift_away;
  965. } else {
  966. // Try to keep file number and session counter (shift away other bits)
  967. uint32_t a = DecodeFixed32(ck.AsSlice().data()) << shift_away_a;
  968. uint32_t b = DecodeFixed32(ck.AsSlice().data() + 4) >> shift_away_b;
  969. reduced_key = (uint64_t{a} << 32) + b;
  970. }
  971. if (reduced_key == 0) {
  972. // Unlikely, but we need to exclude tracking this value because we
  973. // use it to mean "empty" in table. This case is OK as long as we
  974. // don't hit it often.
  975. printf("Hit Zero! \n");
  976. file_count--;
  977. continue;
  978. }
  979. uint64_t h =
  980. NPHash64(reinterpret_cast<char*>(&reduced_key), sizeof(reduced_key));
  981. // Skew expected lifetimes, for high variance (super-Poisson) variance
  982. // in actual lifetimes.
  983. size_t pos =
  984. std::min(Lower32of64(h) & table_mask, Upper32of64(h) & table_mask);
  985. if (table_[pos] == reduced_key) {
  986. collisions_this_run++;
  987. // Our goal is to predict probability of no collisions, not expected
  988. // number of collisions. To make the distinction, we have to get rid
  989. // of observing correlated collisions, which this takes care of:
  990. ResetProcess(/*newdbs*/ false);
  991. } else {
  992. // Replace (end of lifetime for file that was in this slot)
  993. table_[pos] = reduced_key;
  994. }
  995. if (++report_count == FLAGS_sck_files_per_day) {
  996. report_count = 0;
  997. // Estimate fill %
  998. size_t incr = table_mask / 1000;
  999. size_t sampled_count = 0;
  1000. for (size_t i = 0; i <= table_mask; i += incr) {
  1001. if (table_[i] != 0) {
  1002. sampled_count++;
  1003. }
  1004. }
  1005. // Report
  1006. printf(
  1007. "%" PRIu64 " days, %" PRIu64 " proc, %" PRIu64 " sess, %" PRIu64
  1008. " newdb, %u coll, occ %g%%, ejected %g%% \r",
  1009. file_count / FLAGS_sck_files_per_day, process_count_,
  1010. session_count_, newdb_count_ - FLAGS_sck_db_count,
  1011. collisions_this_run, 100.0 * sampled_count / 1000.0,
  1012. 100.0 * (1.0 - sampled_count / 1000.0 * table_mask / file_count));
  1013. fflush(stdout);
  1014. }
  1015. }
  1016. collisions_ += collisions_this_run;
  1017. }
  1018. void ResetSession(size_t i, bool newdb) {
  1019. dbs_[i].db_session_id = DBImpl::GenerateDbSessionId(nullptr);
  1020. if (newdb) {
  1021. ++newdb_count_;
  1022. if (FLAGS_sck_footer_unique_id) {
  1023. // Simulate how footer id would behave
  1024. dbs_[i].db_id = "none";
  1025. } else {
  1026. // db_id might be ignored, depending on the implementation details
  1027. dbs_[i].db_id = std::to_string(newdb_count_);
  1028. dbs_[i].orig_file_number = 0;
  1029. }
  1030. }
  1031. session_count_++;
  1032. }
  1033. void ResetProcess(bool newdbs) {
  1034. process_count_++;
  1035. DBImpl::TEST_ResetDbSessionIdGen();
  1036. for (size_t i = 0; i < FLAGS_sck_db_count; ++i) {
  1037. ResetSession(i, newdbs);
  1038. }
  1039. if (FLAGS_sck_footer_unique_id) {
  1040. // For footer unique ID, this tracks process-wide generated SST file
  1041. // count.
  1042. dbs_[0].orig_file_number = 0;
  1043. }
  1044. }
  1045. private:
  1046. // Use db_session_id and orig_file_number from TableProperties
  1047. std::unique_ptr<TableProperties[]> dbs_;
  1048. std::unique_ptr<uint64_t[]> table_;
  1049. uint64_t process_count_ = 0;
  1050. uint64_t session_count_ = 0;
  1051. uint64_t newdb_count_ = 0;
  1052. uint64_t collisions_ = 0;
  1053. uint32_t restart_nfiles_ = 0;
  1054. double multiplier_ = 0.0;
  1055. };
  1056. // cache_bench -stress_cache_instances is a partially independent embedded tool
  1057. // for evaluating the time and space required to create and destroy many cache
  1058. // instances, as this is considered important for a default cache implementation
  1059. // which could see many throw-away instances in handling of Options, or created
  1060. // in large numbers for many very small DBs with many CFs. Prefix command line
  1061. // with /usr/bin/time to see max RSS memory.
  1062. class StressCacheInstances {
  1063. public:
  1064. void Run() {
  1065. const int kNumIterations = 10;
  1066. const auto clock = SystemClock::Default().get();
  1067. caches_.reserve(FLAGS_stress_cache_instances);
  1068. uint64_t total_create_time_us = 0;
  1069. uint64_t total_destroy_time_us = 0;
  1070. for (int iter = 0; iter < kNumIterations; ++iter) {
  1071. // Create many cache instances
  1072. uint64_t start_create = clock->NowMicros();
  1073. for (uint32_t i = 0; i < FLAGS_stress_cache_instances; ++i) {
  1074. caches_.emplace_back(CacheBench::MakeCache());
  1075. }
  1076. uint64_t end_create = clock->NowMicros();
  1077. uint64_t create_time = end_create - start_create;
  1078. total_create_time_us += create_time;
  1079. // Destroy them
  1080. uint64_t start_destroy = clock->NowMicros();
  1081. caches_.clear();
  1082. uint64_t end_destroy = clock->NowMicros();
  1083. uint64_t destroy_time = end_destroy - start_destroy;
  1084. total_destroy_time_us += destroy_time;
  1085. printf(
  1086. "Iteration %d: Created %u caches in %.3f ms, destroyed in %.3f ms\n",
  1087. iter + 1, FLAGS_stress_cache_instances, create_time / 1000.0,
  1088. destroy_time / 1000.0);
  1089. }
  1090. printf("Average creation time: %.3f ms (%.1f us per cache)\n",
  1091. static_cast<double>(total_create_time_us) / kNumIterations / 1000.0,
  1092. static_cast<double>(total_create_time_us) / kNumIterations /
  1093. FLAGS_stress_cache_instances);
  1094. printf("Average destruction time: %.3f ms (%.1f us per cache)\n",
  1095. static_cast<double>(total_destroy_time_us) / kNumIterations / 1000.0,
  1096. static_cast<double>(total_destroy_time_us) / kNumIterations /
  1097. FLAGS_stress_cache_instances);
  1098. }
  1099. private:
  1100. std::vector<std::shared_ptr<Cache>> caches_;
  1101. };
  1102. int cache_bench_tool(int argc, char** argv) {
  1103. ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
  1104. ParseCommandLineFlags(&argc, &argv, true);
  1105. if (FLAGS_stress_cache_key) {
  1106. // Alternate tool
  1107. StressCacheKey().Run();
  1108. return 0;
  1109. }
  1110. if (FLAGS_stress_cache_instances > 0) {
  1111. StressCacheInstances().Run();
  1112. return 0;
  1113. }
  1114. if (FLAGS_threads <= 0) {
  1115. fprintf(stderr, "threads number <= 0\n");
  1116. exit(1);
  1117. }
  1118. if (FLAGS_seed == 0) {
  1119. FLAGS_seed = static_cast<uint32_t>(port::GetProcessID());
  1120. printf("Using seed = %" PRIu32 "\n", FLAGS_seed);
  1121. }
  1122. ROCKSDB_NAMESPACE::CacheBench bench;
  1123. if (FLAGS_populate_cache) {
  1124. bench.PopulateCache();
  1125. }
  1126. if (bench.Run()) {
  1127. return 0;
  1128. } else {
  1129. return 1;
  1130. }
  1131. } // namespace ROCKSDB_NAMESPACE
  1132. } // namespace ROCKSDB_NAMESPACE
  1133. #endif // GFLAGS