ribbon_test.cc 47 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308
  1. // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. #include "rocksdb/system_clock.h"
  6. #include "test_util/testharness.h"
  7. #include "util/bloom_impl.h"
  8. #include "util/coding.h"
  9. #include "util/hash.h"
  10. #include "util/ribbon_config.h"
  11. #include "util/ribbon_impl.h"
  12. #include "util/stop_watch.h"
  13. #include "util/string_util.h"
  14. #ifndef GFLAGS
  15. uint32_t FLAGS_thoroughness = 5;
  16. uint32_t FLAGS_max_add = 0;
  17. uint32_t FLAGS_min_check = 4000;
  18. uint32_t FLAGS_max_check = 100000;
  19. bool FLAGS_verbose = false;
  20. bool FLAGS_find_occ = false;
  21. bool FLAGS_find_slot_occ = false;
  22. double FLAGS_find_next_factor = 1.618;
  23. uint32_t FLAGS_find_iters = 10000;
  24. uint32_t FLAGS_find_min_slots = 128;
  25. uint32_t FLAGS_find_max_slots = 1000000;
  26. bool FLAGS_optimize_homog = false;
  27. uint32_t FLAGS_optimize_homog_slots = 30000000;
  28. uint32_t FLAGS_optimize_homog_check = 200000;
  29. double FLAGS_optimize_homog_granularity = 0.002;
  30. #else
  31. #include "util/gflags_compat.h"
  32. using GFLAGS_NAMESPACE::ParseCommandLineFlags;
  33. // Using 500 is a good test when you have time to be thorough.
  34. // Default is for general RocksDB regression test runs.
  35. DEFINE_uint32(thoroughness, 5, "iterations per configuration");
  36. DEFINE_uint32(max_add, 0,
  37. "Add up to this number of entries to a single filter in "
  38. "CompactnessAndBacktrackAndFpRate; 0 == reasonable default");
  39. DEFINE_uint32(min_check, 4000,
  40. "Minimum number of novel entries for testing FP rate");
  41. DEFINE_uint32(max_check, 10000,
  42. "Maximum number of novel entries for testing FP rate");
  43. DEFINE_bool(verbose, false, "Print extra details");
  44. // Options for FindOccupancy, which is more of a tool than a test.
  45. DEFINE_bool(find_occ, false, "whether to run the FindOccupancy tool");
  46. DEFINE_bool(find_slot_occ, false,
  47. "whether to show individual slot occupancies with "
  48. "FindOccupancy tool");
  49. DEFINE_double(find_next_factor, 1.618,
  50. "factor to next num_slots for FindOccupancy");
  51. DEFINE_uint32(find_iters, 10000, "number of samples for FindOccupancy");
  52. DEFINE_uint32(find_min_slots, 128, "number of slots for FindOccupancy");
  53. DEFINE_uint32(find_max_slots, 1000000, "number of slots for FindOccupancy");
  54. // Options for OptimizeHomogAtScale, which is more of a tool than a test.
  55. DEFINE_bool(optimize_homog, false,
  56. "whether to run the OptimizeHomogAtScale tool");
  57. DEFINE_uint32(optimize_homog_slots, 30000000,
  58. "number of slots for OptimizeHomogAtScale");
  59. DEFINE_uint32(optimize_homog_check, 200000,
  60. "number of queries for checking FP rate in OptimizeHomogAtScale");
  61. DEFINE_double(
  62. optimize_homog_granularity, 0.002,
  63. "overhead change between FP rate checking in OptimizeHomogAtScale");
  64. #endif // GFLAGS
  65. template <typename TypesAndSettings>
  66. class RibbonTypeParamTest : public ::testing::Test {};
  67. class RibbonTest : public ::testing::Test {};
  68. namespace {
  69. // Different ways of generating keys for testing
  70. // Generate semi-sequential keys
  71. struct StandardKeyGen {
  72. StandardKeyGen(const std::string& prefix, uint64_t id)
  73. : id_(id), str_(prefix) {
  74. ROCKSDB_NAMESPACE::PutFixed64(&str_, /*placeholder*/ 0);
  75. }
  76. // Prefix (only one required)
  77. StandardKeyGen& operator++() {
  78. ++id_;
  79. return *this;
  80. }
  81. StandardKeyGen& operator+=(uint64_t i) {
  82. id_ += i;
  83. return *this;
  84. }
  85. const std::string& operator*() {
  86. // Use multiplication to mix things up a little in the key
  87. ROCKSDB_NAMESPACE::EncodeFixed64(&str_[str_.size() - 8],
  88. id_ * uint64_t{0x1500000001});
  89. return str_;
  90. }
  91. bool operator==(const StandardKeyGen& other) const {
  92. // Same prefix is assumed
  93. return id_ == other.id_;
  94. }
  95. bool operator!=(const StandardKeyGen& other) const {
  96. // Same prefix is assumed
  97. return id_ != other.id_;
  98. }
  99. uint64_t id_;
  100. std::string str_;
  101. };
  102. // Generate small sequential keys, that can misbehave with sequential seeds
  103. // as in https://github.com/Cyan4973/xxHash/issues/469.
  104. // These keys are only heuristically unique, but that's OK with 64 bits,
  105. // for testing purposes.
  106. struct SmallKeyGen {
  107. SmallKeyGen(const std::string& prefix, uint64_t id) : id_(id) {
  108. // Hash the prefix for a heuristically unique offset
  109. id_ += ROCKSDB_NAMESPACE::GetSliceHash64(prefix);
  110. ROCKSDB_NAMESPACE::PutFixed64(&str_, id_);
  111. }
  112. // Prefix (only one required)
  113. SmallKeyGen& operator++() {
  114. ++id_;
  115. return *this;
  116. }
  117. SmallKeyGen& operator+=(uint64_t i) {
  118. id_ += i;
  119. return *this;
  120. }
  121. const std::string& operator*() {
  122. ROCKSDB_NAMESPACE::EncodeFixed64(&str_[str_.size() - 8], id_);
  123. return str_;
  124. }
  125. bool operator==(const SmallKeyGen& other) const { return id_ == other.id_; }
  126. bool operator!=(const SmallKeyGen& other) const { return id_ != other.id_; }
  127. uint64_t id_;
  128. std::string str_;
  129. };
  130. template <typename KeyGen>
  131. struct Hash32KeyGenWrapper : public KeyGen {
  132. Hash32KeyGenWrapper(const std::string& prefix, uint64_t id)
  133. : KeyGen(prefix, id) {}
  134. uint32_t operator*() {
  135. auto& key = *static_cast<KeyGen&>(*this);
  136. // unseeded
  137. return ROCKSDB_NAMESPACE::GetSliceHash(key);
  138. }
  139. };
  140. template <typename KeyGen>
  141. struct Hash64KeyGenWrapper : public KeyGen {
  142. Hash64KeyGenWrapper(const std::string& prefix, uint64_t id)
  143. : KeyGen(prefix, id) {}
  144. uint64_t operator*() {
  145. auto& key = *static_cast<KeyGen&>(*this);
  146. // unseeded
  147. return ROCKSDB_NAMESPACE::GetSliceHash64(key);
  148. }
  149. };
  150. using ROCKSDB_NAMESPACE::ribbon::ConstructionFailureChance;
  151. const std::vector<ConstructionFailureChance> kFailureOnly50Pct = {
  152. ROCKSDB_NAMESPACE::ribbon::kOneIn2};
  153. const std::vector<ConstructionFailureChance> kFailureOnlyRare = {
  154. ROCKSDB_NAMESPACE::ribbon::kOneIn1000};
  155. const std::vector<ConstructionFailureChance> kFailureAll = {
  156. ROCKSDB_NAMESPACE::ribbon::kOneIn2, ROCKSDB_NAMESPACE::ribbon::kOneIn20,
  157. ROCKSDB_NAMESPACE::ribbon::kOneIn1000};
  158. } // namespace
  159. using ROCKSDB_NAMESPACE::ribbon::ExpectedCollisionFpRate;
  160. using ROCKSDB_NAMESPACE::ribbon::StandardHasher;
  161. using ROCKSDB_NAMESPACE::ribbon::StandardRehasherAdapter;
  162. struct DefaultTypesAndSettings {
  163. using CoeffRow = ROCKSDB_NAMESPACE::Unsigned128;
  164. using ResultRow = uint8_t;
  165. using Index = uint32_t;
  166. using Hash = uint64_t;
  167. using Seed = uint32_t;
  168. using Key = ROCKSDB_NAMESPACE::Slice;
  169. static constexpr bool kIsFilter = true;
  170. static constexpr bool kHomogeneous = false;
  171. static constexpr bool kFirstCoeffAlwaysOne = true;
  172. static constexpr bool kUseSmash = false;
  173. static constexpr bool kAllowZeroStarts = false;
  174. static Hash HashFn(const Key& key, uint64_t raw_seed) {
  175. // This version 0.7.2 preview of XXH3 (a.k.a. XXPH3) function does
  176. // not pass SmallKeyGen tests below without some seed premixing from
  177. // StandardHasher. See https://github.com/Cyan4973/xxHash/issues/469
  178. return ROCKSDB_NAMESPACE::Hash64(key.data(), key.size(), raw_seed);
  179. }
  180. // For testing
  181. using KeyGen = StandardKeyGen;
  182. static const std::vector<ConstructionFailureChance>& FailureChanceToTest() {
  183. return kFailureAll;
  184. }
  185. };
  186. using TypesAndSettings_Coeff128 = DefaultTypesAndSettings;
  187. struct TypesAndSettings_Coeff128Smash : public DefaultTypesAndSettings {
  188. static constexpr bool kUseSmash = true;
  189. };
  190. struct TypesAndSettings_Coeff64 : public DefaultTypesAndSettings {
  191. using CoeffRow = uint64_t;
  192. };
  193. struct TypesAndSettings_Coeff64Smash : public TypesAndSettings_Coeff64 {
  194. static constexpr bool kUseSmash = true;
  195. };
  196. struct TypesAndSettings_Coeff64Smash0 : public TypesAndSettings_Coeff64Smash {
  197. static constexpr bool kFirstCoeffAlwaysOne = false;
  198. };
  199. // Homogeneous Ribbon configurations
  200. struct TypesAndSettings_Coeff128_Homog : public DefaultTypesAndSettings {
  201. static constexpr bool kHomogeneous = true;
  202. // Since our best construction success setting still has 1/1000 failure
  203. // rate, the best FP rate we test is 1/256
  204. using ResultRow = uint8_t;
  205. // Homogeneous only makes sense with sufficient slots for equivalent of
  206. // almost sure construction success
  207. static const std::vector<ConstructionFailureChance>& FailureChanceToTest() {
  208. return kFailureOnlyRare;
  209. }
  210. };
  211. struct TypesAndSettings_Coeff128Smash_Homog
  212. : public TypesAndSettings_Coeff128_Homog {
  213. // Smash (extra time to save space) + Homog (extra space to save time)
  214. // doesn't make much sense in practice, but we minimally test it
  215. static constexpr bool kUseSmash = true;
  216. };
  217. struct TypesAndSettings_Coeff64_Homog : public TypesAndSettings_Coeff128_Homog {
  218. using CoeffRow = uint64_t;
  219. };
  220. struct TypesAndSettings_Coeff64Smash_Homog
  221. : public TypesAndSettings_Coeff64_Homog {
  222. // Smash (extra time to save space) + Homog (extra space to save time)
  223. // doesn't make much sense in practice, but we minimally test it
  224. static constexpr bool kUseSmash = true;
  225. };
  226. // Less exhaustive mix of coverage, but still covering the most stressful case
  227. // (only 50% construction success)
  228. struct AbridgedTypesAndSettings : public DefaultTypesAndSettings {
  229. static const std::vector<ConstructionFailureChance>& FailureChanceToTest() {
  230. return kFailureOnly50Pct;
  231. }
  232. };
  233. struct TypesAndSettings_Result16 : public AbridgedTypesAndSettings {
  234. using ResultRow = uint16_t;
  235. };
  236. struct TypesAndSettings_Result32 : public AbridgedTypesAndSettings {
  237. using ResultRow = uint32_t;
  238. };
  239. struct TypesAndSettings_IndexSizeT : public AbridgedTypesAndSettings {
  240. using Index = size_t;
  241. };
  242. struct TypesAndSettings_Hash32 : public AbridgedTypesAndSettings {
  243. using Hash = uint32_t;
  244. static Hash HashFn(const Key& key, Hash raw_seed) {
  245. // This MurmurHash1 function does not pass tests below without the
  246. // seed premixing from StandardHasher. In fact, it needs more than
  247. // just a multiplication mixer on the ordinal seed.
  248. return ROCKSDB_NAMESPACE::Hash(key.data(), key.size(), raw_seed);
  249. }
  250. };
  251. struct TypesAndSettings_Hash32_Result16 : public AbridgedTypesAndSettings {
  252. using ResultRow = uint16_t;
  253. };
  254. struct TypesAndSettings_KeyString : public AbridgedTypesAndSettings {
  255. using Key = std::string;
  256. };
  257. struct TypesAndSettings_Seed8 : public AbridgedTypesAndSettings {
  258. // This is not a generally recommended configuration. With the configured
  259. // hash function, it would fail with SmallKeyGen due to insufficient
  260. // independence among the seeds.
  261. using Seed = uint8_t;
  262. };
  263. struct TypesAndSettings_NoAlwaysOne : public AbridgedTypesAndSettings {
  264. static constexpr bool kFirstCoeffAlwaysOne = false;
  265. };
  266. struct TypesAndSettings_AllowZeroStarts : public AbridgedTypesAndSettings {
  267. static constexpr bool kAllowZeroStarts = true;
  268. };
  269. struct TypesAndSettings_Seed64 : public AbridgedTypesAndSettings {
  270. using Seed = uint64_t;
  271. };
  272. struct TypesAndSettings_Rehasher
  273. : public StandardRehasherAdapter<AbridgedTypesAndSettings> {
  274. using KeyGen = Hash64KeyGenWrapper<StandardKeyGen>;
  275. };
  276. struct TypesAndSettings_Rehasher_Result16 : public TypesAndSettings_Rehasher {
  277. using ResultRow = uint16_t;
  278. };
  279. struct TypesAndSettings_Rehasher_Result32 : public TypesAndSettings_Rehasher {
  280. using ResultRow = uint32_t;
  281. };
  282. struct TypesAndSettings_Rehasher_Seed64
  283. : public StandardRehasherAdapter<TypesAndSettings_Seed64> {
  284. using KeyGen = Hash64KeyGenWrapper<StandardKeyGen>;
  285. // Note: 64-bit seed with Rehasher gives slightly better average reseeds
  286. };
  287. struct TypesAndSettings_Rehasher32
  288. : public StandardRehasherAdapter<TypesAndSettings_Hash32> {
  289. using KeyGen = Hash32KeyGenWrapper<StandardKeyGen>;
  290. };
  291. struct TypesAndSettings_Rehasher32_Coeff64
  292. : public TypesAndSettings_Rehasher32 {
  293. using CoeffRow = uint64_t;
  294. };
  295. struct TypesAndSettings_SmallKeyGen : public AbridgedTypesAndSettings {
  296. // SmallKeyGen stresses the independence of different hash seeds
  297. using KeyGen = SmallKeyGen;
  298. };
  299. struct TypesAndSettings_Hash32_SmallKeyGen : public TypesAndSettings_Hash32 {
  300. // SmallKeyGen stresses the independence of different hash seeds
  301. using KeyGen = SmallKeyGen;
  302. };
  303. struct TypesAndSettings_Coeff32 : public DefaultTypesAndSettings {
  304. using CoeffRow = uint32_t;
  305. };
  306. struct TypesAndSettings_Coeff32Smash : public TypesAndSettings_Coeff32 {
  307. static constexpr bool kUseSmash = true;
  308. };
  309. struct TypesAndSettings_Coeff16 : public DefaultTypesAndSettings {
  310. using CoeffRow = uint16_t;
  311. };
  312. struct TypesAndSettings_Coeff16Smash : public TypesAndSettings_Coeff16 {
  313. static constexpr bool kUseSmash = true;
  314. };
  315. using TestTypesAndSettings = ::testing::Types<
  316. TypesAndSettings_Coeff128, TypesAndSettings_Coeff128Smash,
  317. TypesAndSettings_Coeff64, TypesAndSettings_Coeff64Smash,
  318. TypesAndSettings_Coeff64Smash0, TypesAndSettings_Coeff128_Homog,
  319. TypesAndSettings_Coeff128Smash_Homog, TypesAndSettings_Coeff64_Homog,
  320. TypesAndSettings_Coeff64Smash_Homog, TypesAndSettings_Result16,
  321. TypesAndSettings_Result32, TypesAndSettings_IndexSizeT,
  322. TypesAndSettings_Hash32, TypesAndSettings_Hash32_Result16,
  323. TypesAndSettings_KeyString, TypesAndSettings_Seed8,
  324. TypesAndSettings_NoAlwaysOne, TypesAndSettings_AllowZeroStarts,
  325. TypesAndSettings_Seed64, TypesAndSettings_Rehasher,
  326. TypesAndSettings_Rehasher_Result16, TypesAndSettings_Rehasher_Result32,
  327. TypesAndSettings_Rehasher_Seed64, TypesAndSettings_Rehasher32,
  328. TypesAndSettings_Rehasher32_Coeff64, TypesAndSettings_SmallKeyGen,
  329. TypesAndSettings_Hash32_SmallKeyGen, TypesAndSettings_Coeff32,
  330. TypesAndSettings_Coeff32Smash, TypesAndSettings_Coeff16,
  331. TypesAndSettings_Coeff16Smash>;
  332. TYPED_TEST_CASE(RibbonTypeParamTest, TestTypesAndSettings);
  333. namespace {
  334. // For testing Poisson-distributed (or similar) statistics, get value for
  335. // `stddevs_allowed` standard deviations above expected mean
  336. // `expected_count`.
  337. // (Poisson approximates Binomial only if probability of a trial being
  338. // in the count is low.)
  339. uint64_t PoissonUpperBound(double expected_count, double stddevs_allowed) {
  340. return static_cast<uint64_t>(
  341. expected_count + stddevs_allowed * std::sqrt(expected_count) + 1.0);
  342. }
  343. uint64_t PoissonLowerBound(double expected_count, double stddevs_allowed) {
  344. return static_cast<uint64_t>(std::max(
  345. 0.0, expected_count - stddevs_allowed * std::sqrt(expected_count)));
  346. }
  347. uint64_t FrequentPoissonUpperBound(double expected_count) {
  348. // Allow up to 5.0 standard deviations for frequently checked statistics
  349. return PoissonUpperBound(expected_count, 5.0);
  350. }
  351. uint64_t FrequentPoissonLowerBound(double expected_count) {
  352. return PoissonLowerBound(expected_count, 5.0);
  353. }
  354. uint64_t InfrequentPoissonUpperBound(double expected_count) {
  355. // Allow up to 3 standard deviations for infrequently checked statistics
  356. return PoissonUpperBound(expected_count, 3.0);
  357. }
  358. uint64_t InfrequentPoissonLowerBound(double expected_count) {
  359. return PoissonLowerBound(expected_count, 3.0);
  360. }
  361. } // namespace
  362. TYPED_TEST(RibbonTypeParamTest, CompactnessAndBacktrackAndFpRate) {
  363. IMPORT_RIBBON_TYPES_AND_SETTINGS(TypeParam);
  364. IMPORT_RIBBON_IMPL_TYPES(TypeParam);
  365. using KeyGen = typename TypeParam::KeyGen;
  366. using ConfigHelper =
  367. ROCKSDB_NAMESPACE::ribbon::BandingConfigHelper<TypeParam>;
  368. if (sizeof(CoeffRow) < 8) {
  369. ROCKSDB_GTEST_BYPASS("Not fully supported");
  370. return;
  371. }
  372. const auto log2_thoroughness =
  373. static_cast<uint32_t>(ROCKSDB_NAMESPACE::FloorLog2(FLAGS_thoroughness));
  374. // We are going to choose num_to_add using an exponential distribution,
  375. // so that we have good representation of small-to-medium filters.
  376. // Here we just pick some reasonable, practical upper bound based on
  377. // kCoeffBits or option.
  378. const double log_max_add = std::log(
  379. FLAGS_max_add > 0 ? FLAGS_max_add
  380. : static_cast<uint32_t>(kCoeffBits * kCoeffBits) *
  381. std::max(FLAGS_thoroughness, uint32_t{32}));
  382. // This needs to be enough below the minimum number of slots to get a
  383. // reasonable number of samples with the minimum number of slots.
  384. const double log_min_add = std::log(0.66 * SimpleSoln::RoundUpNumSlots(1));
  385. ASSERT_GT(log_max_add, log_min_add);
  386. const double diff_log_add = log_max_add - log_min_add;
  387. for (ConstructionFailureChance cs : TypeParam::FailureChanceToTest()) {
  388. double expected_reseeds;
  389. switch (cs) {
  390. default:
  391. assert(false);
  392. FALLTHROUGH_INTENDED;
  393. case ROCKSDB_NAMESPACE::ribbon::kOneIn2:
  394. fprintf(stderr, "== Failure: 50 percent\n");
  395. expected_reseeds = 1.0;
  396. break;
  397. case ROCKSDB_NAMESPACE::ribbon::kOneIn20:
  398. fprintf(stderr, "== Failure: 95 percent\n");
  399. expected_reseeds = 0.053;
  400. break;
  401. case ROCKSDB_NAMESPACE::ribbon::kOneIn1000:
  402. fprintf(stderr, "== Failure: 1/1000\n");
  403. expected_reseeds = 0.001;
  404. break;
  405. }
  406. uint64_t total_reseeds = 0;
  407. uint64_t total_singles = 0;
  408. uint64_t total_single_failures = 0;
  409. uint64_t total_batch = 0;
  410. uint64_t total_batch_successes = 0;
  411. uint64_t total_fp_count = 0;
  412. uint64_t total_added = 0;
  413. uint64_t total_expand_trials = 0;
  414. uint64_t total_expand_failures = 0;
  415. double total_expand_overhead = 0.0;
  416. uint64_t soln_query_nanos = 0;
  417. uint64_t soln_query_count = 0;
  418. uint64_t bloom_query_nanos = 0;
  419. uint64_t isoln_query_nanos = 0;
  420. uint64_t isoln_query_count = 0;
  421. // Take different samples if you change thoroughness
  422. ROCKSDB_NAMESPACE::Random32 rnd(FLAGS_thoroughness);
  423. for (uint32_t i = 0; i < FLAGS_thoroughness; ++i) {
  424. // We are going to choose num_to_add using an exponential distribution
  425. // as noted above, but instead of randomly choosing them, we generate
  426. // samples linearly using the golden ratio, which ensures a nice spread
  427. // even for a small number of samples, and starting with the minimum
  428. // number of slots to ensure it is tested.
  429. double log_add =
  430. std::fmod(0.6180339887498948482 * diff_log_add * i, diff_log_add) +
  431. log_min_add;
  432. uint32_t num_to_add = static_cast<uint32_t>(std::exp(log_add));
  433. // Most of the time, test the Interleaved solution storage, but when
  434. // we do we have to make num_slots a multiple of kCoeffBits. So
  435. // sometimes we want to test without that limitation.
  436. bool test_interleaved = (i % 7) != 6;
  437. // Compute num_slots, and re-adjust num_to_add to get as close as possible
  438. // to next num_slots, to stress that num_slots in terms of construction
  439. // success. Ensure at least one iteration:
  440. Index num_slots = Index{0} - 1;
  441. --num_to_add;
  442. for (;;) {
  443. Index next_num_slots = SimpleSoln::RoundUpNumSlots(
  444. ConfigHelper::GetNumSlots(num_to_add + 1, cs));
  445. if (test_interleaved) {
  446. next_num_slots = InterleavedSoln::RoundUpNumSlots(next_num_slots);
  447. // assert idempotent
  448. EXPECT_EQ(next_num_slots,
  449. InterleavedSoln::RoundUpNumSlots(next_num_slots));
  450. }
  451. // assert idempotent with InterleavedSoln::RoundUpNumSlots
  452. EXPECT_EQ(next_num_slots, SimpleSoln::RoundUpNumSlots(next_num_slots));
  453. if (next_num_slots > num_slots) {
  454. break;
  455. }
  456. num_slots = next_num_slots;
  457. ++num_to_add;
  458. }
  459. assert(num_slots < Index{0} - 1);
  460. total_added += num_to_add;
  461. std::string prefix;
  462. ROCKSDB_NAMESPACE::PutFixed32(&prefix, rnd.Next());
  463. // Batch that must be added
  464. std::string added_str = prefix + "added";
  465. KeyGen keys_begin(added_str, 0);
  466. KeyGen keys_end(added_str, num_to_add);
  467. // A couple more that will probably be added
  468. KeyGen one_more(prefix + "more", 1);
  469. KeyGen two_more(prefix + "more", 2);
  470. // Batch that may or may not be added
  471. uint32_t batch_size =
  472. static_cast<uint32_t>(2.0 * std::sqrt(num_slots - num_to_add));
  473. if (batch_size < 10U) {
  474. batch_size = 0;
  475. }
  476. std::string batch_str = prefix + "batch";
  477. KeyGen batch_begin(batch_str, 0);
  478. KeyGen batch_end(batch_str, batch_size);
  479. // Batch never (successfully) added, but used for querying FP rate
  480. std::string not_str = prefix + "not";
  481. KeyGen other_keys_begin(not_str, 0);
  482. KeyGen other_keys_end(not_str, FLAGS_max_check);
  483. double overhead_ratio = 1.0 * num_slots / num_to_add;
  484. if (FLAGS_verbose) {
  485. fprintf(stderr, "Adding(%s) %u / %u Overhead: %g Batch size: %u\n",
  486. test_interleaved ? "i" : "s", (unsigned)num_to_add,
  487. (unsigned)num_slots, overhead_ratio, (unsigned)batch_size);
  488. }
  489. // Vary bytes for InterleavedSoln to use number of solution columns
  490. // from 0 to max allowed by ResultRow type (and used by SimpleSoln).
  491. // Specifically include 0 and max, and otherwise skew toward max.
  492. uint32_t max_ibytes =
  493. static_cast<uint32_t>(sizeof(ResultRow) * num_slots);
  494. size_t ibytes;
  495. if (i == 0) {
  496. ibytes = 0;
  497. } else if (i == 1) {
  498. ibytes = max_ibytes;
  499. } else {
  500. // Skewed
  501. ibytes =
  502. std::max(rnd.Uniformish(max_ibytes), rnd.Uniformish(max_ibytes));
  503. }
  504. std::unique_ptr<char[]> idata(new char[ibytes]);
  505. InterleavedSoln isoln(idata.get(), ibytes);
  506. SimpleSoln soln;
  507. Hasher hasher;
  508. bool first_single;
  509. bool second_single;
  510. bool batch_success;
  511. {
  512. Banding banding;
  513. // Traditional solve for a fixed set.
  514. ASSERT_TRUE(
  515. banding.ResetAndFindSeedToSolve(num_slots, keys_begin, keys_end));
  516. Index occupied_count = banding.GetOccupiedCount();
  517. Index more_added = 0;
  518. if (TypeParam::kHomogeneous || overhead_ratio < 1.01 ||
  519. batch_size == 0) {
  520. // Homogeneous not compatible with backtracking because add
  521. // doesn't fail. Small overhead ratio too packed to expect more
  522. first_single = false;
  523. second_single = false;
  524. batch_success = false;
  525. } else {
  526. // Now to test backtracking, starting with guaranteed fail. By using
  527. // the keys that will be used to test FP rate, we are then doing an
  528. // extra check that after backtracking there are no remnants (e.g. in
  529. // result side of banding) of these entries.
  530. KeyGen other_keys_too_big_end = other_keys_begin;
  531. other_keys_too_big_end += num_to_add;
  532. banding.EnsureBacktrackSize(std::max(num_to_add, batch_size));
  533. EXPECT_FALSE(banding.AddRangeOrRollBack(other_keys_begin,
  534. other_keys_too_big_end));
  535. EXPECT_EQ(occupied_count, banding.GetOccupiedCount());
  536. // Check that we still have a good chance of adding a couple more
  537. // individually
  538. first_single = banding.Add(*one_more);
  539. second_single = banding.Add(*two_more);
  540. more_added += (first_single ? 1 : 0) + (second_single ? 1 : 0);
  541. total_singles += 2U;
  542. total_single_failures += 2U - more_added;
  543. // Or as a batch
  544. batch_success = banding.AddRangeOrRollBack(batch_begin, batch_end);
  545. ++total_batch;
  546. if (batch_success) {
  547. more_added += batch_size;
  548. ++total_batch_successes;
  549. }
  550. EXPECT_LE(banding.GetOccupiedCount(), occupied_count + more_added);
  551. }
  552. // Also verify that redundant adds are OK (no effect)
  553. ASSERT_TRUE(
  554. banding.AddRange(keys_begin, KeyGen(added_str, num_to_add / 8)));
  555. EXPECT_LE(banding.GetOccupiedCount(), occupied_count + more_added);
  556. // Now back-substitution
  557. soln.BackSubstFrom(banding);
  558. if (test_interleaved) {
  559. isoln.BackSubstFrom(banding);
  560. }
  561. Seed reseeds = banding.GetOrdinalSeed();
  562. total_reseeds += reseeds;
  563. EXPECT_LE(reseeds, 8 + log2_thoroughness);
  564. if (reseeds > log2_thoroughness + 1) {
  565. fprintf(
  566. stderr, "%s high reseeds at %u, %u/%u: %u\n",
  567. reseeds > log2_thoroughness + 8 ? "ERROR Extremely" : "Somewhat",
  568. static_cast<unsigned>(i), static_cast<unsigned>(num_to_add),
  569. static_cast<unsigned>(num_slots), static_cast<unsigned>(reseeds));
  570. }
  571. if (reseeds > 0) {
  572. // "Expand" test: given a failed construction, how likely is it to
  573. // pass with same seed and more slots. At each step, we increase
  574. // enough to ensure there is at least one shift within each coeff
  575. // block.
  576. ++total_expand_trials;
  577. Index expand_count = 0;
  578. Index ex_slots = num_slots;
  579. banding.SetOrdinalSeed(0);
  580. for (;; ++expand_count) {
  581. ASSERT_LE(expand_count, log2_thoroughness);
  582. ex_slots += ex_slots / kCoeffBits;
  583. if (test_interleaved) {
  584. ex_slots = InterleavedSoln::RoundUpNumSlots(ex_slots);
  585. }
  586. banding.Reset(ex_slots);
  587. bool success = banding.AddRange(keys_begin, keys_end);
  588. if (success) {
  589. break;
  590. }
  591. }
  592. total_expand_failures += expand_count;
  593. total_expand_overhead += 1.0 * (ex_slots - num_slots) / num_slots;
  594. }
  595. hasher.SetOrdinalSeed(reseeds);
  596. }
  597. // soln and hasher now independent of Banding object
  598. // Verify keys added
  599. KeyGen cur = keys_begin;
  600. while (cur != keys_end) {
  601. ASSERT_TRUE(soln.FilterQuery(*cur, hasher));
  602. ASSERT_TRUE(!test_interleaved || isoln.FilterQuery(*cur, hasher));
  603. ++cur;
  604. }
  605. // We (maybe) snuck these in!
  606. if (first_single) {
  607. ASSERT_TRUE(soln.FilterQuery(*one_more, hasher));
  608. ASSERT_TRUE(!test_interleaved || isoln.FilterQuery(*one_more, hasher));
  609. }
  610. if (second_single) {
  611. ASSERT_TRUE(soln.FilterQuery(*two_more, hasher));
  612. ASSERT_TRUE(!test_interleaved || isoln.FilterQuery(*two_more, hasher));
  613. }
  614. if (batch_success) {
  615. cur = batch_begin;
  616. while (cur != batch_end) {
  617. ASSERT_TRUE(soln.FilterQuery(*cur, hasher));
  618. ASSERT_TRUE(!test_interleaved || isoln.FilterQuery(*cur, hasher));
  619. ++cur;
  620. }
  621. }
  622. // Check FP rate (depends only on number of result bits == solution
  623. // columns)
  624. Index fp_count = 0;
  625. cur = other_keys_begin;
  626. {
  627. ROCKSDB_NAMESPACE::StopWatchNano timer(
  628. ROCKSDB_NAMESPACE::SystemClock::Default().get(), true);
  629. while (cur != other_keys_end) {
  630. bool fp = soln.FilterQuery(*cur, hasher);
  631. fp_count += fp ? 1 : 0;
  632. ++cur;
  633. }
  634. soln_query_nanos += timer.ElapsedNanos();
  635. soln_query_count += FLAGS_max_check;
  636. }
  637. {
  638. double expected_fp_count = soln.ExpectedFpRate() * FLAGS_max_check;
  639. // For expected FP rate, also include false positives due to collisions
  640. // in Hash value. (Negligible for 64-bit, can matter for 32-bit.)
  641. double correction =
  642. FLAGS_max_check * ExpectedCollisionFpRate(hasher, num_to_add);
  643. // NOTE: rare violations expected with kHomogeneous
  644. EXPECT_LE(fp_count,
  645. FrequentPoissonUpperBound(expected_fp_count + correction));
  646. EXPECT_GE(fp_count,
  647. FrequentPoissonLowerBound(expected_fp_count + correction));
  648. }
  649. total_fp_count += fp_count;
  650. // And also check FP rate for isoln
  651. if (test_interleaved) {
  652. Index ifp_count = 0;
  653. cur = other_keys_begin;
  654. ROCKSDB_NAMESPACE::StopWatchNano timer(
  655. ROCKSDB_NAMESPACE::SystemClock::Default().get(), true);
  656. while (cur != other_keys_end) {
  657. ifp_count += isoln.FilterQuery(*cur, hasher) ? 1 : 0;
  658. ++cur;
  659. }
  660. isoln_query_nanos += timer.ElapsedNanos();
  661. isoln_query_count += FLAGS_max_check;
  662. {
  663. double expected_fp_count = isoln.ExpectedFpRate() * FLAGS_max_check;
  664. // For expected FP rate, also include false positives due to
  665. // collisions in Hash value. (Negligible for 64-bit, can matter for
  666. // 32-bit.)
  667. double correction =
  668. FLAGS_max_check * ExpectedCollisionFpRate(hasher, num_to_add);
  669. // NOTE: rare violations expected with kHomogeneous
  670. EXPECT_LE(ifp_count,
  671. FrequentPoissonUpperBound(expected_fp_count + correction));
  672. // FIXME: why sometimes can we slightly "beat the odds"?
  673. // (0.95 factor should not be needed)
  674. EXPECT_GE(ifp_count, FrequentPoissonLowerBound(
  675. 0.95 * expected_fp_count + correction));
  676. }
  677. // Since the bits used in isoln are a subset of the bits used in soln,
  678. // it cannot have fewer FPs
  679. EXPECT_GE(ifp_count, fp_count);
  680. }
  681. // And compare to Bloom time, for fun
  682. if (ibytes >= /* minimum Bloom impl bytes*/ 64) {
  683. Index bfp_count = 0;
  684. cur = other_keys_begin;
  685. ROCKSDB_NAMESPACE::StopWatchNano timer(
  686. ROCKSDB_NAMESPACE::SystemClock::Default().get(), true);
  687. while (cur != other_keys_end) {
  688. uint64_t h = hasher.GetHash(*cur);
  689. uint32_t h1 = ROCKSDB_NAMESPACE::Lower32of64(h);
  690. uint32_t h2 = sizeof(Hash) >= 8 ? ROCKSDB_NAMESPACE::Upper32of64(h)
  691. : h1 * 0x9e3779b9;
  692. bfp_count +=
  693. ROCKSDB_NAMESPACE::FastLocalBloomImpl::HashMayMatch(
  694. h1, h2, static_cast<uint32_t>(ibytes), 6, idata.get())
  695. ? 1
  696. : 0;
  697. ++cur;
  698. }
  699. bloom_query_nanos += timer.ElapsedNanos();
  700. // ensure bfp_count is used
  701. ASSERT_LT(bfp_count, FLAGS_max_check);
  702. }
  703. }
  704. // "outside" == key not in original set so either negative or false positive
  705. fprintf(stderr,
  706. "Simple outside query, hot, incl hashing, ns/key: %g\n",
  707. 1.0 * soln_query_nanos / soln_query_count);
  708. fprintf(stderr,
  709. "Interleaved outside query, hot, incl hashing, ns/key: %g\n",
  710. 1.0 * isoln_query_nanos / isoln_query_count);
  711. fprintf(stderr,
  712. "Bloom outside query, hot, incl hashing, ns/key: %g\n",
  713. 1.0 * bloom_query_nanos / soln_query_count);
  714. if (TypeParam::kHomogeneous) {
  715. EXPECT_EQ(total_reseeds, 0U);
  716. } else {
  717. double average_reseeds = 1.0 * total_reseeds / FLAGS_thoroughness;
  718. fprintf(stderr, "Average re-seeds: %g\n", average_reseeds);
  719. // Values above were chosen to target around 50% chance of encoding
  720. // success rate (average of 1.0 re-seeds) or slightly better. But 1.15 is
  721. // also close enough.
  722. EXPECT_LE(total_reseeds,
  723. InfrequentPoissonUpperBound(1.15 * expected_reseeds *
  724. FLAGS_thoroughness));
  725. // Would use 0.85 here instead of 0.75, but
  726. // TypesAndSettings_Hash32_SmallKeyGen can "beat the odds" because of
  727. // sequential keys with a small, cheap hash function. We accept that
  728. // there are surely inputs that are somewhat bad for this setup, but
  729. // these somewhat good inputs are probably more likely.
  730. EXPECT_GE(total_reseeds,
  731. InfrequentPoissonLowerBound(0.75 * expected_reseeds *
  732. FLAGS_thoroughness));
  733. }
  734. if (total_expand_trials > 0) {
  735. double average_expand_failures =
  736. 1.0 * total_expand_failures / total_expand_trials;
  737. fprintf(stderr, "Average expand failures, and overhead: %g, %g\n",
  738. average_expand_failures,
  739. total_expand_overhead / total_expand_trials);
  740. // Seems to be a generous allowance
  741. EXPECT_LE(total_expand_failures,
  742. InfrequentPoissonUpperBound(1.0 * total_expand_trials));
  743. } else {
  744. fprintf(stderr, "Average expand failures: N/A\n");
  745. }
  746. if (total_singles > 0) {
  747. double single_failure_rate = 1.0 * total_single_failures / total_singles;
  748. fprintf(stderr, "Add'l single, failure rate: %g\n", single_failure_rate);
  749. // A rough bound (one sided) based on nothing in particular
  750. double expected_single_failures = 1.0 * total_singles /
  751. (sizeof(CoeffRow) == 16 ? 128
  752. : TypeParam::kUseSmash ? 64
  753. : 32);
  754. EXPECT_LE(total_single_failures,
  755. InfrequentPoissonUpperBound(expected_single_failures));
  756. }
  757. if (total_batch > 0) {
  758. // Counting successes here for Poisson to approximate the Binomial
  759. // distribution.
  760. // A rough bound (one sided) based on nothing in particular.
  761. double expected_batch_successes = 1.0 * total_batch / 2;
  762. uint64_t lower_bound =
  763. InfrequentPoissonLowerBound(expected_batch_successes);
  764. fprintf(stderr, "Add'l batch, success rate: %g (>= %g)\n",
  765. 1.0 * total_batch_successes / total_batch,
  766. 1.0 * lower_bound / total_batch);
  767. EXPECT_GE(total_batch_successes, lower_bound);
  768. }
  769. {
  770. uint64_t total_checked = uint64_t{FLAGS_max_check} * FLAGS_thoroughness;
  771. double expected_total_fp_count =
  772. total_checked * std::pow(0.5, 8U * sizeof(ResultRow));
  773. // For expected FP rate, also include false positives due to collisions
  774. // in Hash value. (Negligible for 64-bit, can matter for 32-bit.)
  775. double average_added = 1.0 * total_added / FLAGS_thoroughness;
  776. expected_total_fp_count +=
  777. total_checked * ExpectedCollisionFpRate(Hasher(), average_added);
  778. uint64_t upper_bound =
  779. InfrequentPoissonUpperBound(expected_total_fp_count);
  780. uint64_t lower_bound =
  781. InfrequentPoissonLowerBound(expected_total_fp_count);
  782. fprintf(stderr, "Average FP rate: %g (~= %g, <= %g, >= %g)\n",
  783. 1.0 * total_fp_count / total_checked,
  784. expected_total_fp_count / total_checked,
  785. 1.0 * upper_bound / total_checked,
  786. 1.0 * lower_bound / total_checked);
  787. EXPECT_LE(total_fp_count, upper_bound);
  788. EXPECT_GE(total_fp_count, lower_bound);
  789. }
  790. }
  791. }
  792. TYPED_TEST(RibbonTypeParamTest, Extremes) {
  793. IMPORT_RIBBON_TYPES_AND_SETTINGS(TypeParam);
  794. IMPORT_RIBBON_IMPL_TYPES(TypeParam);
  795. using KeyGen = typename TypeParam::KeyGen;
  796. size_t bytes = 128 * 1024;
  797. std::unique_ptr<char[]> buf(new char[bytes]);
  798. InterleavedSoln isoln(buf.get(), bytes);
  799. SimpleSoln soln;
  800. Hasher hasher;
  801. Banding banding;
  802. // ########################################
  803. // Add zero keys to minimal number of slots
  804. KeyGen begin_and_end("foo", 123);
  805. ASSERT_TRUE(banding.ResetAndFindSeedToSolve(
  806. /*slots*/ kCoeffBits, begin_and_end, begin_and_end, /*first seed*/ 0,
  807. /* seed mask*/ 0));
  808. soln.BackSubstFrom(banding);
  809. isoln.BackSubstFrom(banding);
  810. // Because there's plenty of memory, we expect the interleaved solution to
  811. // use maximum supported columns (same as simple solution)
  812. ASSERT_EQ(isoln.GetUpperNumColumns(), 8U * sizeof(ResultRow));
  813. ASSERT_EQ(isoln.GetUpperStartBlock(), 0U);
  814. // Somewhat oddly, we expect same FP rate as if we had essentially filled
  815. // up the slots.
  816. KeyGen other_keys_begin("not", 0);
  817. KeyGen other_keys_end("not", FLAGS_max_check);
  818. Index fp_count = 0;
  819. KeyGen cur = other_keys_begin;
  820. while (cur != other_keys_end) {
  821. bool isoln_query_result = isoln.FilterQuery(*cur, hasher);
  822. bool soln_query_result = soln.FilterQuery(*cur, hasher);
  823. // Solutions are equivalent
  824. ASSERT_EQ(isoln_query_result, soln_query_result);
  825. if (!TypeParam::kHomogeneous) {
  826. // And in fact we only expect an FP when ResultRow is 0
  827. // (except Homogeneous)
  828. ASSERT_EQ(soln_query_result, hasher.GetResultRowFromHash(
  829. hasher.GetHash(*cur)) == ResultRow{0});
  830. }
  831. fp_count += soln_query_result ? 1 : 0;
  832. ++cur;
  833. }
  834. {
  835. ASSERT_EQ(isoln.ExpectedFpRate(), soln.ExpectedFpRate());
  836. double expected_fp_count = isoln.ExpectedFpRate() * FLAGS_max_check;
  837. EXPECT_LE(fp_count, InfrequentPoissonUpperBound(expected_fp_count));
  838. if (TypeParam::kHomogeneous) {
  839. // Pseudorandom garbage in Homogeneous filter can "beat the odds" if
  840. // nothing added
  841. } else {
  842. EXPECT_GE(fp_count, InfrequentPoissonLowerBound(expected_fp_count));
  843. }
  844. }
  845. // ######################################################
  846. // Use zero bytes for interleaved solution (key(s) added)
  847. // Add one key
  848. KeyGen key_begin("added", 0);
  849. KeyGen key_end("added", 1);
  850. ASSERT_TRUE(banding.ResetAndFindSeedToSolve(
  851. /*slots*/ kCoeffBits, key_begin, key_end, /*first seed*/ 0,
  852. /* seed mask*/ 0));
  853. InterleavedSoln isoln2(nullptr, /*bytes*/ 0);
  854. isoln2.BackSubstFrom(banding);
  855. ASSERT_EQ(isoln2.GetUpperNumColumns(), 0U);
  856. ASSERT_EQ(isoln2.GetUpperStartBlock(), 0U);
  857. // All queries return true
  858. ASSERT_TRUE(isoln2.FilterQuery(*other_keys_begin, hasher));
  859. ASSERT_EQ(isoln2.ExpectedFpRate(), 1.0);
  860. }
  861. TEST(RibbonTest, AllowZeroStarts) {
  862. IMPORT_RIBBON_TYPES_AND_SETTINGS(TypesAndSettings_AllowZeroStarts);
  863. IMPORT_RIBBON_IMPL_TYPES(TypesAndSettings_AllowZeroStarts);
  864. using KeyGen = StandardKeyGen;
  865. InterleavedSoln isoln(nullptr, /*bytes*/ 0);
  866. SimpleSoln soln;
  867. Hasher hasher;
  868. Banding banding;
  869. KeyGen begin("foo", 0);
  870. KeyGen end("foo", 1);
  871. // Can't add 1 entry
  872. ASSERT_FALSE(banding.ResetAndFindSeedToSolve(/*slots*/ 0, begin, end));
  873. KeyGen begin_and_end("foo", 123);
  874. // Can add 0 entries
  875. ASSERT_TRUE(banding.ResetAndFindSeedToSolve(/*slots*/ 0, begin_and_end,
  876. begin_and_end));
  877. Seed reseeds = banding.GetOrdinalSeed();
  878. ASSERT_EQ(reseeds, 0U);
  879. hasher.SetOrdinalSeed(reseeds);
  880. // Can construct 0-slot solutions
  881. isoln.BackSubstFrom(banding);
  882. soln.BackSubstFrom(banding);
  883. // Should always return false
  884. ASSERT_FALSE(isoln.FilterQuery(*begin, hasher));
  885. ASSERT_FALSE(soln.FilterQuery(*begin, hasher));
  886. // And report that in FP rate
  887. ASSERT_EQ(isoln.ExpectedFpRate(), 0.0);
  888. ASSERT_EQ(soln.ExpectedFpRate(), 0.0);
  889. }
  890. TEST(RibbonTest, RawAndOrdinalSeeds) {
  891. StandardHasher<TypesAndSettings_Seed64> hasher64;
  892. StandardHasher<DefaultTypesAndSettings> hasher64_32;
  893. StandardHasher<TypesAndSettings_Hash32> hasher32;
  894. StandardHasher<TypesAndSettings_Seed8> hasher8;
  895. for (uint32_t limit : {0xffU, 0xffffU}) {
  896. std::vector<bool> seen(limit + 1);
  897. for (uint32_t i = 0; i < limit; ++i) {
  898. hasher64.SetOrdinalSeed(i);
  899. auto raw64 = hasher64.GetRawSeed();
  900. hasher32.SetOrdinalSeed(i);
  901. auto raw32 = hasher32.GetRawSeed();
  902. hasher8.SetOrdinalSeed(static_cast<uint8_t>(i));
  903. auto raw8 = hasher8.GetRawSeed();
  904. {
  905. hasher64_32.SetOrdinalSeed(i);
  906. auto raw64_32 = hasher64_32.GetRawSeed();
  907. ASSERT_EQ(raw64_32, raw32); // Same size seed
  908. }
  909. if (i == 0) {
  910. // Documented that ordinal seed 0 == raw seed 0
  911. ASSERT_EQ(raw64, 0U);
  912. ASSERT_EQ(raw32, 0U);
  913. ASSERT_EQ(raw8, 0U);
  914. } else {
  915. // Extremely likely that upper bits are set
  916. ASSERT_GT(raw64, raw32);
  917. ASSERT_GT(raw32, raw8);
  918. }
  919. // Hashers agree on lower bits
  920. ASSERT_EQ(static_cast<uint32_t>(raw64), raw32);
  921. ASSERT_EQ(static_cast<uint8_t>(raw32), raw8);
  922. // The translation is one-to-one for this size prefix
  923. uint32_t v = static_cast<uint32_t>(raw32 & limit);
  924. ASSERT_EQ(raw64 & limit, v);
  925. ASSERT_FALSE(seen[v]);
  926. seen[v] = true;
  927. }
  928. }
  929. }
  930. namespace {
  931. struct PhsfInputGen {
  932. PhsfInputGen(const std::string& prefix, uint64_t id) : id_(id) {
  933. val_.first = prefix;
  934. ROCKSDB_NAMESPACE::PutFixed64(&val_.first, /*placeholder*/ 0);
  935. }
  936. // Prefix (only one required)
  937. PhsfInputGen& operator++() {
  938. ++id_;
  939. return *this;
  940. }
  941. const std::pair<std::string, uint8_t>& operator*() {
  942. // Use multiplication to mix things up a little in the key
  943. ROCKSDB_NAMESPACE::EncodeFixed64(&val_.first[val_.first.size() - 8],
  944. id_ * uint64_t{0x1500000001});
  945. // Occasionally repeat values etc.
  946. val_.second = static_cast<uint8_t>(id_ * 7 / 8);
  947. return val_;
  948. }
  949. const std::pair<std::string, uint8_t>* operator->() { return &**this; }
  950. bool operator==(const PhsfInputGen& other) const {
  951. // Same prefix is assumed
  952. return id_ == other.id_;
  953. }
  954. bool operator!=(const PhsfInputGen& other) const {
  955. // Same prefix is assumed
  956. return id_ != other.id_;
  957. }
  958. uint64_t id_;
  959. std::pair<std::string, uint8_t> val_;
  960. };
  961. struct PhsfTypesAndSettings : public DefaultTypesAndSettings {
  962. static constexpr bool kIsFilter = false;
  963. };
  964. } // namespace
  965. TEST(RibbonTest, PhsfBasic) {
  966. IMPORT_RIBBON_TYPES_AND_SETTINGS(PhsfTypesAndSettings);
  967. IMPORT_RIBBON_IMPL_TYPES(PhsfTypesAndSettings);
  968. Index num_slots = 12800;
  969. Index num_to_add = static_cast<Index>(num_slots / 1.02);
  970. PhsfInputGen begin("in", 0);
  971. PhsfInputGen end("in", num_to_add);
  972. std::unique_ptr<char[]> idata(new char[/*bytes*/ num_slots]);
  973. InterleavedSoln isoln(idata.get(), /*bytes*/ num_slots);
  974. SimpleSoln soln;
  975. Hasher hasher;
  976. {
  977. Banding banding;
  978. ASSERT_TRUE(banding.ResetAndFindSeedToSolve(num_slots, begin, end));
  979. soln.BackSubstFrom(banding);
  980. isoln.BackSubstFrom(banding);
  981. hasher.SetOrdinalSeed(banding.GetOrdinalSeed());
  982. }
  983. for (PhsfInputGen cur = begin; cur != end; ++cur) {
  984. ASSERT_EQ(cur->second, soln.PhsfQuery(cur->first, hasher));
  985. ASSERT_EQ(cur->second, isoln.PhsfQuery(cur->first, hasher));
  986. }
  987. }
  988. // Not a real test, but a tool used to build APIs in ribbon_config.h
  989. TYPED_TEST(RibbonTypeParamTest, FindOccupancy) {
  990. IMPORT_RIBBON_TYPES_AND_SETTINGS(TypeParam);
  991. IMPORT_RIBBON_IMPL_TYPES(TypeParam);
  992. using KeyGen = typename TypeParam::KeyGen;
  993. if (!FLAGS_find_occ) {
  994. ROCKSDB_GTEST_BYPASS("Tool disabled during unit test runs");
  995. return;
  996. }
  997. KeyGen cur(std::to_string(testing::UnitTest::GetInstance()->random_seed()),
  998. 0);
  999. Banding banding;
  1000. Index num_slots = InterleavedSoln::RoundUpNumSlots(FLAGS_find_min_slots);
  1001. Index max_slots = InterleavedSoln::RoundUpNumSlots(FLAGS_find_max_slots);
  1002. while (num_slots <= max_slots) {
  1003. std::map<int32_t, uint32_t> rem_histogram;
  1004. std::map<Index, uint32_t> slot_histogram;
  1005. if (FLAGS_find_slot_occ) {
  1006. for (Index i = 0; i < kCoeffBits; ++i) {
  1007. slot_histogram[i] = 0;
  1008. slot_histogram[num_slots - 1 - i] = 0;
  1009. slot_histogram[num_slots / 2 - kCoeffBits / 2 + i] = 0;
  1010. }
  1011. }
  1012. uint64_t total_added = 0;
  1013. for (uint32_t i = 0; i < FLAGS_find_iters; ++i) {
  1014. banding.Reset(num_slots);
  1015. uint32_t j = 0;
  1016. KeyGen end = cur;
  1017. end += num_slots + num_slots / 10;
  1018. for (; cur != end; ++cur) {
  1019. if (banding.Add(*cur)) {
  1020. ++j;
  1021. } else {
  1022. break;
  1023. }
  1024. }
  1025. total_added += j;
  1026. for (auto& slot : slot_histogram) {
  1027. slot.second += banding.IsOccupied(slot.first);
  1028. }
  1029. int32_t bucket =
  1030. static_cast<int32_t>(num_slots) - static_cast<int32_t>(j);
  1031. rem_histogram[bucket]++;
  1032. if (FLAGS_verbose) {
  1033. fprintf(stderr, "num_slots: %u i: %u / %u avg_overhead: %g\r",
  1034. static_cast<unsigned>(num_slots), static_cast<unsigned>(i),
  1035. static_cast<unsigned>(FLAGS_find_iters),
  1036. 1.0 * (i + 1) * num_slots / total_added);
  1037. }
  1038. }
  1039. if (FLAGS_verbose) {
  1040. fprintf(stderr, "\n");
  1041. }
  1042. uint32_t cumulative = 0;
  1043. double p50_rem = 0;
  1044. double p95_rem = 0;
  1045. double p99_9_rem = 0;
  1046. for (auto& h : rem_histogram) {
  1047. double before = 1.0 * cumulative / FLAGS_find_iters;
  1048. double not_after = 1.0 * (cumulative + h.second) / FLAGS_find_iters;
  1049. if (FLAGS_verbose) {
  1050. fprintf(stderr, "overhead: %g before: %g not_after: %g\n",
  1051. 1.0 * num_slots / (num_slots - h.first), before, not_after);
  1052. }
  1053. cumulative += h.second;
  1054. if (before < 0.5 && 0.5 <= not_after) {
  1055. // fake it with linear interpolation
  1056. double portion = (0.5 - before) / (not_after - before);
  1057. p50_rem = h.first + portion;
  1058. } else if (before < 0.95 && 0.95 <= not_after) {
  1059. // fake it with linear interpolation
  1060. double portion = (0.95 - before) / (not_after - before);
  1061. p95_rem = h.first + portion;
  1062. } else if (before < 0.999 && 0.999 <= not_after) {
  1063. // fake it with linear interpolation
  1064. double portion = (0.999 - before) / (not_after - before);
  1065. p99_9_rem = h.first + portion;
  1066. }
  1067. }
  1068. for (auto& slot : slot_histogram) {
  1069. fprintf(stderr, "slot[%u] occupied: %g\n", (unsigned)slot.first,
  1070. 1.0 * slot.second / FLAGS_find_iters);
  1071. }
  1072. double mean_rem =
  1073. (1.0 * FLAGS_find_iters * num_slots - total_added) / FLAGS_find_iters;
  1074. fprintf(
  1075. stderr,
  1076. "num_slots: %u iters: %u mean_ovr: %g p50_ovr: %g p95_ovr: %g "
  1077. "p99.9_ovr: %g mean_rem: %g p50_rem: %g p95_rem: %g p99.9_rem: %g\n",
  1078. static_cast<unsigned>(num_slots),
  1079. static_cast<unsigned>(FLAGS_find_iters),
  1080. 1.0 * num_slots / (num_slots - mean_rem),
  1081. 1.0 * num_slots / (num_slots - p50_rem),
  1082. 1.0 * num_slots / (num_slots - p95_rem),
  1083. 1.0 * num_slots / (num_slots - p99_9_rem), mean_rem, p50_rem, p95_rem,
  1084. p99_9_rem);
  1085. num_slots = std::max(
  1086. num_slots + 1, static_cast<Index>(num_slots * FLAGS_find_next_factor));
  1087. num_slots = InterleavedSoln::RoundUpNumSlots(num_slots);
  1088. }
  1089. }
  1090. // Not a real test, but a tool to understand Homogeneous Ribbon
  1091. // behavior (TODO: configuration APIs & tests)
  1092. TYPED_TEST(RibbonTypeParamTest, OptimizeHomogAtScale) {
  1093. IMPORT_RIBBON_TYPES_AND_SETTINGS(TypeParam);
  1094. IMPORT_RIBBON_IMPL_TYPES(TypeParam);
  1095. using KeyGen = typename TypeParam::KeyGen;
  1096. if (!FLAGS_optimize_homog) {
  1097. ROCKSDB_GTEST_BYPASS("Tool disabled during unit test runs");
  1098. return;
  1099. }
  1100. if (!TypeParam::kHomogeneous) {
  1101. ROCKSDB_GTEST_BYPASS("Only for Homogeneous Ribbon");
  1102. return;
  1103. }
  1104. KeyGen cur(std::to_string(testing::UnitTest::GetInstance()->random_seed()),
  1105. 0);
  1106. Banding banding;
  1107. Index num_slots = SimpleSoln::RoundUpNumSlots(FLAGS_optimize_homog_slots);
  1108. banding.Reset(num_slots);
  1109. // This and "band_ovr" is the "allocated overhead", or slots over added.
  1110. // It does not take into account FP rates.
  1111. double target_overhead = 1.20;
  1112. uint32_t num_added = 0;
  1113. do {
  1114. do {
  1115. (void)banding.Add(*cur);
  1116. ++cur;
  1117. ++num_added;
  1118. } while (1.0 * num_slots / num_added > target_overhead);
  1119. SimpleSoln soln;
  1120. soln.BackSubstFrom(banding);
  1121. std::array<uint32_t, 8U * sizeof(ResultRow)> fp_counts_by_cols;
  1122. fp_counts_by_cols.fill(0U);
  1123. for (uint32_t i = 0; i < FLAGS_optimize_homog_check; ++i) {
  1124. ResultRow r = soln.PhsfQuery(*cur, banding);
  1125. ++cur;
  1126. for (size_t j = 0; j < fp_counts_by_cols.size(); ++j) {
  1127. if ((r & 1) == 1) {
  1128. break;
  1129. }
  1130. fp_counts_by_cols[j]++;
  1131. r /= 2;
  1132. }
  1133. }
  1134. fprintf(stderr, "band_ovr: %g ", 1.0 * num_slots / num_added);
  1135. for (unsigned j = 0; j < fp_counts_by_cols.size(); ++j) {
  1136. double inv_fp_rate =
  1137. 1.0 * FLAGS_optimize_homog_check / fp_counts_by_cols[j];
  1138. double equiv_cols = std::log(inv_fp_rate) * 1.4426950409;
  1139. // Overhead vs. information-theoretic minimum based on observed
  1140. // FP rate (subject to sampling error, especially for low FP rates)
  1141. double actual_overhead =
  1142. 1.0 * (j + 1) * num_slots / (equiv_cols * num_added);
  1143. fprintf(stderr, "ovr_%u: %g ", j + 1, actual_overhead);
  1144. }
  1145. fprintf(stderr, "\n");
  1146. target_overhead -= FLAGS_optimize_homog_granularity;
  1147. } while (target_overhead > 1.0);
  1148. }
  1149. int main(int argc, char** argv) {
  1150. ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
  1151. ::testing::InitGoogleTest(&argc, argv);
  1152. #ifdef GFLAGS
  1153. ParseCommandLineFlags(&argc, &argv, true);
  1154. #endif // GFLAGS
  1155. return RUN_ALL_TESTS();
  1156. }