bloom_test.cc 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912
  1. // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. //
  6. // Copyright (c) 2012 The LevelDB Authors. All rights reserved.
  7. // Use of this source code is governed by a BSD-style license that can be
  8. // found in the LICENSE file. See the AUTHORS file for names of contributors.
  9. #ifndef GFLAGS
  10. #include <cstdio>
  11. int main() {
  12. fprintf(stderr, "Please install gflags to run this test... Skipping...\n");
  13. return 0;
  14. }
  15. #else
  16. #include <array>
  17. #include <cmath>
  18. #include <vector>
  19. #include "logging/logging.h"
  20. #include "memory/arena.h"
  21. #include "rocksdb/filter_policy.h"
  22. #include "table/block_based/filter_policy_internal.h"
  23. #include "test_util/testharness.h"
  24. #include "test_util/testutil.h"
  25. #include "util/gflags_compat.h"
  26. #include "util/hash.h"
  27. using GFLAGS_NAMESPACE::ParseCommandLineFlags;
  28. DEFINE_int32(bits_per_key, 10, "");
  29. namespace ROCKSDB_NAMESPACE {
  30. static const int kVerbose = 1;
  31. static Slice Key(int i, char* buffer) {
  32. std::string s;
  33. PutFixed32(&s, static_cast<uint32_t>(i));
  34. memcpy(buffer, s.c_str(), sizeof(i));
  35. return Slice(buffer, sizeof(i));
  36. }
  37. static int NextLength(int length) {
  38. if (length < 10) {
  39. length += 1;
  40. } else if (length < 100) {
  41. length += 10;
  42. } else if (length < 1000) {
  43. length += 100;
  44. } else {
  45. length += 1000;
  46. }
  47. return length;
  48. }
  49. class BlockBasedBloomTest : public testing::Test {
  50. private:
  51. std::unique_ptr<const FilterPolicy> policy_;
  52. std::string filter_;
  53. std::vector<std::string> keys_;
  54. public:
  55. BlockBasedBloomTest() { ResetPolicy(); }
  56. void Reset() {
  57. keys_.clear();
  58. filter_.clear();
  59. }
  60. void ResetPolicy(double bits_per_key) {
  61. policy_.reset(new BloomFilterPolicy(bits_per_key,
  62. BloomFilterPolicy::kDeprecatedBlock));
  63. Reset();
  64. }
  65. void ResetPolicy() { ResetPolicy(FLAGS_bits_per_key); }
  66. void Add(const Slice& s) {
  67. keys_.push_back(s.ToString());
  68. }
  69. void Build() {
  70. std::vector<Slice> key_slices;
  71. for (size_t i = 0; i < keys_.size(); i++) {
  72. key_slices.push_back(Slice(keys_[i]));
  73. }
  74. filter_.clear();
  75. policy_->CreateFilter(&key_slices[0], static_cast<int>(key_slices.size()),
  76. &filter_);
  77. keys_.clear();
  78. if (kVerbose >= 2) DumpFilter();
  79. }
  80. size_t FilterSize() const {
  81. return filter_.size();
  82. }
  83. Slice FilterData() const { return Slice(filter_); }
  84. void DumpFilter() {
  85. fprintf(stderr, "F(");
  86. for (size_t i = 0; i+1 < filter_.size(); i++) {
  87. const unsigned int c = static_cast<unsigned int>(filter_[i]);
  88. for (int j = 0; j < 8; j++) {
  89. fprintf(stderr, "%c", (c & (1 <<j)) ? '1' : '.');
  90. }
  91. }
  92. fprintf(stderr, ")\n");
  93. }
  94. bool Matches(const Slice& s) {
  95. if (!keys_.empty()) {
  96. Build();
  97. }
  98. return policy_->KeyMayMatch(s, filter_);
  99. }
  100. double FalsePositiveRate() {
  101. char buffer[sizeof(int)];
  102. int result = 0;
  103. for (int i = 0; i < 10000; i++) {
  104. if (Matches(Key(i + 1000000000, buffer))) {
  105. result++;
  106. }
  107. }
  108. return result / 10000.0;
  109. }
  110. };
  111. TEST_F(BlockBasedBloomTest, EmptyFilter) {
  112. ASSERT_TRUE(! Matches("hello"));
  113. ASSERT_TRUE(! Matches("world"));
  114. }
  115. TEST_F(BlockBasedBloomTest, Small) {
  116. Add("hello");
  117. Add("world");
  118. ASSERT_TRUE(Matches("hello"));
  119. ASSERT_TRUE(Matches("world"));
  120. ASSERT_TRUE(! Matches("x"));
  121. ASSERT_TRUE(! Matches("foo"));
  122. }
  123. TEST_F(BlockBasedBloomTest, VaryingLengths) {
  124. char buffer[sizeof(int)];
  125. // Count number of filters that significantly exceed the false positive rate
  126. int mediocre_filters = 0;
  127. int good_filters = 0;
  128. for (int length = 1; length <= 10000; length = NextLength(length)) {
  129. Reset();
  130. for (int i = 0; i < length; i++) {
  131. Add(Key(i, buffer));
  132. }
  133. Build();
  134. ASSERT_LE(FilterSize(), (size_t)((length * 10 / 8) + 40)) << length;
  135. // All added keys must match
  136. for (int i = 0; i < length; i++) {
  137. ASSERT_TRUE(Matches(Key(i, buffer)))
  138. << "Length " << length << "; key " << i;
  139. }
  140. // Check false positive rate
  141. double rate = FalsePositiveRate();
  142. if (kVerbose >= 1) {
  143. fprintf(stderr, "False positives: %5.2f%% @ length = %6d ; bytes = %6d\n",
  144. rate*100.0, length, static_cast<int>(FilterSize()));
  145. }
  146. ASSERT_LE(rate, 0.02); // Must not be over 2%
  147. if (rate > 0.0125) mediocre_filters++; // Allowed, but not too often
  148. else good_filters++;
  149. }
  150. if (kVerbose >= 1) {
  151. fprintf(stderr, "Filters: %d good, %d mediocre\n",
  152. good_filters, mediocre_filters);
  153. }
  154. ASSERT_LE(mediocre_filters, good_filters/5);
  155. }
  156. // Ensure the implementation doesn't accidentally change in an
  157. // incompatible way
  158. TEST_F(BlockBasedBloomTest, Schema) {
  159. char buffer[sizeof(int)];
  160. ResetPolicy(8); // num_probes = 5
  161. for (int key = 0; key < 87; key++) {
  162. Add(Key(key, buffer));
  163. }
  164. Build();
  165. ASSERT_EQ(BloomHash(FilterData()), 3589896109U);
  166. ResetPolicy(9); // num_probes = 6
  167. for (int key = 0; key < 87; key++) {
  168. Add(Key(key, buffer));
  169. }
  170. Build();
  171. ASSERT_EQ(BloomHash(FilterData()), 969445585);
  172. ResetPolicy(11); // num_probes = 7
  173. for (int key = 0; key < 87; key++) {
  174. Add(Key(key, buffer));
  175. }
  176. Build();
  177. ASSERT_EQ(BloomHash(FilterData()), 1694458207);
  178. ResetPolicy(10); // num_probes = 6
  179. for (int key = 0; key < 87; key++) {
  180. Add(Key(key, buffer));
  181. }
  182. Build();
  183. ASSERT_EQ(BloomHash(FilterData()), 2373646410U);
  184. ResetPolicy(10);
  185. for (int key = /*CHANGED*/ 1; key < 87; key++) {
  186. Add(Key(key, buffer));
  187. }
  188. Build();
  189. ASSERT_EQ(BloomHash(FilterData()), 1908442116);
  190. ResetPolicy(10);
  191. for (int key = 1; key < /*CHANGED*/ 88; key++) {
  192. Add(Key(key, buffer));
  193. }
  194. Build();
  195. ASSERT_EQ(BloomHash(FilterData()), 3057004015U);
  196. // With new fractional bits_per_key, check that we are rounding to
  197. // whole bits per key for old Bloom filters.
  198. ResetPolicy(9.5); // Treated as 10
  199. for (int key = 1; key < 88; key++) {
  200. Add(Key(key, buffer));
  201. }
  202. Build();
  203. ASSERT_EQ(BloomHash(FilterData()), /*SAME*/ 3057004015U);
  204. ResetPolicy(10.499); // Treated as 10
  205. for (int key = 1; key < 88; key++) {
  206. Add(Key(key, buffer));
  207. }
  208. Build();
  209. ASSERT_EQ(BloomHash(FilterData()), /*SAME*/ 3057004015U);
  210. ResetPolicy();
  211. }
  212. // Different bits-per-byte
  213. class FullBloomTest : public testing::TestWithParam<BloomFilterPolicy::Mode> {
  214. private:
  215. BlockBasedTableOptions table_options_;
  216. std::shared_ptr<const FilterPolicy>& policy_;
  217. std::unique_ptr<FilterBitsBuilder> bits_builder_;
  218. std::unique_ptr<FilterBitsReader> bits_reader_;
  219. std::unique_ptr<const char[]> buf_;
  220. size_t filter_size_;
  221. public:
  222. FullBloomTest() : policy_(table_options_.filter_policy), filter_size_(0) {
  223. ResetPolicy();
  224. }
  225. BuiltinFilterBitsBuilder* GetBuiltinFilterBitsBuilder() {
  226. // Throws on bad cast
  227. return &dynamic_cast<BuiltinFilterBitsBuilder&>(*bits_builder_);
  228. }
  229. const BloomFilterPolicy* GetBloomFilterPolicy() {
  230. // Throws on bad cast
  231. return &dynamic_cast<const BloomFilterPolicy&>(*policy_);
  232. }
  233. void Reset() {
  234. bits_builder_.reset(BloomFilterPolicy::GetBuilderFromContext(
  235. FilterBuildingContext(table_options_)));
  236. bits_reader_.reset(nullptr);
  237. buf_.reset(nullptr);
  238. filter_size_ = 0;
  239. }
  240. void ResetPolicy(double bits_per_key) {
  241. policy_.reset(new BloomFilterPolicy(bits_per_key, GetParam()));
  242. Reset();
  243. }
  244. void ResetPolicy() { ResetPolicy(FLAGS_bits_per_key); }
  245. void Add(const Slice& s) {
  246. bits_builder_->AddKey(s);
  247. }
  248. void OpenRaw(const Slice& s) {
  249. bits_reader_.reset(policy_->GetFilterBitsReader(s));
  250. }
  251. void Build() {
  252. Slice filter = bits_builder_->Finish(&buf_);
  253. bits_reader_.reset(policy_->GetFilterBitsReader(filter));
  254. filter_size_ = filter.size();
  255. }
  256. size_t FilterSize() const {
  257. return filter_size_;
  258. }
  259. Slice FilterData() { return Slice(buf_.get(), filter_size_); }
  260. int GetNumProbesFromFilterData() {
  261. assert(filter_size_ >= 5);
  262. int8_t raw_num_probes = static_cast<int8_t>(buf_.get()[filter_size_ - 5]);
  263. if (raw_num_probes == -1) { // New bloom filter marker
  264. return static_cast<uint8_t>(buf_.get()[filter_size_ - 3]);
  265. } else {
  266. return raw_num_probes;
  267. }
  268. }
  269. bool Matches(const Slice& s) {
  270. if (bits_reader_ == nullptr) {
  271. Build();
  272. }
  273. return bits_reader_->MayMatch(s);
  274. }
  275. // Provides a kind of fingerprint on the Bloom filter's
  276. // behavior, for reasonbly high FP rates.
  277. uint64_t PackedMatches() {
  278. char buffer[sizeof(int)];
  279. uint64_t result = 0;
  280. for (int i = 0; i < 64; i++) {
  281. if (Matches(Key(i + 12345, buffer))) {
  282. result |= uint64_t{1} << i;
  283. }
  284. }
  285. return result;
  286. }
  287. // Provides a kind of fingerprint on the Bloom filter's
  288. // behavior, for lower FP rates.
  289. std::string FirstFPs(int count) {
  290. char buffer[sizeof(int)];
  291. std::string rv;
  292. int fp_count = 0;
  293. for (int i = 0; i < 1000000; i++) {
  294. // Pack four match booleans into each hexadecimal digit
  295. if (Matches(Key(i + 1000000, buffer))) {
  296. ++fp_count;
  297. rv += std::to_string(i);
  298. if (fp_count == count) {
  299. break;
  300. }
  301. rv += ',';
  302. }
  303. }
  304. return rv;
  305. }
  306. double FalsePositiveRate() {
  307. char buffer[sizeof(int)];
  308. int result = 0;
  309. for (int i = 0; i < 10000; i++) {
  310. if (Matches(Key(i + 1000000000, buffer))) {
  311. result++;
  312. }
  313. }
  314. return result / 10000.0;
  315. }
  316. uint32_t SelectByImpl(uint32_t for_legacy_bloom,
  317. uint32_t for_fast_local_bloom) {
  318. switch (GetParam()) {
  319. case BloomFilterPolicy::kLegacyBloom:
  320. return for_legacy_bloom;
  321. case BloomFilterPolicy::kFastLocalBloom:
  322. return for_fast_local_bloom;
  323. case BloomFilterPolicy::kDeprecatedBlock:
  324. case BloomFilterPolicy::kAuto:
  325. /* N/A */;
  326. }
  327. // otherwise
  328. assert(false);
  329. return 0;
  330. }
  331. };
  332. TEST_P(FullBloomTest, FilterSize) {
  333. // In addition to checking the consistency of space computation, we are
  334. // checking that denoted and computed doubles are interpreted as expected
  335. // as bits_per_key values.
  336. bool some_computed_less_than_denoted = false;
  337. // Note: enforced minimum is 1 bit per key (1000 millibits), and enforced
  338. // maximum is 100 bits per key (100000 millibits).
  339. for (auto bpk :
  340. std::vector<std::pair<double, int> >{{-HUGE_VAL, 1000},
  341. {-INFINITY, 1000},
  342. {0.0, 1000},
  343. {1.234, 1234},
  344. {3.456, 3456},
  345. {9.5, 9500},
  346. {10.0, 10000},
  347. {10.499, 10499},
  348. {21.345, 21345},
  349. {99.999, 99999},
  350. {1234.0, 100000},
  351. {HUGE_VAL, 100000},
  352. {INFINITY, 100000},
  353. {NAN, 100000}}) {
  354. ResetPolicy(bpk.first);
  355. auto bfp = GetBloomFilterPolicy();
  356. EXPECT_EQ(bpk.second, bfp->GetMillibitsPerKey());
  357. EXPECT_EQ((bpk.second + 500) / 1000, bfp->GetWholeBitsPerKey());
  358. double computed = bpk.first;
  359. // This transforms e.g. 9.5 -> 9.499999999999998, which we still
  360. // round to 10 for whole bits per key.
  361. computed += 0.5;
  362. computed /= 1234567.0;
  363. computed *= 1234567.0;
  364. computed -= 0.5;
  365. some_computed_less_than_denoted |= (computed < bpk.first);
  366. ResetPolicy(computed);
  367. bfp = GetBloomFilterPolicy();
  368. EXPECT_EQ(bpk.second, bfp->GetMillibitsPerKey());
  369. EXPECT_EQ((bpk.second + 500) / 1000, bfp->GetWholeBitsPerKey());
  370. auto bits_builder = GetBuiltinFilterBitsBuilder();
  371. for (int n = 1; n < 100; n++) {
  372. auto space = bits_builder->CalculateSpace(n);
  373. auto n2 = bits_builder->CalculateNumEntry(space);
  374. EXPECT_GE(n2, n);
  375. auto space2 = bits_builder->CalculateSpace(n2);
  376. EXPECT_EQ(space, space2);
  377. }
  378. }
  379. // Check that the compiler hasn't optimized our computation into nothing
  380. EXPECT_TRUE(some_computed_less_than_denoted);
  381. ResetPolicy();
  382. }
  383. TEST_P(FullBloomTest, FullEmptyFilter) {
  384. // Empty filter is not match, at this level
  385. ASSERT_TRUE(!Matches("hello"));
  386. ASSERT_TRUE(!Matches("world"));
  387. }
  388. TEST_P(FullBloomTest, FullSmall) {
  389. Add("hello");
  390. Add("world");
  391. ASSERT_TRUE(Matches("hello"));
  392. ASSERT_TRUE(Matches("world"));
  393. ASSERT_TRUE(!Matches("x"));
  394. ASSERT_TRUE(!Matches("foo"));
  395. }
  396. TEST_P(FullBloomTest, FullVaryingLengths) {
  397. char buffer[sizeof(int)];
  398. // Count number of filters that significantly exceed the false positive rate
  399. int mediocre_filters = 0;
  400. int good_filters = 0;
  401. for (int length = 1; length <= 10000; length = NextLength(length)) {
  402. Reset();
  403. for (int i = 0; i < length; i++) {
  404. Add(Key(i, buffer));
  405. }
  406. Build();
  407. ASSERT_LE(FilterSize(),
  408. (size_t)((length * 10 / 8) + CACHE_LINE_SIZE * 2 + 5));
  409. // All added keys must match
  410. for (int i = 0; i < length; i++) {
  411. ASSERT_TRUE(Matches(Key(i, buffer)))
  412. << "Length " << length << "; key " << i;
  413. }
  414. // Check false positive rate
  415. double rate = FalsePositiveRate();
  416. if (kVerbose >= 1) {
  417. fprintf(stderr, "False positives: %5.2f%% @ length = %6d ; bytes = %6d\n",
  418. rate*100.0, length, static_cast<int>(FilterSize()));
  419. }
  420. ASSERT_LE(rate, 0.02); // Must not be over 2%
  421. if (rate > 0.0125)
  422. mediocre_filters++; // Allowed, but not too often
  423. else
  424. good_filters++;
  425. }
  426. if (kVerbose >= 1) {
  427. fprintf(stderr, "Filters: %d good, %d mediocre\n",
  428. good_filters, mediocre_filters);
  429. }
  430. ASSERT_LE(mediocre_filters, good_filters/5);
  431. }
  432. namespace {
  433. inline uint32_t SelectByCacheLineSize(uint32_t for64, uint32_t for128,
  434. uint32_t for256) {
  435. (void)for64;
  436. (void)for128;
  437. (void)for256;
  438. #if CACHE_LINE_SIZE == 64
  439. return for64;
  440. #elif CACHE_LINE_SIZE == 128
  441. return for128;
  442. #elif CACHE_LINE_SIZE == 256
  443. return for256;
  444. #else
  445. #error "CACHE_LINE_SIZE unknown or unrecognized"
  446. #endif
  447. }
  448. } // namespace
  449. // Ensure the implementation doesn't accidentally change in an
  450. // incompatible way. This test doesn't check the reading side
  451. // (FirstFPs/PackedMatches) for LegacyBloom because it requires the
  452. // ability to read filters generated using other cache line sizes.
  453. // See RawSchema.
  454. TEST_P(FullBloomTest, Schema) {
  455. char buffer[sizeof(int)];
  456. // Use enough keys so that changing bits / key by 1 is guaranteed to
  457. // change number of allocated cache lines. So keys > max cache line bits.
  458. ResetPolicy(2); // num_probes = 1
  459. for (int key = 0; key < 2087; key++) {
  460. Add(Key(key, buffer));
  461. }
  462. Build();
  463. EXPECT_EQ(GetNumProbesFromFilterData(), 1);
  464. EXPECT_EQ(
  465. BloomHash(FilterData()),
  466. SelectByImpl(SelectByCacheLineSize(1567096579, 1964771444, 2659542661U),
  467. 3817481309U));
  468. if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
  469. EXPECT_EQ("11,13,17,25,29,30,35,37,45,53", FirstFPs(10));
  470. }
  471. ResetPolicy(3); // num_probes = 2
  472. for (int key = 0; key < 2087; key++) {
  473. Add(Key(key, buffer));
  474. }
  475. Build();
  476. EXPECT_EQ(GetNumProbesFromFilterData(), 2);
  477. EXPECT_EQ(
  478. BloomHash(FilterData()),
  479. SelectByImpl(SelectByCacheLineSize(2707206547U, 2571983456U, 218344685),
  480. 2807269961U));
  481. if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
  482. EXPECT_EQ("4,15,17,24,27,28,29,53,63,70", FirstFPs(10));
  483. }
  484. ResetPolicy(5); // num_probes = 3
  485. for (int key = 0; key < 2087; key++) {
  486. Add(Key(key, buffer));
  487. }
  488. Build();
  489. EXPECT_EQ(GetNumProbesFromFilterData(), 3);
  490. EXPECT_EQ(
  491. BloomHash(FilterData()),
  492. SelectByImpl(SelectByCacheLineSize(515748486, 94611728, 2436112214U),
  493. 204628445));
  494. if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
  495. EXPECT_EQ("15,24,29,39,53,87,89,100,103,104", FirstFPs(10));
  496. }
  497. ResetPolicy(8); // num_probes = 5
  498. for (int key = 0; key < 2087; key++) {
  499. Add(Key(key, buffer));
  500. }
  501. Build();
  502. EXPECT_EQ(GetNumProbesFromFilterData(), 5);
  503. EXPECT_EQ(
  504. BloomHash(FilterData()),
  505. SelectByImpl(SelectByCacheLineSize(1302145999, 2811644657U, 756553699),
  506. 355564975));
  507. if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
  508. EXPECT_EQ("16,60,66,126,220,238,244,256,265,287", FirstFPs(10));
  509. }
  510. ResetPolicy(9); // num_probes = 6
  511. for (int key = 0; key < 2087; key++) {
  512. Add(Key(key, buffer));
  513. }
  514. Build();
  515. EXPECT_EQ(GetNumProbesFromFilterData(), 6);
  516. EXPECT_EQ(
  517. BloomHash(FilterData()),
  518. SelectByImpl(SelectByCacheLineSize(2092755149, 661139132, 1182970461),
  519. 2137566013U));
  520. if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
  521. EXPECT_EQ("156,367,791,872,945,1015,1139,1159,1265,1435", FirstFPs(10));
  522. }
  523. ResetPolicy(11); // num_probes = 7
  524. for (int key = 0; key < 2087; key++) {
  525. Add(Key(key, buffer));
  526. }
  527. Build();
  528. EXPECT_EQ(GetNumProbesFromFilterData(), 7);
  529. EXPECT_EQ(
  530. BloomHash(FilterData()),
  531. SelectByImpl(SelectByCacheLineSize(3755609649U, 1812694762, 1449142939),
  532. 2561502687U));
  533. if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
  534. EXPECT_EQ("34,74,130,236,643,882,962,1015,1035,1110", FirstFPs(10));
  535. }
  536. // This used to be 9 probes, but 8 is a better choice for speed,
  537. // especially with SIMD groups of 8 probes, with essentially no
  538. // change in FP rate.
  539. // FP rate @ 9 probes, old Bloom: 0.4321%
  540. // FP rate @ 9 probes, new Bloom: 0.1846%
  541. // FP rate @ 8 probes, new Bloom: 0.1843%
  542. ResetPolicy(14); // num_probes = 8 (new), 9 (old)
  543. for (int key = 0; key < 2087; key++) {
  544. Add(Key(key, buffer));
  545. }
  546. Build();
  547. EXPECT_EQ(GetNumProbesFromFilterData(), SelectByImpl(9, 8));
  548. EXPECT_EQ(
  549. BloomHash(FilterData()),
  550. SelectByImpl(SelectByCacheLineSize(178861123, 379087593, 2574136516U),
  551. 3709876890U));
  552. if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
  553. EXPECT_EQ("130,240,522,565,989,2002,2526,3147,3543", FirstFPs(9));
  554. }
  555. // This used to be 11 probes, but 9 is a better choice for speed
  556. // AND accuracy.
  557. // FP rate @ 11 probes, old Bloom: 0.3571%
  558. // FP rate @ 11 probes, new Bloom: 0.0884%
  559. // FP rate @ 9 probes, new Bloom: 0.0843%
  560. ResetPolicy(16); // num_probes = 9 (new), 11 (old)
  561. for (int key = 0; key < 2087; key++) {
  562. Add(Key(key, buffer));
  563. }
  564. Build();
  565. EXPECT_EQ(GetNumProbesFromFilterData(), SelectByImpl(11, 9));
  566. EXPECT_EQ(
  567. BloomHash(FilterData()),
  568. SelectByImpl(SelectByCacheLineSize(1129406313, 3049154394U, 1727750964),
  569. 1087138490));
  570. if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
  571. EXPECT_EQ("3299,3611,3916,6620,7822,8079,8482,8942,10167", FirstFPs(9));
  572. }
  573. ResetPolicy(10); // num_probes = 6, but different memory ratio vs. 9
  574. for (int key = 0; key < 2087; key++) {
  575. Add(Key(key, buffer));
  576. }
  577. Build();
  578. EXPECT_EQ(GetNumProbesFromFilterData(), 6);
  579. EXPECT_EQ(
  580. BloomHash(FilterData()),
  581. SelectByImpl(SelectByCacheLineSize(1478976371, 2910591341U, 1182970461),
  582. 2498541272U));
  583. if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
  584. EXPECT_EQ("16,126,133,422,466,472,813,1002,1035,1159", FirstFPs(10));
  585. }
  586. ResetPolicy(10);
  587. for (int key = /*CHANGED*/ 1; key < 2087; key++) {
  588. Add(Key(key, buffer));
  589. }
  590. Build();
  591. EXPECT_EQ(GetNumProbesFromFilterData(), 6);
  592. EXPECT_EQ(
  593. BloomHash(FilterData()),
  594. SelectByImpl(SelectByCacheLineSize(4205696321U, 1132081253U, 2385981855U),
  595. 2058382345U));
  596. if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
  597. EXPECT_EQ("16,126,133,422,466,472,813,1002,1035,1159", FirstFPs(10));
  598. }
  599. ResetPolicy(10);
  600. for (int key = 1; key < /*CHANGED*/ 2088; key++) {
  601. Add(Key(key, buffer));
  602. }
  603. Build();
  604. EXPECT_EQ(GetNumProbesFromFilterData(), 6);
  605. EXPECT_EQ(
  606. BloomHash(FilterData()),
  607. SelectByImpl(SelectByCacheLineSize(2885052954U, 769447944, 4175124908U),
  608. 23699164));
  609. if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
  610. EXPECT_EQ("16,126,133,422,466,472,813,1002,1035,1159", FirstFPs(10));
  611. }
  612. // With new fractional bits_per_key, check that we are rounding to
  613. // whole bits per key for old Bloom filters but fractional for
  614. // new Bloom filter.
  615. ResetPolicy(9.5);
  616. for (int key = 1; key < 2088; key++) {
  617. Add(Key(key, buffer));
  618. }
  619. Build();
  620. EXPECT_EQ(GetNumProbesFromFilterData(), 6);
  621. EXPECT_EQ(BloomHash(FilterData()),
  622. SelectByImpl(/*SAME*/ SelectByCacheLineSize(2885052954U, 769447944,
  623. 4175124908U),
  624. /*CHANGED*/ 3166884174U));
  625. if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
  626. EXPECT_EQ(/*CHANGED*/ "126,156,367,444,458,791,813,976,1015,1035",
  627. FirstFPs(10));
  628. }
  629. ResetPolicy(10.499);
  630. for (int key = 1; key < 2088; key++) {
  631. Add(Key(key, buffer));
  632. }
  633. Build();
  634. EXPECT_EQ(GetNumProbesFromFilterData(), SelectByImpl(6, 7));
  635. EXPECT_EQ(BloomHash(FilterData()),
  636. SelectByImpl(/*SAME*/ SelectByCacheLineSize(2885052954U, 769447944,
  637. 4175124908U),
  638. /*CHANGED*/ 4098502778U));
  639. if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
  640. EXPECT_EQ(/*CHANGED*/ "16,236,240,472,1015,1045,1111,1409,1465,1612",
  641. FirstFPs(10));
  642. }
  643. ResetPolicy();
  644. }
  645. // A helper class for testing custom or corrupt filter bits as read by
  646. // built-in FilterBitsReaders.
  647. struct RawFilterTester {
  648. // Buffer, from which we always return a tail Slice, so the
  649. // last five bytes are always the metadata bytes.
  650. std::array<char, 3000> data_;
  651. // Points five bytes from the end
  652. char* metadata_ptr_;
  653. RawFilterTester() : metadata_ptr_(&*(data_.end() - 5)) {}
  654. Slice ResetNoFill(uint32_t len_without_metadata, uint32_t num_lines,
  655. uint32_t num_probes) {
  656. metadata_ptr_[0] = static_cast<char>(num_probes);
  657. EncodeFixed32(metadata_ptr_ + 1, num_lines);
  658. uint32_t len = len_without_metadata + /*metadata*/ 5;
  659. assert(len <= data_.size());
  660. return Slice(metadata_ptr_ - len_without_metadata, len);
  661. }
  662. Slice Reset(uint32_t len_without_metadata, uint32_t num_lines,
  663. uint32_t num_probes, bool fill_ones) {
  664. data_.fill(fill_ones ? 0xff : 0);
  665. return ResetNoFill(len_without_metadata, num_lines, num_probes);
  666. }
  667. Slice ResetWeirdFill(uint32_t len_without_metadata, uint32_t num_lines,
  668. uint32_t num_probes) {
  669. for (uint32_t i = 0; i < data_.size(); ++i) {
  670. data_[i] = static_cast<char>(0x7b7b >> (i % 7));
  671. }
  672. return ResetNoFill(len_without_metadata, num_lines, num_probes);
  673. }
  674. };
  675. TEST_P(FullBloomTest, RawSchema) {
  676. RawFilterTester cft;
  677. // Two probes, about 3/4 bits set: ~50% "FP" rate
  678. // One 256-byte cache line.
  679. OpenRaw(cft.ResetWeirdFill(256, 1, 2));
  680. EXPECT_EQ(uint64_t{11384799501900898790U}, PackedMatches());
  681. // Two 128-byte cache lines.
  682. OpenRaw(cft.ResetWeirdFill(256, 2, 2));
  683. EXPECT_EQ(uint64_t{10157853359773492589U}, PackedMatches());
  684. // Four 64-byte cache lines.
  685. OpenRaw(cft.ResetWeirdFill(256, 4, 2));
  686. EXPECT_EQ(uint64_t{7123594913907464682U}, PackedMatches());
  687. }
  688. TEST_P(FullBloomTest, CorruptFilters) {
  689. RawFilterTester cft;
  690. for (bool fill : {false, true}) {
  691. // Good filter bits - returns same as fill
  692. OpenRaw(cft.Reset(CACHE_LINE_SIZE, 1, 6, fill));
  693. ASSERT_EQ(fill, Matches("hello"));
  694. ASSERT_EQ(fill, Matches("world"));
  695. // Good filter bits - returns same as fill
  696. OpenRaw(cft.Reset(CACHE_LINE_SIZE * 3, 3, 6, fill));
  697. ASSERT_EQ(fill, Matches("hello"));
  698. ASSERT_EQ(fill, Matches("world"));
  699. // Good filter bits - returns same as fill
  700. // 256 is unusual but legal cache line size
  701. OpenRaw(cft.Reset(256 * 3, 3, 6, fill));
  702. ASSERT_EQ(fill, Matches("hello"));
  703. ASSERT_EQ(fill, Matches("world"));
  704. // Good filter bits - returns same as fill
  705. // 30 should be max num_probes
  706. OpenRaw(cft.Reset(CACHE_LINE_SIZE, 1, 30, fill));
  707. ASSERT_EQ(fill, Matches("hello"));
  708. ASSERT_EQ(fill, Matches("world"));
  709. // Good filter bits - returns same as fill
  710. // 1 should be min num_probes
  711. OpenRaw(cft.Reset(CACHE_LINE_SIZE, 1, 1, fill));
  712. ASSERT_EQ(fill, Matches("hello"));
  713. ASSERT_EQ(fill, Matches("world"));
  714. // Type 1 trivial filter bits - returns true as if FP by zero probes
  715. OpenRaw(cft.Reset(CACHE_LINE_SIZE, 1, 0, fill));
  716. ASSERT_TRUE(Matches("hello"));
  717. ASSERT_TRUE(Matches("world"));
  718. // Type 2 trivial filter bits - returns false as if built from zero keys
  719. OpenRaw(cft.Reset(0, 0, 6, fill));
  720. ASSERT_FALSE(Matches("hello"));
  721. ASSERT_FALSE(Matches("world"));
  722. // Type 2 trivial filter bits - returns false as if built from zero keys
  723. OpenRaw(cft.Reset(0, 37, 6, fill));
  724. ASSERT_FALSE(Matches("hello"));
  725. ASSERT_FALSE(Matches("world"));
  726. // Type 2 trivial filter bits - returns false as 0 size trumps 0 probes
  727. OpenRaw(cft.Reset(0, 0, 0, fill));
  728. ASSERT_FALSE(Matches("hello"));
  729. ASSERT_FALSE(Matches("world"));
  730. // Bad filter bits - returns true for safety
  731. // No solution to 0 * x == CACHE_LINE_SIZE
  732. OpenRaw(cft.Reset(CACHE_LINE_SIZE, 0, 6, fill));
  733. ASSERT_TRUE(Matches("hello"));
  734. ASSERT_TRUE(Matches("world"));
  735. // Bad filter bits - returns true for safety
  736. // Can't have 3 * x == 4 for integer x
  737. OpenRaw(cft.Reset(4, 3, 6, fill));
  738. ASSERT_TRUE(Matches("hello"));
  739. ASSERT_TRUE(Matches("world"));
  740. // Bad filter bits - returns true for safety
  741. // 97 bytes is not a power of two, so not a legal cache line size
  742. OpenRaw(cft.Reset(97 * 3, 3, 6, fill));
  743. ASSERT_TRUE(Matches("hello"));
  744. ASSERT_TRUE(Matches("world"));
  745. // Bad filter bits - returns true for safety
  746. // 65 bytes is not a power of two, so not a legal cache line size
  747. OpenRaw(cft.Reset(65 * 3, 3, 6, fill));
  748. ASSERT_TRUE(Matches("hello"));
  749. ASSERT_TRUE(Matches("world"));
  750. // Bad filter bits - returns false as if built from zero keys
  751. // < 5 bytes overall means missing even metadata
  752. OpenRaw(cft.Reset(-1, 3, 6, fill));
  753. ASSERT_FALSE(Matches("hello"));
  754. ASSERT_FALSE(Matches("world"));
  755. OpenRaw(cft.Reset(-5, 3, 6, fill));
  756. ASSERT_FALSE(Matches("hello"));
  757. ASSERT_FALSE(Matches("world"));
  758. // Dubious filter bits - returns same as fill (for now)
  759. // 31 is not a useful num_probes, nor generated by RocksDB unless directly
  760. // using filter bits API without BloomFilterPolicy.
  761. OpenRaw(cft.Reset(CACHE_LINE_SIZE, 1, 31, fill));
  762. ASSERT_EQ(fill, Matches("hello"));
  763. ASSERT_EQ(fill, Matches("world"));
  764. // Dubious filter bits - returns same as fill (for now)
  765. // Similar, with 127, largest positive char
  766. OpenRaw(cft.Reset(CACHE_LINE_SIZE, 1, 127, fill));
  767. ASSERT_EQ(fill, Matches("hello"));
  768. ASSERT_EQ(fill, Matches("world"));
  769. // Dubious filter bits - returns true (for now)
  770. // num_probes set to 128 / -128, lowest negative char
  771. // NB: Bug in implementation interprets this as negative and has same
  772. // effect as zero probes, but effectively reserves negative char values
  773. // for future use.
  774. OpenRaw(cft.Reset(CACHE_LINE_SIZE, 1, 128, fill));
  775. ASSERT_TRUE(Matches("hello"));
  776. ASSERT_TRUE(Matches("world"));
  777. // Dubious filter bits - returns true (for now)
  778. // Similar, with 255 / -1
  779. OpenRaw(cft.Reset(CACHE_LINE_SIZE, 1, 255, fill));
  780. ASSERT_TRUE(Matches("hello"));
  781. ASSERT_TRUE(Matches("world"));
  782. }
  783. }
  784. INSTANTIATE_TEST_CASE_P(Full, FullBloomTest,
  785. testing::Values(BloomFilterPolicy::kLegacyBloom,
  786. BloomFilterPolicy::kFastLocalBloom));
  787. } // namespace ROCKSDB_NAMESPACE
  788. int main(int argc, char** argv) {
  789. ::testing::InitGoogleTest(&argc, argv);
  790. ParseCommandLineFlags(&argc, &argv, true);
  791. return RUN_ALL_TESTS();
  792. }
  793. #endif // GFLAGS