bloom_impl.h 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489
  1. // Copyright (c) 2019-present, Facebook, Inc. All rights reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. //
  6. // Implementation details of various Bloom filter implementations used in
  7. // RocksDB. (DynamicBloom is in a separate file for now because it
  8. // supports concurrent write.)
  9. #pragma once
  10. #include <stddef.h>
  11. #include <stdint.h>
  12. #include <cmath>
  13. #include "port/port.h" // for PREFETCH
  14. #include "rocksdb/slice.h"
  15. #include "util/hash.h"
  16. #ifdef __AVX2__
  17. #include <immintrin.h>
  18. #endif
  19. namespace ROCKSDB_NAMESPACE {
  20. class BloomMath {
  21. public:
  22. // False positive rate of a standard Bloom filter, for given ratio of
  23. // filter memory bits to added keys, and number of probes per operation.
  24. // (The false positive rate is effectively independent of scale, assuming
  25. // the implementation scales OK.)
  26. static double StandardFpRate(double bits_per_key, int num_probes) {
  27. // Standard very-good-estimate formula. See
  28. // https://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives
  29. return std::pow(1.0 - std::exp(-num_probes / bits_per_key), num_probes);
  30. }
  31. // False positive rate of a "blocked"/"shareded"/"cache-local" Bloom filter,
  32. // for given ratio of filter memory bits to added keys, number of probes per
  33. // operation (all within the given block or cache line size), and block or
  34. // cache line size.
  35. static double CacheLocalFpRate(double bits_per_key, int num_probes,
  36. int cache_line_bits) {
  37. if (bits_per_key <= 0.0) {
  38. // Fix a discontinuity
  39. return 1.0;
  40. }
  41. double keys_per_cache_line = cache_line_bits / bits_per_key;
  42. // A reasonable estimate is the average of the FP rates for one standard
  43. // deviation above and below the mean bucket occupancy. See
  44. // https://github.com/facebook/rocksdb/wiki/RocksDB-Bloom-Filter#the-math
  45. double keys_stddev = std::sqrt(keys_per_cache_line);
  46. double crowded_fp = StandardFpRate(
  47. cache_line_bits / (keys_per_cache_line + keys_stddev), num_probes);
  48. double uncrowded_fp = StandardFpRate(
  49. cache_line_bits / (keys_per_cache_line - keys_stddev), num_probes);
  50. return (crowded_fp + uncrowded_fp) / 2;
  51. }
  52. // False positive rate of querying a new item against `num_keys` items, all
  53. // hashed to `fingerprint_bits` bits. (This assumes the fingerprint hashes
  54. // themselves are stored losslessly. See Section 4 of
  55. // http://www.ccs.neu.edu/home/pete/pub/bloom-filters-verification.pdf)
  56. static double FingerprintFpRate(size_t num_keys, int fingerprint_bits) {
  57. double inv_fingerprint_space = std::pow(0.5, fingerprint_bits);
  58. // Base estimate assumes each key maps to a unique fingerprint.
  59. // Could be > 1 in extreme cases.
  60. double base_estimate = num_keys * inv_fingerprint_space;
  61. // To account for potential overlap, we choose between two formulas
  62. if (base_estimate > 0.0001) {
  63. // A very good formula assuming we don't construct a floating point
  64. // number extremely close to 1. Always produces a probability < 1.
  65. return 1.0 - std::exp(-base_estimate);
  66. } else {
  67. // A very good formula when base_estimate is far below 1. (Subtract
  68. // away the integral-approximated sum that some key has same hash as
  69. // one coming before it in a list.)
  70. return base_estimate - (base_estimate * base_estimate * 0.5);
  71. }
  72. }
  73. // Returns the probably of either of two independent(-ish) events
  74. // happening, given their probabilities. (This is useful for combining
  75. // results from StandardFpRate or CacheLocalFpRate with FingerprintFpRate
  76. // for a hash-efficient Bloom filter's FP rate. See Section 4 of
  77. // http://www.ccs.neu.edu/home/pete/pub/bloom-filters-verification.pdf)
  78. static double IndependentProbabilitySum(double rate1, double rate2) {
  79. // Use formula that avoids floating point extremely close to 1 if
  80. // rates are extremely small.
  81. return rate1 + rate2 - (rate1 * rate2);
  82. }
  83. };
  84. // A fast, flexible, and accurate cache-local Bloom implementation with
  85. // SIMD-optimized query performance (currently using AVX2 on Intel). Write
  86. // performance and non-SIMD read are very good, benefiting from FastRange32
  87. // used in place of % and single-cycle multiplication on recent processors.
  88. //
  89. // Most other SIMD Bloom implementations sacrifice flexibility and/or
  90. // accuracy by requiring num_probes to be a power of two and restricting
  91. // where each probe can occur in a cache line. This implementation sacrifices
  92. // SIMD-optimization for add (might still be possible, especially with AVX512)
  93. // in favor of allowing any num_probes, not crossing cache line boundary,
  94. // and accuracy close to theoretical best accuracy for a cache-local Bloom.
  95. // E.g. theoretical best for 10 bits/key, num_probes=6, and 512-bit bucket
  96. // (Intel cache line size) is 0.9535% FP rate. This implementation yields
  97. // about 0.957%. (Compare to LegacyLocalityBloomImpl<false> at 1.138%, or
  98. // about 0.951% for 1024-bit buckets, cache line size for some ARM CPUs.)
  99. //
  100. // This implementation can use a 32-bit hash (let h2 be h1 * 0x9e3779b9) or
  101. // a 64-bit hash (split into two uint32s). With many millions of keys, the
  102. // false positive rate associated with using a 32-bit hash can dominate the
  103. // false positive rate of the underlying filter. At 10 bits/key setting, the
  104. // inflection point is about 40 million keys, so 32-bit hash is a bad idea
  105. // with 10s of millions of keys or more.
  106. //
  107. // Despite accepting a 64-bit hash, this implementation uses 32-bit fastrange
  108. // to pick a cache line, which can be faster than 64-bit in some cases.
  109. // This only hurts accuracy as you get into 10s of GB for a single filter,
  110. // and accuracy abruptly breaks down at 256GB (2^32 cache lines). Switch to
  111. // 64-bit fastrange if you need filters so big. ;)
  112. //
  113. // Using only a 32-bit input hash within each cache line has negligible
  114. // impact for any reasonable cache line / bucket size, for arbitrary filter
  115. // size, and potentially saves intermediate data size in some cases vs.
  116. // tracking full 64 bits. (Even in an implementation using 64-bit arithmetic
  117. // to generate indices, I might do the same, as a single multiplication
  118. // suffices to generate a sufficiently mixed 64 bits from 32 bits.)
  119. //
  120. // This implementation is currently tied to Intel cache line size, 64 bytes ==
  121. // 512 bits. If there's sufficient demand for other cache line sizes, this is
  122. // a pretty good implementation to extend, but slight performance enhancements
  123. // are possible with an alternate implementation (probably not very compatible
  124. // with SIMD):
  125. // (1) Use rotation in addition to multiplication for remixing
  126. // (like murmur hash). (Using multiplication alone *slightly* hurts accuracy
  127. // because lower bits never depend on original upper bits.)
  128. // (2) Extract more than one bit index from each re-mix. (Only if rotation
  129. // or similar is part of remix, because otherwise you're making the
  130. // multiplication-only problem worse.)
  131. // (3) Re-mix full 64 bit hash, to get maximum number of bit indices per
  132. // re-mix.
  133. //
  134. class FastLocalBloomImpl {
  135. public:
  136. // NOTE: this has only been validated to enough accuracy for producing
  137. // reasonable warnings / user feedback, not for making functional decisions.
  138. static double EstimatedFpRate(size_t keys, size_t bytes, int num_probes,
  139. int hash_bits) {
  140. return BloomMath::IndependentProbabilitySum(
  141. BloomMath::CacheLocalFpRate(8.0 * bytes / keys, num_probes,
  142. /*cache line bits*/ 512),
  143. BloomMath::FingerprintFpRate(keys, hash_bits));
  144. }
  145. static inline int ChooseNumProbes(int millibits_per_key) {
  146. // Since this implementation can (with AVX2) make up to 8 probes
  147. // for the same cost, we pick the most accurate num_probes, based
  148. // on actual tests of the implementation. Note that for higher
  149. // bits/key, the best choice for cache-local Bloom can be notably
  150. // smaller than standard bloom, e.g. 9 instead of 11 @ 16 b/k.
  151. if (millibits_per_key <= 2080) {
  152. return 1;
  153. } else if (millibits_per_key <= 3580) {
  154. return 2;
  155. } else if (millibits_per_key <= 5100) {
  156. return 3;
  157. } else if (millibits_per_key <= 6640) {
  158. return 4;
  159. } else if (millibits_per_key <= 8300) {
  160. return 5;
  161. } else if (millibits_per_key <= 10070) {
  162. return 6;
  163. } else if (millibits_per_key <= 11720) {
  164. return 7;
  165. } else if (millibits_per_key <= 14001) {
  166. // Would be something like <= 13800 but sacrificing *slightly* for
  167. // more settings using <= 8 probes.
  168. return 8;
  169. } else if (millibits_per_key <= 16050) {
  170. return 9;
  171. } else if (millibits_per_key <= 18300) {
  172. return 10;
  173. } else if (millibits_per_key <= 22001) {
  174. return 11;
  175. } else if (millibits_per_key <= 25501) {
  176. return 12;
  177. } else if (millibits_per_key > 50000) {
  178. // Top out at 24 probes (three sets of 8)
  179. return 24;
  180. } else {
  181. // Roughly optimal choices for remaining range
  182. // e.g.
  183. // 28000 -> 12, 28001 -> 13
  184. // 50000 -> 23, 50001 -> 24
  185. return (millibits_per_key - 1) / 2000 - 1;
  186. }
  187. }
  188. static inline void AddHash(uint32_t h1, uint32_t h2, uint32_t len_bytes,
  189. int num_probes, char *data) {
  190. uint32_t bytes_to_cache_line = FastRange32(h1, len_bytes >> 6) << 6;
  191. AddHashPrepared(h2, num_probes, data + bytes_to_cache_line);
  192. }
  193. static inline void AddHashPrepared(uint32_t h2, int num_probes,
  194. char *data_at_cache_line) {
  195. uint32_t h = h2;
  196. for (int i = 0; i < num_probes; ++i, h *= uint32_t{0x9e3779b9}) {
  197. // 9-bit address within 512 bit cache line
  198. int bitpos = h >> (32 - 9);
  199. data_at_cache_line[bitpos >> 3] |= (uint8_t{1} << (bitpos & 7));
  200. }
  201. }
  202. static inline void PrepareHash(uint32_t h1, uint32_t len_bytes,
  203. const char *data,
  204. uint32_t /*out*/ *byte_offset) {
  205. uint32_t bytes_to_cache_line = FastRange32(h1, len_bytes >> 6) << 6;
  206. PREFETCH(data + bytes_to_cache_line, 0 /* rw */, 1 /* locality */);
  207. PREFETCH(data + bytes_to_cache_line + 63, 0 /* rw */, 1 /* locality */);
  208. *byte_offset = bytes_to_cache_line;
  209. }
  210. static inline bool HashMayMatch(uint32_t h1, uint32_t h2, uint32_t len_bytes,
  211. int num_probes, const char *data) {
  212. uint32_t bytes_to_cache_line = FastRange32(h1, len_bytes >> 6) << 6;
  213. return HashMayMatchPrepared(h2, num_probes, data + bytes_to_cache_line);
  214. }
  215. static inline bool HashMayMatchPrepared(uint32_t h2, int num_probes,
  216. const char *data_at_cache_line) {
  217. uint32_t h = h2;
  218. #ifdef __AVX2__
  219. int rem_probes = num_probes;
  220. // NOTE: For better performance for num_probes in {1, 2, 9, 10, 17, 18,
  221. // etc.} one can insert specialized code for rem_probes <= 2, bypassing
  222. // the SIMD code in those cases. There is a detectable but minor overhead
  223. // applied to other values of num_probes (when not statically determined),
  224. // but smoother performance curve vs. num_probes. But for now, when
  225. // in doubt, don't add unnecessary code.
  226. // Powers of 32-bit golden ratio, mod 2**32.
  227. const __m256i multipliers =
  228. _mm256_setr_epi32(0x00000001, 0x9e3779b9, 0xe35e67b1, 0x734297e9,
  229. 0x35fbe861, 0xdeb7c719, 0x448b211, 0x3459b749);
  230. for (;;) {
  231. // Eight copies of hash
  232. __m256i hash_vector = _mm256_set1_epi32(h);
  233. // Same effect as repeated multiplication by 0x9e3779b9 thanks to
  234. // associativity of multiplication.
  235. hash_vector = _mm256_mullo_epi32(hash_vector, multipliers);
  236. // Now the top 9 bits of each of the eight 32-bit values in
  237. // hash_vector are bit addresses for probes within the cache line.
  238. // While the platform-independent code uses byte addressing (6 bits
  239. // to pick a byte + 3 bits to pick a bit within a byte), here we work
  240. // with 32-bit words (4 bits to pick a word + 5 bits to pick a bit
  241. // within a word) because that works well with AVX2 and is equivalent
  242. // under little-endian.
  243. // Shift each right by 28 bits to get 4-bit word addresses.
  244. const __m256i word_addresses = _mm256_srli_epi32(hash_vector, 28);
  245. // Gather 32-bit values spread over 512 bits by 4-bit address. In
  246. // essence, we are dereferencing eight pointers within the cache
  247. // line.
  248. //
  249. // Option 1: AVX2 gather (seems to be a little slow - understandable)
  250. // const __m256i value_vector =
  251. // _mm256_i32gather_epi32(static_cast<const int
  252. // *>(data_at_cache_line),
  253. // word_addresses,
  254. // /*bytes / i32*/ 4);
  255. // END Option 1
  256. // Potentially unaligned as we're not *always* cache-aligned -> loadu
  257. const __m256i *mm_data =
  258. reinterpret_cast<const __m256i *>(data_at_cache_line);
  259. __m256i lower = _mm256_loadu_si256(mm_data);
  260. __m256i upper = _mm256_loadu_si256(mm_data + 1);
  261. // Option 2: AVX512VL permute hack
  262. // Only negligibly faster than Option 3, so not yet worth supporting
  263. // const __m256i value_vector =
  264. // _mm256_permutex2var_epi32(lower, word_addresses, upper);
  265. // END Option 2
  266. // Option 3: AVX2 permute+blend hack
  267. // Use lowest three bits to order probing values, as if all from same
  268. // 256 bit piece.
  269. lower = _mm256_permutevar8x32_epi32(lower, word_addresses);
  270. upper = _mm256_permutevar8x32_epi32(upper, word_addresses);
  271. // Just top 1 bit of address, to select between lower and upper.
  272. const __m256i upper_lower_selector = _mm256_srai_epi32(hash_vector, 31);
  273. // Finally: the next 8 probed 32-bit values, in probing sequence order.
  274. const __m256i value_vector =
  275. _mm256_blendv_epi8(lower, upper, upper_lower_selector);
  276. // END Option 3
  277. // We might not need to probe all 8, so build a mask for selecting only
  278. // what we need. (The k_selector(s) could be pre-computed but that
  279. // doesn't seem to make a noticeable performance difference.)
  280. const __m256i zero_to_seven = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
  281. // Subtract rem_probes from each of those constants
  282. __m256i k_selector =
  283. _mm256_sub_epi32(zero_to_seven, _mm256_set1_epi32(rem_probes));
  284. // Negative after subtract -> use/select
  285. // Keep only high bit (logical shift right each by 31).
  286. k_selector = _mm256_srli_epi32(k_selector, 31);
  287. // Strip off the 4 bit word address (shift left)
  288. __m256i bit_addresses = _mm256_slli_epi32(hash_vector, 4);
  289. // And keep only 5-bit (32 - 27) bit-within-32-bit-word addresses.
  290. bit_addresses = _mm256_srli_epi32(bit_addresses, 27);
  291. // Build a bit mask
  292. const __m256i bit_mask = _mm256_sllv_epi32(k_selector, bit_addresses);
  293. // Like ((~value_vector) & bit_mask) == 0)
  294. bool match = _mm256_testc_si256(value_vector, bit_mask) != 0;
  295. // This check first so that it's easy for branch predictor to optimize
  296. // num_probes <= 8 case, making it free of unpredictable branches.
  297. if (rem_probes <= 8) {
  298. return match;
  299. } else if (!match) {
  300. return false;
  301. }
  302. // otherwise
  303. // Need another iteration. 0xab25f4c1 == golden ratio to the 8th power
  304. h *= 0xab25f4c1;
  305. rem_probes -= 8;
  306. }
  307. #else
  308. for (int i = 0; i < num_probes; ++i, h *= uint32_t{0x9e3779b9}) {
  309. // 9-bit address within 512 bit cache line
  310. int bitpos = h >> (32 - 9);
  311. if ((data_at_cache_line[bitpos >> 3] & (char(1) << (bitpos & 7))) == 0) {
  312. return false;
  313. }
  314. }
  315. return true;
  316. #endif
  317. }
  318. };
  319. // A legacy Bloom filter implementation with no locality of probes (slow).
  320. // It uses double hashing to generate a sequence of hash values.
  321. // Asymptotic analysis is in [Kirsch,Mitzenmacher 2006], but known to have
  322. // subtle accuracy flaws for practical sizes [Dillinger,Manolios 2004].
  323. //
  324. // DO NOT REUSE
  325. //
  326. class LegacyNoLocalityBloomImpl {
  327. public:
  328. static inline int ChooseNumProbes(int bits_per_key) {
  329. // We intentionally round down to reduce probing cost a little bit
  330. int num_probes = static_cast<int>(bits_per_key * 0.69); // 0.69 =~ ln(2)
  331. if (num_probes < 1) num_probes = 1;
  332. if (num_probes > 30) num_probes = 30;
  333. return num_probes;
  334. }
  335. static inline void AddHash(uint32_t h, uint32_t total_bits, int num_probes,
  336. char *data) {
  337. const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits
  338. for (int i = 0; i < num_probes; i++) {
  339. const uint32_t bitpos = h % total_bits;
  340. data[bitpos / 8] |= (1 << (bitpos % 8));
  341. h += delta;
  342. }
  343. }
  344. static inline bool HashMayMatch(uint32_t h, uint32_t total_bits,
  345. int num_probes, const char *data) {
  346. const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits
  347. for (int i = 0; i < num_probes; i++) {
  348. const uint32_t bitpos = h % total_bits;
  349. if ((data[bitpos / 8] & (1 << (bitpos % 8))) == 0) {
  350. return false;
  351. }
  352. h += delta;
  353. }
  354. return true;
  355. }
  356. };
  357. // A legacy Bloom filter implementation with probes local to a single
  358. // cache line (fast). Because SST files might be transported between
  359. // platforms, the cache line size is a parameter rather than hard coded.
  360. // (But if specified as a constant parameter, an optimizing compiler
  361. // should take advantage of that.)
  362. //
  363. // When ExtraRotates is false, this implementation is notably deficient in
  364. // accuracy. Specifically, it uses double hashing with a 1/512 chance of the
  365. // increment being zero (when cache line size is 512 bits). Thus, there's a
  366. // 1/512 chance of probing only one index, which we'd expect to incur about
  367. // a 1/2 * 1/512 or absolute 0.1% FP rate penalty. More detail at
  368. // https://github.com/facebook/rocksdb/issues/4120
  369. //
  370. // DO NOT REUSE
  371. //
  372. template <bool ExtraRotates>
  373. class LegacyLocalityBloomImpl {
  374. private:
  375. static inline uint32_t GetLine(uint32_t h, uint32_t num_lines) {
  376. uint32_t offset_h = ExtraRotates ? (h >> 11) | (h << 21) : h;
  377. return offset_h % num_lines;
  378. }
  379. public:
  380. // NOTE: this has only been validated to enough accuracy for producing
  381. // reasonable warnings / user feedback, not for making functional decisions.
  382. static double EstimatedFpRate(size_t keys, size_t bytes, int num_probes) {
  383. double bits_per_key = 8.0 * bytes / keys;
  384. double filter_rate = BloomMath::CacheLocalFpRate(bits_per_key, num_probes,
  385. /*cache line bits*/ 512);
  386. if (!ExtraRotates) {
  387. // Good estimate of impact of flaw in index computation.
  388. // Adds roughly 0.002 around 50 bits/key and 0.001 around 100 bits/key.
  389. // The + 22 shifts it nicely to fit for lower bits/key.
  390. filter_rate += 0.1 / (bits_per_key * 0.75 + 22);
  391. } else {
  392. // Not yet validated
  393. assert(false);
  394. }
  395. // Always uses 32-bit hash
  396. double fingerprint_rate = BloomMath::FingerprintFpRate(keys, 32);
  397. return BloomMath::IndependentProbabilitySum(filter_rate, fingerprint_rate);
  398. }
  399. static inline void AddHash(uint32_t h, uint32_t num_lines, int num_probes,
  400. char *data, int log2_cache_line_bytes) {
  401. const int log2_cache_line_bits = log2_cache_line_bytes + 3;
  402. char *data_at_offset =
  403. data + (GetLine(h, num_lines) << log2_cache_line_bytes);
  404. const uint32_t delta = (h >> 17) | (h << 15);
  405. for (int i = 0; i < num_probes; ++i) {
  406. // Mask to bit-within-cache-line address
  407. const uint32_t bitpos = h & ((1 << log2_cache_line_bits) - 1);
  408. data_at_offset[bitpos / 8] |= (1 << (bitpos % 8));
  409. if (ExtraRotates) {
  410. h = (h >> log2_cache_line_bits) | (h << (32 - log2_cache_line_bits));
  411. }
  412. h += delta;
  413. }
  414. }
  415. static inline void PrepareHashMayMatch(uint32_t h, uint32_t num_lines,
  416. const char *data,
  417. uint32_t /*out*/ *byte_offset,
  418. int log2_cache_line_bytes) {
  419. uint32_t b = GetLine(h, num_lines) << log2_cache_line_bytes;
  420. PREFETCH(data + b, 0 /* rw */, 1 /* locality */);
  421. PREFETCH(data + b + ((1 << log2_cache_line_bytes) - 1), 0 /* rw */,
  422. 1 /* locality */);
  423. *byte_offset = b;
  424. }
  425. static inline bool HashMayMatch(uint32_t h, uint32_t num_lines,
  426. int num_probes, const char *data,
  427. int log2_cache_line_bytes) {
  428. uint32_t b = GetLine(h, num_lines) << log2_cache_line_bytes;
  429. return HashMayMatchPrepared(h, num_probes, data + b, log2_cache_line_bytes);
  430. }
  431. static inline bool HashMayMatchPrepared(uint32_t h, int num_probes,
  432. const char *data_at_offset,
  433. int log2_cache_line_bytes) {
  434. const int log2_cache_line_bits = log2_cache_line_bytes + 3;
  435. const uint32_t delta = (h >> 17) | (h << 15);
  436. for (int i = 0; i < num_probes; ++i) {
  437. // Mask to bit-within-cache-line address
  438. const uint32_t bitpos = h & ((1 << log2_cache_line_bits) - 1);
  439. if (((data_at_offset[bitpos / 8]) & (1 << (bitpos % 8))) == 0) {
  440. return false;
  441. }
  442. if (ExtraRotates) {
  443. h = (h >> log2_cache_line_bits) | (h << (32 - log2_cache_line_bits));
  444. }
  445. h += delta;
  446. }
  447. return true;
  448. }
  449. };
  450. } // namespace ROCKSDB_NAMESPACE