ribbon_alg.h 53 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225
  1. // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
  2. // This source code is licensed under both the GPLv2 (found in the
  3. // COPYING file in the root directory) and Apache 2.0 License
  4. // (found in the LICENSE.Apache file in the root directory).
  5. #pragma once
  6. #include <array>
  7. #include <memory>
  8. #include "rocksdb/rocksdb_namespace.h"
  9. #include "util/math128.h"
  10. namespace ROCKSDB_NAMESPACE {
  11. namespace ribbon {
  12. // RIBBON PHSF & RIBBON Filter (Rapid Incremental Boolean Banding ON-the-fly)
  13. //
  14. // ribbon_alg.h: generic versions of core algorithms.
  15. //
  16. // Ribbon is a Perfect Hash Static Function construction useful as a compact
  17. // static Bloom filter alternative. It combines (a) a boolean (GF(2)) linear
  18. // system construction that approximates a Band Matrix with hashing,
  19. // (b) an incremental, on-the-fly Gaussian Elimination algorithm that is
  20. // remarkably efficient and adaptable at constructing an upper-triangular
  21. // band matrix from a set of band-approximating inputs from (a), and
  22. // (c) a storage layout that is fast and adaptable as a filter.
  23. //
  24. // Footnotes: (a) "Efficient Gauss Elimination for Near-Quadratic Matrices
  25. // with One Short Random Block per Row, with Applications" by Stefan
  26. // Walzer and Martin Dietzfelbinger ("DW paper")
  27. // (b) developed by Peter C. Dillinger, though not the first on-the-fly
  28. // GE algorithm. See "On the fly Gaussian Elimination for LT codes" by
  29. // Bioglio, Grangetto, Gaeta, and Sereno.
  30. // (c) see "interleaved" solution storage below.
  31. //
  32. // See ribbon_impl.h for high-level behavioral summary. This file focuses
  33. // on the core design details.
  34. //
  35. // ######################################################################
  36. // ################# PHSF -> static filter reduction ####################
  37. //
  38. // A Perfect Hash Static Function is a data structure representing a
  39. // map from anything hashable (a "key") to values of some fixed size.
  40. // Crucially, it is allowed to return garbage values for anything not in
  41. // the original set of map keys, and it is a "static" structure: entries
  42. // cannot be added or deleted after construction. PHSFs representing n
  43. // mappings to b-bit values (assume uniformly distributed) require at least
  44. // n * b bits to represent, or at least b bits per entry. We typically
  45. // describe the compactness of a PHSF by typical bits per entry as some
  46. // function of b. For example, the MWHC construction (k=3 "peeling")
  47. // requires about 1.0222*b and a variant called Xor+ requires about
  48. // 1.08*b + 0.5 bits per entry.
  49. //
  50. // With more hashing, a PHSF can over-approximate a set as a Bloom filter
  51. // does, with no FN queries and predictable false positive (FP) query
  52. // rate. Instead of the user providing a value to map each input key to,
  53. // a hash function provides the value. Keys in the original set will
  54. // return a positive membership query because the underlying PHSF returns
  55. // the same value as hashing the key. When a key is not in the original set,
  56. // the PHSF returns a "garbage" value, which is only equal to the key's
  57. // hash with (false positive) probability 1 in 2^b.
  58. //
  59. // For a matching false positive rate, standard Bloom filters require
  60. // 1.44*b bits per entry. Cache-local Bloom filters (like bloom_impl.h)
  61. // require a bit more, around 1.5*b bits per entry. Thus, a Bloom
  62. // alternative could save up to or nearly 1/3rd of memory and storage
  63. // that RocksDB uses for SST (static) Bloom filters. (Memtable Bloom filter
  64. // is dynamic.)
  65. //
  66. // Recommended reading:
  67. // "Xor Filters: Faster and Smaller Than Bloom and Cuckoo Filters"
  68. // by Graf and Lemire
  69. // First three sections of "Fast Scalable Construction of (Minimal
  70. // Perfect Hash) Functions" by Genuzio, Ottaviano, and Vigna
  71. //
  72. // ######################################################################
  73. // ################## PHSF vs. hash table vs. Bloom #####################
  74. //
  75. // You can think of traditional hash tables and related filter variants
  76. // such as Cuckoo filters as utilizing an "OR" construction: a hash
  77. // function associates a key with some slots and the data is returned if
  78. // the data is found in any one of those slots. The collision resolution
  79. // is visible in the final data structure and requires extra information.
  80. // For example, Cuckoo filter uses roughly 1.05b + 2 bits per entry, and
  81. // Golomb-Rice code (aka "GCS") as little as b + 1.5. When the data
  82. // structure associates each input key with data in one slot, the
  83. // structure implicitly constructs a (near-)minimal (near-)perfect hash
  84. // (MPH) of the keys, which requires at least 1.44 bits per key to
  85. // represent. This is why approaches with visible collision resolution
  86. // have a fixed + 1.5 or more in storage overhead per entry, often in
  87. // addition to an overhead multiplier on b.
  88. //
  89. // By contrast Bloom filters utilize an "AND" construction: a query only
  90. // returns true if all bit positions associated with a key are set to 1.
  91. // There is no collision resolution, so Bloom filters do not suffer a
  92. // fixed bits per entry overhead like the above structures.
  93. //
  94. // PHSFs typically use a bitwise XOR construction: the data you want is
  95. // not in a single slot, but in a linear combination of several slots.
  96. // For static data, this gives the best of "AND" and "OR" constructions:
  97. // avoids the +1.44 or more fixed overhead by not approximating a MPH and
  98. // can do much better than Bloom's 1.44 factor on b with collision
  99. // resolution, which here is done ahead of time and invisible at query
  100. // time.
  101. //
  102. // ######################################################################
  103. // ######################## PHSF construction ###########################
  104. //
  105. // For a typical PHSF, construction is solving a linear system of
  106. // equations, typically in GF(2), which is to say that values are boolean
  107. // and XOR serves both as addition and subtraction. We can use matrices to
  108. // represent the problem:
  109. //
  110. // C * S = R
  111. // (n x m) (m x b) (n x b)
  112. // where C = coefficients, S = solution, R = results
  113. // and solving for S given C and R.
  114. //
  115. // Note that C and R each have n rows, one for each input entry for the
  116. // PHSF. A row in C is given by a hash function on the PHSF input key,
  117. // and the corresponding row in R is the b-bit value to associate with
  118. // that input key. (In a filter, rows of R are given by another hash
  119. // function on the input key.)
  120. //
  121. // On solving, the matrix S (solution) is the final PHSF data, as it
  122. // maps any row from the original C to its corresponding desired result
  123. // in R. We just have to hash our query inputs and compute a linear
  124. // combination of rows in S.
  125. //
  126. // In theory, we could chose m = n and let a hash function associate
  127. // each input key with random rows in C. A solution exists with high
  128. // probability, and uses essentially minimum space, b bits per entry
  129. // (because we set m = n) but this has terrible scaling, something
  130. // like O(n^2) space and O(n^3) time during construction (Gaussian
  131. // elimination) and O(n) query time. But computational efficiency is
  132. // key, and the core of this is avoiding scanning all of S to answer
  133. // each query.
  134. //
  135. // The traditional approach (MWHC, aka Xor filter) starts with setting
  136. // only some small fixed number of columns (typically k=3) to 1 for each
  137. // row of C, with remaining entries implicitly 0. This is implemented as
  138. // three hash functions over [0,m), and S can be implemented as a vector
  139. // of b-bit values. Now, a query only involves looking up k rows
  140. // (values) in S and computing their bitwise XOR. Additionally, this
  141. // construction can use a linear time algorithm called "peeling" for
  142. // finding a solution in many cases of one existing, but peeling
  143. // generally requires a larger space overhead factor in the solution
  144. // (m/n) than is required with Gaussian elimination.
  145. //
  146. // Recommended reading:
  147. // "Peeling Close to the Orientability Threshold - Spatial Coupling in
  148. // Hashing-Based Data Structures" by Stefan Walzer
  149. //
  150. // ######################################################################
  151. // ##################### Ribbon PHSF construction #######################
  152. //
  153. // Ribbon constructs coefficient rows essentially the same as in the
  154. // Walzer/Dietzfelbinger paper cited above: for some chosen fixed width
  155. // r (kCoeffBits in code), each key is hashed to a starting column in
  156. // [0, m - r] (GetStart() in code) and an r-bit sequence of boolean
  157. // coefficients (GetCoeffRow() in code). If you sort the rows by start,
  158. // the C matrix would look something like this:
  159. //
  160. // [####00000000000000000000]
  161. // [####00000000000000000000]
  162. // [000####00000000000000000]
  163. // [0000####0000000000000000]
  164. // [0000000####0000000000000]
  165. // [000000000####00000000000]
  166. // [000000000####00000000000]
  167. // [0000000000000####0000000]
  168. // [0000000000000000####0000]
  169. // [00000000000000000####000]
  170. // [00000000000000000000####]
  171. //
  172. // where each # could be a 0 or 1, chosen uniformly by a hash function.
  173. // (Except we typically set the start column value to 1.) This scheme
  174. // uses hashing to approximate a band matrix, and it has a solution iff
  175. // it reduces to an upper-triangular boolean r-band matrix, like this:
  176. //
  177. // [1###00000000000000000000]
  178. // [01##00000000000000000000]
  179. // [000000000000000000000000]
  180. // [0001###00000000000000000]
  181. // [000000000000000000000000]
  182. // [000001##0000000000000000]
  183. // [000000000000000000000000]
  184. // [00000001###0000000000000]
  185. // [000000001###000000000000]
  186. // [0000000001##000000000000]
  187. // ...
  188. // [00000000000000000000001#]
  189. // [000000000000000000000001]
  190. //
  191. // where we have expanded to an m x m matrix by filling with rows of
  192. // all zeros as needed. As in Gaussian elimination, this form is ready for
  193. // generating a solution through back-substitution.
  194. //
  195. // The awesome thing about the Ribbon construction (from the DW paper) is
  196. // how row reductions keep each row representable as a start column and
  197. // r coefficients, because row reductions are only needed when two rows
  198. // have the same number of leading zero columns. Thus, the combination
  199. // of those rows, the bitwise XOR of the r-bit coefficient rows, cancels
  200. // out the leading 1s, so starts (at least) one column later and only
  201. // needs (at most) r - 1 coefficients.
  202. //
  203. // ######################################################################
  204. // ###################### Ribbon PHSF scalability #######################
  205. //
  206. // Although more practical detail is in ribbon_impl.h, it's worth
  207. // understanding some of the overall benefits and limitations of the
  208. // Ribbon PHSFs.
  209. //
  210. // High-end scalability is a primary issue for Ribbon PHSFs, because in
  211. // a single Ribbon linear system with fixed r and fixed m/n ratio, the
  212. // solution probability approaches zero as n approaches infinity.
  213. // For a given n, solution probability improves with larger r and larger
  214. // m/n.
  215. //
  216. // By contrast, peeling-based PHSFs have somewhat worse storage ratio
  217. // or solution probability for small n (less than ~1000). This is
  218. // especially true with spatial-coupling, where benefits are only
  219. // notable for n on the order of 100k or 1m or more.
  220. //
  221. // To make best use of current hardware, r=128 seems to be closest to
  222. // a "generally good" choice for Ribbon, at least in RocksDB where SST
  223. // Bloom filters typically hold around 10-100k keys, and almost always
  224. // less than 10m keys. r=128 ribbon has a high chance of encoding success
  225. // (with first hash seed) when storage overhead is around 5% (m/n ~ 1.05)
  226. // for roughly 10k - 10m keys in a single linear system. r=64 only scales
  227. // up to about 10k keys with the same storage overhead. Construction and
  228. // access times for r=128 are similar to r=64. r=128 tracks nearly
  229. // twice as much data during construction, but in most cases we expect
  230. // the scalability benefits of r=128 vs. r=64 to make it preferred.
  231. //
  232. // A natural approach to scaling Ribbon beyond ~10m keys is splitting
  233. // (or "sharding") the inputs into multiple linear systems with their
  234. // own hash seeds. This can also help to control peak memory consumption.
  235. // TODO: much more to come
  236. //
  237. // ######################################################################
  238. // #################### Ribbon on-the-fly banding #######################
  239. //
  240. // "Banding" is what we call the process of reducing the inputs to an
  241. // upper-triangular r-band matrix ready for finishing a solution with
  242. // back-substitution. Although the DW paper presents an algorithm for
  243. // this ("SGauss"), the awesome properties of their construction enable
  244. // an even simpler, faster, and more backtrackable algorithm. In simplest
  245. // terms, the SGauss algorithm requires sorting the inputs by start
  246. // columns, but it's possible to make Gaussian elimination resemble hash
  247. // table insertion!
  248. //
  249. // The enhanced algorithm is based on these observations:
  250. // - When processing a coefficient row with first 1 in column j,
  251. // - If it's the first at column j to be processed, it can be part of
  252. // the banding at row j. (And that decision never overwritten, with
  253. // no loss of generality!)
  254. // - Else, it can be combined with existing row j and re-processed,
  255. // which will look for a later "empty" row or reach "no solution".
  256. //
  257. // We call our banding algorithm "incremental" and "on-the-fly" because
  258. // (like hash table insertion) we are "finished" after each input
  259. // processed, with respect to all inputs processed so far. Although the
  260. // band matrix is an intermediate step to the solution structure, we have
  261. // eliminated intermediate steps and unnecessary data tracking for
  262. // banding.
  263. //
  264. // Building on "incremental" and "on-the-fly", the banding algorithm is
  265. // easily backtrackable because no (non-empty) rows are overwritten in
  266. // the banding. Thus, if we want to "try" adding an additional set of
  267. // inputs to the banding, we only have to record which rows were written
  268. // in order to efficiently backtrack to our state before considering
  269. // the additional set. (TODO: how this can mitigate scalability and
  270. // reach sub-1% overheads)
  271. //
  272. // Like in a linear-probed hash table, as the occupancy approaches and
  273. // surpasses 90-95%, collision resolution dominates the construction
  274. // time. (Ribbon doesn't usually pay at query time; see solution
  275. // storage below.) This means that we can speed up construction time
  276. // by using a higher m/n ratio, up to negative returns around 1.2.
  277. // At m/n ~= 1.2, which still saves memory substantially vs. Bloom
  278. // filter's 1.5, construction speed (including back-substitution) is not
  279. // far from sorting speed, but still a few times slower than cache-local
  280. // Bloom construction speed.
  281. //
  282. // Back-substitution from an upper-triangular boolean band matrix is
  283. // especially fast and easy. All the memory accesses are sequential or at
  284. // least local, no random. If the number of result bits (b) is a
  285. // compile-time constant, the back-substitution state can even be tracked
  286. // in CPU registers. Regardless of the solution representation, we prefer
  287. // column-major representation for tracking back-substitution state, as
  288. // r (the band width) will typically be much larger than b (result bits
  289. // or columns), so better to handle r-bit values b times (per solution
  290. // row) than b-bit values r times.
  291. //
  292. // ######################################################################
  293. // ##################### Ribbon solution storage ########################
  294. //
  295. // Row-major layout is typical for boolean (bit) matrices, including for
  296. // MWHC (Xor) filters where a query combines k b-bit values, and k is
  297. // typically smaller than b. Even for k=4 and b=2, at least k=4 random
  298. // look-ups are required regardless of layout.
  299. //
  300. // Ribbon PHSFs are quite different, however, because
  301. // (a) all of the solution rows relevant to a query are within a single
  302. // range of r rows, and
  303. // (b) the number of solution rows involved (r/2 on average, or r if
  304. // avoiding conditional accesses) is typically much greater than
  305. // b, the number of solution columns.
  306. //
  307. // Row-major for Ribbon PHSFs therefore tends to incur undue CPU overhead
  308. // by processing (up to) r entries of b bits each, where b is typically
  309. // less than 10 for filter applications.
  310. //
  311. // Column-major layout has poor locality because of accessing up to b
  312. // memory locations in different pages (and obviously cache lines). Note
  313. // that negative filter queries do not typically need to access all
  314. // solution columns, as they can return when a mismatch is found in any
  315. // result/solution column. This optimization doesn't always pay off on
  316. // recent hardware, where the penalty for unpredictable conditional
  317. // branching can exceed the penalty for unnecessary work, but the
  318. // optimization is essentially unavailable with row-major layout.
  319. //
  320. // The best compromise seems to be interleaving column-major on the small
  321. // scale with row-major on the large scale. For example, let a solution
  322. // "block" be r rows column-major encoded as b r-bit values in sequence.
  323. // Each query accesses (up to) 2 adjacent blocks, which will typically
  324. // span 1-3 cache lines in adjacent memory. We get very close to the same
  325. // locality as row-major, but with much faster reconstruction of each
  326. // result column, at least for filter applications where b is relatively
  327. // small and negative queries can return early.
  328. //
  329. // ######################################################################
  330. // ###################### Fractional result bits ########################
  331. //
  332. // Bloom filters have great flexibility that alternatives mostly do not
  333. // have. One of those flexibilities is in utilizing any ratio of data
  334. // structure bits per key. With a typical memory allocator like jemalloc,
  335. // this flexibility can save roughly 10% of the filters' footprint in
  336. // DRAM by rounding up and down filter sizes to minimize memory internal
  337. // fragmentation (see optimize_filters_for_memory RocksDB option).
  338. //
  339. // At first glance, PHSFs only offer a whole number of bits per "slot"
  340. // (m rather than number of keys n), but coefficient locality in the
  341. // Ribbon construction makes fractional bits/key quite possible and
  342. // attractive for filter applications. This works by a prefix of the
  343. // structure using b-1 solution columns and the rest using b solution
  344. // columns. See InterleavedSolutionStorage below for more detail.
  345. //
  346. // Because false positive rates are non-linear in bits/key, this approach
  347. // is not quite optimal in terms of information theory. In common cases,
  348. // we see additional space overhead up to about 1.5% vs. theoretical
  349. // optimal to achieve the same FP rate. We consider this a quite acceptable
  350. // overhead for very efficiently utilizing space that might otherwise be
  351. // wasted.
  352. //
  353. // This property of Ribbon even makes it "elastic." A Ribbon filter and
  354. // its small metadata for answering queries can be adapted into another
  355. // Ribbon filter filling any smaller multiple of r bits (plus small
  356. // metadata), with a correspondingly higher FP rate. None of the data
  357. // thrown away during construction needs to be recalled for this reduction.
  358. // Similarly a single Ribbon construction can be separated (by solution
  359. // column) into two or more structures (or "layers" or "levels") with
  360. // independent filtering ability (no FP correlation, just as solution or
  361. // result columns in a single structure) despite being constructed as part
  362. // of a single linear system. (TODO: implement)
  363. // See also "ElasticBF: Fine-grained and Elastic Bloom Filter Towards
  364. // Efficient Read for LSM-tree-based KV Stores."
  365. //
  366. // ######################################################################
  367. // ################### CODE: Ribbon core algorithms #####################
  368. // ######################################################################
  369. //
  370. // These algorithms are templatized for genericity but near-maximum
  371. // performance in a given application. The template parameters
  372. // adhere to informal class/struct type concepts outlined below. (This
  373. // code is written for C++11 so does not use formal C++ concepts.)
  374. // Rough architecture for these algorithms:
  375. //
  376. // +-----------+ +---+ +-----------------+
  377. // | AddInputs | --> | H | --> | BandingStorage |
  378. // +-----------+ | a | +-----------------+
  379. // | s | |
  380. // | h | Back substitution
  381. // | e | V
  382. // +-----------+ | r | +-----------------+
  383. // | Query Key | --> | | >+< | SolutionStorage |
  384. // +-----------+ +---+ | +-----------------+
  385. // V
  386. // Query result
  387. // Common to other concepts
  388. // concept RibbonTypes {
  389. // // An unsigned integer type for an r-bit subsequence of coefficients.
  390. // // r (or kCoeffBits) is taken to be sizeof(CoeffRow) * 8, as it would
  391. // // generally only hurt scalability to leave bits of CoeffRow unused.
  392. // typename CoeffRow;
  393. // // An unsigned integer type big enough to hold a result row (b bits,
  394. // // or number of solution/result columns).
  395. // // In many applications, especially filters, the number of result
  396. // // columns is decided at run time, so ResultRow simply needs to be
  397. // // big enough for the largest number of columns allowed.
  398. // typename ResultRow;
  399. // // An unsigned integer type sufficient for representing the number of
  400. // // rows in the solution structure, and at least the arithmetic
  401. // // promotion size (usually 32 bits). uint32_t recommended because a
  402. // // single Ribbon construction doesn't really scale to billions of
  403. // // entries.
  404. // typename Index;
  405. // };
  406. // ######################################################################
  407. // ######################## Hashers and Banding #########################
  408. // Hasher concepts abstract out hashing details.
  409. // concept PhsfQueryHasher extends RibbonTypes {
  410. // // Type for a lookup key, which is hashable.
  411. // typename Key;
  412. //
  413. // // Type for hashed summary of a Key. uint64_t is recommended.
  414. // typename Hash;
  415. //
  416. // // Compute a hash value summarizing a Key
  417. // Hash GetHash(const Key &) const;
  418. //
  419. // // Given a hash value and a number of columns that can start an
  420. // // r-sequence of coefficients (== m - r + 1), return the start
  421. // // column to associate with that hash value. (Starts can be chosen
  422. // // uniformly or "smash" extra entries into the beginning and end for
  423. // // better utilization at those extremes of the structure. Details in
  424. // // ribbon.impl.h)
  425. // Index GetStart(Hash, Index num_starts) const;
  426. //
  427. // // Given a hash value, return the r-bit sequence of coefficients to
  428. // // associate with it. It's generally OK if
  429. // // sizeof(CoeffRow) > sizeof(Hash)
  430. // // as long as the hash itself is not too prone to collisions for the
  431. // // applications and the CoeffRow is generated uniformly from
  432. // // available hash data, but relatively independent of the start.
  433. // //
  434. // // Must be non-zero, because that's required for a solution to exist
  435. // // when mapping to non-zero result row. (Note: BandingAdd could be
  436. // // modified to allow 0 coeff row if that only occurs with 0 result
  437. // // row, which really only makes sense for filter implementation,
  438. // // where both values are hash-derived. Or BandingAdd could reject 0
  439. // // coeff row, forcing next seed, but that has potential problems with
  440. // // generality/scalability.)
  441. // CoeffRow GetCoeffRow(Hash) const;
  442. // };
  443. // concept FilterQueryHasher extends PhsfQueryHasher {
  444. // // For building or querying a filter, this returns the expected
  445. // // result row associated with a hashed input. For general PHSF,
  446. // // this must return 0.
  447. // //
  448. // // Although not strictly required, there's a slightly better chance of
  449. // // solver success if result row is masked down here to only the bits
  450. // // actually needed.
  451. // ResultRow GetResultRowFromHash(Hash) const;
  452. // }
  453. // concept BandingHasher extends FilterQueryHasher {
  454. // // For a filter, this will generally be the same as Key.
  455. // // For a general PHSF, it must either
  456. // // (a) include a key and a result it maps to (e.g. in a std::pair), or
  457. // // (b) GetResultRowFromInput looks up the result somewhere rather than
  458. // // extracting it.
  459. // typename AddInput;
  460. //
  461. // // Instead of requiring a way to extract a Key from an
  462. // // AddInput, we require getting the hash of the Key part
  463. // // of an AddInput, which is trivial if AddInput == Key.
  464. // Hash GetHash(const AddInput &) const;
  465. //
  466. // // For building a non-filter PHSF, this extracts or looks up the result
  467. // // row to associate with an input. For filter PHSF, this must return 0.
  468. // ResultRow GetResultRowFromInput(const AddInput &) const;
  469. //
  470. // // Whether the solver can assume the lowest bit of GetCoeffRow is
  471. // // always 1. When true, it should improve solver efficiency slightly.
  472. // static bool kFirstCoeffAlwaysOne;
  473. // }
  474. // Abstract storage for the the result of "banding" the inputs (Gaussian
  475. // elimination to an upper-triangular boolean band matrix). Because the
  476. // banding is an incremental / on-the-fly algorithm, this also represents
  477. // all the intermediate state between input entries.
  478. //
  479. // concept BandingStorage extends RibbonTypes {
  480. // // Tells the banding algorithm to prefetch memory associated with
  481. // // the next input before processing the current input. Generally
  482. // // recommended iff the BandingStorage doesn't easily fit in CPU
  483. // // cache.
  484. // bool UsePrefetch() const;
  485. //
  486. // // Prefetches (e.g. __builtin_prefetch) memory associated with a
  487. // // slot index i.
  488. // void Prefetch(Index i) const;
  489. //
  490. // // Load or store CoeffRow and ResultRow for slot index i.
  491. // // (Gaussian row operations involve both sides of the equation.)
  492. // // Bool `for_back_subst` indicates that customizing values for
  493. // // unconstrained solution rows (cr == 0) is allowed.
  494. // void LoadRow(Index i, CoeffRow *cr, ResultRow *rr, bool for_back_subst)
  495. // const;
  496. // void StoreRow(Index i, CoeffRow cr, ResultRow rr);
  497. //
  498. // // Returns the number of columns that can start an r-sequence of
  499. // // coefficients, which is the number of slots minus r (kCoeffBits)
  500. // // plus one. (m - r + 1)
  501. // Index GetNumStarts() const;
  502. // };
  503. // Optional storage for backtracking data in banding a set of input
  504. // entries. It exposes an array structure which will generally be
  505. // used as a stack. It must be able to accommodate as many entries
  506. // as are passed in as inputs to `BandingAddRange`.
  507. //
  508. // concept BacktrackStorage extends RibbonTypes {
  509. // // If false, backtracking support will be disabled in the algorithm.
  510. // // This should preferably be an inline compile-time constant function.
  511. // bool UseBacktrack() const;
  512. //
  513. // // Records `to_save` as the `i`th backtrack entry
  514. // void BacktrackPut(Index i, Index to_save);
  515. //
  516. // // Recalls the `i`th backtrack entry
  517. // Index BacktrackGet(Index i) const;
  518. // }
  519. // Adds a single entry to BandingStorage (and optionally, BacktrackStorage),
  520. // returning true if successful or false if solution is impossible with
  521. // current hasher (and presumably its seed) and number of "slots" (solution
  522. // or banding rows). (A solution is impossible when there is a linear
  523. // dependence among the inputs that doesn't "cancel out".)
  524. //
  525. // Pre- and post-condition: the BandingStorage represents a band matrix
  526. // ready for back substitution (row echelon form except for zero rows),
  527. // augmented with result values such that back substitution would give a
  528. // solution satisfying all the cr@start -> rr entries added.
  529. template <bool kFirstCoeffAlwaysOne, typename BandingStorage,
  530. typename BacktrackStorage>
  531. bool BandingAdd(BandingStorage *bs, typename BandingStorage::Index start,
  532. typename BandingStorage::ResultRow rr,
  533. typename BandingStorage::CoeffRow cr, BacktrackStorage *bts,
  534. typename BandingStorage::Index *backtrack_pos) {
  535. using CoeffRow = typename BandingStorage::CoeffRow;
  536. using ResultRow = typename BandingStorage::ResultRow;
  537. using Index = typename BandingStorage::Index;
  538. Index i = start;
  539. if (!kFirstCoeffAlwaysOne) {
  540. // Requires/asserts that cr != 0
  541. int tz = CountTrailingZeroBits(cr);
  542. i += static_cast<Index>(tz);
  543. cr >>= tz;
  544. }
  545. for (;;) {
  546. assert((cr & 1) == 1);
  547. CoeffRow cr_at_i;
  548. ResultRow rr_at_i;
  549. bs->LoadRow(i, &cr_at_i, &rr_at_i, /* for_back_subst */ false);
  550. if (cr_at_i == 0) {
  551. bs->StoreRow(i, cr, rr);
  552. bts->BacktrackPut(*backtrack_pos, i);
  553. ++*backtrack_pos;
  554. return true;
  555. }
  556. assert((cr_at_i & 1) == 1);
  557. // Gaussian row reduction
  558. cr ^= cr_at_i;
  559. rr ^= rr_at_i;
  560. if (cr == 0) {
  561. // Inconsistency or (less likely) redundancy
  562. break;
  563. }
  564. // Find relative offset of next non-zero coefficient.
  565. int tz = CountTrailingZeroBits(cr);
  566. i += static_cast<Index>(tz);
  567. cr >>= tz;
  568. }
  569. // Failed, unless result row == 0 because e.g. a duplicate input or a
  570. // stock hash collision, with same result row. (For filter, stock hash
  571. // collision implies same result row.) Or we could have a full equation
  572. // equal to sum of other equations, which is very possible with
  573. // small range of values for result row.
  574. return rr == 0;
  575. }
  576. // Adds a range of entries to BandingStorage returning true if successful
  577. // or false if solution is impossible with current hasher (and presumably
  578. // its seed) and number of "slots" (solution or banding rows). (A solution
  579. // is impossible when there is a linear dependence among the inputs that
  580. // doesn't "cancel out".) Here "InputIterator" is an iterator over AddInputs.
  581. //
  582. // If UseBacktrack in the BacktrackStorage, this function call rolls back
  583. // to prior state on failure. If !UseBacktrack, some subset of the entries
  584. // will have been added to the BandingStorage, so best considered to be in
  585. // an indeterminate state.
  586. //
  587. template <typename BandingStorage, typename BacktrackStorage,
  588. typename BandingHasher, typename InputIterator>
  589. bool BandingAddRange(BandingStorage *bs, BacktrackStorage *bts,
  590. const BandingHasher &bh, InputIterator begin,
  591. InputIterator end) {
  592. using CoeffRow = typename BandingStorage::CoeffRow;
  593. using Index = typename BandingStorage::Index;
  594. using ResultRow = typename BandingStorage::ResultRow;
  595. using Hash = typename BandingHasher::Hash;
  596. static_assert(IsUnsignedUpTo128<CoeffRow>::value, "must be unsigned");
  597. static_assert(IsUnsignedUpTo128<Index>::value, "must be unsigned");
  598. static_assert(IsUnsignedUpTo128<ResultRow>::value, "must be unsigned");
  599. constexpr bool kFCA1 = BandingHasher::kFirstCoeffAlwaysOne;
  600. if (begin == end) {
  601. // trivial
  602. return true;
  603. }
  604. const Index num_starts = bs->GetNumStarts();
  605. InputIterator cur = begin;
  606. Index backtrack_pos = 0;
  607. if (!bs->UsePrefetch()) {
  608. // Simple version, no prefetch
  609. for (;;) {
  610. Hash h = bh.GetHash(*cur);
  611. Index start = bh.GetStart(h, num_starts);
  612. ResultRow rr =
  613. bh.GetResultRowFromInput(*cur) | bh.GetResultRowFromHash(h);
  614. CoeffRow cr = bh.GetCoeffRow(h);
  615. if (!BandingAdd<kFCA1>(bs, start, rr, cr, bts, &backtrack_pos)) {
  616. break;
  617. }
  618. if ((++cur) == end) {
  619. return true;
  620. }
  621. }
  622. } else {
  623. // Pipelined w/prefetch
  624. // Prime the pipeline
  625. Hash h = bh.GetHash(*cur);
  626. Index start = bh.GetStart(h, num_starts);
  627. ResultRow rr = bh.GetResultRowFromInput(*cur);
  628. bs->Prefetch(start);
  629. // Pipeline
  630. for (;;) {
  631. rr |= bh.GetResultRowFromHash(h);
  632. CoeffRow cr = bh.GetCoeffRow(h);
  633. if ((++cur) == end) {
  634. if (!BandingAdd<kFCA1>(bs, start, rr, cr, bts, &backtrack_pos)) {
  635. break;
  636. }
  637. return true;
  638. }
  639. Hash next_h = bh.GetHash(*cur);
  640. Index next_start = bh.GetStart(next_h, num_starts);
  641. ResultRow next_rr = bh.GetResultRowFromInput(*cur);
  642. bs->Prefetch(next_start);
  643. if (!BandingAdd<kFCA1>(bs, start, rr, cr, bts, &backtrack_pos)) {
  644. break;
  645. }
  646. h = next_h;
  647. start = next_start;
  648. rr = next_rr;
  649. }
  650. }
  651. // failed; backtrack (if implemented)
  652. if (bts->UseBacktrack()) {
  653. while (backtrack_pos > 0) {
  654. --backtrack_pos;
  655. Index i = bts->BacktrackGet(backtrack_pos);
  656. // Clearing the ResultRow is not strictly required, but is required
  657. // for good FP rate on inputs that might have been backtracked out.
  658. // (We don't want anything we've backtracked on to leak into final
  659. // result, as that might not be "harmless".)
  660. bs->StoreRow(i, 0, 0);
  661. }
  662. }
  663. return false;
  664. }
  665. // Adds a range of entries to BandingStorage returning true if successful
  666. // or false if solution is impossible with current hasher (and presumably
  667. // its seed) and number of "slots" (solution or banding rows). (A solution
  668. // is impossible when there is a linear dependence among the inputs that
  669. // doesn't "cancel out".) Here "InputIterator" is an iterator over AddInputs.
  670. //
  671. // On failure, some subset of the entries will have been added to the
  672. // BandingStorage, so best considered to be in an indeterminate state.
  673. //
  674. template <typename BandingStorage, typename BandingHasher,
  675. typename InputIterator>
  676. bool BandingAddRange(BandingStorage *bs, const BandingHasher &bh,
  677. InputIterator begin, InputIterator end) {
  678. using Index = typename BandingStorage::Index;
  679. struct NoopBacktrackStorage {
  680. bool UseBacktrack() { return false; }
  681. void BacktrackPut(Index, Index) {}
  682. Index BacktrackGet(Index) {
  683. assert(false);
  684. return 0;
  685. }
  686. } nbts;
  687. return BandingAddRange(bs, &nbts, bh, begin, end);
  688. }
  689. // ######################################################################
  690. // ######################### Solution Storage ###########################
  691. // Back-substitution and query algorithms unfortunately depend on some
  692. // details of data layout in the final data structure ("solution"). Thus,
  693. // there is no common SolutionStorage covering all the reasonable
  694. // possibilities.
  695. // ###################### SimpleSolutionStorage #########################
  696. // SimpleSolutionStorage is for a row-major storage, typically with no
  697. // unused bits in each ResultRow. This is mostly for demonstration
  698. // purposes as the simplest solution storage scheme. It is relatively slow
  699. // for filter queries.
  700. // concept SimpleSolutionStorage extends RibbonTypes {
  701. // // This is called at the beginning of back-substitution for the
  702. // // solution storage to do any remaining configuration before data
  703. // // is stored to it. If configuration is previously finalized, this
  704. // // could be a simple assertion or even no-op. Ribbon algorithms
  705. // // only call this from back-substitution, and only once per call,
  706. // // before other functions here.
  707. // void PrepareForNumStarts(Index num_starts) const;
  708. // // Must return num_starts passed to PrepareForNumStarts, or the most
  709. // // recent call to PrepareForNumStarts if this storage object can be
  710. // // reused. Note that num_starts == num_slots - kCoeffBits + 1 because
  711. // // there must be a run of kCoeffBits slots starting from each start.
  712. // Index GetNumStarts() const;
  713. // // Load the solution row (type ResultRow) for a slot
  714. // ResultRow Load(Index slot_num) const;
  715. // // Store the solution row (type ResultRow) for a slot
  716. // void Store(Index slot_num, ResultRow data);
  717. // };
  718. // Back-substitution for generating a solution from BandingStorage to
  719. // SimpleSolutionStorage.
  720. template <typename SimpleSolutionStorage, typename BandingStorage>
  721. void SimpleBackSubst(SimpleSolutionStorage *sss, const BandingStorage &bs) {
  722. using CoeffRow = typename BandingStorage::CoeffRow;
  723. using Index = typename BandingStorage::Index;
  724. using ResultRow = typename BandingStorage::ResultRow;
  725. static_assert(sizeof(Index) == sizeof(typename SimpleSolutionStorage::Index),
  726. "must be same");
  727. static_assert(
  728. sizeof(CoeffRow) == sizeof(typename SimpleSolutionStorage::CoeffRow),
  729. "must be same");
  730. static_assert(
  731. sizeof(ResultRow) == sizeof(typename SimpleSolutionStorage::ResultRow),
  732. "must be same");
  733. constexpr auto kCoeffBits = static_cast<Index>(sizeof(CoeffRow) * 8U);
  734. constexpr auto kResultBits = static_cast<Index>(sizeof(ResultRow) * 8U);
  735. // A column-major buffer of the solution matrix, containing enough
  736. // recently-computed solution data to compute the next solution row
  737. // (based also on banding data).
  738. std::array<CoeffRow, kResultBits> state;
  739. state.fill(0);
  740. const Index num_starts = bs.GetNumStarts();
  741. sss->PrepareForNumStarts(num_starts);
  742. const Index num_slots = num_starts + kCoeffBits - 1;
  743. for (Index i = num_slots; i > 0;) {
  744. --i;
  745. CoeffRow cr;
  746. ResultRow rr;
  747. bs.LoadRow(i, &cr, &rr, /* for_back_subst */ true);
  748. // solution row
  749. ResultRow sr = 0;
  750. for (Index j = 0; j < kResultBits; ++j) {
  751. // Compute next solution bit at row i, column j (see derivation below)
  752. CoeffRow tmp = state[j] << 1;
  753. bool bit = (BitParity(tmp & cr) ^ ((rr >> j) & 1)) != 0;
  754. tmp |= bit ? CoeffRow{1} : CoeffRow{0};
  755. // Now tmp is solution at column j from row i for next kCoeffBits
  756. // more rows. Thus, for valid solution, the dot product of the
  757. // solution column with the coefficient row has to equal the result
  758. // at that column,
  759. // BitParity(tmp & cr) == ((rr >> j) & 1)
  760. // Update state.
  761. state[j] = tmp;
  762. // add to solution row
  763. sr |= (bit ? ResultRow{1} : ResultRow{0}) << j;
  764. }
  765. sss->Store(i, sr);
  766. }
  767. }
  768. // Common functionality for querying a key (already hashed) in
  769. // SimpleSolutionStorage.
  770. template <typename SimpleSolutionStorage>
  771. typename SimpleSolutionStorage::ResultRow SimpleQueryHelper(
  772. typename SimpleSolutionStorage::Index start_slot,
  773. typename SimpleSolutionStorage::CoeffRow cr,
  774. const SimpleSolutionStorage &sss) {
  775. using CoeffRow = typename SimpleSolutionStorage::CoeffRow;
  776. using ResultRow = typename SimpleSolutionStorage::ResultRow;
  777. constexpr unsigned kCoeffBits = static_cast<unsigned>(sizeof(CoeffRow) * 8U);
  778. ResultRow result = 0;
  779. for (unsigned i = 0; i < kCoeffBits; ++i) {
  780. // Bit masking whole value is generally faster here than 'if'
  781. result ^= sss.Load(start_slot + i) &
  782. (ResultRow{0} - (static_cast<ResultRow>(cr >> i) & ResultRow{1}));
  783. }
  784. return result;
  785. }
  786. // General PHSF query a key from SimpleSolutionStorage.
  787. template <typename SimpleSolutionStorage, typename PhsfQueryHasher>
  788. typename SimpleSolutionStorage::ResultRow SimplePhsfQuery(
  789. const typename PhsfQueryHasher::Key &key, const PhsfQueryHasher &hasher,
  790. const SimpleSolutionStorage &sss) {
  791. const typename PhsfQueryHasher::Hash hash = hasher.GetHash(key);
  792. static_assert(sizeof(typename SimpleSolutionStorage::Index) ==
  793. sizeof(typename PhsfQueryHasher::Index),
  794. "must be same");
  795. static_assert(sizeof(typename SimpleSolutionStorage::CoeffRow) ==
  796. sizeof(typename PhsfQueryHasher::CoeffRow),
  797. "must be same");
  798. return SimpleQueryHelper(hasher.GetStart(hash, sss.GetNumStarts()),
  799. hasher.GetCoeffRow(hash), sss);
  800. }
  801. // Filter query a key from SimpleSolutionStorage.
  802. template <typename SimpleSolutionStorage, typename FilterQueryHasher>
  803. bool SimpleFilterQuery(const typename FilterQueryHasher::Key &key,
  804. const FilterQueryHasher &hasher,
  805. const SimpleSolutionStorage &sss) {
  806. const typename FilterQueryHasher::Hash hash = hasher.GetHash(key);
  807. const typename SimpleSolutionStorage::ResultRow expected =
  808. hasher.GetResultRowFromHash(hash);
  809. static_assert(sizeof(typename SimpleSolutionStorage::Index) ==
  810. sizeof(typename FilterQueryHasher::Index),
  811. "must be same");
  812. static_assert(sizeof(typename SimpleSolutionStorage::CoeffRow) ==
  813. sizeof(typename FilterQueryHasher::CoeffRow),
  814. "must be same");
  815. static_assert(sizeof(typename SimpleSolutionStorage::ResultRow) ==
  816. sizeof(typename FilterQueryHasher::ResultRow),
  817. "must be same");
  818. return expected ==
  819. SimpleQueryHelper(hasher.GetStart(hash, sss.GetNumStarts()),
  820. hasher.GetCoeffRow(hash), sss);
  821. }
  822. // #################### InterleavedSolutionStorage ######################
  823. // InterleavedSolutionStorage is row-major at a high level, for good
  824. // locality, and column-major at a low level, for CPU efficiency
  825. // especially in filter queries or relatively small number of result bits
  826. // (== solution columns). The storage is a sequence of "blocks" where a
  827. // block has one CoeffRow-sized segment for each solution column. Each
  828. // query spans at most two blocks; the starting solution row is typically
  829. // in the row-logical middle of a block and spans to the middle of the
  830. // next block. (See diagram below.)
  831. //
  832. // InterleavedSolutionStorage supports choosing b (number of result or
  833. // solution columns) at run time, and even supports mixing b and b-1 solution
  834. // columns in a single linear system solution, for filters that can
  835. // effectively utilize any size space (multiple of CoeffRow) for minimizing
  836. // FP rate for any number of added keys. To simplify query implementation
  837. // (with lower-index columns first), the b-bit portion comes after the b-1
  838. // portion of the structure.
  839. //
  840. // Diagram (=== marks logical block boundary; b=4; ### is data used by a
  841. // query crossing the b-1 to b boundary, each Segment has type CoeffRow):
  842. // ...
  843. // +======================+
  844. // | S e g m e n t col=0 |
  845. // +----------------------+
  846. // | S e g m e n t col=1 |
  847. // +----------------------+
  848. // | S e g m e n t col=2 |
  849. // +======================+
  850. // | S e g m e n #########|
  851. // +----------------------+
  852. // | S e g m e n #########|
  853. // +----------------------+
  854. // | S e g m e n #########|
  855. // +======================+ Result/solution columns: above = 3, below = 4
  856. // |#############t col=0 |
  857. // +----------------------+
  858. // |#############t col=1 |
  859. // +----------------------+
  860. // |#############t col=2 |
  861. // +----------------------+
  862. // | S e g m e n t col=3 |
  863. // +======================+
  864. // | S e g m e n t col=0 |
  865. // +----------------------+
  866. // | S e g m e n t col=1 |
  867. // +----------------------+
  868. // | S e g m e n t col=2 |
  869. // +----------------------+
  870. // | S e g m e n t col=3 |
  871. // +======================+
  872. // ...
  873. //
  874. // InterleavedSolutionStorage will be adapted by the algorithms from
  875. // simple array-like segment storage. That array-like storage is templatized
  876. // in part so that an implementation may choose to handle byte ordering
  877. // at access time.
  878. //
  879. // concept InterleavedSolutionStorage extends RibbonTypes {
  880. // // This is called at the beginning of back-substitution for the
  881. // // solution storage to do any remaining configuration before data
  882. // // is stored to it. If configuration is previously finalized, this
  883. // // could be a simple assertion or even no-op. Ribbon algorithms
  884. // // only call this from back-substitution, and only once per call,
  885. // // before other functions here.
  886. // void PrepareForNumStarts(Index num_starts) const;
  887. // // Must return num_starts passed to PrepareForNumStarts, or the most
  888. // // recent call to PrepareForNumStarts if this storage object can be
  889. // // reused. Note that num_starts == num_slots - kCoeffBits + 1 because
  890. // // there must be a run of kCoeffBits slots starting from each start.
  891. // Index GetNumStarts() const;
  892. // // The larger number of solution columns used (called "b" above).
  893. // Index GetUpperNumColumns() const;
  894. // // If returns > 0, then block numbers below that use
  895. // // GetUpperNumColumns() - 1 columns per solution row, and the rest
  896. // // use GetUpperNumColumns(). A block represents kCoeffBits "slots",
  897. // // where all but the last kCoeffBits - 1 slots are also starts. And
  898. // // a block contains a segment for each solution column.
  899. // // An implementation may only support uniform columns per solution
  900. // // row and return constant 0 here.
  901. // Index GetUpperStartBlock() const;
  902. //
  903. // // ### "Array of segments" portion of API ###
  904. // // The number of values of type CoeffRow used in this solution
  905. // // representation. (This value can be inferred from the previous
  906. // // three functions, but is expected at least for sanity / assertion
  907. // // checking.)
  908. // Index GetNumSegments() const;
  909. // // Load an entry from the logical array of segments
  910. // CoeffRow LoadSegment(Index segment_num) const;
  911. // // Store an entry to the logical array of segments
  912. // void StoreSegment(Index segment_num, CoeffRow data);
  913. // };
  914. // A helper for InterleavedBackSubst.
  915. template <typename BandingStorage>
  916. inline void BackSubstBlock(typename BandingStorage::CoeffRow *state,
  917. typename BandingStorage::Index num_columns,
  918. const BandingStorage &bs,
  919. typename BandingStorage::Index start_slot) {
  920. using CoeffRow = typename BandingStorage::CoeffRow;
  921. using Index = typename BandingStorage::Index;
  922. using ResultRow = typename BandingStorage::ResultRow;
  923. constexpr auto kCoeffBits = static_cast<Index>(sizeof(CoeffRow) * 8U);
  924. for (Index i = start_slot + kCoeffBits; i > start_slot;) {
  925. --i;
  926. CoeffRow cr;
  927. ResultRow rr;
  928. bs.LoadRow(i, &cr, &rr, /* for_back_subst */ true);
  929. for (Index j = 0; j < num_columns; ++j) {
  930. // Compute next solution bit at row i, column j (see derivation below)
  931. CoeffRow tmp = state[j] << 1;
  932. int bit = BitParity(tmp & cr) ^ ((rr >> j) & 1);
  933. tmp |= static_cast<CoeffRow>(bit);
  934. // Now tmp is solution at column j from row i for next kCoeffBits
  935. // more rows. Thus, for valid solution, the dot product of the
  936. // solution column with the coefficient row has to equal the result
  937. // at that column,
  938. // BitParity(tmp & cr) == ((rr >> j) & 1)
  939. // Update state.
  940. state[j] = tmp;
  941. }
  942. }
  943. }
  944. // Back-substitution for generating a solution from BandingStorage to
  945. // InterleavedSolutionStorage.
  946. template <typename InterleavedSolutionStorage, typename BandingStorage>
  947. void InterleavedBackSubst(InterleavedSolutionStorage *iss,
  948. const BandingStorage &bs) {
  949. using CoeffRow = typename BandingStorage::CoeffRow;
  950. using Index = typename BandingStorage::Index;
  951. static_assert(
  952. sizeof(Index) == sizeof(typename InterleavedSolutionStorage::Index),
  953. "must be same");
  954. static_assert(
  955. sizeof(CoeffRow) == sizeof(typename InterleavedSolutionStorage::CoeffRow),
  956. "must be same");
  957. constexpr auto kCoeffBits = static_cast<Index>(sizeof(CoeffRow) * 8U);
  958. const Index num_starts = bs.GetNumStarts();
  959. // Although it might be nice to have a filter that returns "always false"
  960. // when no key is added, we aren't specifically supporting that here
  961. // because it would require another condition branch in the query.
  962. assert(num_starts > 0);
  963. iss->PrepareForNumStarts(num_starts);
  964. const Index num_slots = num_starts + kCoeffBits - 1;
  965. assert(num_slots % kCoeffBits == 0);
  966. const Index num_blocks = num_slots / kCoeffBits;
  967. const Index num_segments = iss->GetNumSegments();
  968. // For now upper, then lower
  969. Index num_columns = iss->GetUpperNumColumns();
  970. const Index upper_start_block = iss->GetUpperStartBlock();
  971. if (num_columns == 0) {
  972. // Nothing to do, presumably because there's not enough space for even
  973. // a single segment.
  974. assert(num_segments == 0);
  975. // When num_columns == 0, a Ribbon filter query will always return true,
  976. // or a PHSF query always 0.
  977. return;
  978. }
  979. // We should be utilizing all available segments
  980. assert(num_segments == (upper_start_block * (num_columns - 1)) +
  981. ((num_blocks - upper_start_block) * num_columns));
  982. // TODO: consider fixed-column specializations with stack-allocated state
  983. // A column-major buffer of the solution matrix, containing enough
  984. // recently-computed solution data to compute the next solution row
  985. // (based also on banding data).
  986. std::unique_ptr<CoeffRow[]> state{new CoeffRow[num_columns]()};
  987. Index block = num_blocks;
  988. Index segment_num = num_segments;
  989. while (block > upper_start_block) {
  990. --block;
  991. BackSubstBlock(state.get(), num_columns, bs, block * kCoeffBits);
  992. segment_num -= num_columns;
  993. for (Index i = 0; i < num_columns; ++i) {
  994. iss->StoreSegment(segment_num + i, state[i]);
  995. }
  996. }
  997. // Now (if applicable), region using lower number of columns
  998. // (This should be optimized away if GetUpperStartBlock() returns
  999. // constant 0.)
  1000. --num_columns;
  1001. while (block > 0) {
  1002. --block;
  1003. BackSubstBlock(state.get(), num_columns, bs, block * kCoeffBits);
  1004. segment_num -= num_columns;
  1005. for (Index i = 0; i < num_columns; ++i) {
  1006. iss->StoreSegment(segment_num + i, state[i]);
  1007. }
  1008. }
  1009. // Verify everything processed
  1010. assert(block == 0);
  1011. assert(segment_num == 0);
  1012. }
  1013. // Prefetch memory for a key in InterleavedSolutionStorage.
  1014. template <typename InterleavedSolutionStorage, typename PhsfQueryHasher>
  1015. inline void InterleavedPrepareQuery(
  1016. const typename PhsfQueryHasher::Key &key, const PhsfQueryHasher &hasher,
  1017. const InterleavedSolutionStorage &iss,
  1018. typename PhsfQueryHasher::Hash *saved_hash,
  1019. typename InterleavedSolutionStorage::Index *saved_segment_num,
  1020. typename InterleavedSolutionStorage::Index *saved_num_columns,
  1021. typename InterleavedSolutionStorage::Index *saved_start_bit) {
  1022. using Hash = typename PhsfQueryHasher::Hash;
  1023. using CoeffRow = typename InterleavedSolutionStorage::CoeffRow;
  1024. using Index = typename InterleavedSolutionStorage::Index;
  1025. static_assert(sizeof(Index) == sizeof(typename PhsfQueryHasher::Index),
  1026. "must be same");
  1027. const Hash hash = hasher.GetHash(key);
  1028. const Index start_slot = hasher.GetStart(hash, iss.GetNumStarts());
  1029. constexpr auto kCoeffBits = static_cast<Index>(sizeof(CoeffRow) * 8U);
  1030. const Index upper_start_block = iss.GetUpperStartBlock();
  1031. Index num_columns = iss.GetUpperNumColumns();
  1032. Index start_block_num = start_slot / kCoeffBits;
  1033. Index segment_num = start_block_num * num_columns -
  1034. std::min(start_block_num, upper_start_block);
  1035. // Change to lower num columns if applicable.
  1036. // (This should not compile to a conditional branch.)
  1037. num_columns -= (start_block_num < upper_start_block) ? 1 : 0;
  1038. Index start_bit = start_slot % kCoeffBits;
  1039. Index segment_count = num_columns + (start_bit == 0 ? 0 : num_columns);
  1040. iss.PrefetchSegmentRange(segment_num, segment_num + segment_count);
  1041. *saved_hash = hash;
  1042. *saved_segment_num = segment_num;
  1043. *saved_num_columns = num_columns;
  1044. *saved_start_bit = start_bit;
  1045. }
  1046. // General PHSF query from InterleavedSolutionStorage, using data for
  1047. // the query key from InterleavedPrepareQuery
  1048. template <typename InterleavedSolutionStorage, typename PhsfQueryHasher>
  1049. inline typename InterleavedSolutionStorage::ResultRow InterleavedPhsfQuery(
  1050. typename PhsfQueryHasher::Hash hash,
  1051. typename InterleavedSolutionStorage::Index segment_num,
  1052. typename InterleavedSolutionStorage::Index num_columns,
  1053. typename InterleavedSolutionStorage::Index start_bit,
  1054. const PhsfQueryHasher &hasher, const InterleavedSolutionStorage &iss) {
  1055. using CoeffRow = typename InterleavedSolutionStorage::CoeffRow;
  1056. using Index = typename InterleavedSolutionStorage::Index;
  1057. using ResultRow = typename InterleavedSolutionStorage::ResultRow;
  1058. static_assert(sizeof(Index) == sizeof(typename PhsfQueryHasher::Index),
  1059. "must be same");
  1060. static_assert(sizeof(CoeffRow) == sizeof(typename PhsfQueryHasher::CoeffRow),
  1061. "must be same");
  1062. constexpr auto kCoeffBits = static_cast<Index>(sizeof(CoeffRow) * 8U);
  1063. const CoeffRow cr = hasher.GetCoeffRow(hash);
  1064. ResultRow sr = 0;
  1065. const CoeffRow cr_left = cr << static_cast<unsigned>(start_bit);
  1066. for (Index i = 0; i < num_columns; ++i) {
  1067. sr ^= BitParity(iss.LoadSegment(segment_num + i) & cr_left) << i;
  1068. }
  1069. if (start_bit > 0) {
  1070. segment_num += num_columns;
  1071. const CoeffRow cr_right =
  1072. cr >> static_cast<unsigned>(kCoeffBits - start_bit);
  1073. for (Index i = 0; i < num_columns; ++i) {
  1074. sr ^= BitParity(iss.LoadSegment(segment_num + i) & cr_right) << i;
  1075. }
  1076. }
  1077. return sr;
  1078. }
  1079. // Filter query a key from InterleavedFilterQuery.
  1080. template <typename InterleavedSolutionStorage, typename FilterQueryHasher>
  1081. inline bool InterleavedFilterQuery(
  1082. typename FilterQueryHasher::Hash hash,
  1083. typename InterleavedSolutionStorage::Index segment_num,
  1084. typename InterleavedSolutionStorage::Index num_columns,
  1085. typename InterleavedSolutionStorage::Index start_bit,
  1086. const FilterQueryHasher &hasher, const InterleavedSolutionStorage &iss) {
  1087. using CoeffRow = typename InterleavedSolutionStorage::CoeffRow;
  1088. using Index = typename InterleavedSolutionStorage::Index;
  1089. using ResultRow = typename InterleavedSolutionStorage::ResultRow;
  1090. static_assert(sizeof(Index) == sizeof(typename FilterQueryHasher::Index),
  1091. "must be same");
  1092. static_assert(
  1093. sizeof(CoeffRow) == sizeof(typename FilterQueryHasher::CoeffRow),
  1094. "must be same");
  1095. static_assert(
  1096. sizeof(ResultRow) == sizeof(typename FilterQueryHasher::ResultRow),
  1097. "must be same");
  1098. constexpr auto kCoeffBits = static_cast<Index>(sizeof(CoeffRow) * 8U);
  1099. const CoeffRow cr = hasher.GetCoeffRow(hash);
  1100. const ResultRow expected = hasher.GetResultRowFromHash(hash);
  1101. // TODO: consider optimizations such as
  1102. // * get rid of start_bit == 0 condition with careful fetching & shifting
  1103. if (start_bit == 0) {
  1104. for (Index i = 0; i < num_columns; ++i) {
  1105. if (BitParity(iss.LoadSegment(segment_num + i) & cr) !=
  1106. (static_cast<int>(expected >> i) & 1)) {
  1107. return false;
  1108. }
  1109. }
  1110. } else {
  1111. const CoeffRow cr_left = cr << static_cast<unsigned>(start_bit);
  1112. const CoeffRow cr_right =
  1113. cr >> static_cast<unsigned>(kCoeffBits - start_bit);
  1114. for (Index i = 0; i < num_columns; ++i) {
  1115. CoeffRow soln_data =
  1116. (iss.LoadSegment(segment_num + i) & cr_left) ^
  1117. (iss.LoadSegment(segment_num + num_columns + i) & cr_right);
  1118. if (BitParity(soln_data) != (static_cast<int>(expected >> i) & 1)) {
  1119. return false;
  1120. }
  1121. }
  1122. }
  1123. // otherwise, all match
  1124. return true;
  1125. }
  1126. // TODO: refactor Interleaved*Query so that queries can be "prepared" by
  1127. // prefetching memory, to hide memory latency for multiple queries in a
  1128. // single thread.
  1129. } // namespace ribbon
  1130. } // namespace ROCKSDB_NAMESPACE