mgpuhost.cuh 38 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815
  1. /******************************************************************************
  2. * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved.
  3. *
  4. * Redistribution and use in source and binary forms, with or without
  5. * modification, are permitted provided that the following conditions are met:
  6. * * Redistributions of source code must retain the above copyright
  7. * notice, this list of conditions and the following disclaimer.
  8. * * Redistributions in binary form must reproduce the above copyright
  9. * notice, this list of conditions and the following disclaimer in the
  10. * documentation and/or other materials provided with the distribution.
  11. * * Neither the name of the NVIDIA CORPORATION nor the
  12. * names of its contributors may be used to endorse or promote products
  13. * derived from this software without specific prior written permission.
  14. *
  15. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  16. * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  17. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  18. * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
  19. * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  20. * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  21. * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  22. * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  23. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  24. * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  25. *
  26. ******************************************************************************/
  27. /******************************************************************************
  28. *
  29. * Code and text by Sean Baxter, NVIDIA Research
  30. * See http://nvlabs.github.io/moderngpu for repository and documentation.
  31. *
  32. ******************************************************************************/
  33. #pragma once
  34. #include "mgpudevice.cuh"
  35. #include "util/mgpucontext.h"
  36. namespace mgpu {
  37. ////////////////////////////////////////////////////////////////////////////////
  38. // kernels/reduce.cuh
  39. // Reduce input and return variable in device memory or host memory, or both.
  40. // Provide a non-null pointer to retrieve data.
  41. template<typename InputIt, typename T, typename Op>
  42. MGPU_HOST void Reduce(InputIt data_global, int count, T identity, Op op,
  43. T* reduce_global, T* reduce_host, CudaContext& context);
  44. // T = std::iterator_traits<InputIt>::value_type.
  45. // Reduce with identity = 0 and op = mgpu::plus<T>.
  46. // Returns the value in host memory.
  47. template<typename InputIt>
  48. MGPU_HOST typename std::iterator_traits<InputIt>::value_type
  49. Reduce(InputIt data_global, int count, CudaContext& context);
  50. ////////////////////////////////////////////////////////////////////////////////
  51. // kernels/scan.cuh
  52. // Scan inputs in device memory.
  53. // MgpuScanType may be:
  54. // MgpuScanTypeExc (exclusive) or
  55. // MgpuScanTypeInc (inclusive).
  56. // Return the total in device memory, host memory, or both.
  57. template<MgpuScanType Type, typename DataIt, typename T, typename Op,
  58. typename DestIt>
  59. MGPU_HOST void Scan(DataIt data_global, int count, T identity, Op op,
  60. T* reduce_global, T* reduce_host, DestIt dest_global,
  61. CudaContext& context);
  62. // Exclusive scan with identity = 0 and op = mgpu::plus<T>.
  63. // Returns the total in host memory.
  64. template<typename InputIt, typename TotalType>
  65. MGPU_HOST void ScanExc(InputIt data_global, int count, TotalType* total,
  66. CudaContext& context);
  67. // Like above, but don't return the total.
  68. template<typename InputIt>
  69. MGPU_HOST void ScanExc(InputIt data_global, int count, CudaContext& context);
  70. ////////////////////////////////////////////////////////////////////////////////
  71. // kernels/bulkremove.cuh
  72. // Compact the elements in source_global by removing elements identified by
  73. // indices_global. indices_global must be unique, sorted, and range between 0
  74. // and sourceCount - 1. The number of outputs is sourceCount - indicesCount.
  75. // IndicesIt should resolve to an integer type. iterators like step_iterator
  76. // are supported.
  77. // If sourceCount = 10, indicesCount = 6, and indices = (1, 3, 4, 5, 7, 8), then
  78. // dest = A0 A2 A6 A9. (All indices between 0 and sourceCount - 1 except those
  79. // in indices_global).
  80. template<typename InputIt, typename IndicesIt, typename OutputIt>
  81. MGPU_HOST void BulkRemove(InputIt source_global, int sourceCount,
  82. IndicesIt indices_global, int indicesCount, OutputIt dest_global,
  83. CudaContext& context);
  84. ////////////////////////////////////////////////////////////////////////////////
  85. // kernels/bulkinsert.cuh
  86. // Combine aCount elements in a_global with bCount elements in b_global.
  87. // Each element a_global[i] is inserted before position indices_global[i] and
  88. // stored to dest_global. The insertion indices are relative to the B array,
  89. // not the output. Indices must be sorted but not necessarily unique.
  90. // If aCount = 5, bCount = 3, and indices = (1, 1, 2, 3, 3), the output is:
  91. // B0 A0 A1 B1 A2 B2 A3 A4.
  92. template<typename InputIt1, typename IndicesIt, typename InputIt2,
  93. typename OutputIt>
  94. MGPU_HOST void BulkInsert(InputIt1 a_global, IndicesIt indices_global,
  95. int aCount, InputIt2 b_global, int bCount, OutputIt dest_global,
  96. CudaContext& context);
  97. ////////////////////////////////////////////////////////////////////////////////
  98. // kernels/merge.cuh
  99. // MergeKeys merges two arrays of sorted inputs with C++-comparison semantics.
  100. // aCount items from aKeys_global and bCount items from bKeys_global are merged
  101. // into aCount + bCount items in keys_global.
  102. // Comp is a comparator type supporting strict weak ordering.
  103. // If !comp(b, a), then a is placed before b in the output.
  104. template<typename KeysIt1, typename KeysIt2, typename KeysIt3, typename Comp>
  105. MGPU_HOST void MergeKeys(KeysIt1 aKeys_global, int aCount, KeysIt2 bKeys_global,
  106. int bCount, KeysIt3 keys_global, Comp comp, CudaContext& context);
  107. // MergeKeys specialized with Comp = mgpu::less<T>.
  108. template<typename KeysIt1, typename KeysIt2, typename KeysIt3>
  109. MGPU_HOST void MergeKeys(KeysIt1 aKeys_global, int aCount, KeysIt2 bKeys_global,
  110. int bCount, KeysIt3 keys_global, CudaContext& context);
  111. // MergePairs merges two arrays of sorted inputs by key and copies values.
  112. // If !comp(bKey, aKey), then aKey is placed before bKey in the output, and
  113. // the corresponding aData is placed before bData. This corresponds to *_by_key
  114. // functions in Thrust.
  115. template<typename KeysIt1, typename KeysIt2, typename KeysIt3, typename ValsIt1,
  116. typename ValsIt2, typename ValsIt3, typename Comp>
  117. MGPU_HOST void MergePairs(KeysIt1 aKeys_global, ValsIt1 aVals_global,
  118. int aCount, KeysIt2 bKeys_global, ValsIt2 bVals_global, int bCount,
  119. KeysIt3 keys_global, ValsIt3 vals_global, Comp comp, CudaContext& context);
  120. // MergePairs specialized with Comp = mgpu::less<T>.
  121. template<typename KeysIt1, typename KeysIt2, typename KeysIt3, typename ValsIt1,
  122. typename ValsIt2, typename ValsIt3>
  123. MGPU_HOST void MergePairs(KeysIt1 aKeys_global, ValsIt1 aVals_global,
  124. int aCount, KeysIt2 bKeys_global, ValsIt2 bVals_global, int bCount,
  125. KeysIt3 keys_global, ValsIt3 vals_global, CudaContext& context);
  126. ////////////////////////////////////////////////////////////////////////////////
  127. // kernels/mergesort.cuh
  128. // MergesortKeys sorts data_global using comparator Comp.
  129. // If !comp(b, a), then a comes before b in the output. The data is sorted
  130. // in-place.
  131. template<typename T, typename Comp>
  132. MGPU_HOST void MergesortKeys(T* data_global, int count, Comp comp,
  133. CudaContext& context);
  134. // MergesortKeys specialized with Comp = mgpu::less<T>.
  135. template<typename T>
  136. MGPU_HOST void MergesortKeys(T* data_global, int count, CudaContext& context);
  137. // MergesortPairs sorts data by key, copying data. This corresponds to
  138. // sort_by_key in Thrust.
  139. template<typename KeyType, typename ValType, typename Comp>
  140. MGPU_HOST void MergesortPairs(KeyType* keys_global, ValType* values_global,
  141. int count, Comp comp, CudaContext& context);
  142. // MergesortPairs specialized with Comp = mgpu::less<KeyType>.
  143. template<typename KeyType, typename ValType>
  144. MGPU_HOST void MergesortPairs(KeyType* keys_global, ValType* values_global,
  145. int count, CudaContext& context);
  146. // MergesortIndices is like MergesortPairs where values_global is treated as
  147. // if initialized with integers (0 ... count - 1).
  148. template<typename KeyType, typename Comp>
  149. MGPU_HOST void MergesortIndices(KeyType* keys_global, int* values_global,
  150. int count, Comp comp, CudaContext& context);
  151. // MergesortIndices specialized with Comp = mgpu::less<KeyType>.
  152. template<typename KeyType>
  153. MGPU_HOST void MergesortIndices(KeyType* keys_global, int* values_global,
  154. int count, CudaContext& context);
  155. ////////////////////////////////////////////////////////////////////////////////
  156. // kernels/segmentedsort.cuh
  157. // Mergesort count items in-place in data_global. Keys are compared with Comp
  158. // (as they are in MergesortKeys), however keys remain inside the segments
  159. // defined by flags_global.
  160. // flags_global is a bitfield cast to uint*. Each bit in flags_global is a
  161. // segment head flag. Only keys between segment head flags (inclusive on the
  162. // left and exclusive on the right) may be exchanged. The first element is
  163. // assumed to start a segment, regardless of the value of bit 0.
  164. // Passing verbose=true causes the function to print mergepass statistics to the
  165. // console. This may be helpful for developers to understand the performance
  166. // characteristics of the function and how effectively it early-exits merge
  167. // operations.
  168. template<typename T, typename Comp>
  169. MGPU_HOST void SegSortKeysFromFlags(T* data_global, int count,
  170. const uint* flags_global, CudaContext& context, Comp comp,
  171. bool verbose = false);
  172. // SegSortKeysFromFlags specialized with Comp = mgpu::less<T>.
  173. template<typename T>
  174. MGPU_HOST void SegSortKeysFromFlags(T* data_global, int count,
  175. const uint* flags_global, CudaContext& context, bool verbose = false);
  176. // Segmented sort using head flags and supporting value exchange.
  177. template<typename KeyType, typename ValType, typename Comp>
  178. MGPU_HOST void SegSortPairsFromFlags(KeyType* keys_global,
  179. ValType* values_global, int count, const uint* flags_global,
  180. CudaContext& context, Comp comp, bool verbose = false);
  181. // SegSortPairsFromFlags specialized with Comp = mgpu::less<T>.
  182. template<typename KeyType, typename ValType>
  183. MGPU_HOST void SegSortPairsFromFlags(KeyType* keys_global,
  184. ValType* values_global, int count, const uint* flags_global,
  185. CudaContext& context, bool verbose = false);
  186. // Segmented sort using segment indices rather than head flags. indices_global
  187. // is a sorted and unique list of indicesCount segment start locations. These
  188. // indices correspond to the set bits in the flags_global field. A segment
  189. // head index for position 0 may be omitted.
  190. template<typename T, typename Comp>
  191. MGPU_HOST void SegSortKeysFromIndices(T* data_global, int count,
  192. const int* indices_global, int indicesCount, CudaContext& context,
  193. Comp comp, bool verbose = false);
  194. // SegSortKeysFromIndices specialized with Comp = mgpu::less<T>.
  195. template<typename T>
  196. MGPU_HOST void SegSortKeysFromIndices(T* data_global, int count,
  197. const int* indices_global, int indicesCount, CudaContext& context,
  198. bool verbose = false);
  199. // Segmented sort using segment indices and supporting value exchange.
  200. template<typename KeyType, typename ValType, typename Comp>
  201. MGPU_HOST void SegSortPairsFromIndices(KeyType* keys_global,
  202. ValType* values_global, int count, const int* indices_global,
  203. int indicesCount, CudaContext& context, Comp comp, bool verbose = false);
  204. // SegSortPairsFromIndices specialized with Comp = mgpu::less<KeyType>.
  205. template<typename KeyType, typename ValType>
  206. MGPU_HOST void SegSortPairsFromIndices(KeyType* keys_global,
  207. ValType* values_global, int count, const int* indices_global,
  208. int indicesCount, CudaContext& context, bool verbose = false);
  209. ////////////////////////////////////////////////////////////////////////////////
  210. // kernels/localitysort.cuh
  211. // LocalitySortKeys is a version of MergesortKeys optimized for non-uniformly
  212. // random input arrays. If the keys begin close to their sorted destinations,
  213. // this function may exploit the structure to early-exit merge passes.
  214. // Passing verbose=true causes the function to print mergepass statistics to the
  215. // console. This may be helpful for developers to understand the performance
  216. // characteristics of the function and how effectively it early-exits merge
  217. // operations.
  218. template<typename T, typename Comp>
  219. MGPU_HOST void LocalitySortKeys(T* data_global, int count, CudaContext& context,
  220. Comp comp, bool verbose = false);
  221. // LocalitySortKeys specialized with Comp = mgpu::less<T>.
  222. template<typename T>
  223. MGPU_HOST void LocalitySortKeys(T* data_global, int count, CudaContext& context,
  224. bool verbose = false);
  225. // Locality sort supporting value exchange.
  226. template<typename KeyType, typename ValType, typename Comp>
  227. MGPU_HOST void LocalitySortPairs(KeyType* keys_global, ValType* values_global,
  228. int count, CudaContext& context, Comp comp, bool verbose = false);
  229. // LocalitySortPairs specialized with Comp = mpgu::less<T>.
  230. template<typename KeyType, typename ValType>
  231. MGPU_HOST void LocalitySortPairs(KeyType* keys_global, ValType* values_global,
  232. int count, CudaContext& context, bool verbose = false);
  233. ////////////////////////////////////////////////////////////////////////////////
  234. // kernels/sortedsearch.cuh
  235. // Vectorized sorted search. This is the most general form of the function.
  236. // Executes two simultaneous linear searches on two sorted inputs.
  237. // Bounds:
  238. // MgpuBoundsLower -
  239. // lower-bound search of A into B.
  240. // upper-bound search of B into A.
  241. // MgpuBoundsUpper -
  242. // upper-bound search of A into B.
  243. // lower-bound search of B into A.
  244. // Type[A|B]:
  245. // MgpuSearchTypeNone - no output for this input.
  246. // MgpuSearchTypeIndex - return search indices as integers.
  247. // MgpuSearchTypeMatch - return 0 (no match) or 1 (match).
  248. // For TypeA, returns 1 if there is at least 1 matching element in B
  249. // for element in A.
  250. // For TypeB, returns 1 if there is at least 1 matching element in A
  251. // for element in B.
  252. // MgpuSearchTypeIndexMatch - return search indices as integers. Most
  253. // significant bit is match bit.
  254. // aMatchCount, bMatchCount - If Type is Match or IndexMatch, return the total
  255. // number of elements in A or B with matches in B or A, if the pointer is
  256. // not null. This generates a synchronous cudaMemcpyDeviceToHost call that
  257. // callers using streams should be aware of.
  258. template<MgpuBounds Bounds, MgpuSearchType TypeA, MgpuSearchType TypeB,
  259. typename InputIt1, typename InputIt2, typename OutputIt1,
  260. typename OutputIt2, typename Comp>
  261. MGPU_HOST void SortedSearch(InputIt1 a_global, int aCount, InputIt2 b_global,
  262. int bCount, OutputIt1 aIndices_global, OutputIt2 bIndices_global,
  263. Comp comp, CudaContext& context, int* aMatchCount = 0,
  264. int* bMatchCount = 0);
  265. // SortedSearch specialized with Comp = mgpu::less<T>.
  266. template<MgpuBounds Bounds, MgpuSearchType TypeA, MgpuSearchType TypeB,
  267. typename InputIt1, typename InputIt2, typename OutputIt1,
  268. typename OutputIt2>
  269. MGPU_HOST void SortedSearch(InputIt1 a_global, int aCount, InputIt2 b_global,
  270. int bCount, OutputIt1 aIndices_global, OutputIt2 bIndices_global,
  271. CudaContext& context, int* aMatchCount = 0, int* bMatchCount = 0);
  272. // SortedSearch specialized with
  273. // TypeA = MgpuSearchTypeIndex
  274. // TypeB = MgpuSearchTypeNone
  275. // aMatchCount = bMatchCount = 0.
  276. template<MgpuBounds Bounds, typename InputIt1, typename InputIt2,
  277. typename OutputIt, typename Comp>
  278. MGPU_HOST void SortedSearch(InputIt1 a_global, int aCount, InputIt2 b_global,
  279. int bCount, OutputIt aIndices_global, Comp comp, CudaContext& context);
  280. // SortedSearch specialized with Comp = mgpu::less<T>.
  281. template<MgpuBounds Bounds, typename InputIt1, typename InputIt2,
  282. typename OutputIt>
  283. MGPU_HOST void SortedSearch(InputIt1 a_global, int aCount, InputIt2 b_global,
  284. int bCount, OutputIt aIndices_global, CudaContext& context);
  285. // SortedEqualityCount returns the difference between upper-bound (computed by
  286. // this function) and lower-bound (passed as an argument). That is, it computes
  287. // the number of occurences of a key in B that match each key in A.
  288. // The provided operator must have a method:
  289. // int operator()(int lb, int ub) const;
  290. // It returns the count given the lower- and upper-bound indices.
  291. //
  292. // An object SortedEqualityOp is provided:
  293. // struct SortedEqualityOp {
  294. // MGPU_HOST_DEVICE int operator()(int lb, int ub) const {
  295. // return ub - lb;
  296. // }
  297. // };
  298. template<typename InputIt1, typename InputIt2, typename InputIt3,
  299. typename OutputIt, typename Comp, typename Op>
  300. MGPU_HOST void SortedEqualityCount(InputIt1 a_global, int aCount,
  301. InputIt2 b_global, int bCount, InputIt3 lb_global, OutputIt counts_global,
  302. Comp comp, Op op, CudaContext& context);
  303. // Specialization of SortedEqualityCount with Comp = mgpu::less<T>.
  304. template<typename InputIt1, typename InputIt2, typename InputIt3,
  305. typename OutputIt, typename Op>
  306. MGPU_HOST void SortedEqualityCount(InputIt1 a_global, int aCount,
  307. InputIt2 b_global, int bCount, InputIt3 lb_global, OutputIt counts_global,
  308. Op op, CudaContext& context);
  309. ////////////////////////////////////////////////////////////////////////////////
  310. // kernels/loadbalance.cuh
  311. // LoadBalanceSearch is a special vectorized sorted search. Consider bCount
  312. // objects that generate a variable number of work items, with aCount work
  313. // items in total. The caller computes an exclusive scan of the work-item counts
  314. // into b_global.
  315. // indices_global has aCount outputs. indices_global[i] is the index of the
  316. // object that generated the i'th work item.
  317. // Eg:
  318. // work-item counts: 2, 5, 3, 0, 1.
  319. // scan counts: 0, 2, 7, 10, 10. aCount = 11.
  320. //
  321. // LoadBalanceSearch computes the upper-bound of counting_iterator<int>(0) with
  322. // the scan of the work-item counts and subtracts 1:
  323. // LBS: 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 4.
  324. // This is equivalent to expanding the index of each object by the object's
  325. // work-item count.
  326. template<typename InputIt>
  327. MGPU_HOST void LoadBalanceSearch(int aCount, InputIt b_global, int bCount,
  328. int* indices_global, CudaContext& context);
  329. ////////////////////////////////////////////////////////////////////////////////
  330. // kernels/intervalmove.cuh
  331. // IntervalExpand duplicates intervalCount items in values_global.
  332. // indices_global is an intervalCount-sized array filled with the scan of item
  333. // expand counts. moveCount is the total number of outputs (sum of expand
  334. // counts).
  335. // Eg:
  336. // values = 0, 1, 2, 3, 4, 5, 6, 7, 8
  337. // counts = 1, 2, 1, 0, 4, 2, 3, 0, 2
  338. // indices = 0, 1, 3, 4, 4, 8, 10, 13, 13 (moveCount = 15).
  339. // Expand values[i] by counts[i]:
  340. // output = 0, 1, 1, 2, 4, 4, 4, 4, 5, 5, 6, 6, 6, 8, 8
  341. template<typename IndicesIt, typename ValuesIt, typename OutputIt>
  342. MGPU_HOST void IntervalExpand(int moveCount, IndicesIt indices_global,
  343. ValuesIt values_global, int intervalCount, OutputIt output_global,
  344. CudaContext& context);
  345. // IntervalMove is a load-balanced and vectorized device memcpy.
  346. // It copies intervalCount variable-length intervals from user-defined sources
  347. // to user-defined destinations. If destination intervals overlap, results are
  348. // undefined.
  349. // Eg:
  350. // Interval counts:
  351. // 0: 3 9 1 9 8 5 10 2 5 2
  352. // 10: 8 6 5 2 4 0 8 2 5 6
  353. // Scan of interval counts (indices_global):
  354. // 0: 0 3 12 13 22 30 35 45 47 52
  355. // 10: 54 62 68 73 75 79 79 87 89 94 (moveCount = 100).
  356. // Interval gather (gather_global):
  357. // 0: 75 86 17 2 67 24 37 11 95 35
  358. // 10: 52 18 47 0 13 75 78 60 62 29
  359. // Interval scatter (scatter_global):
  360. // 0: 10 80 99 27 41 71 15 0 36 13
  361. // 10: 89 49 66 97 76 76 2 25 61 55
  362. // This vectorizes into 20 independent memcpy operations which are load-balanced
  363. // across CTAs:
  364. // move 0: (75, 78)->(10, 13) move 10: (52, 60)->(10, 18)
  365. // move 1: (86, 95)->(80, 89) move 11: (18, 24)->(49, 55)
  366. // move 2: (17, 18)->(99,100) move 12: (47, 52)->(66, 71)
  367. // move 3: ( 2, 11)->(27, 36) move 13: ( 0, 2)->(97, 99)
  368. // move 4: (67, 75)->(41, 49) move 14: (13, 17)->(76, 80)
  369. // move 5: (24, 29)->(71, 76) move 15: (75, 75)->(76, 76)
  370. // move 6: (37, 47)->(15, 25) move 16: (78, 86)->( 2, 10)
  371. // move 7: (11, 13)->( 0, 3) move 17: (60, 62)->(25, 27)
  372. // move 8: (95,100)->(36, 41) move 18: (62, 67)->(61, 66)
  373. // move 9: (35, 37)->(13, 15) move 19: (29, 35)->(55, 61)
  374. template<typename GatherIt, typename ScatterIt, typename IndicesIt,
  375. typename InputIt, typename OutputIt>
  376. MGPU_HOST void IntervalMove(int moveCount, GatherIt gather_global,
  377. ScatterIt scatter_global, IndicesIt indices_global, int intervalCount,
  378. InputIt input_global, OutputIt output_global, CudaContext& context);
  379. // IntervalGather is a specialization of IntervalMove that stores output data
  380. // sequentially into output_global. For the example above, IntervalGather treats
  381. // scatter_global the same as indices_global.
  382. template<typename GatherIt, typename IndicesIt, typename InputIt,
  383. typename OutputIt>
  384. MGPU_HOST void IntervalGather(int moveCount, GatherIt gather_global,
  385. IndicesIt indices_global, int intervalCount, InputIt input_global,
  386. OutputIt output_global, CudaContext& context);
  387. // IntervalScatter is a specialization of IntervalMove that loads input data
  388. // sequentially from input_global. For the example above, IntervalScatter treats
  389. // gather_global the same as indices_global.
  390. template<typename ScatterIt, typename IndicesIt, typename InputIt,
  391. typename OutputIt>
  392. MGPU_HOST void IntervalScatter(int moveCount, ScatterIt scatter_global,
  393. IndicesIt indices_global, int intervalCount, InputIt input_global,
  394. OutputIt output_global, CudaContext& context);
  395. ////////////////////////////////////////////////////////////////////////////////
  396. // kernels/join.cuh
  397. // RelationalJoin is a sort-merge join that returns indices into one of the four
  398. // relational joins:
  399. // MgpuJoinKindInner
  400. // MgpuJoinKindLeft
  401. // MgpuJoinKindRight
  402. // MgpuJoinKindOuter.
  403. // A = 100, 101, 103, 103
  404. // B = 100, 100, 102, 103
  405. // Outer join:
  406. // ai, bi a[ai], b[bi]
  407. // 0: (0, 0) - (100, 100) // cross-product expansion for key 100
  408. // 1: (0, 1) - (100, 100)
  409. // 2: (1, -) - (101, ---) // left-join for key 101
  410. // 3: (-, 2) - (---, 102) // right-join for key 102
  411. // 4: (3, 3) - (103, 103) // cross-product expansion for key 103
  412. // MgpuJoinKindLeft drops the right-join on line 3.
  413. // MgpuJoinKindRight drops the left-join on line 2.
  414. // MgpuJoinKindInner drops both the left- and right-joins.
  415. // The caller passes MGPU_MEM(int) pointers to hold indices. Memory is allocated
  416. // by the join function using the allocator associated with the context. It
  417. // returns the number of outputs.
  418. // RelationalJoin performs one cudaMemcpyDeviceToHost to retrieve the size of
  419. // the output array. This is a synchronous operation and may prevent queueing
  420. // for callers using streams.
  421. template<MgpuJoinKind Kind, typename InputIt1, typename InputIt2,
  422. typename Comp>
  423. MGPU_HOST int RelationalJoin(InputIt1 a_global, int aCount, InputIt2 b_global,
  424. int bCount, MGPU_MEM(int)* ppAJoinIndices, MGPU_MEM(int)* ppBJoinIndices,
  425. Comp comp, CudaContext& context);
  426. // Specialization of RelationJoil with Comp = mgpu::less<T>.
  427. template<MgpuJoinKind Kind, typename InputIt1, typename InputIt2>
  428. MGPU_HOST int RelationalJoin(InputIt1 a_global, int aCount, InputIt2 b_global,
  429. int bCount, MGPU_MEM(int)* ppAJoinIndices, MGPU_MEM(int)* ppBJoinIndices,
  430. CudaContext& context);
  431. ////////////////////////////////////////////////////////////////////////////////
  432. // kernels/sets.cuh
  433. // SetOpKeys implements multiset operations with C++ set_* semantics.
  434. // MgpuSetOp may be:
  435. // MgpuSetOpIntersection - like std::set_intersection
  436. // MgpuSetOpUnion - like std::set_union
  437. // MgpuSetOpDiff - like std::set_difference
  438. // MgpuSetOpSymDiff - like std::set_symmetric_difference
  439. // Setting Duplicates to false increases performance for inputs with no
  440. // duplicate keys in either array.
  441. // The caller passes MGPU_MEM(T) pointers to hold outputs. Memory is allocated
  442. // by the multiset function using the allocator associated with the context. It
  443. // returns the number of outputs.
  444. // SetOpKeys performs one cudaMemcpyDeviceToHost to retrieve the size of
  445. // the output array. This is a synchronous operation and may prevent queueing
  446. // for callers using streams.
  447. // If compact = true, SetOpKeys pre-allocates an output buffer is large as the
  448. // sum of the input arrays. Partials results are computed into this temporary
  449. // array before being moved into the final array. It consumes more space but
  450. // results in higher performance.
  451. template<MgpuSetOp Op, bool Duplicates, typename It1, typename It2,
  452. typename T, typename Comp>
  453. MGPU_HOST int SetOpKeys(It1 a_global, int aCount, It2 b_global, int bCount,
  454. MGPU_MEM(T)* ppKeys_global, Comp comp, CudaContext& context,
  455. bool compact = true);
  456. // Specialization of SetOpKeys with Comp = mgpu::less<T>.
  457. template<MgpuSetOp Op, bool Duplicates, typename It1, typename It2, typename T>
  458. MGPU_HOST int SetOpKeys(It1 a_global, int aCount, It2 b_global, int bCount,
  459. MGPU_MEM(T)* ppKeys_global, CudaContext& context, bool compact = true);
  460. // SetOpPairs runs multiset operations by key and supports value exchange.
  461. template<MgpuSetOp Op, bool Duplicates, typename KeysIt1, typename KeysIt2,
  462. typename ValsIt1, typename ValsIt2, typename KeyType, typename ValType,
  463. typename Comp>
  464. MGPU_HOST int SetOpPairs(KeysIt1 aKeys_global, ValsIt1 aVals_global, int aCount,
  465. KeysIt2 bKeys_global, ValsIt2 bVals_global, int bCount,
  466. MGPU_MEM(KeyType)* ppKeys_global, MGPU_MEM(ValType)* ppVals_global,
  467. Comp comp, CudaContext& context);
  468. // Specialization of SetOpPairs with Comp = mgpu::less<T>.
  469. template<MgpuSetOp Op, bool Duplicates, typename KeysIt1, typename KeysIt2,
  470. typename ValsIt1, typename ValsIt2, typename KeyType, typename ValType>
  471. MGPU_HOST int SetOpPairs(KeysIt1 aKeys_global, ValsIt1 aVals_global, int aCount,
  472. KeysIt2 bKeys_global, ValsIt2 bVals_global, int bCount,
  473. MGPU_MEM(KeyType)* ppKeys_global, MGPU_MEM(ValType)* ppVals_global,
  474. CudaContext& context);
  475. ////////////////////////////////////////////////////////////////////////////////
  476. // kernels/segreducecsr.cuh
  477. // SegReducePreprocessData is defined in segreduce.cuh. It includes:
  478. // - limits for CSR->tiles
  479. // - packed thread codes for each thread in the reduction
  480. // - (optional) CSR2 array of filtered segment offsets
  481. struct SegReducePreprocessData;
  482. // SegReduceCsr runs a segmented reduction given an input and a sorted list of
  483. // segment start offsets. This implementation requires operators support
  484. // commutative (a + b = b + a) and associative (a + (b + c) = (a + b) + c)
  485. // evaluation.
  486. // In the segmented reduction, reduce-by-key, and Spmv documentation, "segment"
  487. // and "row" are used interchangably. A
  488. //
  489. // InputIt data_global - Data value input.
  490. // int count - Size of input array data_global.
  491. // CsrIt csr_global - List of integers for start of each segment.
  492. // The first entry must be 0 (indicating that the
  493. // first segment starts at offset 0).
  494. // Equivalent to exc-scan of segment sizes.
  495. // If supportEmpty is false: must be ascending.
  496. // If supportEmpty is true: must be non-descending.
  497. // int numSegments - Size of segment list csr_global. Must be >= 1.
  498. // bool supportEmpty - Basic seg-reduce code does not support empty
  499. // segments.
  500. // Set supportEmpty = true to add pre- and post-
  501. // processing to support empty segments.
  502. // OutputIt dest_global - Output array for segmented reduction. Allocate
  503. // numSegments elements. Should be same data type as
  504. // InputIt and identity.
  505. // T identity - Identity for reduction operation. Eg, use 0 for
  506. // addition or 1 for multiplication.
  507. // Op op - Reduction operator. Model on std::plus<>. MGPU
  508. // provides operators mgpu::plus<>, minus<>,
  509. // multiplies<>, modulus<>, bit_or<> bit_and<>,
  510. // bit_xor<>, maximum<>, and minimum<>.
  511. // CudaContext& context - MGPU context support object. All kernels are
  512. // launched on the associated stream.
  513. template<typename InputIt, typename CsrIt, typename OutputIt, typename T,
  514. typename Op>
  515. MGPU_HOST void SegReduceCsr(InputIt data_global, int count, CsrIt csr_global,
  516. int numSegments, bool supportEmpty, OutputIt dest_global, T identity, Op op,
  517. CudaContext& context);
  518. // IndirectReduceCsr is like SegReduceCsr but with one level of source
  519. // indirection. The start of each segment/row i in data_global starts at
  520. // sources_global[i].
  521. // SourcesIt sources_global - List of integers for source data of each segment.
  522. // Must be numSegments in size.
  523. template<typename InputIt, typename CsrIt, typename SourcesIt,
  524. typename OutputIt, typename T, typename Op>
  525. MGPU_HOST void IndirectReduceCsr(InputIt data_global, int count,
  526. CsrIt csr_global, SourcesIt sources_global, int numSegments,
  527. bool supportEmpty, OutputIt dest_global, T identity, Op op,
  528. CudaContext& context);
  529. // SegReduceCsrPreprocess accelerates multiple seg-reduce calls on different
  530. // data with the same segment geometry. Partitioning and CSR->CSR2 transform is
  531. // off-loaded to a preprocessing pass. The actual reduction is evaluated by
  532. // SegReduceApply.
  533. template<typename T, typename CsrIt>
  534. MGPU_HOST void SegReduceCsrPreprocess(int count, CsrIt csr_global,
  535. int numSegments, bool supportEmpty,
  536. std::auto_ptr<SegReducePreprocessData>* ppData, CudaContext& context);
  537. template<typename InputIt, typename DestIt, typename T, typename Op>
  538. MGPU_HOST void SegReduceApply(const SegReducePreprocessData& preprocess,
  539. InputIt data_global, T identity, Op op, DestIt dest_global,
  540. CudaContext& context);
  541. ////////////////////////////////////////////////////////////////////////////////
  542. // kernels/reducebykey.csr
  543. typedef SegReducePreprocessData ReduceByKeyPreprocessData;
  544. // ReduceByKey runs a segmented reduction given a data input and a matching set
  545. // of keys. This implementation requires operators support commutative
  546. // (a + b = b + a) and associative (a + (b + c) = (a + b) + c) evaluation.
  547. // It roughly matches the behavior of thrust::reduce_by_key.
  548. // KeysIt keys_global - Key identifier for the segment.
  549. // InputIt data_global - Data value input.
  550. // int count - Size of input arrays keys_global and
  551. // data_global.
  552. // ValType identity - Identity for reduction operation. Eg, use 0 for
  553. // addition or 1 for multiplication.
  554. // Op op - Reduction operator. Model on std::plus<>. MGPU
  555. // provides operators mgpu::plus<>, minus<>,
  556. // multiplies<>, modulus<>, bit_or<> bit_and<>,
  557. // bit_xor<>, maximum<>, and minimum<>.
  558. // Comp comp - Operator for comparing adjacent adjacent keys.
  559. // Must return true if two adjacent keys are in the
  560. // same segment. Use mgpu::equal_to<KeyType>() by
  561. // default.
  562. // KeyType* keysDest_global - If this pointer is not null, return the first
  563. // key from each segment. Must be sized to at least
  564. // the number of segments.
  565. // DestIt dest_global - Holds the reduced data. Must be sized to at least
  566. // the number of segments.
  567. // int* count_host - The number of segments, returned in host memory.
  568. // May be null.
  569. // int* count_global - The number of segments, returned in device memory.
  570. // This avoids a D->H synchronization. May be null.
  571. // CudaContext& context - MGPU context support object.
  572. template<typename KeysIt, typename InputIt, typename DestIt,
  573. typename KeyType, typename ValType, typename Op, typename Comp>
  574. MGPU_HOST void ReduceByKey(KeysIt keys_global, InputIt data_global, int count,
  575. ValType identity, Op op, Comp comp, KeyType* keysDest_global,
  576. DestIt dest_global, int* count_host, int* count_global,
  577. CudaContext& context);
  578. // ReduceByKeyPreprocess accelerates multiple reduce-by-key calls on different
  579. // data with the same segment geometry. The actual reduction is evaluated by
  580. // ReduceByKeyApply.
  581. // Note that the caller must explicitly specify the ValType argument. Kernel
  582. // tunings are based on the value type, not the key type.
  583. template<typename ValType, typename KeyType, typename KeysIt, typename Comp>
  584. MGPU_HOST void ReduceByKeyPreprocess(int count, KeysIt keys_global,
  585. KeyType* keysDest_global, Comp comp, int* count_host, int* count_global,
  586. std::auto_ptr<ReduceByKeyPreprocessData>* ppData, CudaContext& context);
  587. template<typename InputIt, typename DestIt, typename T, typename Op>
  588. MGPU_HOST void ReduceByKeyApply(const ReduceByKeyPreprocessData& preprocess,
  589. InputIt data_global, T identity, Op op, DestIt dest_global,
  590. CudaContext& context);
  591. ////////////////////////////////////////////////////////////////////////////////
  592. // kernels/spmvcsr.cuh
  593. typedef SegReducePreprocessData SpmvPreprocessData;
  594. // SpmvCsr[Unary|Binary] evaluates the product of a sparse matrix (CSR format)
  595. // with a dense vector.
  596. // SpmvCsrIndirect[Unary|Binary] uses indirection to lookup the start of each
  597. // matrix_global and cols_global on a per-row basis.
  598. // Unary methods reduce on the right-hand side vector values.
  599. // Binary methods reduce the product of the left-hand side matrix value with the
  600. // right-hand side vector values.
  601. // MatrixIt matrix_global - Left-hand side data for binary Spmv. There are nz
  602. // non-zero matrix elements. These are loaded and
  603. // combined with the vector values with mulOp.
  604. // ColsIt cols_global - Row identifiers for the right-hand side of the
  605. // matrix/value products. If element i is the k'th
  606. // non-zero in row j, the product is formed as
  607. // matrix_global[i] * vec_global[cols_global[i]]
  608. // for direct indexing, or,
  609. // m = source_global[j] + k
  610. // matrix_global[m] * vec_global[cols_global[m]].
  611. // int nz - Number of non-zeros in LHS matrix. Size of
  612. // matrix_global and cols_global.
  613. // CsrIt csr_global - List of integers for start of each row.
  614. // The first entry must be 0 (indicating that the
  615. // first row starts at offset 0).
  616. // Equivalent to exc-scan of row sizes.
  617. // If supportEmpty is false: must be ascending.
  618. // If supportEmpty is true: must be non-descending.
  619. // SourcesIt sources_global - An indirection array to source each row's data.
  620. // The size of each row i is
  621. // size_i = csr_global[i + 1] - csr_global[i].
  622. // The starting offset for both the data and column
  623. // identifiers is
  624. // offset_i = sources_global[i].
  625. // The direct Spmv methods (i.e. those not taking
  626. // a sources_global parameter) can be thought of as
  627. // indirect methods with sources_global = csr_global.
  628. // int numRows - Size of segment list csr_global. Must be >= 1.
  629. // VecIt vec_global - Input array. Size is the width of the matrix.
  630. // For unary Spmv, these values are reduced.
  631. // For binary Spmv, the products of the matrix and
  632. // vector values are reduced.
  633. // bool supportEmpty - Basic seg-reduce code does not support empty rows.
  634. // Set supportEmpty = true to add pre- and post-
  635. // processing to support empty rows.
  636. // DestIt dest_global - Output array. Must be numRows in size.
  637. // T identity - Identity for reduction operation. Eg, use 0 for
  638. // addition or 1 for multiplication.
  639. // MulOp mulOp - Reduction operator for combining matrix value with
  640. // vector value. Only defined for binary Spmv.
  641. // Use mgpu::multiplies<T>() for default behavior.
  642. // AddOp addOp - Reduction operator for reducing vector values
  643. // (unary Spmv) or matrix-vector products (binary
  644. // Spmv). Use mgpu::plus<T>() for default behavior.
  645. // CudaContext& context - MGPU context support object. All kernels are
  646. // launched on the associated stream.
  647. template<typename ColsIt, typename CsrIt, typename VecIt, typename DestIt,
  648. typename T, typename AddOp>
  649. MGPU_HOST void SpmvCsrUnary(ColsIt cols_global, int nz, CsrIt csr_global,
  650. int numRows, VecIt vec_global, bool supportEmpty, DestIt dest_global,
  651. T identity, AddOp addOp, CudaContext& context);
  652. template<typename MatrixIt, typename ColsIt, typename CsrIt, typename VecIt,
  653. typename DestIt, typename T, typename MulOp, typename AddOp>
  654. MGPU_HOST void SpmvCsrBinary(MatrixIt matrix_global, ColsIt cols_global,
  655. int nz, CsrIt csr_global, int numRows, VecIt vec_global,
  656. bool supportEmpty, DestIt dest_global, T identity, MulOp mulOp, AddOp addOp,
  657. CudaContext& context);
  658. template<typename ColsIt, typename CsrIt, typename SourcesIt, typename VecIt,
  659. typename DestIt, typename T, typename AddOp>
  660. MGPU_HOST void SpmvCsrIndirectUnary(ColsIt cols_global, int nz,
  661. CsrIt csr_global, SourcesIt sources_global, int numRows, VecIt vec_global,
  662. bool supportEmpty, DestIt dest_global, T identity, AddOp addOp,
  663. CudaContext& context);
  664. template<typename MatrixIt, typename ColsIt, typename CsrIt, typename SourcesIt,
  665. typename VecIt, typename DestIt, typename T, typename MulOp, typename AddOp>
  666. MGPU_HOST void SpmvCsrIndirectBinary(MatrixIt matrix_global, ColsIt cols_global,
  667. int nz, CsrIt csr_global, SourcesIt sources_global, int numRows,
  668. VecIt vec_global, bool supportEmpty, DestIt dest_global, T identity,
  669. MulOp mulOp, AddOp addOp, CudaContext& context);
  670. // SpmvPreprocess[Unary|Binary] accelerates multiple Spmv calls on different
  671. // matrix/vector pairs with the same matrix geometry. The actual reduction is
  672. // evaluated Spmv[Unary|Binary]Apply.
  673. template<typename T, typename CsrIt>
  674. MGPU_HOST void SpmvPreprocessUnary(int nz, CsrIt csr_global, int numRows,
  675. bool supportEmpty, std::auto_ptr<SpmvPreprocessData>* ppData,
  676. CudaContext& context);
  677. template<typename T, typename CsrIt>
  678. MGPU_HOST void SpmvPreprocessBinary(int nz, CsrIt csr_global, int numRows,
  679. bool supportEmpty, std::auto_ptr<SpmvPreprocessData>* ppData,
  680. CudaContext& context);
  681. template<typename ColsIt, typename VecIt, typename DestIt, typename T,
  682. typename MulOp, typename AddOp>
  683. MGPU_HOST void SpmvUnaryApply(const SpmvPreprocessData& preprocess,
  684. ColsIt cols_global, VecIt vec_global, DestIt dest_global, T identity,
  685. AddOp addOp, CudaContext& context);
  686. template<typename MatrixIt, typename ColsIt, typename VecIt, typename DestIt,
  687. typename T, typename MulOp, typename AddOp>
  688. MGPU_HOST void SpmvBinaryApply(const SpmvPreprocessData& preprocess,
  689. MatrixIt matrix_global, ColsIt cols_global, VecIt vec_global,
  690. DestIt dest_global, T identity, MulOp mulOp, AddOp addOp,
  691. CudaContext& context);
  692. } // namespace mgpu