util_ptx.cuh 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595
  1. /******************************************************************************
  2. * Copyright (c) 2011, Duane Merrill. All rights reserved.
  3. * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions are met:
  7. * * Redistributions of source code must retain the above copyright
  8. * notice, this list of conditions and the following disclaimer.
  9. * * Redistributions in binary form must reproduce the above copyright
  10. * notice, this list of conditions and the following disclaimer in the
  11. * documentation and/or other materials provided with the distribution.
  12. * * Neither the name of the NVIDIA CORPORATION nor the
  13. * names of its contributors may be used to endorse or promote products
  14. * derived from this software without specific prior written permission.
  15. *
  16. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  17. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  18. * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  19. * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
  20. * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  21. * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  22. * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  23. * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  24. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  25. * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *
  27. ******************************************************************************/
  28. /**
  29. * \file
  30. * PTX intrinsics
  31. */
  32. #pragma once
  33. #include "util_type.cuh"
  34. #include "util_arch.cuh"
  35. #include "util_namespace.cuh"
  36. /// Optional outer namespace(s)
  37. CUB_NS_PREFIX
  38. /// CUB namespace
  39. namespace cub {
  40. /**
  41. * \addtogroup UtilPtx
  42. * @{
  43. */
  44. /******************************************************************************
  45. * PTX helper macros
  46. ******************************************************************************/
  47. #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
  48. /**
  49. * Register modifier for pointer-types (for inlining PTX assembly)
  50. */
  51. #if defined(_WIN64) || defined(__LP64__)
  52. #define __CUB_LP64__ 1
  53. // 64-bit register modifier for inlined asm
  54. #define _CUB_ASM_PTR_ "l"
  55. #define _CUB_ASM_PTR_SIZE_ "u64"
  56. #else
  57. #define __CUB_LP64__ 0
  58. // 32-bit register modifier for inlined asm
  59. #define _CUB_ASM_PTR_ "r"
  60. #define _CUB_ASM_PTR_SIZE_ "u32"
  61. #endif
  62. #endif // DOXYGEN_SHOULD_SKIP_THIS
  63. /******************************************************************************
  64. * Inlined PTX intrinsics
  65. ******************************************************************************/
  66. /**
  67. * \brief Shift-right then add. Returns (\p x >> \p shift) + \p addend.
  68. */
  69. __device__ __forceinline__ unsigned int SHR_ADD(
  70. unsigned int x,
  71. unsigned int shift,
  72. unsigned int addend)
  73. {
  74. unsigned int ret;
  75. #if CUB_PTX_VERSION >= 200
  76. asm("vshr.u32.u32.u32.clamp.add %0, %1, %2, %3;" :
  77. "=r"(ret) : "r"(x), "r"(shift), "r"(addend));
  78. #else
  79. ret = (x >> shift) + addend;
  80. #endif
  81. return ret;
  82. }
  83. /**
  84. * \brief Shift-left then add. Returns (\p x << \p shift) + \p addend.
  85. */
  86. __device__ __forceinline__ unsigned int SHL_ADD(
  87. unsigned int x,
  88. unsigned int shift,
  89. unsigned int addend)
  90. {
  91. unsigned int ret;
  92. #if CUB_PTX_VERSION >= 200
  93. asm("vshl.u32.u32.u32.clamp.add %0, %1, %2, %3;" :
  94. "=r"(ret) : "r"(x), "r"(shift), "r"(addend));
  95. #else
  96. ret = (x << shift) + addend;
  97. #endif
  98. return ret;
  99. }
  100. #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
  101. /**
  102. * Bitfield-extract.
  103. */
  104. template <typename UnsignedBits, int BYTE_LEN>
  105. __device__ __forceinline__ unsigned int BFE(
  106. UnsignedBits source,
  107. unsigned int bit_start,
  108. unsigned int num_bits,
  109. Int2Type<BYTE_LEN> byte_len)
  110. {
  111. unsigned int bits;
  112. #if CUB_PTX_VERSION >= 200
  113. asm("bfe.u32 %0, %1, %2, %3;" : "=r"(bits) : "r"((unsigned int) source), "r"(bit_start), "r"(num_bits));
  114. #else
  115. const unsigned int MASK = (1 << num_bits) - 1;
  116. bits = (source >> bit_start) & MASK;
  117. #endif
  118. return bits;
  119. }
  120. /**
  121. * Bitfield-extract for 64-bit types.
  122. */
  123. template <typename UnsignedBits>
  124. __device__ __forceinline__ unsigned int BFE(
  125. UnsignedBits source,
  126. unsigned int bit_start,
  127. unsigned int num_bits,
  128. Int2Type<8> byte_len)
  129. {
  130. const unsigned long long MASK = (1ull << num_bits) - 1;
  131. return (source >> bit_start) & MASK;
  132. }
  133. #endif // DOXYGEN_SHOULD_SKIP_THIS
  134. /**
  135. * \brief Bitfield-extract. Extracts \p num_bits from \p source starting at bit-offset \p bit_start. The input \p source may be an 8b, 16b, 32b, or 64b unsigned integer type.
  136. */
  137. template <typename UnsignedBits>
  138. __device__ __forceinline__ unsigned int BFE(
  139. UnsignedBits source,
  140. unsigned int bit_start,
  141. unsigned int num_bits)
  142. {
  143. return BFE(source, bit_start, num_bits, Int2Type<sizeof(UnsignedBits)>());
  144. }
  145. /**
  146. * \brief Bitfield insert. Inserts the \p num_bits least significant bits of \p y into \p x at bit-offset \p bit_start.
  147. */
  148. __device__ __forceinline__ void BFI(
  149. unsigned int &ret,
  150. unsigned int x,
  151. unsigned int y,
  152. unsigned int bit_start,
  153. unsigned int num_bits)
  154. {
  155. #if CUB_PTX_VERSION >= 200
  156. asm("bfi.b32 %0, %1, %2, %3, %4;" :
  157. "=r"(ret) : "r"(y), "r"(x), "r"(bit_start), "r"(num_bits));
  158. #else
  159. x <<= bit_start;
  160. unsigned int MASK_X = ((1 << num_bits) - 1) << bit_start;
  161. unsigned int MASK_Y = ~MASK_X;
  162. ret = (y & MASK_Y) | (x & MASK_X);
  163. #endif
  164. }
  165. /**
  166. * \brief Three-operand add. Returns \p x + \p y + \p z.
  167. */
  168. __device__ __forceinline__ unsigned int IADD3(unsigned int x, unsigned int y, unsigned int z)
  169. {
  170. #if CUB_PTX_VERSION >= 200
  171. asm("vadd.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(x) : "r"(x), "r"(y), "r"(z));
  172. #else
  173. x = x + y + z;
  174. #endif
  175. return x;
  176. }
  177. /**
  178. * \brief Byte-permute. Pick four arbitrary bytes from two 32-bit registers, and reassemble them into a 32-bit destination register. For SM2.0 or later.
  179. *
  180. * \par
  181. * The bytes in the two source registers \p a and \p b are numbered from 0 to 7:
  182. * {\p b, \p a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}. For each of the four bytes
  183. * {b3, b2, b1, b0} selected in the return value, a 4-bit selector is defined within
  184. * the four lower "nibbles" of \p index: {\p index } = {n7, n6, n5, n4, n3, n2, n1, n0}
  185. *
  186. * \par Snippet
  187. * The code snippet below illustrates byte-permute.
  188. * \par
  189. * \code
  190. * #include <cub/cub.cuh>
  191. *
  192. * __global__ void ExampleKernel(...)
  193. * {
  194. * int a = 0x03020100;
  195. * int b = 0x07060504;
  196. * int index = 0x00007531;
  197. *
  198. * int selected = PRMT(a, b, index); // 0x07050301
  199. *
  200. * \endcode
  201. *
  202. */
  203. __device__ __forceinline__ int PRMT(unsigned int a, unsigned int b, unsigned int index)
  204. {
  205. int ret;
  206. asm("prmt.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(a), "r"(b), "r"(index));
  207. return ret;
  208. }
  209. #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
  210. /**
  211. * Sync-threads barrier.
  212. */
  213. __device__ __forceinline__ void BAR(int count)
  214. {
  215. asm volatile("bar.sync 1, %0;" : : "r"(count));
  216. }
  217. /**
  218. * Floating point multiply. (Mantissa LSB rounds towards zero.)
  219. */
  220. __device__ __forceinline__ float FMUL_RZ(float a, float b)
  221. {
  222. float d;
  223. asm("mul.rz.f32 %0, %1, %2;" : "=f"(d) : "f"(a), "f"(b));
  224. return d;
  225. }
  226. /**
  227. * Floating point multiply-add. (Mantissa LSB rounds towards zero.)
  228. */
  229. __device__ __forceinline__ float FFMA_RZ(float a, float b, float c)
  230. {
  231. float d;
  232. asm("fma.rz.f32 %0, %1, %2, %3;" : "=f"(d) : "f"(a), "f"(b), "f"(c));
  233. return d;
  234. }
  235. #endif // DOXYGEN_SHOULD_SKIP_THIS
  236. /**
  237. * \brief Terminates the calling thread
  238. */
  239. __device__ __forceinline__ void ThreadExit() {
  240. asm("exit;");
  241. }
  242. /**
  243. * \brief Returns the warp lane ID of the calling thread
  244. */
  245. __device__ __forceinline__ unsigned int LaneId()
  246. {
  247. unsigned int ret;
  248. asm("mov.u32 %0, %laneid;" : "=r"(ret) );
  249. return ret;
  250. }
  251. /**
  252. * \brief Returns the warp ID of the calling thread. Warp ID is guaranteed to be unique among warps, but may not correspond to a zero-based ranking within the thread block.
  253. */
  254. __device__ __forceinline__ unsigned int WarpId()
  255. {
  256. unsigned int ret;
  257. asm("mov.u32 %0, %warpid;" : "=r"(ret) );
  258. return ret;
  259. }
  260. /**
  261. * \brief Returns the warp lane mask of all lanes less than the calling thread
  262. */
  263. __device__ __forceinline__ unsigned int LaneMaskLt()
  264. {
  265. unsigned int ret;
  266. asm("mov.u32 %0, %lanemask_lt;" : "=r"(ret) );
  267. return ret;
  268. }
  269. /**
  270. * \brief Returns the warp lane mask of all lanes less than or equal to the calling thread
  271. */
  272. __device__ __forceinline__ unsigned int LaneMaskLe()
  273. {
  274. unsigned int ret;
  275. asm("mov.u32 %0, %lanemask_le;" : "=r"(ret) );
  276. return ret;
  277. }
  278. /**
  279. * \brief Returns the warp lane mask of all lanes greater than the calling thread
  280. */
  281. __device__ __forceinline__ unsigned int LaneMaskGt()
  282. {
  283. unsigned int ret;
  284. asm("mov.u32 %0, %lanemask_gt;" : "=r"(ret) );
  285. return ret;
  286. }
  287. /**
  288. * \brief Returns the warp lane mask of all lanes greater than or equal to the calling thread
  289. */
  290. __device__ __forceinline__ unsigned int LaneMaskGe()
  291. {
  292. unsigned int ret;
  293. asm("mov.u32 %0, %lanemask_ge;" : "=r"(ret) );
  294. return ret;
  295. }
  296. /** @} */ // end group UtilPtx
  297. /**
  298. * \brief Shuffle-up for any data type. Each <em>warp-lane<sub>i</sub></em> obtains the value \p input contributed by <em>warp-lane</em><sub><em>i</em>-<tt>src_offset</tt></sub>. For thread lanes \e i < src_offset, the thread's own \p input is returned to the thread. ![](shfl_up_logo.png)
  299. * \ingroup WarpModule
  300. *
  301. * \par
  302. * - Available only for SM3.0 or newer
  303. *
  304. * \par Snippet
  305. * The code snippet below illustrates each thread obtaining a \p double value from the
  306. * predecessor of its predecessor.
  307. * \par
  308. * \code
  309. * #include <cub/cub.cuh> // or equivalently <cub/util_ptx.cuh>
  310. *
  311. * __global__ void ExampleKernel(...)
  312. * {
  313. * // Obtain one input item per thread
  314. * double thread_data = ...
  315. *
  316. * // Obtain item from two ranks below
  317. * double peer_data = ShuffleUp(thread_data, 2);
  318. *
  319. * \endcode
  320. * \par
  321. * Suppose the set of input \p thread_data across the first warp of threads is <tt>{1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}</tt>.
  322. * The corresponding output \p peer_data will be <tt>{1.0, 2.0, 1.0, 2.0, 3.0, ..., 30.0}</tt>.
  323. *
  324. */
  325. template <typename T>
  326. __device__ __forceinline__ T ShuffleUp(
  327. T input, ///< [in] The value to broadcast
  328. int src_offset) ///< [in] The relative down-offset of the peer to read from
  329. {
  330. enum
  331. {
  332. SHFL_C = 0,
  333. };
  334. typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
  335. const int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
  336. T output;
  337. ShuffleWord *output_alias = reinterpret_cast<ShuffleWord *>(&output);
  338. ShuffleWord *input_alias = reinterpret_cast<ShuffleWord *>(&input);
  339. #pragma unroll
  340. for (int WORD = 0; WORD < WORDS; ++WORD)
  341. {
  342. unsigned int shuffle_word = input_alias[WORD];
  343. asm(
  344. " shfl.up.b32 %0, %1, %2, %3;"
  345. : "=r"(shuffle_word) : "r"(shuffle_word), "r"(src_offset), "r"(SHFL_C));
  346. output_alias[WORD] = (ShuffleWord) shuffle_word;
  347. }
  348. return output;
  349. }
  350. /**
  351. * \brief Shuffle-down for any data type. Each <em>warp-lane<sub>i</sub></em> obtains the value \p input contributed by <em>warp-lane</em><sub><em>i</em>+<tt>src_offset</tt></sub>. For thread lanes \e i >= WARP_THREADS, the thread's own \p input is returned to the thread. ![](shfl_down_logo.png)
  352. * \ingroup WarpModule
  353. *
  354. * \par
  355. * - Available only for SM3.0 or newer
  356. *
  357. * \par Snippet
  358. * The code snippet below illustrates each thread obtaining a \p double value from the
  359. * successor of its successor.
  360. * \par
  361. * \code
  362. * #include <cub/cub.cuh> // or equivalently <cub/util_ptx.cuh>
  363. *
  364. * __global__ void ExampleKernel(...)
  365. * {
  366. * // Obtain one input item per thread
  367. * double thread_data = ...
  368. *
  369. * // Obtain item from two ranks below
  370. * double peer_data = ShuffleDown(thread_data, 2);
  371. *
  372. * \endcode
  373. * \par
  374. * Suppose the set of input \p thread_data across the first warp of threads is <tt>{1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}</tt>.
  375. * The corresponding output \p peer_data will be <tt>{3.0, 4.0, 5.0, 6.0, 7.0, ..., 32.0}</tt>.
  376. *
  377. */
  378. template <typename T>
  379. __device__ __forceinline__ T ShuffleDown(
  380. T input, ///< [in] The value to broadcast
  381. int src_offset) ///< [in] The relative up-offset of the peer to read from
  382. {
  383. enum
  384. {
  385. SHFL_C = CUB_PTX_WARP_THREADS - 1,
  386. };
  387. typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
  388. const int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
  389. T output;
  390. ShuffleWord *output_alias = reinterpret_cast<ShuffleWord *>(&output);
  391. ShuffleWord *input_alias = reinterpret_cast<ShuffleWord *>(&input);
  392. #pragma unroll
  393. for (int WORD = 0; WORD < WORDS; ++WORD)
  394. {
  395. unsigned int shuffle_word = input_alias[WORD];
  396. asm(
  397. " shfl.down.b32 %0, %1, %2, %3;"
  398. : "=r"(shuffle_word) : "r"(shuffle_word), "r"(src_offset), "r"(SHFL_C));
  399. output_alias[WORD] = (ShuffleWord) shuffle_word;
  400. }
  401. return output;
  402. }
  403. #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
  404. /**
  405. * \brief Shuffle-broadcast for any data type. Each <em>warp-lane<sub>i</sub></em> obtains the value \p input contributed by <em>warp-lane</em><sub><tt>src_lane</tt></sub>. For \p src_lane < 0 or \p src_lane >= WARP_THREADS, then the thread's own \p input is returned to the thread. ![](shfl_broadcast_logo.png)
  406. * \ingroup WarpModule
  407. *
  408. * \par
  409. * - Available only for SM3.0 or newer
  410. */
  411. template <typename T>
  412. __device__ __forceinline__ T ShuffleBroadcast(
  413. T input, ///< [in] The value to broadcast
  414. int src_lane, ///< [in] Which warp lane is to do the broadcasting
  415. int logical_warp_threads) ///< [in] Number of threads per logical warp
  416. {
  417. typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
  418. const int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
  419. T output;
  420. ShuffleWord *output_alias = reinterpret_cast<ShuffleWord *>(&output);
  421. ShuffleWord *input_alias = reinterpret_cast<ShuffleWord *>(&input);
  422. #pragma unroll
  423. for (int WORD = 0; WORD < WORDS; ++WORD)
  424. {
  425. unsigned int shuffle_word = input_alias[WORD];
  426. asm("shfl.idx.b32 %0, %1, %2, %3;"
  427. : "=r"(shuffle_word) : "r"(shuffle_word), "r"(src_lane), "r"(logical_warp_threads - 1));
  428. output_alias[WORD] = (ShuffleWord) shuffle_word;
  429. }
  430. return output;
  431. }
  432. #endif // DOXYGEN_SHOULD_SKIP_THIS
  433. /**
  434. * \brief Shuffle-broadcast for any data type. Each <em>warp-lane<sub>i</sub></em> obtains the value \p input contributed by <em>warp-lane</em><sub><tt>src_lane</tt></sub>. For \p src_lane < 0 or \p src_lane >= WARP_THREADS, then the thread's own \p input is returned to the thread. ![](shfl_broadcast_logo.png)
  435. * \ingroup WarpModule
  436. *
  437. * \par
  438. * - Available only for SM3.0 or newer
  439. *
  440. * \par Snippet
  441. * The code snippet below illustrates each thread obtaining a \p double value from <em>warp-lane</em><sub>0</sub>.
  442. *
  443. * \par
  444. * \code
  445. * #include <cub/cub.cuh> // or equivalently <cub/util_ptx.cuh>
  446. *
  447. * __global__ void ExampleKernel(...)
  448. * {
  449. * // Obtain one input item per thread
  450. * double thread_data = ...
  451. *
  452. * // Obtain item from thread 0
  453. * double peer_data = ShuffleBroadcast(thread_data, 0);
  454. *
  455. * \endcode
  456. * \par
  457. * Suppose the set of input \p thread_data across the first warp of threads is <tt>{1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}</tt>.
  458. * The corresponding output \p peer_data will be <tt>{1.0, 1.0, 1.0, 1.0, 1.0, ..., 1.0}</tt>.
  459. *
  460. */
  461. template <typename T>
  462. __device__ __forceinline__ T ShuffleBroadcast(
  463. T input, ///< [in] The value to broadcast
  464. int src_lane) ///< [in] Which warp lane is to do the broadcasting
  465. {
  466. return ShuffleBroadcast(input, src_lane, CUB_PTX_WARP_THREADS);
  467. }
  468. /**
  469. * \brief Portable implementation of __all
  470. * \ingroup WarpModule
  471. */
  472. __device__ __forceinline__ int WarpAll(int cond)
  473. {
  474. #if CUB_PTX_VERSION < 120
  475. __shared__ volatile int warp_signals[CUB_PTX_MAX_SM_THREADS / CUB_PTX_WARP_THREADS];
  476. if (LaneId() == 0)
  477. warp_signals[WarpId()] = 1;
  478. if (cond == 0)
  479. warp_signals[WarpId()] = 0;
  480. return warp_signals[WarpId()];
  481. #else
  482. return __all(cond);
  483. #endif
  484. }
  485. /**
  486. * \brief Portable implementation of __any
  487. * \ingroup WarpModule
  488. */
  489. __device__ __forceinline__ int WarpAny(int cond)
  490. {
  491. #if CUB_PTX_VERSION < 120
  492. __shared__ volatile int warp_signals[CUB_PTX_MAX_SM_THREADS / CUB_PTX_WARP_THREADS];
  493. if (LaneId() == 0)
  494. warp_signals[WarpId()] = 0;
  495. if (cond)
  496. warp_signals[WarpId()] = 1;
  497. return warp_signals[WarpId()];
  498. #else
  499. return __any(cond);
  500. #endif
  501. }
  502. } // CUB namespace
  503. CUB_NS_POSTFIX // Optional outer namespace(s)