mgpudevice.cuh 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290
  1. /******************************************************************************
  2. * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved.
  3. *
  4. * Redistribution and use in source and binary forms, with or without
  5. * modification, are permitted provided that the following conditions are met:
  6. * * Redistributions of source code must retain the above copyright
  7. * notice, this list of conditions and the following disclaimer.
  8. * * Redistributions in binary form must reproduce the above copyright
  9. * notice, this list of conditions and the following disclaimer in the
  10. * documentation and/or other materials provided with the distribution.
  11. * * Neither the name of the NVIDIA CORPORATION nor the
  12. * names of its contributors may be used to endorse or promote products
  13. * derived from this software without specific prior written permission.
  14. *
  15. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  16. * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  17. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  18. * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
  19. * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  20. * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  21. * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  22. * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  23. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  24. * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  25. *
  26. ******************************************************************************/
  27. /******************************************************************************
  28. *
  29. * Code and text by Sean Baxter, NVIDIA Research
  30. * See http://nvlabs.github.io/moderngpu for repository and documentation.
  31. *
  32. ******************************************************************************/
  33. #pragma once
  34. #include "mgpuenums.h"
  35. #include "device/deviceutil.cuh"
  36. namespace mgpu {
  37. ////////////////////////////////////////////////////////////////////////////////
  38. // device/loadstore.cuh
  39. // For 0 <= i < VT:
  40. // index = NT * i + tid;
  41. // reg[i] = data[index];
  42. // Synchronize after load.
  43. template<int NT, int VT, typename InputIt, typename T>
  44. MGPU_DEVICE void DeviceSharedToReg(InputIt data, int tid, T* reg,
  45. bool sync = true);
  46. // For 0 <= i < VT:
  47. // index = NT * i + tid;
  48. // if(index < count) reg[i] = data[index];
  49. // No synchronize after load.
  50. template<int NT, int VT, typename InputIt, typename T>
  51. MGPU_DEVICE void DeviceGlobalToReg(int count, InputIt data, int tid,
  52. T* reg, bool sync = false);
  53. template<int NT, int VT, typename InputIt, typename T>
  54. MGPU_DEVICE void DeviceGlobalToRegDefault(int count, InputIt data, int tid,
  55. T* reg, T init, bool sync = false);
  56. // For 0 <= i < VT:
  57. // index = NT * i + tid;
  58. // if(index < count) reg[i] = data[index];
  59. // No synchronize after load.
  60. template<int NT, int VT0, int VT1, typename InputIt, typename T>
  61. MGPU_DEVICE void DeviceGlobalToReg(int count, InputIt data, int tid,
  62. T* reg, bool sync = false);
  63. // For 0 <= i < VT:
  64. // index = NT * i + tid;
  65. // if(index < count) reg[i] = data[index];
  66. // No synchronize after load.
  67. template<int NT, int VT0, int VT1, typename InputIt, typename T>
  68. MGPU_DEVICE void DeviceGlobalToRegDefault2(int count, InputIt data, int tid,
  69. T* reg, T init, bool sync = false);
  70. // For 0 <= i < VT:
  71. // index = NT * i + tid;
  72. // if(index < count) reg[i] = data[index];
  73. // No synchronize after load.
  74. // No optimized code path for count < NV (smaller generated code).
  75. template<int NT, int VT, typename InputIt, typename T>
  76. MGPU_DEVICE void DeviceGlobalToRegLoop(int count, InputIt data, int tid,
  77. T* reg, bool sync = false);
  78. // For 0 <= i < VT:
  79. // index = VT * tid + i.
  80. // if(index < count) reg[i] = data[index];
  81. // No synchronize after load.
  82. template<int NT, int VT, typename InputIt, typename T>
  83. MGPU_DEVICE void DeviceGlobalToThread(int count, InputIt data, int tid,
  84. T* reg);
  85. template<int NT, int VT, typename InputIt, typename T>
  86. MGPU_DEVICE void DeviceGlobalToThreadDefault(int count, InputIt data, int tid,
  87. T* reg, T init);
  88. // For 0 <= i < VT:
  89. // index = NT * i + tid;
  90. // if(index < count) data[index] = reg[i];
  91. // Synchronize after load.
  92. template<int NT, int VT, typename OutputIt, typename T>
  93. MGPU_DEVICE void DeviceRegToShared(const T* reg, int tid, OutputIt dest,
  94. bool sync = true);
  95. // For 0 <= i < VT:
  96. // index = NT * i + tid;
  97. // if(index < count) data[index] = reg[i];
  98. // No synchronize after load.
  99. template<int NT, int VT, typename OutputIt, typename T>
  100. MGPU_DEVICE void DeviceRegToGlobal(int count, const T* reg, int tid,
  101. OutputIt dest, bool sync = false);
  102. // For 0 <= index < count:
  103. // dest[index] = source[index];
  104. // This function is intended to replace DeviceGlobalToShared in cases where
  105. // count is much less than NT * VT.
  106. template<int NT, typename InputIt, typename OutputIt>
  107. MGPU_DEVICE void DeviceMemToMemLoop(int count, InputIt source, int tid,
  108. OutputIt dest, bool sync = true);
  109. // For 0 <= index < count:
  110. // dest[index] = source[index];
  111. // Synchronize after store.
  112. template<int NT, int VT, typename T, typename OutputIt>
  113. MGPU_DEVICE void DeviceSharedToGlobal(int count, const T* source, int tid,
  114. OutputIt dest, bool sync = true);
  115. // For 0 <= index < count:
  116. // dest[index] = source[index];
  117. // Synchronize after store.
  118. template<int NT, int VT, typename InputIt, typename T>
  119. MGPU_DEVICE void DeviceGlobalToShared(int count, InputIt source, int tid,
  120. T* dest, bool sync = true);
  121. template<int NT, int VT0, int VT1, typename InputIt, typename T>
  122. MGPU_DEVICE void DeviceGlobalToShared2(int count, InputIt source, int tid,
  123. T* dest, bool sync = true);
  124. // For 0 <= index < count:
  125. // dest[index] = source[index];
  126. // Synchronize after store.
  127. // No optimized code path for count < NV (smaller generated code).
  128. template<int NT, int VT, typename InputIt, typename T>
  129. MGPU_DEVICE void DeviceGlobalToSharedLoop(int count, InputIt source, int tid,
  130. T* dest, bool sync = true);
  131. template<int NT, int VT, typename InputIt, typename T>
  132. MGPU_DEVICE void DeviceGlobalToSharedDefault(int count, InputIt source, int tid,
  133. T* dest, T init, bool sync = true);
  134. template<int NT, int VT0, int VT1, typename InputIt, typename T>
  135. MGPU_DEVICE void DeviceGlobalToSharedDefault2(int count, InputIt source,
  136. int tid, T* dest, T init, bool sync = true);
  137. // For 0 <= index < count:
  138. // dest[index] = source[index];
  139. // No synchronize.
  140. template<int NT, int VT, typename InputIt, typename OutputIt>
  141. MGPU_DEVICE void DeviceGlobalToGlobal(int count, InputIt source, int tid,
  142. OutputIt dest, bool sync = false);
  143. // Transponse VT elements in NT threads (x) into thread-order registers (y)
  144. // using only NT * VT / 2 elements of shared memory.
  145. template<int NT, int VT, typename T>
  146. MGPU_DEVICE void HalfSmemTranspose(const T* x, int tid, T* shared, T* y);
  147. // For 0 <= i < VT:
  148. // index = NT * i + tid;
  149. // if(index < count)
  150. // gather = indices[index];
  151. // reg[i] = data[gather];
  152. // Synchronize after load.
  153. template<int NT, int VT, typename InputIt, typename T>
  154. MGPU_DEVICE void DeviceGather(int count, InputIt data, int indices[VT],
  155. int tid, T* reg, bool sync = true);
  156. template<int NT, int VT, typename InputIt, typename T>
  157. MGPU_DEVICE void DeviceGatherDefault(int count, InputIt data, int indices[VT],
  158. int tid, T* reg, T identity, bool sync = true);
  159. // For 0 <= i < VT:
  160. // index = NT * i + tid;
  161. // if(index < count)
  162. // scatter = indices[index];
  163. // data[scatter] = reg[i];
  164. // Synchronize after store.
  165. template<int NT, int VT, typename T, typename OutputIt>
  166. MGPU_DEVICE void DeviceScatter(int count, const T* reg, int tid,
  167. int indices[VT], OutputIt data, bool sync = true);
  168. // For 0 <= i < VT:
  169. // shared[VT * tid + i] = threadReg[i];
  170. // Synchronize after store.
  171. // Note this function moves data in THREAD ORDER.
  172. // (DeviceRegToShared moves data in STRIDED ORDER).
  173. template<int VT, typename T>
  174. MGPU_DEVICE void DeviceThreadToShared(const T* threadReg, int tid, T* shared,
  175. bool sync = true);
  176. // For 0 <= i < VT:
  177. // threadReg[i] = shared[VT * tid + i];
  178. // Synchronize after load.
  179. // Note this function moves data in THREAD ORDER.
  180. // (DeviceSharedToReg moves data in STRIDED ORDER).
  181. template<int VT, typename T>
  182. MGPU_DEVICE void DeviceSharedToThread(const T* shared, int tid, T* threadReg,
  183. bool sync = true);
  184. // For 0 <= index < aCount:
  185. // shared[index] = a_global[index];
  186. // For 0 <= index < bCount:
  187. // shared[aCount + index] = b_global[index];
  188. // VT0 is the lower-bound for predication-free execution:
  189. // If count >= NT * VT0, a predication-free branch is taken.
  190. // VT1 is the upper-bound for loads:
  191. // NT * VT1 must >= aCount + bCount.
  192. template<int NT, int VT0, int VT1, typename T>
  193. MGPU_DEVICE void DeviceLoad2ToReg(const T* a_global, int aCount,
  194. const T* b_global, int bCount, int tid, T* reg, bool sync = false);
  195. template<int NT, int VT0, int VT1, typename T>
  196. MGPU_DEVICE void DeviceLoad2ToShared(const T* a_global, int aCount,
  197. const T* b_global, int bCount, int tid, T* shared, bool sync = true);
  198. template<int NT, int VT0, int VT1, typename InputIt1, typename InputIt2,
  199. typename T>
  200. MGPU_DEVICE void DeviceLoad2ToReg(InputIt1 a_global, int aCount,
  201. InputIt2 b_global, int bCount, int tid, T* reg, bool sync = false);
  202. template<int NT, int VT0, int VT1, typename InputIt1, typename InputIt2,
  203. typename T>
  204. MGPU_DEVICE void DeviceLoad2ToShared(InputIt1 a_global, int aCount,
  205. InputIt2 b_global, int bCount, int tid, T* shared, bool sync = true);
  206. // For 0 <= i < VT
  207. // index = NT * i + tid;
  208. // if(index < count)
  209. // gather = indices_shared[index];
  210. // dest_global[index] = data_global[gather];
  211. // Synchronize after load.
  212. template<int NT, int VT, typename InputIt, typename OutputIt>
  213. MGPU_DEVICE void DeviceGatherGlobalToGlobal(int count, InputIt data_global,
  214. const int* indices_shared, int tid, OutputIt dest_global,
  215. bool sync = true);
  216. // For 0 <= i < VT
  217. // index = NT * i + tid
  218. // if(index < count)
  219. // gather = indices[index];
  220. // if(gather < aCount) data = a_global[gather];
  221. // else data = b_global[gather - aCount];
  222. // dest_global[index] = data;
  223. // Synchronize after load.
  224. template<int NT, int VT, typename InputIt1, typename InputIt2,
  225. typename T>
  226. MGPU_DEVICE void DeviceTransferMergeValuesReg(int count, InputIt1 a_global,
  227. InputIt2 b_global, int bStart, const int* indices, int tid,
  228. T* reg, bool sync = false);
  229. template<int NT, int VT, typename InputIt1, typename InputIt2,
  230. typename OutputIt>
  231. MGPU_DEVICE void DeviceTransferMergeValuesShared(int count, InputIt1 a_global,
  232. InputIt2 b_global, int bStart, const int* indices_shared, int tid,
  233. OutputIt dest_global, bool sync = true);
  234. template<int NT, int VT, typename T>
  235. MGPU_DEVICE void DeviceTransferMergeValuesReg(int count, const T* a_global,
  236. const T* b_global, int bStart, const int* indices, int tid,
  237. T* reg, bool sync = false);
  238. template<int NT, int VT, typename T, typename OutputIt>
  239. MGPU_DEVICE void DeviceTransferMergeValuesShared(int count, const T* a_global,
  240. const T* b_global, int bStart, const int* indices_shared, int tid,
  241. OutputIt dest_global, bool sync = true);
  242. } // namespace mgpu
  243. #include "device/launchbox.cuh"
  244. #include "device/loadstore.cuh"
  245. #include "device/ctasegscan.cuh"