util_device.cuh 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372
  1. /******************************************************************************
  2. * Copyright (c) 2011, Duane Merrill. All rights reserved.
  3. * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions are met:
  7. * * Redistributions of source code must retain the above copyright
  8. * notice, this list of conditions and the following disclaimer.
  9. * * Redistributions in binary form must reproduce the above copyright
  10. * notice, this list of conditions and the following disclaimer in the
  11. * documentation and/or other materials provided with the distribution.
  12. * * Neither the name of the NVIDIA CORPORATION nor the
  13. * names of its contributors may be used to endorse or promote products
  14. * derived from this software without specific prior written permission.
  15. *
  16. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  17. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  18. * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  19. * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
  20. * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  21. * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  22. * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  23. * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  24. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  25. * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *
  27. ******************************************************************************/
  28. /**
  29. * \file
  30. * Properties of a given CUDA device and the corresponding PTX bundle
  31. */
  32. #pragma once
  33. #include "util_arch.cuh"
  34. #include "util_debug.cuh"
  35. #include "util_namespace.cuh"
  36. #include "util_macro.cuh"
  37. /// Optional outer namespace(s)
  38. CUB_NS_PREFIX
  39. /// CUB namespace
  40. namespace cub {
  41. /**
  42. * \addtogroup UtilMgmt
  43. * @{
  44. */
  45. #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
  46. /**
  47. * Empty kernel for querying PTX manifest metadata (e.g., version) for the current device
  48. */
  49. template <typename T>
  50. __global__ void EmptyKernel(void) { }
  51. /**
  52. * Alias temporaries to externally-allocated device storage (or simply return the amount of storage needed).
  53. */
  54. template <int ALLOCATIONS>
  55. __host__ __device__ __forceinline__
  56. cudaError_t AliasTemporaries(
  57. void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
  58. size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \t d_temp_storage allocation
  59. void* (&allocations)[ALLOCATIONS], ///< [in,out] Pointers to device allocations needed
  60. size_t (&allocation_sizes)[ALLOCATIONS]) ///< [in] Sizes in bytes of device allocations needed
  61. {
  62. const int ALIGN_BYTES = 256;
  63. const int ALIGN_MASK = ~(ALIGN_BYTES - 1);
  64. // Compute exclusive prefix sum over allocation requests
  65. size_t allocation_offsets[ALLOCATIONS];
  66. size_t bytes_needed = 0;
  67. for (int i = 0; i < ALLOCATIONS; ++i)
  68. {
  69. size_t allocation_bytes = (allocation_sizes[i] + ALIGN_BYTES - 1) & ALIGN_MASK;
  70. allocation_offsets[i] = bytes_needed;
  71. bytes_needed += allocation_bytes;
  72. }
  73. // Check if the caller is simply requesting the size of the storage allocation
  74. if (!d_temp_storage)
  75. {
  76. temp_storage_bytes = bytes_needed;
  77. return cudaSuccess;
  78. }
  79. // Check if enough storage provided
  80. if (temp_storage_bytes < bytes_needed)
  81. {
  82. return CubDebug(cudaErrorInvalidValue);
  83. }
  84. // Alias
  85. for (int i = 0; i < ALLOCATIONS; ++i)
  86. {
  87. allocations[i] = static_cast<char*>(d_temp_storage) + allocation_offsets[i];
  88. }
  89. return cudaSuccess;
  90. }
  91. #endif // DOXYGEN_SHOULD_SKIP_THIS
  92. /**
  93. * \brief Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10)
  94. */
  95. __host__ __device__ __forceinline__ cudaError_t PtxVersion(int &ptx_version)
  96. {
  97. struct Dummy
  98. {
  99. /// Type definition of the EmptyKernel kernel entry point
  100. typedef void (*EmptyKernelPtr)();
  101. /// Force EmptyKernel<void> to be generated if this class is used
  102. __host__ __device__ __forceinline__
  103. EmptyKernelPtr Empty()
  104. {
  105. return EmptyKernel<void>;
  106. }
  107. };
  108. #ifndef CUB_RUNTIME_ENABLED
  109. // CUDA API calls not supported from this device
  110. return cudaErrorInvalidConfiguration;
  111. #elif (CUB_PTX_VERSION > 0)
  112. ptx_version = CUB_PTX_VERSION;
  113. return cudaSuccess;
  114. #else
  115. cudaError_t error = cudaSuccess;
  116. do
  117. {
  118. cudaFuncAttributes empty_kernel_attrs;
  119. if (CubDebug(error = cudaFuncGetAttributes(&empty_kernel_attrs, EmptyKernel<void>))) break;
  120. ptx_version = empty_kernel_attrs.ptxVersion * 10;
  121. }
  122. while (0);
  123. return error;
  124. #endif
  125. }
  126. /**
  127. * \brief Retrieves the SM version (major * 100 + minor * 10)
  128. */
  129. __host__ __device__ __forceinline__ cudaError_t SmVersion(int &sm_version, int device_ordinal)
  130. {
  131. #ifndef CUB_RUNTIME_ENABLED
  132. // CUDA API calls not supported from this device
  133. return cudaErrorInvalidConfiguration;
  134. #else
  135. cudaError_t error = cudaSuccess;
  136. do
  137. {
  138. // Fill in SM version
  139. int major, minor;
  140. if (CubDebug(error = cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device_ordinal))) break;
  141. if (CubDebug(error = cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device_ordinal))) break;
  142. sm_version = major * 100 + minor * 10;
  143. }
  144. while (0);
  145. return error;
  146. #endif
  147. }
  148. #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
  149. /**
  150. * Synchronize the stream if specified
  151. */
  152. __host__ __device__ __forceinline__
  153. static cudaError_t SyncStream(cudaStream_t stream)
  154. {
  155. #if (CUB_PTX_VERSION == 0)
  156. return cudaStreamSynchronize(stream);
  157. #else
  158. // Device can't yet sync on a specific stream
  159. return cudaDeviceSynchronize();
  160. #endif
  161. }
  162. /**
  163. * \brief Computes maximum SM occupancy in thread blocks for the given kernel function pointer \p kernel_ptr.
  164. */
  165. template <typename KernelPtr>
  166. __host__ __device__ __forceinline__
  167. cudaError_t MaxSmOccupancy(
  168. int &max_sm_occupancy, ///< [out] maximum number of thread blocks that can reside on a single SM
  169. int sm_version, ///< [in] The SM architecture to run on
  170. KernelPtr kernel_ptr, ///< [in] Kernel pointer for which to compute SM occupancy
  171. int block_threads) ///< [in] Number of threads per thread block
  172. {
  173. #ifndef CUB_RUNTIME_ENABLED
  174. // CUDA API calls not supported from this device
  175. return CubDebug(cudaErrorInvalidConfiguration);
  176. #else
  177. cudaError_t error = cudaSuccess;
  178. do
  179. {
  180. int warp_threads = 1 << CUB_LOG_WARP_THREADS(sm_version);
  181. int max_sm_blocks = CUB_MAX_SM_BLOCKS(sm_version);
  182. int max_sm_warps = CUB_MAX_SM_THREADS(sm_version) / warp_threads;
  183. int regs_by_block = CUB_REGS_BY_BLOCK(sm_version);
  184. int max_sm_registers = CUB_MAX_SM_REGISTERS(sm_version);
  185. int warp_alloc_unit = CUB_WARP_ALLOC_UNIT(sm_version);
  186. int smem_alloc_unit = CUB_SMEM_ALLOC_UNIT(sm_version);
  187. int reg_alloc_unit = CUB_REG_ALLOC_UNIT(sm_version);
  188. int smem_bytes = CUB_SMEM_BYTES(sm_version);
  189. // Get kernel attributes
  190. cudaFuncAttributes kernel_attrs;
  191. if (CubDebug(error = cudaFuncGetAttributes(&kernel_attrs, kernel_ptr))) break;
  192. // Number of warps per threadblock
  193. int block_warps = (block_threads + warp_threads - 1) / warp_threads;
  194. // Max warp occupancy
  195. int max_warp_occupancy = (block_warps > 0) ?
  196. max_sm_warps / block_warps :
  197. max_sm_blocks;
  198. // Maximum register occupancy
  199. int max_reg_occupancy;
  200. if ((block_threads == 0) || (kernel_attrs.numRegs == 0))
  201. {
  202. // Prevent divide-by-zero
  203. max_reg_occupancy = max_sm_blocks;
  204. }
  205. else if (regs_by_block)
  206. {
  207. // Allocates registers by threadblock
  208. int block_regs = CUB_ROUND_UP_NEAREST(kernel_attrs.numRegs * warp_threads * block_warps, reg_alloc_unit);
  209. max_reg_occupancy = max_sm_registers / block_regs;
  210. }
  211. else
  212. {
  213. // Allocates registers by warp
  214. int sm_sides = warp_alloc_unit;
  215. int sm_registers_per_side = max_sm_registers / sm_sides;
  216. int regs_per_warp = CUB_ROUND_UP_NEAREST(kernel_attrs.numRegs * warp_threads, reg_alloc_unit);
  217. int warps_per_side = sm_registers_per_side / regs_per_warp;
  218. int warps = warps_per_side * sm_sides;
  219. max_reg_occupancy = warps / block_warps;
  220. }
  221. // Shared memory per threadblock
  222. int block_allocated_smem = CUB_ROUND_UP_NEAREST(
  223. kernel_attrs.sharedSizeBytes,
  224. smem_alloc_unit);
  225. // Max shared memory occupancy
  226. int max_smem_occupancy = (block_allocated_smem > 0) ?
  227. (smem_bytes / block_allocated_smem) :
  228. max_sm_blocks;
  229. // Max occupancy
  230. max_sm_occupancy = CUB_MIN(
  231. CUB_MIN(max_sm_blocks, max_warp_occupancy),
  232. CUB_MIN(max_smem_occupancy, max_reg_occupancy));
  233. // printf("max_smem_occupancy(%d), max_warp_occupancy(%d), max_reg_occupancy(%d) \n", max_smem_occupancy, max_warp_occupancy, max_reg_occupancy);
  234. } while (0);
  235. return error;
  236. #endif // CUB_RUNTIME_ENABLED
  237. }
  238. #endif // Do not document
  239. /**
  240. * \brief Computes maximum SM occupancy in thread blocks for executing the given kernel function pointer \p kernel_ptr on the current device with \p block_threads per thread block.
  241. *
  242. * \par Snippet
  243. * The code snippet below illustrates the use of the MaxSmOccupancy function.
  244. * \par
  245. * \code
  246. * #include <cub/cub.cuh> // or equivalently <cub/util_device.cuh>
  247. *
  248. * template <typename T>
  249. * __global__ void ExampleKernel()
  250. * {
  251. * // Allocate shared memory for BlockScan
  252. * __shared__ volatile T buffer[4096];
  253. *
  254. * ...
  255. * }
  256. *
  257. * ...
  258. *
  259. * // Determine SM occupancy for ExampleKernel specialized for unsigned char
  260. * int max_sm_occupancy;
  261. * MaxSmOccupancy(max_sm_occupancy, ExampleKernel<unsigned char>, 64);
  262. *
  263. * // max_sm_occupancy <-- 4 on SM10
  264. * // max_sm_occupancy <-- 8 on SM20
  265. * // max_sm_occupancy <-- 12 on SM35
  266. *
  267. * \endcode
  268. *
  269. */
  270. template <typename KernelPtr>
  271. __host__ __device__ __forceinline__
  272. cudaError_t MaxSmOccupancy(
  273. int &max_sm_occupancy, ///< [out] maximum number of thread blocks that can reside on a single SM
  274. KernelPtr kernel_ptr, ///< [in] Kernel pointer for which to compute SM occupancy
  275. int block_threads) ///< [in] Number of threads per thread block
  276. {
  277. #ifndef CUB_RUNTIME_ENABLED
  278. // CUDA API calls not supported from this device
  279. return CubDebug(cudaErrorInvalidConfiguration);
  280. #else
  281. cudaError_t error = cudaSuccess;
  282. do
  283. {
  284. // Get device ordinal
  285. int device_ordinal;
  286. if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
  287. // Get device SM version
  288. int sm_version;
  289. if (CubDebug(error = SmVersion(sm_version, device_ordinal))) break;
  290. // Get SM occupancy
  291. if (CubDebug(error = MaxSmOccupancy(max_sm_occupancy, sm_version, kernel_ptr, block_threads))) break;
  292. } while (0);
  293. return error;
  294. #endif // CUB_RUNTIME_ENABLED
  295. }
  296. /** @} */ // end group UtilMgmt
  297. } // CUB namespace
  298. CUB_NS_POSTFIX // Optional outer namespace(s)