util_arch.cuh 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186
  1. /******************************************************************************
  2. * Copyright (c) 2011, Duane Merrill. All rights reserved.
  3. * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions are met:
  7. * * Redistributions of source code must retain the above copyright
  8. * notice, this list of conditions and the following disclaimer.
  9. * * Redistributions in binary form must reproduce the above copyright
  10. * notice, this list of conditions and the following disclaimer in the
  11. * documentation and/or other materials provided with the distribution.
  12. * * Neither the name of the NVIDIA CORPORATION nor the
  13. * names of its contributors may be used to endorse or promote products
  14. * derived from this software without specific prior written permission.
  15. *
  16. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  17. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  18. * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  19. * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
  20. * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  21. * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  22. * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  23. * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  24. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  25. * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *
  27. ******************************************************************************/
  28. /**
  29. * \file
  30. * Static architectural properties by SM version.
  31. */
  32. #pragma once
  33. #include "util_namespace.cuh"
  34. /// Optional outer namespace(s)
  35. CUB_NS_PREFIX
  36. /// CUB namespace
  37. namespace cub {
  38. /**
  39. * \addtogroup UtilMgmt
  40. * @{
  41. */
  42. /// CUB_PTX_VERSION reflects the PTX version targeted by the active compiler pass (or zero during the host pass).
  43. #ifndef __CUDA_ARCH__
  44. #define CUB_PTX_VERSION 0
  45. #else
  46. #define CUB_PTX_VERSION __CUDA_ARCH__
  47. #endif
  48. /// Whether or not the source targeted by the active compiler pass is allowed to invoke device kernels or methods from the CUDA runtime API.
  49. #if (CUB_PTX_VERSION == 0) || defined(CUB_CDP)
  50. #define CUB_RUNTIME_ENABLED
  51. #endif
  52. /// Number of threads per warp
  53. #define CUB_LOG_WARP_THREADS(arch) \
  54. (5)
  55. /// Number of smem banks
  56. #define CUB_LOG_SMEM_BANKS(arch) \
  57. ((arch >= 200) ? \
  58. (5) : \
  59. (4))
  60. /// Number of bytes per smem bank
  61. #define CUB_SMEM_BANK_BYTES(arch) \
  62. (4)
  63. /// Number of smem bytes provisioned per SM
  64. #define CUB_SMEM_BYTES(arch) \
  65. ((arch >= 200) ? \
  66. (48 * 1024) : \
  67. (16 * 1024))
  68. /// Smem allocation size in bytes
  69. #define CUB_SMEM_ALLOC_UNIT(arch) \
  70. ((arch >= 300) ? \
  71. (256) : \
  72. ((arch >= 200) ? \
  73. (128) : \
  74. (512)))
  75. /// Whether or not the architecture allocates registers by block (or by warp)
  76. #define CUB_REGS_BY_BLOCK(arch) \
  77. ((arch >= 200) ? \
  78. (false) : \
  79. (true))
  80. /// Number of registers allocated at a time per block (or by warp)
  81. #define CUB_REG_ALLOC_UNIT(arch) \
  82. ((arch >= 300) ? \
  83. (256) : \
  84. ((arch >= 200) ? \
  85. (64) : \
  86. ((arch >= 120) ? \
  87. (512) : \
  88. (256))))
  89. /// Granularity of warps for which registers are allocated
  90. #define CUB_WARP_ALLOC_UNIT(arch) \
  91. ((arch >= 300) ? \
  92. (4) : \
  93. (2))
  94. /// Maximum number of threads per SM
  95. #define CUB_MAX_SM_THREADS(arch) \
  96. ((arch >= 300) ? \
  97. (2048) : \
  98. ((arch >= 200) ? \
  99. (1536) : \
  100. ((arch >= 120) ? \
  101. (1024) : \
  102. (768))))
  103. /// Maximum number of thread blocks per SM
  104. #define CUB_MAX_SM_BLOCKS(arch) \
  105. ((arch >= 300) ? \
  106. (16) : \
  107. (8))
  108. /// Maximum number of threads per thread block
  109. #define CUB_MAX_BLOCK_THREADS(arch) \
  110. ((arch >= 200) ? \
  111. (1024) : \
  112. (512))
  113. /// Maximum number of registers per SM
  114. #define CUB_MAX_SM_REGISTERS(arch) \
  115. ((arch >= 300) ? \
  116. (64 * 1024) : \
  117. ((arch >= 200) ? \
  118. (32 * 1024) : \
  119. ((arch >= 120) ? \
  120. (16 * 1024) : \
  121. (8 * 1024))))
  122. /// Oversubscription factor
  123. #define CUB_SUBSCRIPTION_FACTOR(arch) \
  124. ((arch >= 300) ? \
  125. (5) : \
  126. ((arch >= 200) ? \
  127. (3) : \
  128. (10)))
  129. /// Prefer X-way conflict over padding
  130. #define CUB_PREFER_CONFLICT_OVER_PADDING(arch) \
  131. ((arch >= 300) ? \
  132. (0) : \
  133. (4))
  134. #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
  135. #define CUB_PTX_LOG_WARP_THREADS CUB_LOG_WARP_THREADS(CUB_PTX_VERSION)
  136. #define CUB_PTX_WARP_THREADS (1 << CUB_PTX_LOG_WARP_THREADS)
  137. #define CUB_PTX_LOG_SMEM_BANKS CUB_LOG_SMEM_BANKS(CUB_PTX_VERSION)
  138. #define CUB_PTX_SMEM_BANKS (1 << CUB_PTX_LOG_SMEM_BANKS)
  139. #define CUB_PTX_SMEM_BANK_BYTES CUB_SMEM_BANK_BYTES(CUB_PTX_VERSION)
  140. #define CUB_PTX_SMEM_BYTES CUB_SMEM_BYTES(CUB_PTX_VERSION)
  141. #define CUB_PTX_SMEM_ALLOC_UNIT CUB_SMEM_ALLOC_UNIT(CUB_PTX_VERSION)
  142. #define CUB_PTX_REGS_BY_BLOCK CUB_REGS_BY_BLOCK(CUB_PTX_VERSION)
  143. #define CUB_PTX_REG_ALLOC_UNIT CUB_REG_ALLOC_UNIT(CUB_PTX_VERSION)
  144. #define CUB_PTX_WARP_ALLOC_UNIT CUB_WARP_ALLOC_UNIT(CUB_PTX_VERSION)
  145. #define CUB_PTX_MAX_SM_THREADS CUB_MAX_SM_THREADS(CUB_PTX_VERSION)
  146. #define CUB_PTX_MAX_SM_BLOCKS CUB_MAX_SM_BLOCKS(CUB_PTX_VERSION)
  147. #define CUB_PTX_MAX_BLOCK_THREADS CUB_MAX_BLOCK_THREADS(CUB_PTX_VERSION)
  148. #define CUB_PTX_MAX_SM_REGISTERS CUB_MAX_SM_REGISTERS(CUB_PTX_VERSION)
  149. #define CUB_PTX_PREFER_CONFLICT_OVER_PADDING CUB_PREFER_CONFLICT_OVER_PADDING(CUB_PTX_VERSION)
  150. #endif // Do not document
  151. /** @} */ // end group UtilMgmt
  152. } // CUB namespace
  153. CUB_NS_POSTFIX // Optional outer namespace(s)