123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372 |
- /******************************************************************************
- * Copyright (c) 2011, Duane Merrill. All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * * Neither the name of the NVIDIA CORPORATION nor the
- * names of its contributors may be used to endorse or promote products
- * derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
- /**
- * \file
- * Properties of a given CUDA device and the corresponding PTX bundle
- */
- #pragma once
- #include "util_arch.cuh"
- #include "util_debug.cuh"
- #include "util_namespace.cuh"
- #include "util_macro.cuh"
- /// Optional outer namespace(s)
- CUB_NS_PREFIX
- /// CUB namespace
- namespace cub {
- /**
- * \addtogroup UtilMgmt
- * @{
- */
- #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
- /**
- * Empty kernel for querying PTX manifest metadata (e.g., version) for the current device
- */
- template <typename T>
- __global__ void EmptyKernel(void) { }
- /**
- * Alias temporaries to externally-allocated device storage (or simply return the amount of storage needed).
- */
- template <int ALLOCATIONS>
- __host__ __device__ __forceinline__
- cudaError_t AliasTemporaries(
- void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
- size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \t d_temp_storage allocation
- void* (&allocations)[ALLOCATIONS], ///< [in,out] Pointers to device allocations needed
- size_t (&allocation_sizes)[ALLOCATIONS]) ///< [in] Sizes in bytes of device allocations needed
- {
- const int ALIGN_BYTES = 256;
- const int ALIGN_MASK = ~(ALIGN_BYTES - 1);
- // Compute exclusive prefix sum over allocation requests
- size_t allocation_offsets[ALLOCATIONS];
- size_t bytes_needed = 0;
- for (int i = 0; i < ALLOCATIONS; ++i)
- {
- size_t allocation_bytes = (allocation_sizes[i] + ALIGN_BYTES - 1) & ALIGN_MASK;
- allocation_offsets[i] = bytes_needed;
- bytes_needed += allocation_bytes;
- }
- // Check if the caller is simply requesting the size of the storage allocation
- if (!d_temp_storage)
- {
- temp_storage_bytes = bytes_needed;
- return cudaSuccess;
- }
- // Check if enough storage provided
- if (temp_storage_bytes < bytes_needed)
- {
- return CubDebug(cudaErrorInvalidValue);
- }
- // Alias
- for (int i = 0; i < ALLOCATIONS; ++i)
- {
- allocations[i] = static_cast<char*>(d_temp_storage) + allocation_offsets[i];
- }
- return cudaSuccess;
- }
- #endif // DOXYGEN_SHOULD_SKIP_THIS
- /**
- * \brief Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10)
- */
- __host__ __device__ __forceinline__ cudaError_t PtxVersion(int &ptx_version)
- {
- struct Dummy
- {
- /// Type definition of the EmptyKernel kernel entry point
- typedef void (*EmptyKernelPtr)();
- /// Force EmptyKernel<void> to be generated if this class is used
- __host__ __device__ __forceinline__
- EmptyKernelPtr Empty()
- {
- return EmptyKernel<void>;
- }
- };
- #ifndef CUB_RUNTIME_ENABLED
- // CUDA API calls not supported from this device
- return cudaErrorInvalidConfiguration;
- #elif (CUB_PTX_VERSION > 0)
- ptx_version = CUB_PTX_VERSION;
- return cudaSuccess;
- #else
- cudaError_t error = cudaSuccess;
- do
- {
- cudaFuncAttributes empty_kernel_attrs;
- if (CubDebug(error = cudaFuncGetAttributes(&empty_kernel_attrs, EmptyKernel<void>))) break;
- ptx_version = empty_kernel_attrs.ptxVersion * 10;
- }
- while (0);
- return error;
- #endif
- }
- /**
- * \brief Retrieves the SM version (major * 100 + minor * 10)
- */
- __host__ __device__ __forceinline__ cudaError_t SmVersion(int &sm_version, int device_ordinal)
- {
- #ifndef CUB_RUNTIME_ENABLED
- // CUDA API calls not supported from this device
- return cudaErrorInvalidConfiguration;
- #else
- cudaError_t error = cudaSuccess;
- do
- {
- // Fill in SM version
- int major, minor;
- if (CubDebug(error = cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device_ordinal))) break;
- if (CubDebug(error = cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device_ordinal))) break;
- sm_version = major * 100 + minor * 10;
- }
- while (0);
- return error;
- #endif
- }
- #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
- /**
- * Synchronize the stream if specified
- */
- __host__ __device__ __forceinline__
- static cudaError_t SyncStream(cudaStream_t stream)
- {
- #if (CUB_PTX_VERSION == 0)
- return cudaStreamSynchronize(stream);
- #else
- // Device can't yet sync on a specific stream
- return cudaDeviceSynchronize();
- #endif
- }
- /**
- * \brief Computes maximum SM occupancy in thread blocks for the given kernel function pointer \p kernel_ptr.
- */
- template <typename KernelPtr>
- __host__ __device__ __forceinline__
- cudaError_t MaxSmOccupancy(
- int &max_sm_occupancy, ///< [out] maximum number of thread blocks that can reside on a single SM
- int sm_version, ///< [in] The SM architecture to run on
- KernelPtr kernel_ptr, ///< [in] Kernel pointer for which to compute SM occupancy
- int block_threads) ///< [in] Number of threads per thread block
- {
- #ifndef CUB_RUNTIME_ENABLED
- // CUDA API calls not supported from this device
- return CubDebug(cudaErrorInvalidConfiguration);
- #else
- cudaError_t error = cudaSuccess;
- do
- {
- int warp_threads = 1 << CUB_LOG_WARP_THREADS(sm_version);
- int max_sm_blocks = CUB_MAX_SM_BLOCKS(sm_version);
- int max_sm_warps = CUB_MAX_SM_THREADS(sm_version) / warp_threads;
- int regs_by_block = CUB_REGS_BY_BLOCK(sm_version);
- int max_sm_registers = CUB_MAX_SM_REGISTERS(sm_version);
- int warp_alloc_unit = CUB_WARP_ALLOC_UNIT(sm_version);
- int smem_alloc_unit = CUB_SMEM_ALLOC_UNIT(sm_version);
- int reg_alloc_unit = CUB_REG_ALLOC_UNIT(sm_version);
- int smem_bytes = CUB_SMEM_BYTES(sm_version);
- // Get kernel attributes
- cudaFuncAttributes kernel_attrs;
- if (CubDebug(error = cudaFuncGetAttributes(&kernel_attrs, kernel_ptr))) break;
- // Number of warps per threadblock
- int block_warps = (block_threads + warp_threads - 1) / warp_threads;
- // Max warp occupancy
- int max_warp_occupancy = (block_warps > 0) ?
- max_sm_warps / block_warps :
- max_sm_blocks;
- // Maximum register occupancy
- int max_reg_occupancy;
- if ((block_threads == 0) || (kernel_attrs.numRegs == 0))
- {
- // Prevent divide-by-zero
- max_reg_occupancy = max_sm_blocks;
- }
- else if (regs_by_block)
- {
- // Allocates registers by threadblock
- int block_regs = CUB_ROUND_UP_NEAREST(kernel_attrs.numRegs * warp_threads * block_warps, reg_alloc_unit);
- max_reg_occupancy = max_sm_registers / block_regs;
- }
- else
- {
- // Allocates registers by warp
- int sm_sides = warp_alloc_unit;
- int sm_registers_per_side = max_sm_registers / sm_sides;
- int regs_per_warp = CUB_ROUND_UP_NEAREST(kernel_attrs.numRegs * warp_threads, reg_alloc_unit);
- int warps_per_side = sm_registers_per_side / regs_per_warp;
- int warps = warps_per_side * sm_sides;
- max_reg_occupancy = warps / block_warps;
- }
- // Shared memory per threadblock
- int block_allocated_smem = CUB_ROUND_UP_NEAREST(
- kernel_attrs.sharedSizeBytes,
- smem_alloc_unit);
- // Max shared memory occupancy
- int max_smem_occupancy = (block_allocated_smem > 0) ?
- (smem_bytes / block_allocated_smem) :
- max_sm_blocks;
- // Max occupancy
- max_sm_occupancy = CUB_MIN(
- CUB_MIN(max_sm_blocks, max_warp_occupancy),
- CUB_MIN(max_smem_occupancy, max_reg_occupancy));
- // printf("max_smem_occupancy(%d), max_warp_occupancy(%d), max_reg_occupancy(%d) \n", max_smem_occupancy, max_warp_occupancy, max_reg_occupancy);
- } while (0);
- return error;
- #endif // CUB_RUNTIME_ENABLED
- }
- #endif // Do not document
- /**
- * \brief Computes maximum SM occupancy in thread blocks for executing the given kernel function pointer \p kernel_ptr on the current device with \p block_threads per thread block.
- *
- * \par Snippet
- * The code snippet below illustrates the use of the MaxSmOccupancy function.
- * \par
- * \code
- * #include <cub/cub.cuh> // or equivalently <cub/util_device.cuh>
- *
- * template <typename T>
- * __global__ void ExampleKernel()
- * {
- * // Allocate shared memory for BlockScan
- * __shared__ volatile T buffer[4096];
- *
- * ...
- * }
- *
- * ...
- *
- * // Determine SM occupancy for ExampleKernel specialized for unsigned char
- * int max_sm_occupancy;
- * MaxSmOccupancy(max_sm_occupancy, ExampleKernel<unsigned char>, 64);
- *
- * // max_sm_occupancy <-- 4 on SM10
- * // max_sm_occupancy <-- 8 on SM20
- * // max_sm_occupancy <-- 12 on SM35
- *
- * \endcode
- *
- */
- template <typename KernelPtr>
- __host__ __device__ __forceinline__
- cudaError_t MaxSmOccupancy(
- int &max_sm_occupancy, ///< [out] maximum number of thread blocks that can reside on a single SM
- KernelPtr kernel_ptr, ///< [in] Kernel pointer for which to compute SM occupancy
- int block_threads) ///< [in] Number of threads per thread block
- {
- #ifndef CUB_RUNTIME_ENABLED
- // CUDA API calls not supported from this device
- return CubDebug(cudaErrorInvalidConfiguration);
- #else
- cudaError_t error = cudaSuccess;
- do
- {
- // Get device ordinal
- int device_ordinal;
- if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
- // Get device SM version
- int sm_version;
- if (CubDebug(error = SmVersion(sm_version, device_ordinal))) break;
- // Get SM occupancy
- if (CubDebug(error = MaxSmOccupancy(max_sm_occupancy, sm_version, kernel_ptr, block_threads))) break;
- } while (0);
- return error;
- #endif // CUB_RUNTIME_ENABLED
- }
- /** @} */ // end group UtilMgmt
- } // CUB namespace
- CUB_NS_POSTFIX // Optional outer namespace(s)
|