123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290 |
- /******************************************************************************
- * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * * Neither the name of the NVIDIA CORPORATION nor the
- * names of its contributors may be used to endorse or promote products
- * derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
- /******************************************************************************
- *
- * Code and text by Sean Baxter, NVIDIA Research
- * See http://nvlabs.github.io/moderngpu for repository and documentation.
- *
- ******************************************************************************/
- #pragma once
- #include "mgpuenums.h"
- #include "device/deviceutil.cuh"
- namespace mgpu {
- ////////////////////////////////////////////////////////////////////////////////
- // device/loadstore.cuh
- // For 0 <= i < VT:
- // index = NT * i + tid;
- // reg[i] = data[index];
- // Synchronize after load.
- template<int NT, int VT, typename InputIt, typename T>
- MGPU_DEVICE void DeviceSharedToReg(InputIt data, int tid, T* reg,
- bool sync = true);
- // For 0 <= i < VT:
- // index = NT * i + tid;
- // if(index < count) reg[i] = data[index];
- // No synchronize after load.
- template<int NT, int VT, typename InputIt, typename T>
- MGPU_DEVICE void DeviceGlobalToReg(int count, InputIt data, int tid,
- T* reg, bool sync = false);
- template<int NT, int VT, typename InputIt, typename T>
- MGPU_DEVICE void DeviceGlobalToRegDefault(int count, InputIt data, int tid,
- T* reg, T init, bool sync = false);
- // For 0 <= i < VT:
- // index = NT * i + tid;
- // if(index < count) reg[i] = data[index];
- // No synchronize after load.
- template<int NT, int VT0, int VT1, typename InputIt, typename T>
- MGPU_DEVICE void DeviceGlobalToReg(int count, InputIt data, int tid,
- T* reg, bool sync = false);
- // For 0 <= i < VT:
- // index = NT * i + tid;
- // if(index < count) reg[i] = data[index];
- // No synchronize after load.
- template<int NT, int VT0, int VT1, typename InputIt, typename T>
- MGPU_DEVICE void DeviceGlobalToRegDefault2(int count, InputIt data, int tid,
- T* reg, T init, bool sync = false);
- // For 0 <= i < VT:
- // index = NT * i + tid;
- // if(index < count) reg[i] = data[index];
- // No synchronize after load.
- // No optimized code path for count < NV (smaller generated code).
- template<int NT, int VT, typename InputIt, typename T>
- MGPU_DEVICE void DeviceGlobalToRegLoop(int count, InputIt data, int tid,
- T* reg, bool sync = false);
- // For 0 <= i < VT:
- // index = VT * tid + i.
- // if(index < count) reg[i] = data[index];
- // No synchronize after load.
- template<int NT, int VT, typename InputIt, typename T>
- MGPU_DEVICE void DeviceGlobalToThread(int count, InputIt data, int tid,
- T* reg);
- template<int NT, int VT, typename InputIt, typename T>
- MGPU_DEVICE void DeviceGlobalToThreadDefault(int count, InputIt data, int tid,
- T* reg, T init);
- // For 0 <= i < VT:
- // index = NT * i + tid;
- // if(index < count) data[index] = reg[i];
- // Synchronize after load.
- template<int NT, int VT, typename OutputIt, typename T>
- MGPU_DEVICE void DeviceRegToShared(const T* reg, int tid, OutputIt dest,
- bool sync = true);
- // For 0 <= i < VT:
- // index = NT * i + tid;
- // if(index < count) data[index] = reg[i];
- // No synchronize after load.
- template<int NT, int VT, typename OutputIt, typename T>
- MGPU_DEVICE void DeviceRegToGlobal(int count, const T* reg, int tid,
- OutputIt dest, bool sync = false);
- // For 0 <= index < count:
- // dest[index] = source[index];
- // This function is intended to replace DeviceGlobalToShared in cases where
- // count is much less than NT * VT.
- template<int NT, typename InputIt, typename OutputIt>
- MGPU_DEVICE void DeviceMemToMemLoop(int count, InputIt source, int tid,
- OutputIt dest, bool sync = true);
- // For 0 <= index < count:
- // dest[index] = source[index];
- // Synchronize after store.
- template<int NT, int VT, typename T, typename OutputIt>
- MGPU_DEVICE void DeviceSharedToGlobal(int count, const T* source, int tid,
- OutputIt dest, bool sync = true);
- // For 0 <= index < count:
- // dest[index] = source[index];
- // Synchronize after store.
- template<int NT, int VT, typename InputIt, typename T>
- MGPU_DEVICE void DeviceGlobalToShared(int count, InputIt source, int tid,
- T* dest, bool sync = true);
- template<int NT, int VT0, int VT1, typename InputIt, typename T>
- MGPU_DEVICE void DeviceGlobalToShared2(int count, InputIt source, int tid,
- T* dest, bool sync = true);
- // For 0 <= index < count:
- // dest[index] = source[index];
- // Synchronize after store.
- // No optimized code path for count < NV (smaller generated code).
- template<int NT, int VT, typename InputIt, typename T>
- MGPU_DEVICE void DeviceGlobalToSharedLoop(int count, InputIt source, int tid,
- T* dest, bool sync = true);
- template<int NT, int VT, typename InputIt, typename T>
- MGPU_DEVICE void DeviceGlobalToSharedDefault(int count, InputIt source, int tid,
- T* dest, T init, bool sync = true);
- template<int NT, int VT0, int VT1, typename InputIt, typename T>
- MGPU_DEVICE void DeviceGlobalToSharedDefault2(int count, InputIt source,
- int tid, T* dest, T init, bool sync = true);
- // For 0 <= index < count:
- // dest[index] = source[index];
- // No synchronize.
- template<int NT, int VT, typename InputIt, typename OutputIt>
- MGPU_DEVICE void DeviceGlobalToGlobal(int count, InputIt source, int tid,
- OutputIt dest, bool sync = false);
- // Transponse VT elements in NT threads (x) into thread-order registers (y)
- // using only NT * VT / 2 elements of shared memory.
- template<int NT, int VT, typename T>
- MGPU_DEVICE void HalfSmemTranspose(const T* x, int tid, T* shared, T* y);
- // For 0 <= i < VT:
- // index = NT * i + tid;
- // if(index < count)
- // gather = indices[index];
- // reg[i] = data[gather];
- // Synchronize after load.
- template<int NT, int VT, typename InputIt, typename T>
- MGPU_DEVICE void DeviceGather(int count, InputIt data, int indices[VT],
- int tid, T* reg, bool sync = true);
- template<int NT, int VT, typename InputIt, typename T>
- MGPU_DEVICE void DeviceGatherDefault(int count, InputIt data, int indices[VT],
- int tid, T* reg, T identity, bool sync = true);
- // For 0 <= i < VT:
- // index = NT * i + tid;
- // if(index < count)
- // scatter = indices[index];
- // data[scatter] = reg[i];
- // Synchronize after store.
- template<int NT, int VT, typename T, typename OutputIt>
- MGPU_DEVICE void DeviceScatter(int count, const T* reg, int tid,
- int indices[VT], OutputIt data, bool sync = true);
- // For 0 <= i < VT:
- // shared[VT * tid + i] = threadReg[i];
- // Synchronize after store.
- // Note this function moves data in THREAD ORDER.
- // (DeviceRegToShared moves data in STRIDED ORDER).
- template<int VT, typename T>
- MGPU_DEVICE void DeviceThreadToShared(const T* threadReg, int tid, T* shared,
- bool sync = true);
- // For 0 <= i < VT:
- // threadReg[i] = shared[VT * tid + i];
- // Synchronize after load.
- // Note this function moves data in THREAD ORDER.
- // (DeviceSharedToReg moves data in STRIDED ORDER).
- template<int VT, typename T>
- MGPU_DEVICE void DeviceSharedToThread(const T* shared, int tid, T* threadReg,
- bool sync = true);
- // For 0 <= index < aCount:
- // shared[index] = a_global[index];
- // For 0 <= index < bCount:
- // shared[aCount + index] = b_global[index];
- // VT0 is the lower-bound for predication-free execution:
- // If count >= NT * VT0, a predication-free branch is taken.
- // VT1 is the upper-bound for loads:
- // NT * VT1 must >= aCount + bCount.
- template<int NT, int VT0, int VT1, typename T>
- MGPU_DEVICE void DeviceLoad2ToReg(const T* a_global, int aCount,
- const T* b_global, int bCount, int tid, T* reg, bool sync = false);
- template<int NT, int VT0, int VT1, typename T>
- MGPU_DEVICE void DeviceLoad2ToShared(const T* a_global, int aCount,
- const T* b_global, int bCount, int tid, T* shared, bool sync = true);
- template<int NT, int VT0, int VT1, typename InputIt1, typename InputIt2,
- typename T>
- MGPU_DEVICE void DeviceLoad2ToReg(InputIt1 a_global, int aCount,
- InputIt2 b_global, int bCount, int tid, T* reg, bool sync = false);
- template<int NT, int VT0, int VT1, typename InputIt1, typename InputIt2,
- typename T>
- MGPU_DEVICE void DeviceLoad2ToShared(InputIt1 a_global, int aCount,
- InputIt2 b_global, int bCount, int tid, T* shared, bool sync = true);
- // For 0 <= i < VT
- // index = NT * i + tid;
- // if(index < count)
- // gather = indices_shared[index];
- // dest_global[index] = data_global[gather];
- // Synchronize after load.
- template<int NT, int VT, typename InputIt, typename OutputIt>
- MGPU_DEVICE void DeviceGatherGlobalToGlobal(int count, InputIt data_global,
- const int* indices_shared, int tid, OutputIt dest_global,
- bool sync = true);
- // For 0 <= i < VT
- // index = NT * i + tid
- // if(index < count)
- // gather = indices[index];
- // if(gather < aCount) data = a_global[gather];
- // else data = b_global[gather - aCount];
- // dest_global[index] = data;
- // Synchronize after load.
- template<int NT, int VT, typename InputIt1, typename InputIt2,
- typename T>
- MGPU_DEVICE void DeviceTransferMergeValuesReg(int count, InputIt1 a_global,
- InputIt2 b_global, int bStart, const int* indices, int tid,
- T* reg, bool sync = false);
- template<int NT, int VT, typename InputIt1, typename InputIt2,
- typename OutputIt>
- MGPU_DEVICE void DeviceTransferMergeValuesShared(int count, InputIt1 a_global,
- InputIt2 b_global, int bStart, const int* indices_shared, int tid,
- OutputIt dest_global, bool sync = true);
- template<int NT, int VT, typename T>
- MGPU_DEVICE void DeviceTransferMergeValuesReg(int count, const T* a_global,
- const T* b_global, int bStart, const int* indices, int tid,
- T* reg, bool sync = false);
- template<int NT, int VT, typename T, typename OutputIt>
- MGPU_DEVICE void DeviceTransferMergeValuesShared(int count, const T* a_global,
- const T* b_global, int bStart, const int* indices_shared, int tid,
- OutputIt dest_global, bool sync = true);
- } // namespace mgpu
- #include "device/launchbox.cuh"
- #include "device/loadstore.cuh"
- #include "device/ctasegscan.cuh"
|