123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290 |
- #pragma once
- #include "mgpuenums.h"
- #include "device/deviceutil.cuh"
- namespace mgpu {
- template<int NT, int VT, typename InputIt, typename T>
- MGPU_DEVICE void DeviceSharedToReg(InputIt data, int tid, T* reg,
- bool sync = true);
- template<int NT, int VT, typename InputIt, typename T>
- MGPU_DEVICE void DeviceGlobalToReg(int count, InputIt data, int tid,
- T* reg, bool sync = false);
- template<int NT, int VT, typename InputIt, typename T>
- MGPU_DEVICE void DeviceGlobalToRegDefault(int count, InputIt data, int tid,
- T* reg, T init, bool sync = false);
- template<int NT, int VT0, int VT1, typename InputIt, typename T>
- MGPU_DEVICE void DeviceGlobalToReg(int count, InputIt data, int tid,
- T* reg, bool sync = false);
- template<int NT, int VT0, int VT1, typename InputIt, typename T>
- MGPU_DEVICE void DeviceGlobalToRegDefault2(int count, InputIt data, int tid,
- T* reg, T init, bool sync = false);
- template<int NT, int VT, typename InputIt, typename T>
- MGPU_DEVICE void DeviceGlobalToRegLoop(int count, InputIt data, int tid,
- T* reg, bool sync = false);
- template<int NT, int VT, typename InputIt, typename T>
- MGPU_DEVICE void DeviceGlobalToThread(int count, InputIt data, int tid,
- T* reg);
- template<int NT, int VT, typename InputIt, typename T>
- MGPU_DEVICE void DeviceGlobalToThreadDefault(int count, InputIt data, int tid,
- T* reg, T init);
- template<int NT, int VT, typename OutputIt, typename T>
- MGPU_DEVICE void DeviceRegToShared(const T* reg, int tid, OutputIt dest,
- bool sync = true);
- template<int NT, int VT, typename OutputIt, typename T>
- MGPU_DEVICE void DeviceRegToGlobal(int count, const T* reg, int tid,
- OutputIt dest, bool sync = false);
- template<int NT, typename InputIt, typename OutputIt>
- MGPU_DEVICE void DeviceMemToMemLoop(int count, InputIt source, int tid,
- OutputIt dest, bool sync = true);
- template<int NT, int VT, typename T, typename OutputIt>
- MGPU_DEVICE void DeviceSharedToGlobal(int count, const T* source, int tid,
- OutputIt dest, bool sync = true);
- template<int NT, int VT, typename InputIt, typename T>
- MGPU_DEVICE void DeviceGlobalToShared(int count, InputIt source, int tid,
- T* dest, bool sync = true);
- template<int NT, int VT0, int VT1, typename InputIt, typename T>
- MGPU_DEVICE void DeviceGlobalToShared2(int count, InputIt source, int tid,
- T* dest, bool sync = true);
- template<int NT, int VT, typename InputIt, typename T>
- MGPU_DEVICE void DeviceGlobalToSharedLoop(int count, InputIt source, int tid,
- T* dest, bool sync = true);
- template<int NT, int VT, typename InputIt, typename T>
- MGPU_DEVICE void DeviceGlobalToSharedDefault(int count, InputIt source, int tid,
- T* dest, T init, bool sync = true);
- template<int NT, int VT0, int VT1, typename InputIt, typename T>
- MGPU_DEVICE void DeviceGlobalToSharedDefault2(int count, InputIt source,
- int tid, T* dest, T init, bool sync = true);
- template<int NT, int VT, typename InputIt, typename OutputIt>
- MGPU_DEVICE void DeviceGlobalToGlobal(int count, InputIt source, int tid,
- OutputIt dest, bool sync = false);
- template<int NT, int VT, typename T>
- MGPU_DEVICE void HalfSmemTranspose(const T* x, int tid, T* shared, T* y);
- template<int NT, int VT, typename InputIt, typename T>
- MGPU_DEVICE void DeviceGather(int count, InputIt data, int indices[VT],
- int tid, T* reg, bool sync = true);
- template<int NT, int VT, typename InputIt, typename T>
- MGPU_DEVICE void DeviceGatherDefault(int count, InputIt data, int indices[VT],
- int tid, T* reg, T identity, bool sync = true);
- template<int NT, int VT, typename T, typename OutputIt>
- MGPU_DEVICE void DeviceScatter(int count, const T* reg, int tid,
- int indices[VT], OutputIt data, bool sync = true);
- template<int VT, typename T>
- MGPU_DEVICE void DeviceThreadToShared(const T* threadReg, int tid, T* shared,
- bool sync = true);
- template<int VT, typename T>
- MGPU_DEVICE void DeviceSharedToThread(const T* shared, int tid, T* threadReg,
- bool sync = true);
- template<int NT, int VT0, int VT1, typename T>
- MGPU_DEVICE void DeviceLoad2ToReg(const T* a_global, int aCount,
- const T* b_global, int bCount, int tid, T* reg, bool sync = false);
- template<int NT, int VT0, int VT1, typename T>
- MGPU_DEVICE void DeviceLoad2ToShared(const T* a_global, int aCount,
- const T* b_global, int bCount, int tid, T* shared, bool sync = true);
- template<int NT, int VT0, int VT1, typename InputIt1, typename InputIt2,
- typename T>
- MGPU_DEVICE void DeviceLoad2ToReg(InputIt1 a_global, int aCount,
- InputIt2 b_global, int bCount, int tid, T* reg, bool sync = false);
- template<int NT, int VT0, int VT1, typename InputIt1, typename InputIt2,
- typename T>
- MGPU_DEVICE void DeviceLoad2ToShared(InputIt1 a_global, int aCount,
- InputIt2 b_global, int bCount, int tid, T* shared, bool sync = true);
- template<int NT, int VT, typename InputIt, typename OutputIt>
- MGPU_DEVICE void DeviceGatherGlobalToGlobal(int count, InputIt data_global,
- const int* indices_shared, int tid, OutputIt dest_global,
- bool sync = true);
- template<int NT, int VT, typename InputIt1, typename InputIt2,
- typename T>
- MGPU_DEVICE void DeviceTransferMergeValuesReg(int count, InputIt1 a_global,
- InputIt2 b_global, int bStart, const int* indices, int tid,
- T* reg, bool sync = false);
- template<int NT, int VT, typename InputIt1, typename InputIt2,
- typename OutputIt>
- MGPU_DEVICE void DeviceTransferMergeValuesShared(int count, InputIt1 a_global,
- InputIt2 b_global, int bStart, const int* indices_shared, int tid,
- OutputIt dest_global, bool sync = true);
- template<int NT, int VT, typename T>
- MGPU_DEVICE void DeviceTransferMergeValuesReg(int count, const T* a_global,
- const T* b_global, int bStart, const int* indices, int tid,
- T* reg, bool sync = false);
- template<int NT, int VT, typename T, typename OutputIt>
- MGPU_DEVICE void DeviceTransferMergeValuesShared(int count, const T* a_global,
- const T* b_global, int bStart, const int* indices_shared, int tid,
- OutputIt dest_global, bool sync = true);
- }
- #include "device/launchbox.cuh"
- #include "device/loadstore.cuh"
- #include "device/ctasegscan.cuh"
|