/******************************************************************************
 * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of the NVIDIA CORPORATION nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 * 
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
 * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 ******************************************************************************/

/******************************************************************************
 *
 * Code and text by Sean Baxter, NVIDIA Research
 * See http://nvlabs.github.io/moderngpu for repository and documentation.
 *
 ******************************************************************************/

#pragma once

#include "mgpuenums.h"
#include "device/deviceutil.cuh"

namespace mgpu {

////////////////////////////////////////////////////////////////////////////////
// device/loadstore.cuh

// For 0 <= i < VT: 
//		index = NT * i + tid;
//		reg[i] = data[index];
// Synchronize after load.
template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceSharedToReg(InputIt data, int tid, T* reg, 
	bool sync = true);

// For 0 <= i < VT: 
//		index = NT * i + tid;
//		if(index < count) reg[i] = data[index];
// No synchronize after load.
template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToReg(int count, InputIt data, int tid, 
	T* reg, bool sync = false);

template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToRegDefault(int count, InputIt data, int tid,
	T* reg, T init, bool sync = false);

// For 0 <= i < VT: 
//		index = NT * i + tid;
//		if(index < count) reg[i] = data[index];
// No synchronize after load.
template<int NT, int VT0, int VT1, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToReg(int count, InputIt data, int tid, 
	T* reg, bool sync = false);

// For 0 <= i < VT: 
//		index = NT * i + tid;
//		if(index < count) reg[i] = data[index];
// No synchronize after load.
template<int NT, int VT0, int VT1, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToRegDefault2(int count, InputIt data, int tid, 
	T* reg, T init, bool sync = false);

// For 0 <= i < VT: 
//		index = NT * i + tid;
//		if(index < count) reg[i] = data[index];
// No synchronize after load.
// No optimized code path for count < NV (smaller generated code).
template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToRegLoop(int count, InputIt data, int tid, 
	T* reg, bool sync = false);


// For 0 <= i < VT: 
//		index = VT * tid + i.
//		if(index < count) reg[i] = data[index];
// No synchronize after load.
template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToThread(int count, InputIt data, int tid, 
	T* reg);

template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToThreadDefault(int count, InputIt data, int tid,
	T* reg, T init);

// For 0 <= i < VT: 
//		index = NT * i + tid;
//		if(index < count) data[index] = reg[i];
// Synchronize after load.
template<int NT, int VT, typename OutputIt, typename T>
MGPU_DEVICE void DeviceRegToShared(const T* reg, int tid, OutputIt dest, 
	bool sync = true);

// For 0 <= i < VT: 
//		index = NT * i + tid;
//		if(index < count) data[index] = reg[i];
// No synchronize after load.
template<int NT, int VT, typename OutputIt, typename T>
MGPU_DEVICE void DeviceRegToGlobal(int count, const T* reg, int tid,
	OutputIt dest, bool sync = false);

// For 0 <= index < count:
//		dest[index] = source[index];
// This function is intended to replace DeviceGlobalToShared in cases where
// count is much less than NT * VT.
template<int NT, typename InputIt, typename OutputIt>
MGPU_DEVICE void DeviceMemToMemLoop(int count, InputIt source, int tid, 
	OutputIt dest, bool sync = true);

// For 0 <= index < count:
//		dest[index] = source[index];
// Synchronize after store.
template<int NT, int VT, typename T, typename OutputIt>
MGPU_DEVICE void DeviceSharedToGlobal(int count, const T* source, int tid, 
	OutputIt dest, bool sync = true);

// For 0 <= index < count:
//		dest[index] = source[index];
// Synchronize after store.
template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToShared(int count, InputIt source, int tid,
	T* dest, bool sync = true);

template<int NT, int VT0, int VT1, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToShared2(int count, InputIt source, int tid,
	T* dest, bool sync = true);

// For 0 <= index < count:
//		dest[index] = source[index];
// Synchronize after store.
// No optimized code path for count < NV (smaller generated code).
template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToSharedLoop(int count, InputIt source, int tid,
	T* dest, bool sync = true);

template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToSharedDefault(int count, InputIt source, int tid,
	T* dest, T init, bool sync = true);

template<int NT, int VT0, int VT1, typename InputIt, typename T>
MGPU_DEVICE void DeviceGlobalToSharedDefault2(int count, InputIt source, 
	int tid, T* dest, T init, bool sync = true);

// For 0 <= index < count:
//		dest[index] = source[index];
// No synchronize.
template<int NT, int VT, typename InputIt, typename OutputIt>
MGPU_DEVICE void DeviceGlobalToGlobal(int count, InputIt source, int tid, 
	OutputIt dest, bool sync = false);

// Transponse VT elements in NT threads (x) into thread-order registers (y)
// using only NT * VT / 2 elements of shared memory.
template<int NT, int VT, typename T>
MGPU_DEVICE void HalfSmemTranspose(const T* x, int tid, T* shared, T* y);

// For 0 <= i < VT:
//		index = NT * i + tid;
//		if(index < count)
//			gather = indices[index];
//			reg[i] = data[gather];
// Synchronize after load.
template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceGather(int count, InputIt data, int indices[VT], 
	int tid, T* reg, bool sync = true);

template<int NT, int VT, typename InputIt, typename T>
MGPU_DEVICE void DeviceGatherDefault(int count, InputIt data, int indices[VT], 
	int tid, T* reg, T identity, bool sync = true);

// For 0 <= i < VT:
//		index = NT * i + tid;
//		if(index < count)
//			scatter = indices[index];
//			data[scatter] = reg[i];
// Synchronize after store.
template<int NT, int VT, typename T, typename OutputIt>
MGPU_DEVICE void DeviceScatter(int count, const T* reg, int tid, 
	int indices[VT], OutputIt data, bool sync = true);

// For 0 <= i < VT:
//		shared[VT * tid + i] = threadReg[i];
// Synchronize after store.
// Note this function moves data in THREAD ORDER.
// (DeviceRegToShared moves data in STRIDED ORDER).
template<int VT, typename T>
MGPU_DEVICE void DeviceThreadToShared(const T* threadReg, int tid, T* shared,
	bool sync = true);

// For 0 <= i < VT:
//		threadReg[i] = shared[VT * tid + i];
// Synchronize after load.
// Note this function moves data in THREAD ORDER.
// (DeviceSharedToReg moves data in STRIDED ORDER).
template<int VT, typename T>
MGPU_DEVICE void DeviceSharedToThread(const T* shared, int tid, T* threadReg,
	bool sync = true);

// For 0 <= index < aCount:
//		shared[index] = a_global[index];
// For 0 <= index < bCount:
//		shared[aCount + index] = b_global[index];
// VT0 is the lower-bound for predication-free execution: 
//		If count >= NT * VT0, a predication-free branch is taken.
// VT1 is the upper-bound for loads:
//		NT * VT1 must >= aCount + bCount.

template<int NT, int VT0, int VT1, typename T>
MGPU_DEVICE void DeviceLoad2ToReg(const T* a_global, int aCount, 
	const T* b_global, int bCount, int tid, T* reg, bool sync = false);

template<int NT, int VT0, int VT1, typename T>
MGPU_DEVICE void DeviceLoad2ToShared(const T* a_global, int aCount,
	const T* b_global, int bCount, int tid, T* shared, bool sync = true);

template<int NT, int VT0, int VT1, typename InputIt1, typename InputIt2,
	typename T>
MGPU_DEVICE void DeviceLoad2ToReg(InputIt1 a_global, int aCount, 
	InputIt2 b_global, int bCount, int tid, T* reg, bool sync = false);

template<int NT, int VT0, int VT1, typename InputIt1, typename InputIt2,
	typename T>
MGPU_DEVICE void DeviceLoad2ToShared(InputIt1 a_global, int aCount, 
	InputIt2 b_global, int bCount, int tid, T* shared, bool sync = true);

// For 0 <= i < VT
//		index = NT * i + tid;
//		if(index < count)
//			gather = indices_shared[index];
//			dest_global[index] = data_global[gather];
// Synchronize after load.
template<int NT, int VT, typename InputIt, typename OutputIt>
MGPU_DEVICE void DeviceGatherGlobalToGlobal(int count, InputIt data_global,
	const int* indices_shared, int tid, OutputIt dest_global, 
	bool sync = true);

// For 0 <= i < VT
//		index = NT * i + tid
//		if(index < count)
//			gather = indices[index];
//			if(gather < aCount) data = a_global[gather];
//			else data = b_global[gather - aCount];
//			dest_global[index] = data;
// Synchronize after load.
template<int NT, int VT, typename InputIt1, typename InputIt2,
	typename T>
MGPU_DEVICE void DeviceTransferMergeValuesReg(int count, InputIt1 a_global, 
	InputIt2 b_global, int bStart, const int* indices, int tid, 
	T* reg, bool sync = false);

template<int NT, int VT, typename InputIt1, typename InputIt2,
	typename OutputIt>
MGPU_DEVICE void DeviceTransferMergeValuesShared(int count, InputIt1 a_global, 
	InputIt2 b_global, int bStart, const int* indices_shared, int tid, 
	OutputIt dest_global, bool sync = true);

template<int NT, int VT, typename T>
MGPU_DEVICE void DeviceTransferMergeValuesReg(int count, const T* a_global, 
	const T* b_global, int bStart, const int* indices, int tid, 
	T* reg, bool sync = false);

template<int NT, int VT, typename T, typename OutputIt>
MGPU_DEVICE void DeviceTransferMergeValuesShared(int count, const T* a_global, 
	const T* b_global, int bStart, const int* indices_shared, int tid, 
	OutputIt dest_global, bool sync = true);



} // namespace mgpu


#include "device/launchbox.cuh"
#include "device/loadstore.cuh"
#include "device/ctasegscan.cuh"