Collaboration diagram for Reduce Kernels:

Functions
static std::string	skepu::ReduceKernel_CL ("__kernel void ReduceKernel_KERNELNAME(__global TYPE* input, __global TYPE* output, size_t n, __local TYPE* sdata)\n""{\n"" size_t blockSize = get_local_size(0);\n"" size_t tid = get_local_id(0);\n"" size_t i = get_group_id(0)blockSize + get_local_id(0);\n"" size_t gridSize = blockSizeget_num_groups(0);\n"" TYPE result = 0;\n"" if(i < n)\n"" {\n"" result = input[i];\n"" i += gridSize;\n"" }\n"" while(i < n)\n"" {\n"" result = FUNCTIONNAME(result, input[i], (TYPE)0);\n"" i += gridSize;\n"" }\n"" sdata[tid] = result;\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if(blockSize >= 512) { if (tid < 256 && tid + 256 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 256], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 256) { if (tid < 128 && tid + 128 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 128], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 128) { if (tid < 64 && tid + 64 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 64], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 64) { if (tid < 32 && tid + 32 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 32], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 32) { if (tid < 16 && tid + 16 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 16], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 16) { if (tid < 8 && tid + 8 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 8], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 8) { if (tid < 4 && tid + 4 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 4], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 4) { if (tid < 2 && tid + 2 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 2], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 2) { if (tid < 1 && tid + 1 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 1], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(tid == 0)\n"" {\n"" output[get_group_id(0)] = sdata[tid];\n"" }\n""}\n")

size_t	skepu::nextPow2 (size_t x)
	A helper to return a value that is nearest value that is power of 2. More...

void	skepu::getNumBlocksAndThreads (size_t n, size_t maxBlocks, size_t maxThreads, size_t &blocks, size_t &threads)

template<typename T , typename BinaryFunc >
__global__ void	skepu::ReduceKernel_CU_oldAndIncorrect (BinaryFunc reduceFunc, T input, T output, size_t n)

template<typename T , typename BinaryFunc , size_t blockSize, bool nIsPow2>
__global__ void	skepu::ReduceKernel_CU (BinaryFunc reduceFunc, T input, T output, size_t n)

bool	skepu::isPow2 (size_t x)
	A small helper to determine whether the number is a power of 2. More...

template<typename ReduceFunc , typename T >
void	skepu::CallReduceKernel (ReduceFunc reduceFunc, size_t size, size_t numThreads, size_t numBlocks, T d_idata, T *d_odata, bool enableIsPow2=true)

template<typename ReduceFunc , typename T >
void	skepu::ExecuteReduceOnADevice (ReduceFunc reduceFunc, size_t n, size_t numThreads, size_t numBlocks, size_t maxThreads, size_t maxBlocks, T d_idata, T *d_odata, unsigned int deviceID, bool enableIsPow2=true)

Detailed Description

Definitions of CUDA and OpenCL kernels for the Reduce skeleton.

Function Documentation

template<typename ReduceFunc , typename T >

void skepu::CallReduceKernel	(	ReduceFunc *	reduceFunc,
		size_t	size,
		size_t	numThreads,
		size_t	numBlocks,
		T *	d_idata,
		T *	d_odata,
		bool	enableIsPow2 = `true`
	)

Helper method used to call the actual CUDA kernel for reduction. Used when PINNED MEMORY is disabled

Parameters

reduceFunc	The reduction user function to be used.
size	size of the input array to be reduced.
numThreads	Number of threads to be used for kernel execution.
numBlocks	Number of blocks to be used for kernel execution.
d_idata	CUDA memory pointer to input array.
d_odata	CUDA memory pointer to output array.
enableIsPow2	boolean flag (default true) used to enable/disable isPow2 optimizations. disabled only for sparse row-/column-wise reduction for technical reasons.

References skepu::isPow2().

Here is the call graph for this function:

template<typename ReduceFunc , typename T >

void skepu::ExecuteReduceOnADevice	(	ReduceFunc *	reduceFunc,
		size_t	n,
		size_t	numThreads,
		size_t	numBlocks,
		size_t	maxThreads,
		size_t	maxBlocks,
		T *	d_idata,
		T *	d_odata,
		unsigned int	deviceID,
		bool	enableIsPow2 = `true`
	)

A helper function that is used to call the actual kernel for reduction. Used by other functions to call the actual kernel Internally, it just calls 2 kernels by setting their arguments. No synchronization is enforced.

Parameters

reduceFunc	The reduction user function to be used.
n	size of the input array to be reduced.
numThreads	Number of threads to be used for kernel execution.
numBlocks	Number of blocks to be used for kernel execution.
maxThreads	Maximum number of threads that can be used for kernel execution.
maxBlocks	Maximum number of blocks that can be used for kernel execution.
d_idata	CUDA memory pointer to input array.
d_odata	CUDA memory pointer to output array.
deviceID	Integer deciding which device to utilize.
enableIsPow2	boolean flag (default true) used to enable/disable isPow2 optimizations. disabled only for sparse row-/column-wise reduction for technical reasons.

References skepu::getNumBlocksAndThreads().

Here is the call graph for this function:

void skepu::getNumBlocksAndThreads	(	size_t	n,
		size_t	maxBlocks,
		size_t	maxThreads,
		size_t &	blocks,
		size_t &	threads
	)

Compute the number of threads and blocks to use for the reduction kernel. We set threads / block to the minimum of maxThreads and n/2 where n is problem size. We observe the maximum specified number of blocks, because each kernel thread can process more than 1 elements.

Parameters

n	Problem size.
maxBlocks	Maximum number of blocks that can be used.
maxThreads	Maximum number of threads that can be used.
blocks	An output parameter passed by reference. Specify number of blocks to be used.
threads	An output parameter passed by reference. Specify number of threads to be used.

References MIN, and skepu::nextPow2().

Referenced by skepu::Reduce< ReduceFunc, ReduceFunc >::CL(), skepu::MapReduce< MapFunc, ReduceFunc >::CU(), skepu::Reduce< ReduceFunc, ReduceFunc >::CU(), and skepu::ExecuteReduceOnADevice().

Here is the call graph for this function:

bool skepu::isPow2 ( size_t x )

A small helper to determine whether the number is a power of 2.

Parameters

x	the actual number.

Returns: bool specifying whether number of power of 2 or not,

Referenced by skepu::CallReduceKernel().

size_t skepu::nextPow2 ( size_t x )

A helper to return a value that is nearest value that is power of 2.

Parameters

x	The input number for which we need to find the nearest value that is power of 2.

Returns: The nearest value that is power of 2.

Referenced by skepu::getNumBlocksAndThreads().

static std::string skepu::ReduceKernel_CL ( "__kernel void ReduceKernel_KERNELNAME(__global TYPE* input, __global TYPE* output, size_t n, __local TYPE* sdata)\n""{\n"" size_t blockSize = get_local_size(0);\n"" size_t tid = get_local_id(0);\n"" size_t i = get_group_id(0)*blockSize + get_local_id(0);\n"" size_t gridSize = blockSize*get_num_groups(0);\n"" TYPE result = 0;\n"" if(i < n)\n"" {\n"" result = input[i];\n"" i += gridSize;\n"" }\n"" while(i < n)\n"" {\n"" result = FUNCTIONNAME(result, input[i], (TYPE)0);\n"" i += gridSize;\n"" }\n"" sdata[tid] = result;\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if(blockSize >= 512) { if (tid < 256 && tid + 256 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 256], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 256) { if (tid < 128 && tid + 128 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 128], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 128) { if (tid < 64 && tid + 64 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 64], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 64) { if (tid < 32 && tid + 32 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 32], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 32) { if (tid < 16 && tid + 16 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 16], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 16) { if (tid < 8 && tid + 8 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 8], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 8) { if (tid < 4 && tid + 4 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 4], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 4) { if (tid < 2 && tid + 2 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 2], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 2) { if (tid < 1 && tid + 1 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 1], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(tid == 0)\n"" {\n"" output[get_group_id(0)] = sdata[tid];\n"" }\n""}\n" )

static

OpenCL Reduce kernel, using the same pattern as reduce6 in the CUDA SDK. See whitepaper from NVIDIA on optimizing reduction for the GPU.

template<typename T , typename BinaryFunc , size_t blockSize, bool nIsPow2>

__global__ void skepu::ReduceKernel_CU	(	BinaryFunc	reduceFunc,
		T *	input,
		T *	output,
		size_t	n
	)

CUDA Reduce kernel, using the same pattern as reduce6 in the CUDA SDK. See whitepaper from NVIDIA on optimizing reduction for the GPU.

template<typename T , typename BinaryFunc >

__global__ void skepu::ReduceKernel_CU_oldAndIncorrect	(	BinaryFunc	reduceFunc,
		T *	input,
		T *	output,
		size_t	n
	)

The old CUDA Reduce kernel which now gives incorrect results for larger problem sizes. Not used anymore.

Functions

Detailed Description

Function Documentation