SkePU
1.2
|
Functions | |
static std::string | skepu::ReduceKernel_CL ("__kernel void ReduceKernel_KERNELNAME(__global TYPE* input, __global TYPE* output, size_t n, __local TYPE* sdata)\n""{\n"" size_t blockSize = get_local_size(0);\n"" size_t tid = get_local_id(0);\n"" size_t i = get_group_id(0)*blockSize + get_local_id(0);\n"" size_t gridSize = blockSize*get_num_groups(0);\n"" TYPE result = 0;\n"" if(i < n)\n"" {\n"" result = input[i];\n"" i += gridSize;\n"" }\n"" while(i < n)\n"" {\n"" result = FUNCTIONNAME(result, input[i], (TYPE)0);\n"" i += gridSize;\n"" }\n"" sdata[tid] = result;\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if(blockSize >= 512) { if (tid < 256 && tid + 256 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 256], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 256) { if (tid < 128 && tid + 128 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 128], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 128) { if (tid < 64 && tid + 64 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 64], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 64) { if (tid < 32 && tid + 32 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 32], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 32) { if (tid < 16 && tid + 16 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 16], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 16) { if (tid < 8 && tid + 8 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 8], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 8) { if (tid < 4 && tid + 4 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 4], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 4) { if (tid < 2 && tid + 2 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 2], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 2) { if (tid < 1 && tid + 1 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 1], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(tid == 0)\n"" {\n"" output[get_group_id(0)] = sdata[tid];\n"" }\n""}\n") |
size_t | skepu::nextPow2 (size_t x) |
A helper to return a value that is nearest value that is power of 2. More... | |
void | skepu::getNumBlocksAndThreads (size_t n, size_t maxBlocks, size_t maxThreads, size_t &blocks, size_t &threads) |
template<typename T , typename BinaryFunc > | |
__global__ void | skepu::ReduceKernel_CU_oldAndIncorrect (BinaryFunc reduceFunc, T *input, T *output, size_t n) |
template<typename T , typename BinaryFunc , size_t blockSize, bool nIsPow2> | |
__global__ void | skepu::ReduceKernel_CU (BinaryFunc reduceFunc, T *input, T *output, size_t n) |
bool | skepu::isPow2 (size_t x) |
A small helper to determine whether the number is a power of 2. More... | |
template<typename ReduceFunc , typename T > | |
void | skepu::CallReduceKernel (ReduceFunc *reduceFunc, size_t size, size_t numThreads, size_t numBlocks, T *d_idata, T *d_odata, bool enableIsPow2=true) |
template<typename ReduceFunc , typename T > | |
void | skepu::ExecuteReduceOnADevice (ReduceFunc *reduceFunc, size_t n, size_t numThreads, size_t numBlocks, size_t maxThreads, size_t maxBlocks, T *d_idata, T *d_odata, unsigned int deviceID, bool enableIsPow2=true) |
Definitions of CUDA and OpenCL kernels for the Reduce skeleton.
void skepu::CallReduceKernel | ( | ReduceFunc * | reduceFunc, |
size_t | size, | ||
size_t | numThreads, | ||
size_t | numBlocks, | ||
T * | d_idata, | ||
T * | d_odata, | ||
bool | enableIsPow2 = true |
||
) |
Helper method used to call the actual CUDA kernel for reduction. Used when PINNED MEMORY is disabled
reduceFunc | The reduction user function to be used. |
size | size of the input array to be reduced. |
numThreads | Number of threads to be used for kernel execution. |
numBlocks | Number of blocks to be used for kernel execution. |
d_idata | CUDA memory pointer to input array. |
d_odata | CUDA memory pointer to output array. |
enableIsPow2 | boolean flag (default true) used to enable/disable isPow2 optimizations. disabled only for sparse row-/column-wise reduction for technical reasons. |
References skepu::isPow2().
void skepu::ExecuteReduceOnADevice | ( | ReduceFunc * | reduceFunc, |
size_t | n, | ||
size_t | numThreads, | ||
size_t | numBlocks, | ||
size_t | maxThreads, | ||
size_t | maxBlocks, | ||
T * | d_idata, | ||
T * | d_odata, | ||
unsigned int | deviceID, | ||
bool | enableIsPow2 = true |
||
) |
A helper function that is used to call the actual kernel for reduction. Used by other functions to call the actual kernel Internally, it just calls 2 kernels by setting their arguments. No synchronization is enforced.
reduceFunc | The reduction user function to be used. |
n | size of the input array to be reduced. |
numThreads | Number of threads to be used for kernel execution. |
numBlocks | Number of blocks to be used for kernel execution. |
maxThreads | Maximum number of threads that can be used for kernel execution. |
maxBlocks | Maximum number of blocks that can be used for kernel execution. |
d_idata | CUDA memory pointer to input array. |
d_odata | CUDA memory pointer to output array. |
deviceID | Integer deciding which device to utilize. |
enableIsPow2 | boolean flag (default true) used to enable/disable isPow2 optimizations. disabled only for sparse row-/column-wise reduction for technical reasons. |
References skepu::getNumBlocksAndThreads().
void skepu::getNumBlocksAndThreads | ( | size_t | n, |
size_t | maxBlocks, | ||
size_t | maxThreads, | ||
size_t & | blocks, | ||
size_t & | threads | ||
) |
Compute the number of threads and blocks to use for the reduction kernel. We set threads / block to the minimum of maxThreads and n/2 where n is problem size. We observe the maximum specified number of blocks, because each kernel thread can process more than 1 elements.
n | Problem size. |
maxBlocks | Maximum number of blocks that can be used. |
maxThreads | Maximum number of threads that can be used. |
blocks | An output parameter passed by reference. Specify number of blocks to be used. |
threads | An output parameter passed by reference. Specify number of threads to be used. |
References MIN, and skepu::nextPow2().
Referenced by skepu::Reduce< ReduceFunc, ReduceFunc >::CL(), skepu::MapReduce< MapFunc, ReduceFunc >::CU(), skepu::Reduce< ReduceFunc, ReduceFunc >::CU(), and skepu::ExecuteReduceOnADevice().
bool skepu::isPow2 | ( | size_t | x | ) |
A small helper to determine whether the number is a power of 2.
x | the actual number. |
Referenced by skepu::CallReduceKernel().
size_t skepu::nextPow2 | ( | size_t | x | ) |
A helper to return a value that is nearest value that is power of 2.
x | The input number for which we need to find the nearest value that is power of 2. |
Referenced by skepu::getNumBlocksAndThreads().
|
static |
OpenCL Reduce kernel, using the same pattern as reduce6 in the CUDA SDK. See whitepaper from NVIDIA on optimizing reduction for the GPU.
__global__ void skepu::ReduceKernel_CU | ( | BinaryFunc | reduceFunc, |
T * | input, | ||
T * | output, | ||
size_t | n | ||
) |
CUDA Reduce kernel, using the same pattern as reduce6 in the CUDA SDK. See whitepaper from NVIDIA on optimizing reduction for the GPU.
__global__ void skepu::ReduceKernel_CU_oldAndIncorrect | ( | BinaryFunc | reduceFunc, |
T * | input, | ||
T * | output, | ||
size_t | n | ||
) |
The old CUDA Reduce kernel which now gives incorrect results for larger problem sizes. Not used anymore.