5 #ifndef REDUCE_KERNELS_H
6 #define REDUCE_KERNELS_H
32 "__kernel void ReduceKernel_KERNELNAME(__global TYPE* input, __global TYPE* output, size_t n, __local TYPE* sdata)\n"
34 " size_t blockSize = get_local_size(0);\n"
35 " size_t tid = get_local_id(0);\n"
36 " size_t i = get_group_id(0)*blockSize + get_local_id(0);\n"
37 " size_t gridSize = blockSize*get_num_groups(0);\n"
41 " result = input[i];\n"
46 " result = FUNCTIONNAME(result, input[i], (TYPE)0);\n"
49 " sdata[tid] = result;\n"
50 " barrier(CLK_LOCAL_MEM_FENCE);\n"
51 " if(blockSize >= 512) { if (tid < 256 && tid + 256 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 256], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
52 " if(blockSize >= 256) { if (tid < 128 && tid + 128 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 128], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
53 " if(blockSize >= 128) { if (tid < 64 && tid + 64 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 64], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
54 " if(blockSize >= 64) { if (tid < 32 && tid + 32 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 32], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
55 " if(blockSize >= 32) { if (tid < 16 && tid + 16 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 16], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
56 " if(blockSize >= 16) { if (tid < 8 && tid + 8 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 8], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
57 " if(blockSize >= 8) { if (tid < 4 && tid + 4 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 4], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
58 " if(blockSize >= 4) { if (tid < 2 && tid + 2 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 2], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
59 " if(blockSize >= 2) { if (tid < 1 && tid + 1 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 1], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
62 " output[get_group_id(0)] = sdata[tid];\n"
101 threads = (n < maxThreads*2) ?
nextPow2((n + 1)/ 2) : maxThreads;
102 blocks = (n + (threads * 2 - 1)) / (threads * 2);
104 blocks =
MIN(maxBlocks, blocks);
122 __device__
inline operator T*()
124 extern __shared__
int __smem[];
128 __device__
inline operator const T*()
const
130 extern __shared__
int __smem[];
138 struct SharedMemory<double>
140 __device__
inline operator double*()
142 extern __shared__
double __smem_d[];
143 return (
double*)__smem_d;
146 __device__
inline operator const double*()
const
148 extern __shared__
double __smem_d[];
149 return (
double*)__smem_d;
171 template<
typename T,
typename BinaryFunc>
174 T* sdata = SharedMemory<T>();
176 size_t blockSize = blockDim.x;
177 size_t tid = threadIdx.x;
178 size_t i = blockIdx.x * blockSize + tid;
179 size_t gridSize = blockSize*gridDim.x;
190 result = reduceFunc.CU(result, input[i]);
200 if (tid < 256 && tid + 256 < n)
202 sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 256]);
208 if (tid < 128 && tid + 128 < n)
210 sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 128]);
216 if (tid < 64 && tid + 64 < n)
218 sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 64]);
224 if (tid < 32 && tid + 32 < n)
226 sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 32]);
232 if (tid < 16 && tid + 16 < n)
234 sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 16]);
240 if (tid < 8 && tid + 8 < n)
242 sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 8]);
248 if (tid < 4 && tid + 4 < n)
250 sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 4]);
256 if (tid < 2 && tid + 2 < n)
258 sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 2]);
264 if (tid < 1 && tid + 1 < n)
266 sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 1]);
273 output[blockIdx.x] = sdata[tid];
286 template<
typename T,
typename BinaryFunc,
size_t blockSize,
bool nIsPow2>
289 T *sdata = SharedMemory<T>();
293 size_t tid = threadIdx.x;
294 size_t i = blockIdx.x*blockSize*2 + threadIdx.x;
295 size_t gridSize = blockSize*2*gridDim.x;
306 if (nIsPow2 || i + blockSize < n)
307 result = reduceFunc.CU(result, input[i+blockSize]);
316 result = reduceFunc.CU(result, input[i]);
318 if (nIsPow2 || i + blockSize < n)
319 result = reduceFunc.CU(result, input[i+blockSize]);
330 if (blockSize >= 512)
334 sdata[tid] = result = reduceFunc.CU(result, sdata[tid + 256]);
338 if (blockSize >= 256)
342 sdata[tid] = result = reduceFunc.CU(result, sdata[tid + 128]);
346 if (blockSize >= 128)
350 sdata[tid] = result = reduceFunc.CU(result, sdata[tid + 64]);
360 volatile T* smem = sdata;
363 smem[tid] = result = reduceFunc.CU(result, smem[tid + 32]);
367 smem[tid] = result = reduceFunc.CU(result, smem[tid + 16]);
371 smem[tid] = result = reduceFunc.CU(result, smem[tid + 8]);
375 smem[tid] = result = reduceFunc.CU(result, smem[tid + 4]);
379 smem[tid] = result = reduceFunc.CU(result, smem[tid + 2]);
383 smem[tid] = result = reduceFunc.CU(result, smem[tid + 1]);
389 output[blockIdx.x] = sdata[0];
413 return ((x&(x-1))==0);
428 template <
typename ReduceFunc,
typename T>
429 void CallReduceKernel(ReduceFunc *reduceFunc,
size_t size,
size_t numThreads,
size_t numBlocks, T *d_idata, T *d_odata,
bool enableIsPow2=
true)
431 dim3 dimBlock(numThreads, 1, 1);
432 dim3 dimGrid(numBlocks, 1, 1);
436 size_t smemSize = (numThreads <= 32) ? 2 * numThreads *
sizeof(T) : numThreads *
sizeof(T);
439 if (
isPow2(size) && enableIsPow2)
444 ReduceKernel_CU<T, ReduceFunc, 512, true><<< dimGrid, dimBlock, smemSize >>>(*reduceFunc, d_idata, d_odata, size);
447 ReduceKernel_CU<T, ReduceFunc, 256, true><<< dimGrid, dimBlock, smemSize >>>(*reduceFunc, d_idata, d_odata, size);
450 ReduceKernel_CU<T, ReduceFunc, 128, true><<< dimGrid, dimBlock, smemSize >>>(*reduceFunc, d_idata, d_odata, size);
453 ReduceKernel_CU<T, ReduceFunc, 64, true><<< dimGrid, dimBlock, smemSize >>>(*reduceFunc, d_idata, d_odata, size);
456 ReduceKernel_CU<T, ReduceFunc, 32, true><<< dimGrid, dimBlock, smemSize >>>(*reduceFunc, d_idata, d_odata, size);
459 ReduceKernel_CU<T, ReduceFunc, 16, true><<< dimGrid, dimBlock, smemSize >>>(*reduceFunc, d_idata, d_odata, size);
462 ReduceKernel_CU<T, ReduceFunc, 8, true><<< dimGrid, dimBlock, smemSize >>>(*reduceFunc, d_idata, d_odata, size);
465 ReduceKernel_CU<T, ReduceFunc, 4, true><<< dimGrid, dimBlock, smemSize >>>(*reduceFunc, d_idata, d_odata, size);
468 ReduceKernel_CU<T, ReduceFunc, 2, true><<< dimGrid, dimBlock, smemSize >>>(*reduceFunc, d_idata, d_odata, size);
471 ReduceKernel_CU<T, ReduceFunc, 1, true><<< dimGrid, dimBlock, smemSize >>>(*reduceFunc, d_idata, d_odata, size);
480 ReduceKernel_CU<T, ReduceFunc, 512, false><<< dimGrid, dimBlock, smemSize >>>(*reduceFunc, d_idata, d_odata, size);
483 ReduceKernel_CU<T, ReduceFunc, 256, false><<< dimGrid, dimBlock, smemSize >>>(*reduceFunc, d_idata, d_odata, size);
486 ReduceKernel_CU<T, ReduceFunc, 128, false><<< dimGrid, dimBlock, smemSize >>>(*reduceFunc, d_idata, d_odata, size);
489 ReduceKernel_CU<T, ReduceFunc, 64, false><<< dimGrid, dimBlock, smemSize >>>(*reduceFunc, d_idata, d_odata, size);
492 ReduceKernel_CU<T, ReduceFunc, 32, false><<< dimGrid, dimBlock, smemSize >>>(*reduceFunc, d_idata, d_odata, size);
495 ReduceKernel_CU<T, ReduceFunc, 16, false><<< dimGrid, dimBlock, smemSize >>>(*reduceFunc, d_idata, d_odata, size);
498 ReduceKernel_CU<T, ReduceFunc, 8, false><<< dimGrid, dimBlock, smemSize >>>(*reduceFunc, d_idata, d_odata, size);
501 ReduceKernel_CU<T, ReduceFunc, 4, false><<< dimGrid, dimBlock, smemSize >>>(*reduceFunc, d_idata, d_odata, size);
504 ReduceKernel_CU<T, ReduceFunc, 2, false><<< dimGrid, dimBlock, smemSize >>>(*reduceFunc, d_idata, d_odata, size);
507 ReduceKernel_CU<T, ReduceFunc, 1, false><<< dimGrid, dimBlock, smemSize >>>(*reduceFunc, d_idata, d_odata, size);
516 #ifdef USE_PINNED_MEMORY
529 template <
typename ReduceFunc,
typename T>
530 void CallReduceKernel_WithStream(ReduceFunc *reduceFunc,
size_t size,
size_t numThreads,
size_t numBlocks, T *d_idata, T *d_odata, cudaStream_t &stream,
bool enableIsPow2=
true)
532 dim3 dimBlock(numThreads, 1, 1);
533 dim3 dimGrid(numBlocks, 1, 1);
537 size_t smemSize = (numThreads <= 32) ? 2 * numThreads *
sizeof(T) : numThreads *
sizeof(T);
540 if (
isPow2(size) && enableIsPow2)
545 ReduceKernel_CU<T, ReduceFunc, 512, true><<< dimGrid, dimBlock, smemSize, stream >>>(*reduceFunc, d_idata, d_odata, size);
548 ReduceKernel_CU<T, ReduceFunc, 256, true><<< dimGrid, dimBlock, smemSize, stream >>>(*reduceFunc, d_idata, d_odata, size);
551 ReduceKernel_CU<T, ReduceFunc, 128, true><<< dimGrid, dimBlock, smemSize, stream >>>(*reduceFunc, d_idata, d_odata, size);
554 ReduceKernel_CU<T, ReduceFunc, 64, true><<< dimGrid, dimBlock, smemSize, stream >>>(*reduceFunc, d_idata, d_odata, size);
557 ReduceKernel_CU<T, ReduceFunc, 32, true><<< dimGrid, dimBlock, smemSize, stream >>>(*reduceFunc, d_idata, d_odata, size);
560 ReduceKernel_CU<T, ReduceFunc, 16, true><<< dimGrid, dimBlock, smemSize, stream >>>(*reduceFunc, d_idata, d_odata, size);
563 ReduceKernel_CU<T, ReduceFunc, 8, true><<< dimGrid, dimBlock, smemSize, stream >>>(*reduceFunc, d_idata, d_odata, size);
566 ReduceKernel_CU<T, ReduceFunc, 4, true><<< dimGrid, dimBlock, smemSize, stream >>>(*reduceFunc, d_idata, d_odata, size);
569 ReduceKernel_CU<T, ReduceFunc, 2, true><<< dimGrid, dimBlock, smemSize, stream >>>(*reduceFunc, d_idata, d_odata, size);
572 ReduceKernel_CU<T, ReduceFunc, 1, true><<< dimGrid, dimBlock, smemSize, stream >>>(*reduceFunc, d_idata, d_odata, size);
581 ReduceKernel_CU<T, ReduceFunc, 512, false><<< dimGrid, dimBlock, smemSize, stream >>>(*reduceFunc, d_idata, d_odata, size);
584 ReduceKernel_CU<T, ReduceFunc, 256, false><<< dimGrid, dimBlock, smemSize, stream >>>(*reduceFunc, d_idata, d_odata, size);
587 ReduceKernel_CU<T, ReduceFunc, 128, false><<< dimGrid, dimBlock, smemSize, stream >>>(*reduceFunc, d_idata, d_odata, size);
590 ReduceKernel_CU<T, ReduceFunc, 64, false><<< dimGrid, dimBlock, smemSize, stream >>>(*reduceFunc, d_idata, d_odata, size);
593 ReduceKernel_CU<T, ReduceFunc, 32, false><<< dimGrid, dimBlock, smemSize, stream >>>(*reduceFunc, d_idata, d_odata, size);
596 ReduceKernel_CU<T, ReduceFunc, 16, false><<< dimGrid, dimBlock, smemSize, stream >>>(*reduceFunc, d_idata, d_odata, size);
599 ReduceKernel_CU<T, ReduceFunc, 8, false><<< dimGrid, dimBlock, smemSize, stream >>>(*reduceFunc, d_idata, d_odata, size);
602 ReduceKernel_CU<T, ReduceFunc, 4, false><<< dimGrid, dimBlock, smemSize, stream >>>(*reduceFunc, d_idata, d_odata, size);
605 ReduceKernel_CU<T, ReduceFunc, 2, false><<< dimGrid, dimBlock, smemSize, stream >>>(*reduceFunc, d_idata, d_odata, size);
608 ReduceKernel_CU<T, ReduceFunc, 1, false><<< dimGrid, dimBlock, smemSize, stream >>>(*reduceFunc, d_idata, d_odata, size);
648 void getNumBlocksAndThreads(
size_t n,
size_t maxBlocks,
size_t maxThreads,
size_t &blocks,
size_t &threads)
650 threads = (n < maxThreads*2) ?
nextPow2((n + 1)/ 2) : maxThreads;
651 blocks = (n + (threads * 2 - 1)) / (threads * 2);
653 blocks =
MIN(maxBlocks, blocks);
659 #ifdef USE_PINNED_MEMORY
676 template <
typename ReduceFunc,
typename T>
677 void ExecuteReduceOnADevice(ReduceFunc *reduceFunc,
size_t n,
size_t numThreads,
size_t numBlocks,
size_t maxThreads,
size_t maxBlocks, T* d_idata, T* d_odata,
unsigned int deviceID, cudaStream_t &stream,
bool enableIsPow2=
true)
694 template <
typename ReduceFunc,
typename T>
695 void ExecuteReduceOnADevice(ReduceFunc *reduceFunc,
size_t n,
size_t numThreads,
size_t numBlocks,
size_t maxThreads,
size_t maxBlocks, T* d_idata, T* d_odata,
unsigned int deviceID,
bool enableIsPow2=
true)
699 #ifdef USE_PINNED_MEMORY
700 CallReduceKernel_WithStream<ReduceFunc, T>(reduceFunc, n, numThreads, numBlocks, d_idata, d_odata, stream, enableIsPow2);
702 CallReduceKernel<ReduceFunc, T>(reduceFunc, n, numThreads, numBlocks, d_idata, d_odata, enableIsPow2);
710 size_t threads = 0, blocks = 0;
713 #ifdef USE_PINNED_MEMORY
714 CallReduceKernel_WithStream<ReduceFunc, T>(reduceFunc, s, threads, blocks, d_odata, d_odata, stream, enableIsPow2);
716 CallReduceKernel<ReduceFunc, T>(reduceFunc, s, threads, blocks, d_odata, d_odata, enableIsPow2);
719 s = (s + (threads*2-1)) / (threads*2);
#define MIN(a, b)
CUT bool type.
Definition: skepu_cuda_helpers.h:42
__global__ void ReduceKernel_CU_oldAndIncorrect(BinaryFunc reduceFunc, T *input, T *output, size_t n)
Definition: reduce_kernels.h:172
void CallReduceKernel(ReduceFunc *reduceFunc, size_t size, size_t numThreads, size_t numBlocks, T *d_idata, T *d_odata, bool enableIsPow2=true)
Definition: reduce_kernels.h:429
__global__ void ReduceKernel_CU(BinaryFunc reduceFunc, T *input, T *output, size_t n)
Definition: reduce_kernels.h:287
static std::string ReduceKernel_CL("__kernel void ReduceKernel_KERNELNAME(__global TYPE* input, __global TYPE* output, size_t n, __local TYPE* sdata)\n""{\n"" size_t blockSize = get_local_size(0);\n"" size_t tid = get_local_id(0);\n"" size_t i = get_group_id(0)*blockSize + get_local_id(0);\n"" size_t gridSize = blockSize*get_num_groups(0);\n"" TYPE result = 0;\n"" if(i < n)\n"" {\n"" result = input[i];\n"" i += gridSize;\n"" }\n"" while(i < n)\n"" {\n"" result = FUNCTIONNAME(result, input[i], (TYPE)0);\n"" i += gridSize;\n"" }\n"" sdata[tid] = result;\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if(blockSize >= 512) { if (tid < 256 && tid + 256 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 256], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 256) { if (tid < 128 && tid + 128 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 128], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 128) { if (tid < 64 && tid + 64 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 64], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 64) { if (tid < 32 && tid + 32 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 32], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 32) { if (tid < 16 && tid + 16 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 16], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 16) { if (tid < 8 && tid + 8 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 8], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 8) { if (tid < 4 && tid + 4 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 4], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 4) { if (tid < 2 && tid + 2 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 2], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 2) { if (tid < 1 && tid + 1 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 1], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(tid == 0)\n"" {\n"" output[get_group_id(0)] = sdata[tid];\n"" }\n""}\n")
size_t nextPow2(size_t x)
A helper to return a value that is nearest value that is power of 2.
Definition: reduce_kernels.h:74
bool isPow2(size_t x)
A small helper to determine whether the number is a power of 2.
Definition: reduce_kernels.h:411
void ExecuteReduceOnADevice(ReduceFunc *reduceFunc, size_t n, size_t numThreads, size_t numBlocks, size_t maxThreads, size_t maxBlocks, T *d_idata, T *d_odata, unsigned int deviceID, bool enableIsPow2=true)
Definition: reduce_kernels.h:695
void getNumBlocksAndThreads(size_t n, size_t maxBlocks, size_t maxThreads, size_t &blocks, size_t &threads)
Definition: reduce_kernels.h:99