33 "__kernel void ScanKernel_KERNELNAME(__global TYPE* input, __global TYPE* output, __global TYPE* blockSums, unsigned int n, unsigned int numElements, __local TYPE* sdata)\n"
35 " unsigned int threadIdx = get_local_id(0);\n"
36 " unsigned int blockDim = get_local_size(0);\n"
37 " unsigned int blockIdx = get_group_id(0);\n"
38 " unsigned int gridDim = get_num_groups(0);\n"
39 " int thid = threadIdx;\n"
42 " int mem = get_global_id(0);\n"
43 " int blockNr = blockIdx;\n"
44 " unsigned int gridSize = blockDim*gridDim;\n"
45 " unsigned int numBlocks = numElements/(blockDim) + (numElements%(blockDim) == 0 ? 0:1);\n"
47 " while(blockNr < numBlocks)\n"
49 " sdata[pout*n+thid] = (mem < numElements) ? input[mem] : 0;\n"
50 " barrier(CLK_LOCAL_MEM_FENCE);\n"
51 " for(offset = 1; offset < n; offset *=2)\n"
55 " if(thid >= offset)\n"
56 " sdata[pout*n+thid] = FUNCTIONNAME(sdata[pin*n+thid], sdata[pin*n+thid-offset], (TYPE)0);\n"
58 " sdata[pout*n+thid] = sdata[pin*n+thid];\n"
59 " barrier(CLK_LOCAL_MEM_FENCE);\n"
61 " if(thid == blockDim - 1)\n"
62 " blockSums[blockNr] = sdata[pout*n+blockDim-1];\n"
63 " if(mem < numElements)\n"
64 " output[mem] = sdata[pout*n+thid];\n"
66 " blockNr += gridDim;\n"
67 " barrier(CLK_LOCAL_MEM_FENCE);\n"
78 "__kernel void ScanUpdate_KERNELNAME(__global TYPE* data, __global TYPE* sums, int isInclusive, TYPE init, int n, __global TYPE* ret, __local TYPE* sdata)\n"
80 " __local TYPE offset;\n"
81 " __local TYPE inc_offset;\n"
82 " unsigned int threadIdx = get_local_id(0);\n"
83 " unsigned int blockDim = get_local_size(0);\n"
84 " unsigned int blockIdx = get_group_id(0);\n"
85 " unsigned int gridDim = get_num_groups(0);\n"
86 " int thid = threadIdx;\n"
87 " int blockNr = blockIdx;\n"
88 " unsigned int gridSize = blockDim*gridDim;\n"
89 " int mem = get_global_id(0);\n"
90 " unsigned int numBlocks = n/(blockDim) + (n%(blockDim) == 0 ? 0:1);\n"
91 " while(blockNr < numBlocks)\n"
95 " if(isInclusive == 0)\n"
100 " offset = FUNCTIONNAME(offset, sums[blockNr-1], (TYPE)0);\n"
101 " inc_offset = sums[blockNr-1];\n"
107 " offset = sums[blockNr-1];\n"
110 " barrier(CLK_LOCAL_MEM_FENCE);\n"
111 " if(isInclusive == 1)\n"
114 " sdata[thid] = (mem < n) ? FUNCTIONNAME(offset, data[mem], (TYPE)0) : 0;\n"
116 " sdata[thid] = (mem < n) ? data[mem] : 0;\n"
118 " *ret = sdata[thid];\n"
123 " *ret = FUNCTIONNAME(inc_offset, data[mem], (TYPE)0);\n"
125 " sdata[thid] = offset;\n"
127 " sdata[thid] = (mem-1 < n) ? FUNCTIONNAME(offset, data[mem-1], (TYPE)0) : 0;\n"
129 " barrier(CLK_LOCAL_MEM_FENCE);\n"
131 " data[mem] = sdata[thid];\n"
132 " mem += gridSize;\n"
133 " blockNr += gridDim;\n"
134 " barrier(CLK_LOCAL_MEM_FENCE);\n"
144 "__kernel void ScanAdd_KERNELNAME(__global TYPE* data, TYPE sum, int n)\n"
146 " int i = get_global_id(0);\n"
147 " unsigned int gridSize = get_local_size(0)*get_num_groups(0);\n"
150 " data[i] = FUNCTIONNAME(data[i], sum, (TYPE)0);\n"
186 template<
typename T,
typename BinaryFunc>
187 __global__
void ScanKernel_CU(BinaryFunc scanFunc, T* input, T* output, T* blockSums,
unsigned int n,
unsigned int numElements)
189 extern __shared__
char _sdata[];
190 T* sdata =
reinterpret_cast<T*
>(_sdata);
192 int thid = threadIdx.x;
195 int mem = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;
196 int blockNr = blockIdx.x;
197 unsigned int gridSize = blockDim.x*gridDim.x;
198 unsigned int numBlocks = numElements/(blockDim.x) + (numElements%(blockDim.x) == 0 ? 0:1);
201 while(blockNr < numBlocks)
203 sdata[pout*n+thid] = (mem < numElements) ? input[mem] : 0;
207 for(offset = 1; offset < n; offset *=2)
212 sdata[pout*n+thid] = scanFunc.CU(sdata[pin*n+thid], sdata[pin*n+thid-offset]);
214 sdata[pout*n+thid] = sdata[pin*n+thid];
218 if(thid == blockDim.x - 1)
219 blockSums[blockNr] = sdata[pout*n+blockDim.x-1];
221 if(mem < numElements)
222 output[mem] = sdata[pout*n+thid];
225 blockNr += gridDim.x;
236 template <
typename T,
typename BinaryFunc>
237 __global__
void ScanUpdate_CU(BinaryFunc scanFunc, T *data, T *sums,
int isInclusive, T init,
int n, T *ret)
239 extern __shared__
char _sdata[];
240 T* sdata =
reinterpret_cast<T*
>(_sdata);
243 __shared__ T inc_offset;
245 int thid = threadIdx.x;
246 int blockNr = blockIdx.x;
247 unsigned int gridSize = blockDim.x*gridDim.x;
248 int mem = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;
249 unsigned int numBlocks = n/(blockDim.x) + (n%(blockDim.x) == 0 ? 0:1);
251 while(blockNr < numBlocks)
260 offset = scanFunc.CU(offset, sums[blockNr-1]);
261 inc_offset = sums[blockNr-1];
267 offset = sums[blockNr-1];
276 sdata[thid] = (mem < n) ? scanFunc.CU(offset, data[mem]) : 0;
278 sdata[thid] = (mem < n) ? data[mem] : 0;
285 *ret = scanFunc.CU(inc_offset, data[mem]);
287 sdata[thid] = offset;
289 sdata[thid] = (mem-1 < n) ? scanFunc.CU(offset, data[mem-1]) : 0;
295 data[mem] = sdata[thid];
298 blockNr += gridDim.x;
308 template <
typename T,
typename BinaryFunc>
309 __global__
void ScanAdd_CU(BinaryFunc scanFunc, T *data, T sum,
int n)
311 int i = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;
312 unsigned int gridSize = blockDim.x*gridDim.x;
316 data[i] = scanFunc.CU(data[i], sum);
__global__ void ScanAdd_CU(BinaryFunc scanFunc, T *data, T sum, int n)
Definition: scan_kernels.h:309
__global__ void ScanUpdate_CU(BinaryFunc scanFunc, T *data, T *sums, int isInclusive, T init, int n, T *ret)
Definition: scan_kernels.h:237
static std::string ScanAdd_CL("__kernel void ScanAdd_KERNELNAME(__global TYPE* data, TYPE sum, int n)\n""{\n"" int i = get_global_id(0);\n"" unsigned int gridSize = get_local_size(0)*get_num_groups(0);\n"" while(i < n)\n"" {\n"" data[i] = FUNCTIONNAME(data[i], sum, (TYPE)0);\n"" i += gridSize;\n"" }\n""}\n")
static std::string ScanKernel_CL("__kernel void ScanKernel_KERNELNAME(__global TYPE* input, __global TYPE* output, __global TYPE* blockSums, unsigned int n, unsigned int numElements, __local TYPE* sdata)\n""{\n"" unsigned int threadIdx = get_local_id(0);\n"" unsigned int blockDim = get_local_size(0);\n"" unsigned int blockIdx = get_group_id(0);\n"" unsigned int gridDim = get_num_groups(0);\n"" int thid = threadIdx;\n"" int pout = 0;\n"" int pin = 1;\n"" int mem = get_global_id(0);\n"" int blockNr = blockIdx;\n"" unsigned int gridSize = blockDim*gridDim;\n"" unsigned int numBlocks = numElements/(blockDim) + (numElements%(blockDim) == 0 ? 0:1);\n"" int offset;\n"" while(blockNr < numBlocks)\n"" {\n"" sdata[pout*n+thid] = (mem < numElements) ? input[mem] : 0;\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" for(offset = 1; offset < n; offset *=2)\n"" {\n"" pout = 1-pout;\n"" pin = 1-pout;\n"" if(thid >= offset)\n"" sdata[pout*n+thid] = FUNCTIONNAME(sdata[pin*n+thid], sdata[pin*n+thid-offset], (TYPE)0);\n"" else\n"" sdata[pout*n+thid] = sdata[pin*n+thid];\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" }\n"" if(thid == blockDim - 1)\n"" blockSums[blockNr] = sdata[pout*n+blockDim-1];\n"" if(mem < numElements)\n"" output[mem] = sdata[pout*n+thid];\n"" mem += gridSize;\n"" blockNr += gridDim;\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" }\n""}\n")
static std::string ScanUpdate_CL("__kernel void ScanUpdate_KERNELNAME(__global TYPE* data, __global TYPE* sums, int isInclusive, TYPE init, int n, __global TYPE* ret, __local TYPE* sdata)\n""{\n"" __local TYPE offset;\n"" __local TYPE inc_offset;\n"" unsigned int threadIdx = get_local_id(0);\n"" unsigned int blockDim = get_local_size(0);\n"" unsigned int blockIdx = get_group_id(0);\n"" unsigned int gridDim = get_num_groups(0);\n"" int thid = threadIdx;\n"" int blockNr = blockIdx;\n"" unsigned int gridSize = blockDim*gridDim;\n"" int mem = get_global_id(0);\n"" unsigned int numBlocks = n/(blockDim) + (n%(blockDim) == 0 ? 0:1);\n"" while(blockNr < numBlocks)\n"" {\n"" if(thid == 0)\n"" {\n"" if(isInclusive == 0)\n"" {\n"" offset = init;\n"" if(blockNr > 0)\n"" {\n"" offset = FUNCTIONNAME(offset, sums[blockNr-1], (TYPE)0);\n"" inc_offset = sums[blockNr-1];\n"" }\n"" }\n"" else\n"" {\n"" if(blockNr > 0)\n"" offset = sums[blockNr-1];\n"" }\n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if(isInclusive == 1)\n"" {\n"" if(blockNr > 0)\n"" sdata[thid] = (mem < n) ? FUNCTIONNAME(offset, data[mem], (TYPE)0) : 0;\n"" else\n"" sdata[thid] = (mem < n) ? data[mem] : 0;\n"" if(mem == n-1)\n"" *ret = sdata[thid];\n"" }\n"" else\n"" {\n"" if(mem == n-1)\n"" *ret = FUNCTIONNAME(inc_offset, data[mem], (TYPE)0);\n"" if(thid == 0)\n"" sdata[thid] = offset;\n"" else\n"" sdata[thid] = (mem-1 < n) ? FUNCTIONNAME(offset, data[mem-1], (TYPE)0) : 0;\n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if(mem < n)\n"" data[mem] = sdata[thid];\n"" mem += gridSize;\n"" blockNr += gridDim;\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" }\n""}\n")
__global__ void ScanKernel_CU(BinaryFunc scanFunc, T *input, T *output, T *blockSums, unsigned int n, unsigned int numElements)
Definition: scan_kernels.h:187