5 #ifndef MAPREDUCE_KERNELS_H
6 #define MAPREDUCE_KERNELS_H
33 "__kernel void UnaryMapReduceKernel_KERNELNAME(__global TYPE* input, __global TYPE* output, unsigned int n, __local TYPE* sdata, TYPE const1)\n"
35 " unsigned int blockSize = get_local_size(0);\n"
36 " unsigned int tid = get_local_id(0);\n"
37 " unsigned int i = get_group_id(0)*blockSize + get_local_id(0);\n"
38 " unsigned int gridSize = blockSize*get_num_groups(0);\n"
42 " result = FUNCTIONNAME_MAP(input[i], const1);\n"
48 " tempMap = FUNCTIONNAME_MAP(input[i], const1);\n"
49 " result = FUNCTIONNAME_REDUCE(result, tempMap, (TYPE)0);\n"
52 " sdata[tid] = result;\n"
53 " barrier(CLK_LOCAL_MEM_FENCE);\n"
54 " if(blockSize >= 512) { if (tid < 256 && tid + 256 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 256], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
55 " if(blockSize >= 256) { if (tid < 128 && tid + 128 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 128], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
56 " if(blockSize >= 128) { if (tid < 64 && tid + 64 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 64], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
57 " if(blockSize >= 64) { if (tid < 32 && tid + 32 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 32], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
58 " if(blockSize >= 32) { if (tid < 16 && tid + 16 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 16], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
59 " if(blockSize >= 16) { if (tid < 8 && tid + 8 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 8], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
60 " if(blockSize >= 8) { if (tid < 4 && tid + 4 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 4], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
61 " if(blockSize >= 4) { if (tid < 2 && tid + 2 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 2], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
62 " if(blockSize >= 2) { if (tid < 1 && tid + 1 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 1], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
65 " output[get_group_id(0)] = sdata[tid];\n"
77 "__kernel void BinaryMapReduceKernel_KERNELNAME(__global TYPE* input1, __global TYPE* input2, __global TYPE* output, unsigned int n, __local TYPE* sdata, TYPE const1)\n"
79 " unsigned int blockSize = get_local_size(0);\n"
80 " unsigned int tid = get_local_id(0);\n"
81 " unsigned int i = get_group_id(0)*blockSize + get_local_id(0);\n"
82 " unsigned int gridSize = blockSize*get_num_groups(0);\n"
86 " result = FUNCTIONNAME_MAP(input1[i], input2[i], const1);\n"
92 " tempMap = FUNCTIONNAME_MAP(input1[i], input2[i], const1);\n"
93 " result = FUNCTIONNAME_REDUCE(result, tempMap, (TYPE)0);\n"
96 " sdata[tid] = result;\n"
97 " barrier(CLK_LOCAL_MEM_FENCE);\n"
98 " if(blockSize >= 512) { if (tid < 256 && tid + 256 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 256], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
99 " if(blockSize >= 256) { if (tid < 128 && tid + 128 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 128], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
100 " if(blockSize >= 128) { if (tid < 64 && tid + 64 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 64], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
101 " if(blockSize >= 64) { if (tid < 32 && tid + 32 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 32], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
102 " if(blockSize >= 32) { if (tid < 16 && tid + 16 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 16], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
103 " if(blockSize >= 16) { if (tid < 8 && tid + 8 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 8], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
104 " if(blockSize >= 8) { if (tid < 4 && tid + 4 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 4], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
105 " if(blockSize >= 4) { if (tid < 2 && tid + 2 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 2], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
106 " if(blockSize >= 2) { if (tid < 1 && tid + 1 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 1], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
109 " output[get_group_id(0)] = sdata[tid];\n"
121 "__kernel void TrinaryMapReduceKernel_KERNELNAME(__global TYPE* input1, __global TYPE* input2, __global TYPE* input3, __global TYPE* output, unsigned int n, __local TYPE* sdata, TYPE const1)\n"
123 " unsigned int blockSize = get_local_size(0);\n"
124 " unsigned int tid = get_local_id(0);\n"
125 " unsigned int i = get_group_id(0)*blockSize + get_local_id(0);\n"
126 " unsigned int gridSize = blockSize*get_num_groups(0);\n"
127 " TYPE result = 0;\n"
130 " result = FUNCTIONNAME_MAP(input1[i], input2[i], input3[i], const1);\n"
136 " tempMap = FUNCTIONNAME_MAP(input1[i], input2[i], input3[i], const1);\n"
137 " result = FUNCTIONNAME_REDUCE(result, tempMap, (TYPE)0);\n"
140 " sdata[tid] = result;\n"
141 " barrier(CLK_LOCAL_MEM_FENCE);\n"
142 " if(blockSize >= 512) { if (tid < 256 && tid + 256 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 256], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
143 " if(blockSize >= 256) { if (tid < 128 && tid + 128 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 128], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
144 " if(blockSize >= 128) { if (tid < 64 && tid + 64 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 64], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
145 " if(blockSize >= 64) { if (tid < 32 && tid + 32 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 32], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
146 " if(blockSize >= 32) { if (tid < 16 && tid + 16 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 16], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
147 " if(blockSize >= 16) { if (tid < 8 && tid + 8 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 8], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
148 " if(blockSize >= 8) { if (tid < 4 && tid + 4 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 4], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
149 " if(blockSize >= 4) { if (tid < 2 && tid + 2 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 2], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
150 " if(blockSize >= 2) { if (tid < 1 && tid + 1 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 1], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
153 " output[get_group_id(0)] = sdata[tid];\n"
187 template <
typename T,
typename UnaryFunc,
typename BinaryFunc>
188 __global__
void MapReduceKernel1_CU(UnaryFunc mapFunc, BinaryFunc reduceFunc, T* input, T* output,
unsigned int n)
190 extern __shared__
char _sdata[];
191 T* sdata =
reinterpret_cast<T*
>(_sdata);
193 unsigned int blockSize = blockDim.x;
194 unsigned int tid = threadIdx.x;
195 unsigned int i = blockIdx.x * blockSize + tid;
196 unsigned int gridSize = blockSize*gridDim.x;
201 result = mapFunc.CU(input[i]);
208 tempMap = mapFunc.CU(input[i]);
209 result = reduceFunc.CU(result, tempMap);
217 if(blockSize >= 512) {
if (tid < 256 && tid + 256 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 256]); } __syncthreads(); }
218 if(blockSize >= 256) {
if (tid < 128 && tid + 128 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 128]); } __syncthreads(); }
219 if(blockSize >= 128) {
if (tid < 64 && tid + 64 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 64]); } __syncthreads(); }
220 if(blockSize >= 64) {
if (tid < 32 && tid + 32 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 32]); } __syncthreads(); }
221 if(blockSize >= 32) {
if (tid < 16 && tid + 16 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 16]); } __syncthreads(); }
222 if(blockSize >= 16) {
if (tid < 8 && tid + 8 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 8]); } __syncthreads(); }
223 if(blockSize >= 8) {
if (tid < 4 && tid + 4 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 4]); } __syncthreads(); }
224 if(blockSize >= 4) {
if (tid < 2 && tid + 2 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 2]); } __syncthreads(); }
225 if(blockSize >= 2) {
if (tid < 1 && tid + 1 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 1]); } __syncthreads(); }
229 output[blockIdx.x] = sdata[tid];
238 template <
typename T,
typename BinaryFunc1,
typename BinaryFunc2>
239 __global__
void MapReduceKernel2_CU(BinaryFunc1 mapFunc, BinaryFunc2 reduceFunc, T* input1, T* input2, T* output,
unsigned int n)
241 extern __shared__
char _sdata[];
242 T* sdata =
reinterpret_cast<T*
>(_sdata);
244 unsigned int blockSize = blockDim.x;
245 unsigned int tid = threadIdx.x;
246 unsigned int i = blockIdx.x * blockSize + tid;
247 unsigned int gridSize = blockSize*gridDim.x;
252 result = mapFunc.CU(input1[i], input2[i]);
259 tempMap = mapFunc.CU(input1[i], input2[i]);
260 result = reduceFunc.CU(result, tempMap);
268 if(blockSize >= 512) {
if (tid < 256 && tid + 256 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 256]); } __syncthreads(); }
269 if(blockSize >= 256) {
if (tid < 128 && tid + 128 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 128]); } __syncthreads(); }
270 if(blockSize >= 128) {
if (tid < 64 && tid + 64 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 64]); } __syncthreads(); }
271 if(blockSize >= 64) {
if (tid < 32 && tid + 32 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 32]); } __syncthreads(); }
272 if(blockSize >= 32) {
if (tid < 16 && tid + 16 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 16]); } __syncthreads(); }
273 if(blockSize >= 16) {
if (tid < 8 && tid + 8 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 8]); } __syncthreads(); }
274 if(blockSize >= 8) {
if (tid < 4 && tid + 4 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 4]); } __syncthreads(); }
275 if(blockSize >= 4) {
if (tid < 2 && tid + 2 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 2]); } __syncthreads(); }
276 if(blockSize >= 2) {
if (tid < 1 && tid + 1 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 1]); } __syncthreads(); }
280 output[blockIdx.x] = sdata[tid];
289 template <
typename T,
typename TrinaryFunc,
typename BinaryFunc>
290 __global__
void MapReduceKernel3_CU(TrinaryFunc mapFunc, BinaryFunc reduceFunc, T* input1, T* input2, T* input3, T* output,
unsigned int n)
292 extern __shared__
char _sdata[];
293 T* sdata =
reinterpret_cast<T*
>(_sdata);
295 unsigned int blockSize = blockDim.x;
296 unsigned int tid = threadIdx.x;
297 unsigned int i = blockIdx.x * blockSize + tid;
298 unsigned int gridSize = blockSize*gridDim.x;
303 result = mapFunc.CU(input1[i], input2[i], input3[i]);
310 tempMap = mapFunc.CU(input1[i], input2[i], input3[i]);
311 result = reduceFunc.CU(result, tempMap);
319 if(blockSize >= 512) {
if (tid < 256 && tid + 256 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 256]); } __syncthreads(); }
320 if(blockSize >= 256) {
if (tid < 128 && tid + 128 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 128]); } __syncthreads(); }
321 if(blockSize >= 128) {
if (tid < 64 && tid + 64 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 64]); } __syncthreads(); }
322 if(blockSize >= 64) {
if (tid < 32 && tid + 32 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 32]); } __syncthreads(); }
323 if(blockSize >= 32) {
if (tid < 16 && tid + 16 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 16]); } __syncthreads(); }
324 if(blockSize >= 16) {
if (tid < 8 && tid + 8 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 8]); } __syncthreads(); }
325 if(blockSize >= 8) {
if (tid < 4 && tid + 4 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 4]); } __syncthreads(); }
326 if(blockSize >= 4) {
if (tid < 2 && tid + 2 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 2]); } __syncthreads(); }
327 if(blockSize >= 2) {
if (tid < 1 && tid + 1 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 1]); } __syncthreads(); }
331 output[blockIdx.x] = sdata[tid];
__global__ void MapReduceKernel3_CU(TrinaryFunc mapFunc, BinaryFunc reduceFunc, T *input1, T *input2, T *input3, T *output, unsigned int n)
Definition: mapreduce_kernels.h:290
__global__ void MapReduceKernel2_CU(BinaryFunc1 mapFunc, BinaryFunc2 reduceFunc, T *input1, T *input2, T *output, unsigned int n)
Definition: mapreduce_kernels.h:239
static std::string UnaryMapReduceKernel_CL("__kernel void UnaryMapReduceKernel_KERNELNAME(__global TYPE* input, __global TYPE* output, unsigned int n, __local TYPE* sdata, TYPE const1)\n""{\n"" unsigned int blockSize = get_local_size(0);\n"" unsigned int tid = get_local_id(0);\n"" unsigned int i = get_group_id(0)*blockSize + get_local_id(0);\n"" unsigned int gridSize = blockSize*get_num_groups(0);\n"" TYPE result = 0;\n"" if(i < n)\n"" {\n"" result = FUNCTIONNAME_MAP(input[i], const1);\n"" i += gridSize;\n"" }\n"" while(i < n)\n"" {\n"" TYPE tempMap;\n"" tempMap = FUNCTIONNAME_MAP(input[i], const1);\n"" result = FUNCTIONNAME_REDUCE(result, tempMap, (TYPE)0);\n"" i += gridSize;\n"" }\n"" sdata[tid] = result;\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if(blockSize >= 512) { if (tid < 256 && tid + 256 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 256], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 256) { if (tid < 128 && tid + 128 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 128], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 128) { if (tid < 64 && tid + 64 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 64], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 64) { if (tid < 32 && tid + 32 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 32], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 32) { if (tid < 16 && tid + 16 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 16], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 16) { if (tid < 8 && tid + 8 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 8], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 8) { if (tid < 4 && tid + 4 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 4], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 4) { if (tid < 2 && tid + 2 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 2], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 2) { if (tid < 1 && tid + 1 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 1], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(tid == 0)\n"" {\n"" output[get_group_id(0)] = sdata[tid];\n"" }\n""}\n")
static std::string BinaryMapReduceKernel_CL("__kernel void BinaryMapReduceKernel_KERNELNAME(__global TYPE* input1, __global TYPE* input2, __global TYPE* output, unsigned int n, __local TYPE* sdata, TYPE const1)\n""{\n"" unsigned int blockSize = get_local_size(0);\n"" unsigned int tid = get_local_id(0);\n"" unsigned int i = get_group_id(0)*blockSize + get_local_id(0);\n"" unsigned int gridSize = blockSize*get_num_groups(0);\n"" TYPE result = 0;\n"" if(i < n)\n"" {\n"" result = FUNCTIONNAME_MAP(input1[i], input2[i], const1);\n"" i += gridSize;\n"" }\n"" while(i < n)\n"" {\n"" TYPE tempMap;\n"" tempMap = FUNCTIONNAME_MAP(input1[i], input2[i], const1);\n"" result = FUNCTIONNAME_REDUCE(result, tempMap, (TYPE)0);\n"" i += gridSize;\n"" }\n"" sdata[tid] = result;\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if(blockSize >= 512) { if (tid < 256 && tid + 256 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 256], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 256) { if (tid < 128 && tid + 128 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 128], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 128) { if (tid < 64 && tid + 64 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 64], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 64) { if (tid < 32 && tid + 32 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 32], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 32) { if (tid < 16 && tid + 16 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 16], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 16) { if (tid < 8 && tid + 8 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 8], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 8) { if (tid < 4 && tid + 4 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 4], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 4) { if (tid < 2 && tid + 2 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 2], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 2) { if (tid < 1 && tid + 1 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 1], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(tid == 0)\n"" {\n"" output[get_group_id(0)] = sdata[tid];\n"" }\n""}\n")
__global__ void MapReduceKernel1_CU(UnaryFunc mapFunc, BinaryFunc reduceFunc, T *input, T *output, unsigned int n)
Definition: mapreduce_kernels.h:188
static std::string TrinaryMapReduceKernel_CL("__kernel void TrinaryMapReduceKernel_KERNELNAME(__global TYPE* input1, __global TYPE* input2, __global TYPE* input3, __global TYPE* output, unsigned int n, __local TYPE* sdata, TYPE const1)\n""{\n"" unsigned int blockSize = get_local_size(0);\n"" unsigned int tid = get_local_id(0);\n"" unsigned int i = get_group_id(0)*blockSize + get_local_id(0);\n"" unsigned int gridSize = blockSize*get_num_groups(0);\n"" TYPE result = 0;\n"" if(i < n)\n"" {\n"" result = FUNCTIONNAME_MAP(input1[i], input2[i], input3[i], const1);\n"" i += gridSize;\n"" }\n"" while(i < n)\n"" {\n"" TYPE tempMap;\n"" tempMap = FUNCTIONNAME_MAP(input1[i], input2[i], input3[i], const1);\n"" result = FUNCTIONNAME_REDUCE(result, tempMap, (TYPE)0);\n"" i += gridSize;\n"" }\n"" sdata[tid] = result;\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if(blockSize >= 512) { if (tid < 256 && tid + 256 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 256], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 256) { if (tid < 128 && tid + 128 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 128], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 128) { if (tid < 64 && tid + 64 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 64], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 64) { if (tid < 32 && tid + 32 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 32], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 32) { if (tid < 16 && tid + 16 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 16], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 16) { if (tid < 8 && tid + 8 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 8], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 8) { if (tid < 4 && tid + 4 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 4], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 4) { if (tid < 2 && tid + 2 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 2], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 2) { if (tid < 1 && tid + 1 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 1], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(tid == 0)\n"" {\n"" output[get_group_id(0)] = sdata[tid];\n"" }\n""}\n")