SkePU(integratedwithStarPU)  0.8.1
 All Classes Namespaces Files Functions Enumerations Friends Macros Groups Pages
mapreduce_kernels.h
Go to the documentation of this file.
1 
5 #ifndef MAPREDUCE_KERNELS_H
6 #define MAPREDUCE_KERNELS_H
7 
8 #ifdef SKEPU_OPENCL
9 
10 #include <string>
11 
12 namespace skepu
13 {
14 
31 static std::string UnaryMapReduceKernel_CL
32 (
33 "__kernel void UnaryMapReduceKernel_KERNELNAME(__global TYPE* input, __global TYPE* output, unsigned int n, __local TYPE* sdata, TYPE const1)\n"
34 "{\n"
35 " unsigned int blockSize = get_local_size(0);\n"
36 " unsigned int tid = get_local_id(0);\n"
37 " unsigned int i = get_group_id(0)*blockSize + get_local_id(0);\n"
38 " unsigned int gridSize = blockSize*get_num_groups(0);\n"
39 " TYPE result = 0;\n"
40 " if(i < n)\n"
41 " {\n"
42 " result = FUNCTIONNAME_MAP(input[i], const1);\n"
43 " i += gridSize;\n"
44 " }\n"
45 " while(i < n)\n"
46 " {\n"
47 " TYPE tempMap;\n"
48 " tempMap = FUNCTIONNAME_MAP(input[i], const1);\n"
49 " result = FUNCTIONNAME_REDUCE(result, tempMap, (TYPE)0);\n"
50 " i += gridSize;\n"
51 " }\n"
52 " sdata[tid] = result;\n"
53 " barrier(CLK_LOCAL_MEM_FENCE);\n"
54 " if(blockSize >= 512) { if (tid < 256 && tid + 256 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 256], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
55 " if(blockSize >= 256) { if (tid < 128 && tid + 128 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 128], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
56 " if(blockSize >= 128) { if (tid < 64 && tid + 64 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 64], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
57 " if(blockSize >= 64) { if (tid < 32 && tid + 32 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 32], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
58 " if(blockSize >= 32) { if (tid < 16 && tid + 16 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 16], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
59 " if(blockSize >= 16) { if (tid < 8 && tid + 8 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 8], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
60 " if(blockSize >= 8) { if (tid < 4 && tid + 4 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 4], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
61 " if(blockSize >= 4) { if (tid < 2 && tid + 2 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 2], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
62 " if(blockSize >= 2) { if (tid < 1 && tid + 1 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 1], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
63 " if(tid == 0)\n"
64 " {\n"
65 " output[get_group_id(0)] = sdata[tid];\n"
66 " }\n"
67 "}\n"
68 );
69 
75 static std::string BinaryMapReduceKernel_CL
76 (
77 "__kernel void BinaryMapReduceKernel_KERNELNAME(__global TYPE* input1, __global TYPE* input2, __global TYPE* output, unsigned int n, __local TYPE* sdata, TYPE const1)\n"
78 "{\n"
79 " unsigned int blockSize = get_local_size(0);\n"
80 " unsigned int tid = get_local_id(0);\n"
81 " unsigned int i = get_group_id(0)*blockSize + get_local_id(0);\n"
82 " unsigned int gridSize = blockSize*get_num_groups(0);\n"
83 " TYPE result = 0;\n"
84 " if(i < n)\n"
85 " {\n"
86 " result = FUNCTIONNAME_MAP(input1[i], input2[i], const1);\n"
87 " i += gridSize;\n"
88 " }\n"
89 " while(i < n)\n"
90 " {\n"
91 " TYPE tempMap;\n"
92 " tempMap = FUNCTIONNAME_MAP(input1[i], input2[i], const1);\n"
93 " result = FUNCTIONNAME_REDUCE(result, tempMap, (TYPE)0);\n"
94 " i += gridSize;\n"
95 " }\n"
96 " sdata[tid] = result;\n"
97 " barrier(CLK_LOCAL_MEM_FENCE);\n"
98 " if(blockSize >= 512) { if (tid < 256 && tid + 256 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 256], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
99 " if(blockSize >= 256) { if (tid < 128 && tid + 128 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 128], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
100 " if(blockSize >= 128) { if (tid < 64 && tid + 64 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 64], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
101 " if(blockSize >= 64) { if (tid < 32 && tid + 32 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 32], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
102 " if(blockSize >= 32) { if (tid < 16 && tid + 16 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 16], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
103 " if(blockSize >= 16) { if (tid < 8 && tid + 8 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 8], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
104 " if(blockSize >= 8) { if (tid < 4 && tid + 4 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 4], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
105 " if(blockSize >= 4) { if (tid < 2 && tid + 2 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 2], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
106 " if(blockSize >= 2) { if (tid < 1 && tid + 1 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 1], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
107 " if(tid == 0)\n"
108 " {\n"
109 " output[get_group_id(0)] = sdata[tid];\n"
110 " }\n"
111 "}\n"
112 );
113 
119 static std::string TrinaryMapReduceKernel_CL
120 (
121 "__kernel void TrinaryMapReduceKernel_KERNELNAME(__global TYPE* input1, __global TYPE* input2, __global TYPE* input3, __global TYPE* output, unsigned int n, __local TYPE* sdata, TYPE const1)\n"
122 "{\n"
123 " unsigned int blockSize = get_local_size(0);\n"
124 " unsigned int tid = get_local_id(0);\n"
125 " unsigned int i = get_group_id(0)*blockSize + get_local_id(0);\n"
126 " unsigned int gridSize = blockSize*get_num_groups(0);\n"
127 " TYPE result = 0;\n"
128 " if(i < n)\n"
129 " {\n"
130 " result = FUNCTIONNAME_MAP(input1[i], input2[i], input3[i], const1);\n"
131 " i += gridSize;\n"
132 " }\n"
133 " while(i < n)\n"
134 " {\n"
135 " TYPE tempMap;\n"
136 " tempMap = FUNCTIONNAME_MAP(input1[i], input2[i], input3[i], const1);\n"
137 " result = FUNCTIONNAME_REDUCE(result, tempMap, (TYPE)0);\n"
138 " i += gridSize;\n"
139 " }\n"
140 " sdata[tid] = result;\n"
141 " barrier(CLK_LOCAL_MEM_FENCE);\n"
142 " if(blockSize >= 512) { if (tid < 256 && tid + 256 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 256], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
143 " if(blockSize >= 256) { if (tid < 128 && tid + 128 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 128], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
144 " if(blockSize >= 128) { if (tid < 64 && tid + 64 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 64], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
145 " if(blockSize >= 64) { if (tid < 32 && tid + 32 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 32], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
146 " if(blockSize >= 32) { if (tid < 16 && tid + 16 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 16], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
147 " if(blockSize >= 16) { if (tid < 8 && tid + 8 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 8], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
148 " if(blockSize >= 8) { if (tid < 4 && tid + 4 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 4], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
149 " if(blockSize >= 4) { if (tid < 2 && tid + 2 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 2], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
150 " if(blockSize >= 2) { if (tid < 1 && tid + 1 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 1], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
151 " if(tid == 0)\n"
152 " {\n"
153 " output[get_group_id(0)] = sdata[tid];\n"
154 " }\n"
155 "}\n"
156 );
157 
162 }
163 
164 #endif
165 
166 #ifdef SKEPU_CUDA
167 
168 namespace skepu
169 {
170 
187 template <typename T, typename UnaryFunc, typename BinaryFunc>
188 __global__ void MapReduceKernel1_CU(UnaryFunc mapFunc, BinaryFunc reduceFunc, T* input, T* output, unsigned int n)
189 {
190  extern __shared__ char _sdata[];
191  T* sdata = reinterpret_cast<T*>(_sdata);
192 
193  unsigned int blockSize = blockDim.x;
194  unsigned int tid = threadIdx.x;
195  unsigned int i = blockIdx.x * blockSize + tid;
196  unsigned int gridSize = blockSize*gridDim.x;
197  T result = 0;
198 
199  if(i < n)
200  {
201  result = mapFunc.CU(input[i]);
202  i += gridSize;
203  }
204 
205  while(i < n)
206  {
207  T tempMap;
208  tempMap = mapFunc.CU(input[i]);
209  result = reduceFunc.CU(result, tempMap);
210  i += gridSize;
211  }
212 
213  sdata[tid] = result;
214 
215  __syncthreads();
216 
217  if(blockSize >= 512) { if (tid < 256 && tid + 256 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 256]); } __syncthreads(); }
218  if(blockSize >= 256) { if (tid < 128 && tid + 128 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 128]); } __syncthreads(); }
219  if(blockSize >= 128) { if (tid < 64 && tid + 64 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 64]); } __syncthreads(); }
220  if(blockSize >= 64) { if (tid < 32 && tid + 32 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 32]); } __syncthreads(); }
221  if(blockSize >= 32) { if (tid < 16 && tid + 16 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 16]); } __syncthreads(); }
222  if(blockSize >= 16) { if (tid < 8 && tid + 8 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 8]); } __syncthreads(); }
223  if(blockSize >= 8) { if (tid < 4 && tid + 4 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 4]); } __syncthreads(); }
224  if(blockSize >= 4) { if (tid < 2 && tid + 2 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 2]); } __syncthreads(); }
225  if(blockSize >= 2) { if (tid < 1 && tid + 1 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 1]); } __syncthreads(); }
226 
227  if(tid == 0)
228  {
229  output[blockIdx.x] = sdata[tid];
230  }
231 }
232 
238 template <typename T, typename BinaryFunc1, typename BinaryFunc2>
239 __global__ void MapReduceKernel2_CU(BinaryFunc1 mapFunc, BinaryFunc2 reduceFunc, T* input1, T* input2, T* output, unsigned int n)
240 {
241  extern __shared__ char _sdata[];
242  T* sdata = reinterpret_cast<T*>(_sdata);
243 
244  unsigned int blockSize = blockDim.x;
245  unsigned int tid = threadIdx.x;
246  unsigned int i = blockIdx.x * blockSize + tid;
247  unsigned int gridSize = blockSize*gridDim.x;
248  T result = 0;
249 
250  if(i < n)
251  {
252  result = mapFunc.CU(input1[i], input2[i]);
253  i += gridSize;
254  }
255 
256  while(i < n)
257  {
258  T tempMap;
259  tempMap = mapFunc.CU(input1[i], input2[i]);
260  result = reduceFunc.CU(result, tempMap);
261  i += gridSize;
262  }
263 
264  sdata[tid] = result;
265 
266  __syncthreads();
267 
268  if(blockSize >= 512) { if (tid < 256 && tid + 256 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 256]); } __syncthreads(); }
269  if(blockSize >= 256) { if (tid < 128 && tid + 128 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 128]); } __syncthreads(); }
270  if(blockSize >= 128) { if (tid < 64 && tid + 64 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 64]); } __syncthreads(); }
271  if(blockSize >= 64) { if (tid < 32 && tid + 32 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 32]); } __syncthreads(); }
272  if(blockSize >= 32) { if (tid < 16 && tid + 16 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 16]); } __syncthreads(); }
273  if(blockSize >= 16) { if (tid < 8 && tid + 8 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 8]); } __syncthreads(); }
274  if(blockSize >= 8) { if (tid < 4 && tid + 4 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 4]); } __syncthreads(); }
275  if(blockSize >= 4) { if (tid < 2 && tid + 2 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 2]); } __syncthreads(); }
276  if(blockSize >= 2) { if (tid < 1 && tid + 1 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 1]); } __syncthreads(); }
277 
278  if(tid == 0)
279  {
280  output[blockIdx.x] = sdata[tid];
281  }
282 }
283 
289 template <typename T, typename TrinaryFunc, typename BinaryFunc>
290 __global__ void MapReduceKernel3_CU(TrinaryFunc mapFunc, BinaryFunc reduceFunc, T* input1, T* input2, T* input3, T* output, unsigned int n)
291 {
292  extern __shared__ char _sdata[];
293  T* sdata = reinterpret_cast<T*>(_sdata);
294 
295  unsigned int blockSize = blockDim.x;
296  unsigned int tid = threadIdx.x;
297  unsigned int i = blockIdx.x * blockSize + tid;
298  unsigned int gridSize = blockSize*gridDim.x;
299  T result = 0;
300 
301  if(i < n)
302  {
303  result = mapFunc.CU(input1[i], input2[i], input3[i]);
304  i += gridSize;
305  }
306 
307  while(i < n)
308  {
309  T tempMap;
310  tempMap = mapFunc.CU(input1[i], input2[i], input3[i]);
311  result = reduceFunc.CU(result, tempMap);
312  i += gridSize;
313  }
314 
315  sdata[tid] = result;
316 
317  __syncthreads();
318 
319  if(blockSize >= 512) { if (tid < 256 && tid + 256 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 256]); } __syncthreads(); }
320  if(blockSize >= 256) { if (tid < 128 && tid + 128 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 128]); } __syncthreads(); }
321  if(blockSize >= 128) { if (tid < 64 && tid + 64 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 64]); } __syncthreads(); }
322  if(blockSize >= 64) { if (tid < 32 && tid + 32 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 32]); } __syncthreads(); }
323  if(blockSize >= 32) { if (tid < 16 && tid + 16 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 16]); } __syncthreads(); }
324  if(blockSize >= 16) { if (tid < 8 && tid + 8 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 8]); } __syncthreads(); }
325  if(blockSize >= 8) { if (tid < 4 && tid + 4 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 4]); } __syncthreads(); }
326  if(blockSize >= 4) { if (tid < 2 && tid + 2 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 2]); } __syncthreads(); }
327  if(blockSize >= 2) { if (tid < 1 && tid + 1 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 1]); } __syncthreads(); }
328 
329  if(tid == 0)
330  {
331  output[blockIdx.x] = sdata[tid];
332  }
333 }
334 
339 }
340 
341 #endif
342 
343 #endif
344 
__global__ void MapReduceKernel3_CU(TrinaryFunc mapFunc, BinaryFunc reduceFunc, T *input1, T *input2, T *input3, T *output, unsigned int n)
Definition: mapreduce_kernels.h:290
__global__ void MapReduceKernel2_CU(BinaryFunc1 mapFunc, BinaryFunc2 reduceFunc, T *input1, T *input2, T *output, unsigned int n)
Definition: mapreduce_kernels.h:239
static std::string UnaryMapReduceKernel_CL("__kernel void UnaryMapReduceKernel_KERNELNAME(__global TYPE* input, __global TYPE* output, unsigned int n, __local TYPE* sdata, TYPE const1)\n""{\n"" unsigned int blockSize = get_local_size(0);\n"" unsigned int tid = get_local_id(0);\n"" unsigned int i = get_group_id(0)*blockSize + get_local_id(0);\n"" unsigned int gridSize = blockSize*get_num_groups(0);\n"" TYPE result = 0;\n"" if(i < n)\n"" {\n"" result = FUNCTIONNAME_MAP(input[i], const1);\n"" i += gridSize;\n"" }\n"" while(i < n)\n"" {\n"" TYPE tempMap;\n"" tempMap = FUNCTIONNAME_MAP(input[i], const1);\n"" result = FUNCTIONNAME_REDUCE(result, tempMap, (TYPE)0);\n"" i += gridSize;\n"" }\n"" sdata[tid] = result;\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if(blockSize >= 512) { if (tid < 256 && tid + 256 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 256], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 256) { if (tid < 128 && tid + 128 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 128], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 128) { if (tid < 64 && tid + 64 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 64], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 64) { if (tid < 32 && tid + 32 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 32], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 32) { if (tid < 16 && tid + 16 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 16], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 16) { if (tid < 8 && tid + 8 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 8], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 8) { if (tid < 4 && tid + 4 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 4], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 4) { if (tid < 2 && tid + 2 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 2], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 2) { if (tid < 1 && tid + 1 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 1], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(tid == 0)\n"" {\n"" output[get_group_id(0)] = sdata[tid];\n"" }\n""}\n")
static std::string BinaryMapReduceKernel_CL("__kernel void BinaryMapReduceKernel_KERNELNAME(__global TYPE* input1, __global TYPE* input2, __global TYPE* output, unsigned int n, __local TYPE* sdata, TYPE const1)\n""{\n"" unsigned int blockSize = get_local_size(0);\n"" unsigned int tid = get_local_id(0);\n"" unsigned int i = get_group_id(0)*blockSize + get_local_id(0);\n"" unsigned int gridSize = blockSize*get_num_groups(0);\n"" TYPE result = 0;\n"" if(i < n)\n"" {\n"" result = FUNCTIONNAME_MAP(input1[i], input2[i], const1);\n"" i += gridSize;\n"" }\n"" while(i < n)\n"" {\n"" TYPE tempMap;\n"" tempMap = FUNCTIONNAME_MAP(input1[i], input2[i], const1);\n"" result = FUNCTIONNAME_REDUCE(result, tempMap, (TYPE)0);\n"" i += gridSize;\n"" }\n"" sdata[tid] = result;\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if(blockSize >= 512) { if (tid < 256 && tid + 256 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 256], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 256) { if (tid < 128 && tid + 128 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 128], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 128) { if (tid < 64 && tid + 64 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 64], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 64) { if (tid < 32 && tid + 32 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 32], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 32) { if (tid < 16 && tid + 16 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 16], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 16) { if (tid < 8 && tid + 8 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 8], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 8) { if (tid < 4 && tid + 4 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 4], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 4) { if (tid < 2 && tid + 2 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 2], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 2) { if (tid < 1 && tid + 1 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 1], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(tid == 0)\n"" {\n"" output[get_group_id(0)] = sdata[tid];\n"" }\n""}\n")
__global__ void MapReduceKernel1_CU(UnaryFunc mapFunc, BinaryFunc reduceFunc, T *input, T *output, unsigned int n)
Definition: mapreduce_kernels.h:188
static std::string TrinaryMapReduceKernel_CL("__kernel void TrinaryMapReduceKernel_KERNELNAME(__global TYPE* input1, __global TYPE* input2, __global TYPE* input3, __global TYPE* output, unsigned int n, __local TYPE* sdata, TYPE const1)\n""{\n"" unsigned int blockSize = get_local_size(0);\n"" unsigned int tid = get_local_id(0);\n"" unsigned int i = get_group_id(0)*blockSize + get_local_id(0);\n"" unsigned int gridSize = blockSize*get_num_groups(0);\n"" TYPE result = 0;\n"" if(i < n)\n"" {\n"" result = FUNCTIONNAME_MAP(input1[i], input2[i], input3[i], const1);\n"" i += gridSize;\n"" }\n"" while(i < n)\n"" {\n"" TYPE tempMap;\n"" tempMap = FUNCTIONNAME_MAP(input1[i], input2[i], input3[i], const1);\n"" result = FUNCTIONNAME_REDUCE(result, tempMap, (TYPE)0);\n"" i += gridSize;\n"" }\n"" sdata[tid] = result;\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if(blockSize >= 512) { if (tid < 256 && tid + 256 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 256], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 256) { if (tid < 128 && tid + 128 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 128], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 128) { if (tid < 64 && tid + 64 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 64], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 64) { if (tid < 32 && tid + 32 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 32], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 32) { if (tid < 16 && tid + 16 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 16], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 16) { if (tid < 8 && tid + 8 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 8], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 8) { if (tid < 4 && tid + 4 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 4], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 4) { if (tid < 2 && tid + 2 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 2], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 2) { if (tid < 1 && tid + 1 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 1], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(tid == 0)\n"" {\n"" output[get_group_id(0)] = sdata[tid];\n"" }\n""}\n")