SkePU  1.2
 All Classes Namespaces Files Functions Variables Enumerations Friends Macros Groups Pages
mapreduce_kernels.h
Go to the documentation of this file.
1 
5 #ifndef MAPREDUCE_KERNELS_H
6 #define MAPREDUCE_KERNELS_H
7 
8 #ifdef SKEPU_OPENCL
9 
10 #include <string>
11 
12 namespace skepu
13 {
14 
31 static std::string UnaryMapReduceKernel_CL
32 (
33  "__kernel void UnaryMapReduceKernel_KERNELNAME(__global TYPE* input, __global TYPE* output, size_t n, __local TYPE* sdata, CONST_TYPE const1)\n"
34  "{\n"
35  " size_t blockSize = get_local_size(0);\n"
36  " size_t tid = get_local_id(0);\n"
37  " size_t i = get_group_id(0)*blockSize + get_local_id(0);\n"
38  " size_t gridSize = blockSize*get_num_groups(0);\n"
39  " TYPE result = 0;\n"
40  " if(i < n)\n"
41  " {\n"
42  " result = FUNCTIONNAME_MAP(input[i], const1);\n"
43  " i += gridSize;\n"
44  " }\n"
45  " while(i < n)\n"
46  " {\n"
47  " TYPE tempMap;\n"
48  " tempMap = FUNCTIONNAME_MAP(input[i], const1);\n"
49  " result = FUNCTIONNAME_REDUCE(result, tempMap, (TYPE)0);\n"
50  " i += gridSize;\n"
51  " }\n"
52  " sdata[tid] = result;\n"
53  " barrier(CLK_LOCAL_MEM_FENCE);\n"
54  " if(blockSize >= 512) { if (tid < 256 && tid + 256 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 256], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
55  " if(blockSize >= 256) { if (tid < 128 && tid + 128 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 128], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
56  " if(blockSize >= 128) { if (tid < 64 && tid + 64 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 64], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
57  " if(blockSize >= 64) { if (tid < 32 && tid + 32 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 32], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
58  " if(blockSize >= 32) { if (tid < 16 && tid + 16 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 16], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
59  " if(blockSize >= 16) { if (tid < 8 && tid + 8 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 8], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
60  " if(blockSize >= 8) { if (tid < 4 && tid + 4 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 4], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
61  " if(blockSize >= 4) { if (tid < 2 && tid + 2 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 2], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
62  " if(blockSize >= 2) { if (tid < 1 && tid + 1 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 1], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
63  " if(tid == 0)\n"
64  " {\n"
65  " output[get_group_id(0)] = sdata[tid];\n"
66  " }\n"
67  "}\n"
68 );
69 
75 static std::string BinaryMapReduceKernel_CL
76 (
77  "__kernel void BinaryMapReduceKernel_KERNELNAME(__global TYPE* input1, __global TYPE* input2, __global TYPE* output, size_t n, __local TYPE* sdata, CONST_TYPE const1)\n"
78  "{\n"
79  " size_t blockSize = get_local_size(0);\n"
80  " size_t tid = get_local_id(0);\n"
81  " size_t i = get_group_id(0)*blockSize + get_local_id(0);\n"
82  " size_t gridSize = blockSize*get_num_groups(0);\n"
83  " TYPE result = 0;\n"
84  " if(i < n)\n"
85  " {\n"
86  " result = FUNCTIONNAME_MAP(input1[i], input2[i], const1);\n"
87  " i += gridSize;\n"
88  " }\n"
89  " while(i < n)\n"
90  " {\n"
91  " TYPE tempMap;\n"
92  " tempMap = FUNCTIONNAME_MAP(input1[i], input2[i], const1);\n"
93  " result = FUNCTIONNAME_REDUCE(result, tempMap, (TYPE)0);\n"
94  " i += gridSize;\n"
95  " }\n"
96  " sdata[tid] = result;\n"
97  " barrier(CLK_LOCAL_MEM_FENCE);\n"
98  " if(blockSize >= 512) { if (tid < 256 && tid + 256 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 256], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
99  " if(blockSize >= 256) { if (tid < 128 && tid + 128 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 128], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
100  " if(blockSize >= 128) { if (tid < 64 && tid + 64 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 64], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
101  " if(blockSize >= 64) { if (tid < 32 && tid + 32 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 32], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
102  " if(blockSize >= 32) { if (tid < 16 && tid + 16 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 16], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
103  " if(blockSize >= 16) { if (tid < 8 && tid + 8 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 8], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
104  " if(blockSize >= 8) { if (tid < 4 && tid + 4 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 4], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
105  " if(blockSize >= 4) { if (tid < 2 && tid + 2 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 2], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
106  " if(blockSize >= 2) { if (tid < 1 && tid + 1 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 1], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
107  " if(tid == 0)\n"
108  " {\n"
109  " output[get_group_id(0)] = sdata[tid];\n"
110  " }\n"
111  "}\n"
112 );
113 
119 static std::string TrinaryMapReduceKernel_CL
120 (
121  "__kernel void TrinaryMapReduceKernel_KERNELNAME(__global TYPE* input1, __global TYPE* input2, __global TYPE* input3, __global TYPE* output, size_t n, __local TYPE* sdata, CONST_TYPE const1)\n"
122  "{\n"
123  " size_t blockSize = get_local_size(0);\n"
124  " size_t tid = get_local_id(0);\n"
125  " size_t i = get_group_id(0)*blockSize + get_local_id(0);\n"
126  " size_t gridSize = blockSize*get_num_groups(0);\n"
127  " TYPE result = 0;\n"
128  " if(i < n)\n"
129  " {\n"
130  " result = FUNCTIONNAME_MAP(input1[i], input2[i], input3[i], const1);\n"
131  " i += gridSize;\n"
132  " }\n"
133  " while(i < n)\n"
134  " {\n"
135  " TYPE tempMap;\n"
136  " tempMap = FUNCTIONNAME_MAP(input1[i], input2[i], input3[i], const1);\n"
137  " result = FUNCTIONNAME_REDUCE(result, tempMap, (TYPE)0);\n"
138  " i += gridSize;\n"
139  " }\n"
140  " sdata[tid] = result;\n"
141  " barrier(CLK_LOCAL_MEM_FENCE);\n"
142  " if(blockSize >= 512) { if (tid < 256 && tid + 256 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 256], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
143  " if(blockSize >= 256) { if (tid < 128 && tid + 128 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 128], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
144  " if(blockSize >= 128) { if (tid < 64 && tid + 64 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 64], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
145  " if(blockSize >= 64) { if (tid < 32 && tid + 32 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 32], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
146  " if(blockSize >= 32) { if (tid < 16 && tid + 16 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 16], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
147  " if(blockSize >= 16) { if (tid < 8 && tid + 8 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 8], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
148  " if(blockSize >= 8) { if (tid < 4 && tid + 4 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 4], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
149  " if(blockSize >= 4) { if (tid < 2 && tid + 2 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 2], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
150  " if(blockSize >= 2) { if (tid < 1 && tid + 1 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 1], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
151  " if(tid == 0)\n"
152  " {\n"
153  " output[get_group_id(0)] = sdata[tid];\n"
154  " }\n"
155  "}\n"
156 );
157 
162 }
163 
164 #endif
165 
166 #ifdef SKEPU_CUDA
167 
168 namespace skepu
169 {
170 
187 template <typename T, typename UnaryFunc, typename BinaryFunc>
188 __global__ void MapReduceKernel1_CU(UnaryFunc mapFunc, BinaryFunc reduceFunc, T* input, T* output, size_t n)
189 {
190  extern __shared__ char _sdata[];
191  T* sdata = reinterpret_cast<T*>(_sdata);
192 
193  size_t blockSize = blockDim.x;
194  size_t tid = threadIdx.x;
195  size_t i = blockIdx.x * blockSize + tid;
196  size_t gridSize = blockSize*gridDim.x;
197  T result = 0;
198 
199  if(i < n)
200  {
201  result = mapFunc.CU(input[i]);
202  i += gridSize;
203  }
204 
205  while(i < n)
206  {
207  T tempMap;
208  tempMap = mapFunc.CU(input[i]);
209  result = reduceFunc.CU(result, tempMap);
210  i += gridSize;
211  }
212 
213  sdata[tid] = result;
214 
215  __syncthreads();
216 
217  if(blockSize >= 512)
218  {
219  if (tid < 256 && tid + 256 < n)
220  {
221  sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 256]);
222  }
223  __syncthreads();
224  }
225  if(blockSize >= 256)
226  {
227  if (tid < 128 && tid + 128 < n)
228  {
229  sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 128]);
230  }
231  __syncthreads();
232  }
233  if(blockSize >= 128)
234  {
235  if (tid < 64 && tid + 64 < n)
236  {
237  sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 64]);
238  }
239  __syncthreads();
240  }
241  if(blockSize >= 64)
242  {
243  if (tid < 32 && tid + 32 < n)
244  {
245  sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 32]);
246  }
247  __syncthreads();
248  }
249  if(blockSize >= 32)
250  {
251  if (tid < 16 && tid + 16 < n)
252  {
253  sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 16]);
254  }
255  __syncthreads();
256  }
257  if(blockSize >= 16)
258  {
259  if (tid < 8 && tid + 8 < n)
260  {
261  sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 8]);
262  }
263  __syncthreads();
264  }
265  if(blockSize >= 8)
266  {
267  if (tid < 4 && tid + 4 < n)
268  {
269  sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 4]);
270  }
271  __syncthreads();
272  }
273  if(blockSize >= 4)
274  {
275  if (tid < 2 && tid + 2 < n)
276  {
277  sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 2]);
278  }
279  __syncthreads();
280  }
281  if(blockSize >= 2)
282  {
283  if (tid < 1 && tid + 1 < n)
284  {
285  sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 1]);
286  }
287  __syncthreads();
288  }
289 
290  if(tid == 0)
291  {
292  output[blockIdx.x] = sdata[tid];
293  }
294 }
295 
301 template <typename T, typename BinaryFunc1, typename BinaryFunc2>
302 __global__ void MapReduceKernel2_CU(BinaryFunc1 mapFunc, BinaryFunc2 reduceFunc, T* input1, T* input2, T* output, size_t n)
303 {
304  extern __shared__ char _sdata[];
305  T* sdata = reinterpret_cast<T*>(_sdata);
306 
307  size_t blockSize = blockDim.x;
308  size_t tid = threadIdx.x;
309  size_t i = blockIdx.x * blockSize + tid;
310  size_t gridSize = blockSize*gridDim.x;
311  T result = 0;
312 
313  if(i < n)
314  {
315  result = mapFunc.CU(input1[i], input2[i]);
316  i += gridSize;
317  }
318 
319  while(i < n)
320  {
321  T tempMap;
322  tempMap = mapFunc.CU(input1[i], input2[i]);
323  result = reduceFunc.CU(result, tempMap);
324  i += gridSize;
325  }
326 
327  sdata[tid] = result;
328 
329  __syncthreads();
330 
331  if(blockSize >= 512)
332  {
333  if (tid < 256 && tid + 256 < n)
334  {
335  sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 256]);
336  }
337  __syncthreads();
338  }
339  if(blockSize >= 256)
340  {
341  if (tid < 128 && tid + 128 < n)
342  {
343  sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 128]);
344  }
345  __syncthreads();
346  }
347  if(blockSize >= 128)
348  {
349  if (tid < 64 && tid + 64 < n)
350  {
351  sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 64]);
352  }
353  __syncthreads();
354  }
355  if(blockSize >= 64)
356  {
357  if (tid < 32 && tid + 32 < n)
358  {
359  sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 32]);
360  }
361  __syncthreads();
362  }
363  if(blockSize >= 32)
364  {
365  if (tid < 16 && tid + 16 < n)
366  {
367  sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 16]);
368  }
369  __syncthreads();
370  }
371  if(blockSize >= 16)
372  {
373  if (tid < 8 && tid + 8 < n)
374  {
375  sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 8]);
376  }
377  __syncthreads();
378  }
379  if(blockSize >= 8)
380  {
381  if (tid < 4 && tid + 4 < n)
382  {
383  sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 4]);
384  }
385  __syncthreads();
386  }
387  if(blockSize >= 4)
388  {
389  if (tid < 2 && tid + 2 < n)
390  {
391  sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 2]);
392  }
393  __syncthreads();
394  }
395  if(blockSize >= 2)
396  {
397  if (tid < 1 && tid + 1 < n)
398  {
399  sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 1]);
400  }
401  __syncthreads();
402  }
403 
404  if(tid == 0)
405  {
406  output[blockIdx.x] = sdata[tid];
407  }
408 }
409 
415 template <typename T, typename TrinaryFunc, typename BinaryFunc>
416 __global__ void MapReduceKernel3_CU(TrinaryFunc mapFunc, BinaryFunc reduceFunc, T* input1, T* input2, T* input3, T* output, size_t n)
417 {
418  extern __shared__ char _sdata[];
419  T* sdata = reinterpret_cast<T*>(_sdata);
420 
421  size_t blockSize = blockDim.x;
422  size_t tid = threadIdx.x;
423  size_t i = blockIdx.x * blockSize + tid;
424  size_t gridSize = blockSize*gridDim.x;
425  T result = 0;
426 
427  if(i < n)
428  {
429  result = mapFunc.CU(input1[i], input2[i], input3[i]);
430  i += gridSize;
431  }
432 
433  while(i < n)
434  {
435  T tempMap;
436  tempMap = mapFunc.CU(input1[i], input2[i], input3[i]);
437  result = reduceFunc.CU(result, tempMap);
438  i += gridSize;
439  }
440 
441  sdata[tid] = result;
442 
443  __syncthreads();
444 
445  if(blockSize >= 512)
446  {
447  if (tid < 256 && tid + 256 < n)
448  {
449  sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 256]);
450  }
451  __syncthreads();
452  }
453  if(blockSize >= 256)
454  {
455  if (tid < 128 && tid + 128 < n)
456  {
457  sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 128]);
458  }
459  __syncthreads();
460  }
461  if(blockSize >= 128)
462  {
463  if (tid < 64 && tid + 64 < n)
464  {
465  sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 64]);
466  }
467  __syncthreads();
468  }
469  if(blockSize >= 64)
470  {
471  if (tid < 32 && tid + 32 < n)
472  {
473  sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 32]);
474  }
475  __syncthreads();
476  }
477  if(blockSize >= 32)
478  {
479  if (tid < 16 && tid + 16 < n)
480  {
481  sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 16]);
482  }
483  __syncthreads();
484  }
485  if(blockSize >= 16)
486  {
487  if (tid < 8 && tid + 8 < n)
488  {
489  sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 8]);
490  }
491  __syncthreads();
492  }
493  if(blockSize >= 8)
494  {
495  if (tid < 4 && tid + 4 < n)
496  {
497  sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 4]);
498  }
499  __syncthreads();
500  }
501  if(blockSize >= 4)
502  {
503  if (tid < 2 && tid + 2 < n)
504  {
505  sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 2]);
506  }
507  __syncthreads();
508  }
509  if(blockSize >= 2)
510  {
511  if (tid < 1 && tid + 1 < n)
512  {
513  sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 1]);
514  }
515  __syncthreads();
516  }
517 
518  if(tid == 0)
519  {
520  output[blockIdx.x] = sdata[tid];
521  }
522 }
523 
528 }
529 
530 #endif
531 
532 #endif
533 
534 
__global__ void MapReduceKernel1_CU(UnaryFunc mapFunc, BinaryFunc reduceFunc, T *input, T *output, size_t n)
Definition: mapreduce_kernels.h:188
static std::string TrinaryMapReduceKernel_CL("__kernel void TrinaryMapReduceKernel_KERNELNAME(__global TYPE* input1, __global TYPE* input2, __global TYPE* input3, __global TYPE* output, size_t n, __local TYPE* sdata, CONST_TYPE const1)\n""{\n"" size_t blockSize = get_local_size(0);\n"" size_t tid = get_local_id(0);\n"" size_t i = get_group_id(0)*blockSize + get_local_id(0);\n"" size_t gridSize = blockSize*get_num_groups(0);\n"" TYPE result = 0;\n"" if(i < n)\n"" {\n"" result = FUNCTIONNAME_MAP(input1[i], input2[i], input3[i], const1);\n"" i += gridSize;\n"" }\n"" while(i < n)\n"" {\n"" TYPE tempMap;\n"" tempMap = FUNCTIONNAME_MAP(input1[i], input2[i], input3[i], const1);\n"" result = FUNCTIONNAME_REDUCE(result, tempMap, (TYPE)0);\n"" i += gridSize;\n"" }\n"" sdata[tid] = result;\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if(blockSize >= 512) { if (tid < 256 && tid + 256 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 256], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 256) { if (tid < 128 && tid + 128 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 128], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 128) { if (tid < 64 && tid + 64 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 64], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 64) { if (tid < 32 && tid + 32 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 32], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 32) { if (tid < 16 && tid + 16 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 16], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 16) { if (tid < 8 && tid + 8 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 8], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 8) { if (tid < 4 && tid + 4 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 4], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 4) { if (tid < 2 && tid + 2 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 2], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 2) { if (tid < 1 && tid + 1 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 1], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(tid == 0)\n"" {\n"" output[get_group_id(0)] = sdata[tid];\n"" }\n""}\n")
static std::string UnaryMapReduceKernel_CL("__kernel void UnaryMapReduceKernel_KERNELNAME(__global TYPE* input, __global TYPE* output, size_t n, __local TYPE* sdata, CONST_TYPE const1)\n""{\n"" size_t blockSize = get_local_size(0);\n"" size_t tid = get_local_id(0);\n"" size_t i = get_group_id(0)*blockSize + get_local_id(0);\n"" size_t gridSize = blockSize*get_num_groups(0);\n"" TYPE result = 0;\n"" if(i < n)\n"" {\n"" result = FUNCTIONNAME_MAP(input[i], const1);\n"" i += gridSize;\n"" }\n"" while(i < n)\n"" {\n"" TYPE tempMap;\n"" tempMap = FUNCTIONNAME_MAP(input[i], const1);\n"" result = FUNCTIONNAME_REDUCE(result, tempMap, (TYPE)0);\n"" i += gridSize;\n"" }\n"" sdata[tid] = result;\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if(blockSize >= 512) { if (tid < 256 && tid + 256 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 256], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 256) { if (tid < 128 && tid + 128 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 128], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 128) { if (tid < 64 && tid + 64 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 64], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 64) { if (tid < 32 && tid + 32 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 32], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 32) { if (tid < 16 && tid + 16 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 16], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 16) { if (tid < 8 && tid + 8 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 8], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 8) { if (tid < 4 && tid + 4 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 4], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 4) { if (tid < 2 && tid + 2 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 2], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 2) { if (tid < 1 && tid + 1 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 1], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(tid == 0)\n"" {\n"" output[get_group_id(0)] = sdata[tid];\n"" }\n""}\n")
__global__ void MapReduceKernel3_CU(TrinaryFunc mapFunc, BinaryFunc reduceFunc, T *input1, T *input2, T *input3, T *output, size_t n)
Definition: mapreduce_kernels.h:416
static std::string BinaryMapReduceKernel_CL("__kernel void BinaryMapReduceKernel_KERNELNAME(__global TYPE* input1, __global TYPE* input2, __global TYPE* output, size_t n, __local TYPE* sdata, CONST_TYPE const1)\n""{\n"" size_t blockSize = get_local_size(0);\n"" size_t tid = get_local_id(0);\n"" size_t i = get_group_id(0)*blockSize + get_local_id(0);\n"" size_t gridSize = blockSize*get_num_groups(0);\n"" TYPE result = 0;\n"" if(i < n)\n"" {\n"" result = FUNCTIONNAME_MAP(input1[i], input2[i], const1);\n"" i += gridSize;\n"" }\n"" while(i < n)\n"" {\n"" TYPE tempMap;\n"" tempMap = FUNCTIONNAME_MAP(input1[i], input2[i], const1);\n"" result = FUNCTIONNAME_REDUCE(result, tempMap, (TYPE)0);\n"" i += gridSize;\n"" }\n"" sdata[tid] = result;\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if(blockSize >= 512) { if (tid < 256 && tid + 256 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 256], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 256) { if (tid < 128 && tid + 128 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 128], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 128) { if (tid < 64 && tid + 64 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 64], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 64) { if (tid < 32 && tid + 32 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 32], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 32) { if (tid < 16 && tid + 16 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 16], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 16) { if (tid < 8 && tid + 8 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 8], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 8) { if (tid < 4 && tid + 4 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 4], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 4) { if (tid < 2 && tid + 2 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 2], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 2) { if (tid < 1 && tid + 1 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 1], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(tid == 0)\n"" {\n"" output[get_group_id(0)] = sdata[tid];\n"" }\n""}\n")
__global__ void MapReduceKernel2_CU(BinaryFunc1 mapFunc, BinaryFunc2 reduceFunc, T *input1, T *input2, T *output, size_t n)
Definition: mapreduce_kernels.h:302