SkePU  1.2
 All Classes Namespaces Files Functions Variables Enumerations Friends Macros Groups Pages
reduce_kernels.h
Go to the documentation of this file.
1 
5 #ifndef REDUCE_KERNELS_H
6 #define REDUCE_KERNELS_H
7 
8 #ifdef SKEPU_OPENCL
9 
10 #include <string>
11 
12 namespace skepu
13 {
14 
31 static std::string ReduceKernel_CL(
32  "__kernel void ReduceKernel_KERNELNAME(__global TYPE* input, __global TYPE* output, size_t n, __local TYPE* sdata)\n"
33  "{\n"
34  " size_t blockSize = get_local_size(0);\n"
35  " size_t tid = get_local_id(0);\n"
36  " size_t i = get_group_id(0)*blockSize + get_local_id(0);\n"
37  " size_t gridSize = blockSize*get_num_groups(0);\n"
38  " TYPE result = 0;\n"
39  " if(i < n)\n"
40  " {\n"
41  " result = input[i];\n"
42  " i += gridSize;\n"
43  " }\n"
44  " while(i < n)\n"
45  " {\n"
46  " result = FUNCTIONNAME(result, input[i], (TYPE)0);\n"
47  " i += gridSize;\n"
48  " }\n"
49  " sdata[tid] = result;\n"
50  " barrier(CLK_LOCAL_MEM_FENCE);\n"
51  " if(blockSize >= 512) { if (tid < 256 && tid + 256 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 256], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
52  " if(blockSize >= 256) { if (tid < 128 && tid + 128 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 128], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
53  " if(blockSize >= 128) { if (tid < 64 && tid + 64 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 64], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
54  " if(blockSize >= 64) { if (tid < 32 && tid + 32 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 32], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
55  " if(blockSize >= 32) { if (tid < 16 && tid + 16 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 16], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
56  " if(blockSize >= 16) { if (tid < 8 && tid + 8 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 8], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
57  " if(blockSize >= 8) { if (tid < 4 && tid + 4 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 4], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
58  " if(blockSize >= 4) { if (tid < 2 && tid + 2 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 2], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
59  " if(blockSize >= 2) { if (tid < 1 && tid + 1 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 1], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
60  " if(tid == 0)\n"
61  " {\n"
62  " output[get_group_id(0)] = sdata[tid];\n"
63  " }\n"
64  "}\n"
65 );
66 
67 
74 size_t nextPow2( size_t x )
75 {
76  --x;
77  x |= x >> 1;
78  x |= x >> 2;
79  x |= x >> 4;
80  x |= x >> 8;
81  x |= x >> 16;
82  return ++x;
83 }
84 
85 
86 
99 void getNumBlocksAndThreads(size_t n, size_t maxBlocks, size_t maxThreads, size_t &blocks, size_t &threads)
100 {
101  threads = (n < maxThreads*2) ? nextPow2((n + 1)/ 2) : maxThreads;
102  blocks = (n + (threads * 2 - 1)) / (threads * 2);
103 
104  blocks = MIN(maxBlocks, blocks);
105 }
110 }
111 
112 #endif
113 
114 #ifdef SKEPU_CUDA
115 
116 
117 // Utility class used to avoid linker errors with extern
118 // unsized shared memory arrays with templated type
119 template<class T>
120 struct SharedMemory
121 {
122  __device__ inline operator T*()
123  {
124  extern __shared__ int __smem[];
125  return (T*)__smem;
126  }
127 
128  __device__ inline operator const T*() const
129  {
130  extern __shared__ int __smem[];
131  return (T*)__smem;
132  }
133 };
134 
135 // specialize for double to avoid unaligned memory
136 // access compile errors
137 template<>
138 struct SharedMemory<double>
139 {
140  __device__ inline operator double*()
141  {
142  extern __shared__ double __smem_d[];
143  return (double*)__smem_d;
144  }
145 
146  __device__ inline operator const double*() const
147  {
148  extern __shared__ double __smem_d[];
149  return (double*)__smem_d;
150  }
151 };
152 
153 namespace skepu
154 {
155 
171 template<typename T, typename BinaryFunc>
172 __global__ void ReduceKernel_CU_oldAndIncorrect(BinaryFunc reduceFunc, T* input, T* output, size_t n)
173 {
174  T* sdata = SharedMemory<T>();
175 
176  size_t blockSize = blockDim.x;
177  size_t tid = threadIdx.x;
178  size_t i = blockIdx.x * blockSize + tid;
179  size_t gridSize = blockSize*gridDim.x;
180  T result = 0;
181 
182  if(i < n)
183  {
184  result = input[i];
185  i += gridSize;
186  }
187 
188  while(i < n)
189  {
190  result = reduceFunc.CU(result, input[i]);
191  i += gridSize;
192  }
193 
194  sdata[tid] = result;
195 
196  __syncthreads();
197 
198  if(blockSize >= 512)
199  {
200  if (tid < 256 && tid + 256 < n)
201  {
202  sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 256]);
203  }
204  __syncthreads();
205  }
206  if(blockSize >= 256)
207  {
208  if (tid < 128 && tid + 128 < n)
209  {
210  sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 128]);
211  }
212  __syncthreads();
213  }
214  if(blockSize >= 128)
215  {
216  if (tid < 64 && tid + 64 < n)
217  {
218  sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 64]);
219  }
220  __syncthreads();
221  }
222  if(blockSize >= 64)
223  {
224  if (tid < 32 && tid + 32 < n)
225  {
226  sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 32]);
227  }
228  __syncthreads();
229  }
230  if(blockSize >= 32)
231  {
232  if (tid < 16 && tid + 16 < n)
233  {
234  sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 16]);
235  }
236  __syncthreads();
237  }
238  if(blockSize >= 16)
239  {
240  if (tid < 8 && tid + 8 < n)
241  {
242  sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 8]);
243  }
244  __syncthreads();
245  }
246  if(blockSize >= 8)
247  {
248  if (tid < 4 && tid + 4 < n)
249  {
250  sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 4]);
251  }
252  __syncthreads();
253  }
254  if(blockSize >= 4)
255  {
256  if (tid < 2 && tid + 2 < n)
257  {
258  sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 2]);
259  }
260  __syncthreads();
261  }
262  if(blockSize >= 2)
263  {
264  if (tid < 1 && tid + 1 < n)
265  {
266  sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 1]);
267  }
268  __syncthreads();
269  }
270 
271  if(tid == 0)
272  {
273  output[blockIdx.x] = sdata[tid];
274  }
275 }
276 
277 
278 
279 
280 
286 template<typename T, typename BinaryFunc, size_t blockSize, bool nIsPow2>
287 __global__ void ReduceKernel_CU(BinaryFunc reduceFunc, T *input, T *output, size_t n)
288 {
289  T *sdata = SharedMemory<T>();
290 
291  // perform first level of reduction,
292  // reading from global memory, writing to shared memory
293  size_t tid = threadIdx.x;
294  size_t i = blockIdx.x*blockSize*2 + threadIdx.x;
295  size_t gridSize = blockSize*2*gridDim.x;
296 
297  T result = 0;
298 
299  if(i < n)
300  {
301  result = input[i];
302  // ensure we don't read out of bounds -- this is optimized away for powerOf2 sized arrays
303  //This nIsPow2 opt is not valid when we use this kernel for sparse matrices as well where we
304  // dont exactly now the elements when calculating thread- and block-size and nIsPow2 assum becomes invalid in some cases there which results in sever problems.
305  // There we pass it always false
306  if (nIsPow2 || i + blockSize < n)
307  result = reduceFunc.CU(result, input[i+blockSize]);
308  i += gridSize;
309  }
310 
311  // we reduce multiple elements per thread. The number is determined by the
312  // number of active thread blocks (via gridDim). More blocks will result
313  // in a larger gridSize and therefore fewer elements per thread
314  while(i < n)
315  {
316  result = reduceFunc.CU(result, input[i]);
317  // ensure we don't read out of bounds -- this is optimized away for powerOf2 sized arrays
318  if (nIsPow2 || i + blockSize < n)
319  result = reduceFunc.CU(result, input[i+blockSize]);
320  i += gridSize;
321  }
322 
323  // each thread puts its local sum into shared memory
324  sdata[tid] = result;
325 
326  __syncthreads();
327 
328 
329  // do reduction in shared mem
330  if (blockSize >= 512)
331  {
332  if (tid < 256)
333  {
334  sdata[tid] = result = reduceFunc.CU(result, sdata[tid + 256]);
335  }
336  __syncthreads();
337  }
338  if (blockSize >= 256)
339  {
340  if (tid < 128)
341  {
342  sdata[tid] = result = reduceFunc.CU(result, sdata[tid + 128]);
343  }
344  __syncthreads();
345  }
346  if (blockSize >= 128)
347  {
348  if (tid < 64)
349  {
350  sdata[tid] = result = reduceFunc.CU(result, sdata[tid + 64]);
351  }
352  __syncthreads();
353  }
354 
355  if (tid < 32)
356  {
357  // now that we are using warp-synchronous programming (below)
358  // we need to declare our shared memory volatile so that the compiler
359  // doesn't reorder stores to it and induce incorrect behavior.
360  volatile T* smem = sdata;
361  if (blockSize >= 64)
362  {
363  smem[tid] = result = reduceFunc.CU(result, smem[tid + 32]);
364  }
365  if (blockSize >= 32)
366  {
367  smem[tid] = result = reduceFunc.CU(result, smem[tid + 16]);
368  }
369  if (blockSize >= 16)
370  {
371  smem[tid] = result = reduceFunc.CU(result, smem[tid + 8]);
372  }
373  if (blockSize >= 8)
374  {
375  smem[tid] = result = reduceFunc.CU(result, smem[tid + 4]);
376  }
377  if (blockSize >= 4)
378  {
379  smem[tid] = result = reduceFunc.CU(result, smem[tid + 2]);
380  }
381  if (blockSize >= 2)
382  {
383  smem[tid] = result = reduceFunc.CU(result, smem[tid + 1]);
384  }
385  }
386 
387  // write result for this block to global mem
388  if (tid == 0)
389  output[blockIdx.x] = sdata[0];
390 }
391 
392 
393 
394 
395 
396 // ********************************************************************************************************************
397 // --------------------------------------------------------------------------------------------------------------------
398 // ********************************************************************************************************************
399 // --------------------------------------------------------------------------------------------------------------------// ********************************************************************************************************************
400 // --------------------------------------------------------------------------------------------------------------------// ********************************************************************************************************************
401 // --------------------------------------------------------------------------------------------------------------------
402 
403 
404 
411 bool isPow2(size_t x)
412 {
413  return ((x&(x-1))==0);
414 }
415 
416 
428 template <typename ReduceFunc, typename T>
429 void CallReduceKernel(ReduceFunc *reduceFunc, size_t size, size_t numThreads, size_t numBlocks, T *d_idata, T *d_odata, bool enableIsPow2=true)
430 {
431  dim3 dimBlock(numThreads, 1, 1);
432  dim3 dimGrid(numBlocks, 1, 1);
433 
434  // when there is only one warp per block, we need to allocate two warps
435  // worth of shared memory so that we don't index shared memory out of bounds
436  size_t smemSize = (numThreads <= 32) ? 2 * numThreads * sizeof(T) : numThreads * sizeof(T);
437 
438  // choose which of the optimized versions of reduction to launch
439  if (isPow2(size) && enableIsPow2)
440  {
441  switch (numThreads)
442  {
443  case 512:
444  ReduceKernel_CU<T, ReduceFunc, 512, true><<< dimGrid, dimBlock, smemSize >>>(*reduceFunc, d_idata, d_odata, size);
445  break;
446  case 256:
447  ReduceKernel_CU<T, ReduceFunc, 256, true><<< dimGrid, dimBlock, smemSize >>>(*reduceFunc, d_idata, d_odata, size);
448  break;
449  case 128:
450  ReduceKernel_CU<T, ReduceFunc, 128, true><<< dimGrid, dimBlock, smemSize >>>(*reduceFunc, d_idata, d_odata, size);
451  break;
452  case 64:
453  ReduceKernel_CU<T, ReduceFunc, 64, true><<< dimGrid, dimBlock, smemSize >>>(*reduceFunc, d_idata, d_odata, size);
454  break;
455  case 32:
456  ReduceKernel_CU<T, ReduceFunc, 32, true><<< dimGrid, dimBlock, smemSize >>>(*reduceFunc, d_idata, d_odata, size);
457  break;
458  case 16:
459  ReduceKernel_CU<T, ReduceFunc, 16, true><<< dimGrid, dimBlock, smemSize >>>(*reduceFunc, d_idata, d_odata, size);
460  break;
461  case 8:
462  ReduceKernel_CU<T, ReduceFunc, 8, true><<< dimGrid, dimBlock, smemSize >>>(*reduceFunc, d_idata, d_odata, size);
463  break;
464  case 4:
465  ReduceKernel_CU<T, ReduceFunc, 4, true><<< dimGrid, dimBlock, smemSize >>>(*reduceFunc, d_idata, d_odata, size);
466  break;
467  case 2:
468  ReduceKernel_CU<T, ReduceFunc, 2, true><<< dimGrid, dimBlock, smemSize >>>(*reduceFunc, d_idata, d_odata, size);
469  break;
470  case 1:
471  ReduceKernel_CU<T, ReduceFunc, 1, true><<< dimGrid, dimBlock, smemSize >>>(*reduceFunc, d_idata, d_odata, size);
472  break;
473  }
474  }
475  else
476  {
477  switch (numThreads)
478  {
479  case 512:
480  ReduceKernel_CU<T, ReduceFunc, 512, false><<< dimGrid, dimBlock, smemSize >>>(*reduceFunc, d_idata, d_odata, size);
481  break;
482  case 256:
483  ReduceKernel_CU<T, ReduceFunc, 256, false><<< dimGrid, dimBlock, smemSize >>>(*reduceFunc, d_idata, d_odata, size);
484  break;
485  case 128:
486  ReduceKernel_CU<T, ReduceFunc, 128, false><<< dimGrid, dimBlock, smemSize >>>(*reduceFunc, d_idata, d_odata, size);
487  break;
488  case 64:
489  ReduceKernel_CU<T, ReduceFunc, 64, false><<< dimGrid, dimBlock, smemSize >>>(*reduceFunc, d_idata, d_odata, size);
490  break;
491  case 32:
492  ReduceKernel_CU<T, ReduceFunc, 32, false><<< dimGrid, dimBlock, smemSize >>>(*reduceFunc, d_idata, d_odata, size);
493  break;
494  case 16:
495  ReduceKernel_CU<T, ReduceFunc, 16, false><<< dimGrid, dimBlock, smemSize >>>(*reduceFunc, d_idata, d_odata, size);
496  break;
497  case 8:
498  ReduceKernel_CU<T, ReduceFunc, 8, false><<< dimGrid, dimBlock, smemSize >>>(*reduceFunc, d_idata, d_odata, size);
499  break;
500  case 4:
501  ReduceKernel_CU<T, ReduceFunc, 4, false><<< dimGrid, dimBlock, smemSize >>>(*reduceFunc, d_idata, d_odata, size);
502  break;
503  case 2:
504  ReduceKernel_CU<T, ReduceFunc, 2, false><<< dimGrid, dimBlock, smemSize >>>(*reduceFunc, d_idata, d_odata, size);
505  break;
506  case 1:
507  ReduceKernel_CU<T, ReduceFunc, 1, false><<< dimGrid, dimBlock, smemSize >>>(*reduceFunc, d_idata, d_odata, size);
508  break;
509  }
510  }
511 }
512 
513 
514 
515 
516 #ifdef USE_PINNED_MEMORY
517 
529 template <typename ReduceFunc, typename T>
530 void CallReduceKernel_WithStream(ReduceFunc *reduceFunc, size_t size, size_t numThreads, size_t numBlocks, T *d_idata, T *d_odata, cudaStream_t &stream, bool enableIsPow2=true)
531 {
532  dim3 dimBlock(numThreads, 1, 1);
533  dim3 dimGrid(numBlocks, 1, 1);
534 
535  // when there is only one warp per block, we need to allocate two warps
536  // worth of shared memory so that we don't index shared memory out of bounds
537  size_t smemSize = (numThreads <= 32) ? 2 * numThreads * sizeof(T) : numThreads * sizeof(T);
538 
539  // choose which of the optimized versions of reduction to launch
540  if (isPow2(size) && enableIsPow2)
541  {
542  switch (numThreads)
543  {
544  case 512:
545  ReduceKernel_CU<T, ReduceFunc, 512, true><<< dimGrid, dimBlock, smemSize, stream >>>(*reduceFunc, d_idata, d_odata, size);
546  break;
547  case 256:
548  ReduceKernel_CU<T, ReduceFunc, 256, true><<< dimGrid, dimBlock, smemSize, stream >>>(*reduceFunc, d_idata, d_odata, size);
549  break;
550  case 128:
551  ReduceKernel_CU<T, ReduceFunc, 128, true><<< dimGrid, dimBlock, smemSize, stream >>>(*reduceFunc, d_idata, d_odata, size);
552  break;
553  case 64:
554  ReduceKernel_CU<T, ReduceFunc, 64, true><<< dimGrid, dimBlock, smemSize, stream >>>(*reduceFunc, d_idata, d_odata, size);
555  break;
556  case 32:
557  ReduceKernel_CU<T, ReduceFunc, 32, true><<< dimGrid, dimBlock, smemSize, stream >>>(*reduceFunc, d_idata, d_odata, size);
558  break;
559  case 16:
560  ReduceKernel_CU<T, ReduceFunc, 16, true><<< dimGrid, dimBlock, smemSize, stream >>>(*reduceFunc, d_idata, d_odata, size);
561  break;
562  case 8:
563  ReduceKernel_CU<T, ReduceFunc, 8, true><<< dimGrid, dimBlock, smemSize, stream >>>(*reduceFunc, d_idata, d_odata, size);
564  break;
565  case 4:
566  ReduceKernel_CU<T, ReduceFunc, 4, true><<< dimGrid, dimBlock, smemSize, stream >>>(*reduceFunc, d_idata, d_odata, size);
567  break;
568  case 2:
569  ReduceKernel_CU<T, ReduceFunc, 2, true><<< dimGrid, dimBlock, smemSize, stream >>>(*reduceFunc, d_idata, d_odata, size);
570  break;
571  case 1:
572  ReduceKernel_CU<T, ReduceFunc, 1, true><<< dimGrid, dimBlock, smemSize, stream >>>(*reduceFunc, d_idata, d_odata, size);
573  break;
574  }
575  }
576  else
577  {
578  switch (numThreads)
579  {
580  case 512:
581  ReduceKernel_CU<T, ReduceFunc, 512, false><<< dimGrid, dimBlock, smemSize, stream >>>(*reduceFunc, d_idata, d_odata, size);
582  break;
583  case 256:
584  ReduceKernel_CU<T, ReduceFunc, 256, false><<< dimGrid, dimBlock, smemSize, stream >>>(*reduceFunc, d_idata, d_odata, size);
585  break;
586  case 128:
587  ReduceKernel_CU<T, ReduceFunc, 128, false><<< dimGrid, dimBlock, smemSize, stream >>>(*reduceFunc, d_idata, d_odata, size);
588  break;
589  case 64:
590  ReduceKernel_CU<T, ReduceFunc, 64, false><<< dimGrid, dimBlock, smemSize, stream >>>(*reduceFunc, d_idata, d_odata, size);
591  break;
592  case 32:
593  ReduceKernel_CU<T, ReduceFunc, 32, false><<< dimGrid, dimBlock, smemSize, stream >>>(*reduceFunc, d_idata, d_odata, size);
594  break;
595  case 16:
596  ReduceKernel_CU<T, ReduceFunc, 16, false><<< dimGrid, dimBlock, smemSize, stream >>>(*reduceFunc, d_idata, d_odata, size);
597  break;
598  case 8:
599  ReduceKernel_CU<T, ReduceFunc, 8, false><<< dimGrid, dimBlock, smemSize, stream >>>(*reduceFunc, d_idata, d_odata, size);
600  break;
601  case 4:
602  ReduceKernel_CU<T, ReduceFunc, 4, false><<< dimGrid, dimBlock, smemSize, stream >>>(*reduceFunc, d_idata, d_odata, size);
603  break;
604  case 2:
605  ReduceKernel_CU<T, ReduceFunc, 2, false><<< dimGrid, dimBlock, smemSize, stream >>>(*reduceFunc, d_idata, d_odata, size);
606  break;
607  case 1:
608  ReduceKernel_CU<T, ReduceFunc, 1, false><<< dimGrid, dimBlock, smemSize, stream >>>(*reduceFunc, d_idata, d_odata, size);
609  break;
610  }
611  }
612 }
613 #endif
614 
615 
616 
617 
624 size_t nextPow2( size_t x )
625 {
626  --x;
627  x |= x >> 1;
628  x |= x >> 2;
629  x |= x >> 4;
630  x |= x >> 8;
631  x |= x >> 16;
632  return ++x;
633 }
634 
635 
648 void getNumBlocksAndThreads(size_t n, size_t maxBlocks, size_t maxThreads, size_t &blocks, size_t &threads)
649 {
650  threads = (n < maxThreads*2) ? nextPow2((n + 1)/ 2) : maxThreads;
651  blocks = (n + (threads * 2 - 1)) / (threads * 2);
652 
653  blocks = MIN(maxBlocks, blocks);
654 }
655 
656 
657 
658 
659 #ifdef USE_PINNED_MEMORY
660 
676 template <typename ReduceFunc, typename T>
677 void ExecuteReduceOnADevice(ReduceFunc *reduceFunc, size_t n, size_t numThreads, size_t numBlocks, size_t maxThreads, size_t maxBlocks, T* d_idata, T* d_odata, unsigned int deviceID, cudaStream_t &stream, bool enableIsPow2=true)
678 #else
679 
694 template <typename ReduceFunc, typename T>
695 void ExecuteReduceOnADevice(ReduceFunc *reduceFunc, size_t n, size_t numThreads, size_t numBlocks, size_t maxThreads, size_t maxBlocks, T* d_idata, T* d_odata, unsigned int deviceID, bool enableIsPow2=true)
696 #endif
697 {
698  // execute the kernel
699 #ifdef USE_PINNED_MEMORY
700  CallReduceKernel_WithStream<ReduceFunc, T>(reduceFunc, n, numThreads, numBlocks, d_idata, d_odata, stream, enableIsPow2);
701 #else
702  CallReduceKernel<ReduceFunc, T>(reduceFunc, n, numThreads, numBlocks, d_idata, d_odata, enableIsPow2);
703 #endif
704 
705  // sum partial block sums on GPU
706  size_t s=numBlocks;
707 
708  while(s > 1)
709  {
710  size_t threads = 0, blocks = 0;
711  getNumBlocksAndThreads(s, maxBlocks, maxThreads, blocks, threads);
712 
713 #ifdef USE_PINNED_MEMORY
714  CallReduceKernel_WithStream<ReduceFunc, T>(reduceFunc, s, threads, blocks, d_odata, d_odata, stream, enableIsPow2);
715 #else
716  CallReduceKernel<ReduceFunc, T>(reduceFunc, s, threads, blocks, d_odata, d_odata, enableIsPow2);
717 #endif
718 
719  s = (s + (threads*2-1)) / (threads*2);
720  }
721 }
722 
723 
724 
729 }
730 
731 #endif
732 
733 
734 
735 #endif
736 
737 
#define MIN(a, b)
CUT bool type.
Definition: skepu_cuda_helpers.h:42
__global__ void ReduceKernel_CU_oldAndIncorrect(BinaryFunc reduceFunc, T *input, T *output, size_t n)
Definition: reduce_kernels.h:172
void CallReduceKernel(ReduceFunc *reduceFunc, size_t size, size_t numThreads, size_t numBlocks, T *d_idata, T *d_odata, bool enableIsPow2=true)
Definition: reduce_kernels.h:429
__global__ void ReduceKernel_CU(BinaryFunc reduceFunc, T *input, T *output, size_t n)
Definition: reduce_kernels.h:287
static std::string ReduceKernel_CL("__kernel void ReduceKernel_KERNELNAME(__global TYPE* input, __global TYPE* output, size_t n, __local TYPE* sdata)\n""{\n"" size_t blockSize = get_local_size(0);\n"" size_t tid = get_local_id(0);\n"" size_t i = get_group_id(0)*blockSize + get_local_id(0);\n"" size_t gridSize = blockSize*get_num_groups(0);\n"" TYPE result = 0;\n"" if(i < n)\n"" {\n"" result = input[i];\n"" i += gridSize;\n"" }\n"" while(i < n)\n"" {\n"" result = FUNCTIONNAME(result, input[i], (TYPE)0);\n"" i += gridSize;\n"" }\n"" sdata[tid] = result;\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if(blockSize >= 512) { if (tid < 256 && tid + 256 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 256], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 256) { if (tid < 128 && tid + 128 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 128], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 128) { if (tid < 64 && tid + 64 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 64], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 64) { if (tid < 32 && tid + 32 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 32], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 32) { if (tid < 16 && tid + 16 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 16], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 16) { if (tid < 8 && tid + 8 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 8], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 8) { if (tid < 4 && tid + 4 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 4], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 4) { if (tid < 2 && tid + 2 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 2], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 2) { if (tid < 1 && tid + 1 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 1], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(tid == 0)\n"" {\n"" output[get_group_id(0)] = sdata[tid];\n"" }\n""}\n")
size_t nextPow2(size_t x)
A helper to return a value that is nearest value that is power of 2.
Definition: reduce_kernels.h:74
bool isPow2(size_t x)
A small helper to determine whether the number is a power of 2.
Definition: reduce_kernels.h:411
void ExecuteReduceOnADevice(ReduceFunc *reduceFunc, size_t n, size_t numThreads, size_t numBlocks, size_t maxThreads, size_t maxBlocks, T *d_idata, T *d_odata, unsigned int deviceID, bool enableIsPow2=true)
Definition: reduce_kernels.h:695
void getNumBlocksAndThreads(size_t n, size_t maxBlocks, size_t maxThreads, size_t &blocks, size_t &threads)
Definition: reduce_kernels.h:99