SkePU(integratedwithStarPU)  0.8.1
 All Classes Namespaces Files Functions Enumerations Friends Macros Groups Pages
reduce_kernels.h
Go to the documentation of this file.
1 
5 #ifndef REDUCE_KERNELS_H
6 #define REDUCE_KERNELS_H
7 
8 #ifdef SKEPU_OPENCL
9 
10 #include <string>
11 
12 namespace skepu
13 {
14 
31 static std::string ReduceKernel_CL(
32 "__kernel void ReduceKernel_KERNELNAME(__global TYPE* input, __global TYPE* output, unsigned int n, __local TYPE* sdata)\n"
33 "{\n"
34 " unsigned int blockSize = get_local_size(0);\n"
35 " unsigned int tid = get_local_id(0);\n"
36 " unsigned int i = get_group_id(0)*blockSize + get_local_id(0);\n"
37 " unsigned int gridSize = blockSize*get_num_groups(0);\n"
38 " TYPE result = 0;\n"
39 " if(i < n)\n"
40 " {\n"
41 " result = input[i];\n"
42 " i += gridSize;\n"
43 " }\n"
44 " while(i < n)\n"
45 " {\n"
46 " result = FUNCTIONNAME(result, input[i], (TYPE)0);\n"
47 " i += gridSize;\n"
48 " }\n"
49 " sdata[tid] = result;\n"
50 " barrier(CLK_LOCAL_MEM_FENCE);\n"
51 " if(blockSize >= 512) { if (tid < 256 && tid + 256 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 256], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
52 " if(blockSize >= 256) { if (tid < 128 && tid + 128 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 128], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
53 " if(blockSize >= 128) { if (tid < 64 && tid + 64 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 64], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
54 " if(blockSize >= 64) { if (tid < 32 && tid + 32 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 32], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
55 " if(blockSize >= 32) { if (tid < 16 && tid + 16 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 16], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
56 " if(blockSize >= 16) { if (tid < 8 && tid + 8 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 8], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
57 " if(blockSize >= 8) { if (tid < 4 && tid + 4 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 4], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
58 " if(blockSize >= 4) { if (tid < 2 && tid + 2 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 2], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
59 " if(blockSize >= 2) { if (tid < 1 && tid + 1 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 1], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
60 " if(tid == 0)\n"
61 " {\n"
62 " output[get_group_id(0)] = sdata[tid];\n"
63 " }\n"
64 "}\n"
65 );
66 
71 }
72 
73 #endif
74 
75 #ifdef SKEPU_CUDA
76 
77 namespace skepu
78 {
79 
96 template<typename T, typename BinaryFunc>
97 __global__ void ReduceKernel_CU(BinaryFunc reduceFunc, T* input, T* output, unsigned int n)
98 {
99  //A bit ugly
100  extern __shared__ char _sdata[];
101  T* sdata = reinterpret_cast<T*>(_sdata);
102 
103  unsigned int blockSize = blockDim.x;
104  unsigned int tid = threadIdx.x;
105  unsigned int i = blockIdx.x * blockSize + tid;
106  unsigned int gridSize = blockSize*gridDim.x;
107  T result = 0;
108 
109  if(i < n)
110  {
111  result = input[i];
112  i += gridSize;
113  }
114 
115  while(i < n)
116  {
117  result = reduceFunc.CU(result, input[i]);
118  i += gridSize;
119  }
120 
121  sdata[tid] = result;
122 
123  __syncthreads();
124 
125  if(blockSize >= 512) { if (tid < 256 && tid + 256 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 256]); } __syncthreads(); }
126  if(blockSize >= 256) { if (tid < 128 && tid + 128 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 128]); } __syncthreads(); }
127  if(blockSize >= 128) { if (tid < 64 && tid + 64 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 64]); } __syncthreads(); }
128  if(blockSize >= 64) { if (tid < 32 && tid + 32 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 32]); } __syncthreads(); }
129  if(blockSize >= 32) { if (tid < 16 && tid + 16 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 16]); } __syncthreads(); }
130  if(blockSize >= 16) { if (tid < 8 && tid + 8 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 8]); } __syncthreads(); }
131  if(blockSize >= 8) { if (tid < 4 && tid + 4 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 4]); } __syncthreads(); }
132  if(blockSize >= 4) { if (tid < 2 && tid + 2 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 2]); } __syncthreads(); }
133  if(blockSize >= 2) { if (tid < 1 && tid + 1 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 1]); } __syncthreads(); }
134 
135  if(tid == 0)
136  {
137  output[blockIdx.x] = sdata[tid];
138  }
139 }
140 
145 }
146 
147 #endif
148 
149 #endif
150 
__global__ void ReduceKernel_CU(BinaryFunc reduceFunc, T *input, T *output, unsigned int n)
Definition: reduce_kernels.h:97
static std::string ReduceKernel_CL("__kernel void ReduceKernel_KERNELNAME(__global TYPE* input, __global TYPE* output, unsigned int n, __local TYPE* sdata)\n""{\n"" unsigned int blockSize = get_local_size(0);\n"" unsigned int tid = get_local_id(0);\n"" unsigned int i = get_group_id(0)*blockSize + get_local_id(0);\n"" unsigned int gridSize = blockSize*get_num_groups(0);\n"" TYPE result = 0;\n"" if(i < n)\n"" {\n"" result = input[i];\n"" i += gridSize;\n"" }\n"" while(i < n)\n"" {\n"" result = FUNCTIONNAME(result, input[i], (TYPE)0);\n"" i += gridSize;\n"" }\n"" sdata[tid] = result;\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if(blockSize >= 512) { if (tid < 256 && tid + 256 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 256], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 256) { if (tid < 128 && tid + 128 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 128], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 128) { if (tid < 64 && tid + 64 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 64], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 64) { if (tid < 32 && tid + 32 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 32], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 32) { if (tid < 16 && tid + 16 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 16], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 16) { if (tid < 8 && tid + 8 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 8], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 8) { if (tid < 4 && tid + 4 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 4], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 4) { if (tid < 2 && tid + 2 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 2], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 2) { if (tid < 1 && tid + 1 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 1], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(tid == 0)\n"" {\n"" output[get_group_id(0)] = sdata[tid];\n"" }\n""}\n")