SkePU  1.2
 All Classes Namespaces Files Functions Variables Enumerations Friends Macros Groups Pages
scan_kernels.h
Go to the documentation of this file.
1 
5 #ifndef SCAN_KERNELS_H
6 #define SCAN_KERNELS_H
7 
8 #ifdef SKEPU_OPENCL
9 
10 #include <string>
11 
12 namespace skepu
13 {
14 
32 static std::string ScanKernel_CL(
33  "__kernel void ScanKernel_KERNELNAME(__global TYPE* input, __global TYPE* output, __global TYPE* blockSums, size_t n, size_t numElements, __local TYPE* sdata)\n"
34  "{\n"
35  " size_t threadIdx = get_local_id(0);\n"
36  " size_t blockDim = get_local_size(0);\n"
37  " size_t blockIdx = get_group_id(0);\n"
38  " size_t gridDim = get_num_groups(0);\n"
39  " size_t thid = threadIdx;\n"
40  " unsigned int pout = 0;\n"
41  " unsigned int pin = 1;\n"
42  " size_t mem = get_global_id(0);\n"
43  " size_t blockNr = blockIdx;\n"
44  " size_t gridSize = blockDim*gridDim;\n"
45  " size_t numBlocks = numElements/(blockDim) + (numElements%(blockDim) == 0 ? 0:1);\n"
46  " size_t offset;\n"
47  " while(blockNr < numBlocks)\n"
48  " {\n"
49  " sdata[pout*n+thid] = (mem < numElements) ? input[mem] : 0;\n"
50  " barrier(CLK_LOCAL_MEM_FENCE);\n"
51  " for(offset = 1; offset < n; offset *=2)\n"
52  " {\n"
53  " pout = 1-pout;\n"
54  " pin = 1-pout;\n"
55  " if(thid >= offset)\n"
56  " sdata[pout*n+thid] = FUNCTIONNAME(sdata[pin*n+thid], sdata[pin*n+thid-offset], (TYPE)0);\n"
57  " else\n"
58  " sdata[pout*n+thid] = sdata[pin*n+thid];\n"
59  " barrier(CLK_LOCAL_MEM_FENCE);\n"
60  " }\n"
61  " if(thid == blockDim - 1)\n"
62  " blockSums[blockNr] = sdata[pout*n+blockDim-1];\n"
63  " if(mem < numElements)\n"
64  " output[mem] = sdata[pout*n+thid];\n"
65  " mem += gridSize;\n"
66  " blockNr += gridDim;\n"
67  " barrier(CLK_LOCAL_MEM_FENCE);\n"
68  " }\n"
69  "}\n"
70 );
71 
77 static std::string ScanUpdate_CL(
78  "__kernel void ScanUpdate_KERNELNAME(__global TYPE* data, __global TYPE* sums, int isInclusive, TYPE init, size_t n, __global TYPE* ret, __local TYPE* sdata)\n"
79  "{\n"
80  " __local TYPE offset;\n"
81  " __local TYPE inc_offset;\n"
82  " size_t threadIdx = get_local_id(0);\n"
83  " size_t blockDim = get_local_size(0);\n"
84  " size_t blockIdx = get_group_id(0);\n"
85  " size_t gridDim = get_num_groups(0);\n"
86  " size_t thid = threadIdx;\n"
87  " size_t blockNr = blockIdx;\n"
88  " size_t gridSize = blockDim*gridDim;\n"
89  " size_t mem = get_global_id(0);\n"
90  " size_t numBlocks = n/(blockDim) + (n%(blockDim) == 0 ? 0:1);\n"
91  " while(blockNr < numBlocks)\n"
92  " {\n"
93  " if(thid == 0)\n"
94  " {\n"
95  " if(isInclusive == 0)\n"
96  " {\n"
97  " offset = init;\n"
98  " if(blockNr > 0)\n"
99  " {\n"
100  " offset = FUNCTIONNAME(offset, sums[blockNr-1], (TYPE)0);\n"
101  " inc_offset = sums[blockNr-1];\n"
102  " }\n"
103  " }\n"
104  " else\n"
105  " {\n"
106  " if(blockNr > 0)\n"
107  " offset = sums[blockNr-1];\n"
108  " }\n"
109  " }\n"
110  " barrier(CLK_LOCAL_MEM_FENCE);\n"
111  " if(isInclusive == 1)\n"
112  " {\n"
113  " if(blockNr > 0)\n"
114  " sdata[thid] = (mem < n) ? FUNCTIONNAME(offset, data[mem], (TYPE)0) : 0;\n"
115  " else\n"
116  " sdata[thid] = (mem < n) ? data[mem] : 0;\n"
117  " if(mem == n-1)\n"
118  " *ret = sdata[thid];\n"
119  " }\n"
120  " else\n"
121  " {\n"
122  " if(mem == n-1)\n"
123  " *ret = FUNCTIONNAME(inc_offset, data[mem], (TYPE)0);\n"
124  " if(thid == 0)\n"
125  " sdata[thid] = offset;\n"
126  " else\n"
127  " sdata[thid] = (mem-1 < n) ? FUNCTIONNAME(offset, data[mem-1], (TYPE)0) : 0;\n"
128  " }\n"
129  " barrier(CLK_LOCAL_MEM_FENCE);\n"
130  " if(mem < n)\n"
131  " data[mem] = sdata[thid];\n"
132  " mem += gridSize;\n"
133  " blockNr += gridDim;\n"
134  " barrier(CLK_LOCAL_MEM_FENCE);\n"
135  " }\n"
136  "}\n"
137 );
138 
143 static std::string ScanAdd_CL(
144  "__kernel void ScanAdd_KERNELNAME(__global TYPE* data, TYPE sum, size_t n)\n"
145  "{\n"
146  " size_t i = get_global_id(0);\n"
147  " size_t gridSize = get_local_size(0)*get_num_groups(0);\n"
148  " while(i < n)\n"
149  " {\n"
150  " data[i] = FUNCTIONNAME(data[i], sum, (TYPE)0);\n"
151  " i += gridSize;\n"
152  " }\n"
153  "}\n"
154 );
155 
160 }
161 
162 #endif
163 
164 #ifdef SKEPU_CUDA
165 
166 namespace skepu
167 {
168 
186 template<typename T, typename BinaryFunc>
187 __global__ void ScanKernel_CU(BinaryFunc scanFunc, T* input, T* output, T* blockSums, size_t n, size_t numElements)
188 {
189  extern __shared__ char _sdata[];
190  T* sdata = reinterpret_cast<T*>(_sdata);
191 
192  size_t thid = threadIdx.x;
193  unsigned int pout = 0;
194  unsigned int pin = 1;
195  size_t mem = blockIdx.x * blockDim.x + threadIdx.x;
196  size_t blockNr = blockIdx.x;
197  size_t gridSize = blockDim.x*gridDim.x;
198  size_t numBlocks = numElements/(blockDim.x) + (numElements%(blockDim.x) == 0 ? 0:1);
199  size_t offset;
200 
201  while(blockNr < numBlocks)
202  {
203  sdata[pout*n+thid] = (mem < numElements) ? input[mem] : 0;
204 
205  __syncthreads();
206 
207  for(offset = 1; offset < n; offset *=2)
208  {
209  pout = 1-pout;
210  pin = 1-pout;
211  if(thid >= offset)
212  sdata[pout*n+thid] = scanFunc.CU(sdata[pin*n+thid], sdata[pin*n+thid-offset]);
213  else
214  sdata[pout*n+thid] = sdata[pin*n+thid];
215 
216  __syncthreads();
217  }
218  if(thid == blockDim.x - 1)
219  blockSums[blockNr] = sdata[pout*n+blockDim.x-1];
220 
221  if(mem < numElements)
222  output[mem] = sdata[pout*n+thid];
223 
224  mem += gridSize;
225  blockNr += gridDim.x;
226 
227  __syncthreads();
228  }
229 }
230 
236 template <typename T, typename BinaryFunc>
237 __global__ void ScanUpdate_CU(BinaryFunc scanFunc, T *data, T *sums, int isInclusive, T init, size_t n, T *ret)
238 {
239  extern __shared__ char _sdata[];
240  T* sdata = reinterpret_cast<T*>(_sdata);
241 
242  __shared__ T offset;
243  __shared__ T inc_offset;
244 
245  size_t thid = threadIdx.x;
246  size_t blockNr = blockIdx.x;
247  size_t gridSize = blockDim.x*gridDim.x;
248  size_t mem = blockIdx.x * blockDim.x + threadIdx.x;
249  size_t numBlocks = n/(blockDim.x) + (n%(blockDim.x) == 0 ? 0:1);
250 
251  while(blockNr < numBlocks)
252  {
253  if(thid == 0)
254  {
255  if(isInclusive == 0)
256  {
257  offset = init;
258  if(blockNr > 0)
259  {
260  offset = scanFunc.CU(offset, sums[blockNr-1]);
261  inc_offset = sums[blockNr-1];
262  }
263  }
264  else
265  {
266  if(blockNr > 0)
267  offset = sums[blockNr-1];
268  }
269  }
270 
271  __syncthreads();
272 
273  if(isInclusive == 1)
274  {
275  if(blockNr > 0)
276  sdata[thid] = (mem < n) ? scanFunc.CU(offset, data[mem]) : 0;
277  else
278  sdata[thid] = (mem < n) ? data[mem] : 0;
279  if(mem == n-1)
280  *ret = sdata[thid];
281  }
282  else
283  {
284  if(mem == n-1)
285  *ret = scanFunc.CU(inc_offset, data[mem]);
286  if(thid == 0)
287  sdata[thid] = offset;
288  else
289  sdata[thid] = (mem-1 < n) ? scanFunc.CU(offset, data[mem-1]) : 0;
290  }
291 
292  __syncthreads();
293 
294  if(mem < n)
295  data[mem] = sdata[thid];
296 
297  mem += gridSize;
298  blockNr += gridDim.x;
299 
300  __syncthreads();
301  }
302 }
303 
308 template <typename T, typename BinaryFunc>
309 __global__ void ScanAdd_CU(BinaryFunc scanFunc, T *data, T sum, size_t n)
310 {
311  size_t i = blockIdx.x * blockDim.x + threadIdx.x;
312  size_t gridSize = blockDim.x*gridDim.x;
313 
314  while(i < n)
315  {
316  data[i] = scanFunc.CU(data[i], sum);
317  i += gridSize;
318  }
319 }
320 
325 }
326 
327 #endif
328 
329 #endif
330 
331 
static std::string ScanAdd_CL("__kernel void ScanAdd_KERNELNAME(__global TYPE* data, TYPE sum, size_t n)\n""{\n"" size_t i = get_global_id(0);\n"" size_t gridSize = get_local_size(0)*get_num_groups(0);\n"" while(i < n)\n"" {\n"" data[i] = FUNCTIONNAME(data[i], sum, (TYPE)0);\n"" i += gridSize;\n"" }\n""}\n")
static std::string ScanUpdate_CL("__kernel void ScanUpdate_KERNELNAME(__global TYPE* data, __global TYPE* sums, int isInclusive, TYPE init, size_t n, __global TYPE* ret, __local TYPE* sdata)\n""{\n"" __local TYPE offset;\n"" __local TYPE inc_offset;\n"" size_t threadIdx = get_local_id(0);\n"" size_t blockDim = get_local_size(0);\n"" size_t blockIdx = get_group_id(0);\n"" size_t gridDim = get_num_groups(0);\n"" size_t thid = threadIdx;\n"" size_t blockNr = blockIdx;\n"" size_t gridSize = blockDim*gridDim;\n"" size_t mem = get_global_id(0);\n"" size_t numBlocks = n/(blockDim) + (n%(blockDim) == 0 ? 0:1);\n"" while(blockNr < numBlocks)\n"" {\n"" if(thid == 0)\n"" {\n"" if(isInclusive == 0)\n"" {\n"" offset = init;\n"" if(blockNr > 0)\n"" {\n"" offset = FUNCTIONNAME(offset, sums[blockNr-1], (TYPE)0);\n"" inc_offset = sums[blockNr-1];\n"" }\n"" }\n"" else\n"" {\n"" if(blockNr > 0)\n"" offset = sums[blockNr-1];\n"" }\n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if(isInclusive == 1)\n"" {\n"" if(blockNr > 0)\n"" sdata[thid] = (mem < n) ? FUNCTIONNAME(offset, data[mem], (TYPE)0) : 0;\n"" else\n"" sdata[thid] = (mem < n) ? data[mem] : 0;\n"" if(mem == n-1)\n"" *ret = sdata[thid];\n"" }\n"" else\n"" {\n"" if(mem == n-1)\n"" *ret = FUNCTIONNAME(inc_offset, data[mem], (TYPE)0);\n"" if(thid == 0)\n"" sdata[thid] = offset;\n"" else\n"" sdata[thid] = (mem-1 < n) ? FUNCTIONNAME(offset, data[mem-1], (TYPE)0) : 0;\n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if(mem < n)\n"" data[mem] = sdata[thid];\n"" mem += gridSize;\n"" blockNr += gridDim;\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" }\n""}\n")
__global__ void ScanAdd_CU(BinaryFunc scanFunc, T *data, T sum, size_t n)
Definition: scan_kernels.h:309
__global__ void ScanKernel_CU(BinaryFunc scanFunc, T *input, T *output, T *blockSums, size_t n, size_t numElements)
Definition: scan_kernels.h:187
__global__ void ScanUpdate_CU(BinaryFunc scanFunc, T *data, T *sums, int isInclusive, T init, size_t n, T *ret)
Definition: scan_kernels.h:237
static std::string ScanKernel_CL("__kernel void ScanKernel_KERNELNAME(__global TYPE* input, __global TYPE* output, __global TYPE* blockSums, size_t n, size_t numElements, __local TYPE* sdata)\n""{\n"" size_t threadIdx = get_local_id(0);\n"" size_t blockDim = get_local_size(0);\n"" size_t blockIdx = get_group_id(0);\n"" size_t gridDim = get_num_groups(0);\n"" size_t thid = threadIdx;\n"" unsigned int pout = 0;\n"" unsigned int pin = 1;\n"" size_t mem = get_global_id(0);\n"" size_t blockNr = blockIdx;\n"" size_t gridSize = blockDim*gridDim;\n"" size_t numBlocks = numElements/(blockDim) + (numElements%(blockDim) == 0 ? 0:1);\n"" size_t offset;\n"" while(blockNr < numBlocks)\n"" {\n"" sdata[pout*n+thid] = (mem < numElements) ? input[mem] : 0;\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" for(offset = 1; offset < n; offset *=2)\n"" {\n"" pout = 1-pout;\n"" pin = 1-pout;\n"" if(thid >= offset)\n"" sdata[pout*n+thid] = FUNCTIONNAME(sdata[pin*n+thid], sdata[pin*n+thid-offset], (TYPE)0);\n"" else\n"" sdata[pout*n+thid] = sdata[pin*n+thid];\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" }\n"" if(thid == blockDim - 1)\n"" blockSums[blockNr] = sdata[pout*n+blockDim-1];\n"" if(mem < numElements)\n"" output[mem] = sdata[pout*n+thid];\n"" mem += gridSize;\n"" blockNr += gridDim;\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" }\n""}\n")