SkePU(integratedwithStarPU)  0.8.1
 All Classes Namespaces Files Functions Enumerations Friends Macros Groups Pages
scan_kernels.h
Go to the documentation of this file.
1 
5 #ifndef SCAN_KERNELS_H
6 #define SCAN_KERNELS_H
7 
8 #ifdef SKEPU_OPENCL
9 
10 #include <string>
11 
12 namespace skepu
13 {
14 
32 static std::string ScanKernel_CL(
33 "__kernel void ScanKernel_KERNELNAME(__global TYPE* input, __global TYPE* output, __global TYPE* blockSums, unsigned int n, unsigned int numElements, __local TYPE* sdata)\n"
34 "{\n"
35 " unsigned int threadIdx = get_local_id(0);\n"
36 " unsigned int blockDim = get_local_size(0);\n"
37 " unsigned int blockIdx = get_group_id(0);\n"
38 " unsigned int gridDim = get_num_groups(0);\n"
39 " int thid = threadIdx;\n"
40 " int pout = 0;\n"
41 " int pin = 1;\n"
42 " int mem = get_global_id(0);\n"
43 " int blockNr = blockIdx;\n"
44 " unsigned int gridSize = blockDim*gridDim;\n"
45 " unsigned int numBlocks = numElements/(blockDim) + (numElements%(blockDim) == 0 ? 0:1);\n"
46 " int offset;\n"
47 " while(blockNr < numBlocks)\n"
48 " {\n"
49 " sdata[pout*n+thid] = (mem < numElements) ? input[mem] : 0;\n"
50 " barrier(CLK_LOCAL_MEM_FENCE);\n"
51 " for(offset = 1; offset < n; offset *=2)\n"
52 " {\n"
53 " pout = 1-pout;\n"
54 " pin = 1-pout;\n"
55 " if(thid >= offset)\n"
56 " sdata[pout*n+thid] = FUNCTIONNAME(sdata[pin*n+thid], sdata[pin*n+thid-offset], (TYPE)0);\n"
57 " else\n"
58 " sdata[pout*n+thid] = sdata[pin*n+thid];\n"
59 " barrier(CLK_LOCAL_MEM_FENCE);\n"
60 " }\n"
61 " if(thid == blockDim - 1)\n"
62 " blockSums[blockNr] = sdata[pout*n+blockDim-1];\n"
63 " if(mem < numElements)\n"
64 " output[mem] = sdata[pout*n+thid];\n"
65 " mem += gridSize;\n"
66 " blockNr += gridDim;\n"
67 " barrier(CLK_LOCAL_MEM_FENCE);\n"
68 " }\n"
69 "}\n"
70 );
71 
77 static std::string ScanUpdate_CL(
78 "__kernel void ScanUpdate_KERNELNAME(__global TYPE* data, __global TYPE* sums, int isInclusive, TYPE init, int n, __global TYPE* ret, __local TYPE* sdata)\n"
79 "{\n"
80 " __local TYPE offset;\n"
81 " __local TYPE inc_offset;\n"
82 " unsigned int threadIdx = get_local_id(0);\n"
83 " unsigned int blockDim = get_local_size(0);\n"
84 " unsigned int blockIdx = get_group_id(0);\n"
85 " unsigned int gridDim = get_num_groups(0);\n"
86 " int thid = threadIdx;\n"
87 " int blockNr = blockIdx;\n"
88 " unsigned int gridSize = blockDim*gridDim;\n"
89 " int mem = get_global_id(0);\n"
90 " unsigned int numBlocks = n/(blockDim) + (n%(blockDim) == 0 ? 0:1);\n"
91 " while(blockNr < numBlocks)\n"
92 " {\n"
93 " if(thid == 0)\n"
94 " {\n"
95 " if(isInclusive == 0)\n"
96 " {\n"
97 " offset = init;\n"
98 " if(blockNr > 0)\n"
99 " {\n"
100 " offset = FUNCTIONNAME(offset, sums[blockNr-1], (TYPE)0);\n"
101 " inc_offset = sums[blockNr-1];\n"
102 " }\n"
103 " }\n"
104 " else\n"
105 " {\n"
106 " if(blockNr > 0)\n"
107 " offset = sums[blockNr-1];\n"
108 " }\n"
109 " }\n"
110 " barrier(CLK_LOCAL_MEM_FENCE);\n"
111 " if(isInclusive == 1)\n"
112 " {\n"
113 " if(blockNr > 0)\n"
114 " sdata[thid] = (mem < n) ? FUNCTIONNAME(offset, data[mem], (TYPE)0) : 0;\n"
115 " else\n"
116 " sdata[thid] = (mem < n) ? data[mem] : 0;\n"
117 " if(mem == n-1)\n"
118 " *ret = sdata[thid];\n"
119 " }\n"
120 " else\n"
121 " {\n"
122 " if(mem == n-1)\n"
123 " *ret = FUNCTIONNAME(inc_offset, data[mem], (TYPE)0);\n"
124 " if(thid == 0)\n"
125 " sdata[thid] = offset;\n"
126 " else\n"
127 " sdata[thid] = (mem-1 < n) ? FUNCTIONNAME(offset, data[mem-1], (TYPE)0) : 0;\n"
128 " }\n"
129 " barrier(CLK_LOCAL_MEM_FENCE);\n"
130 " if(mem < n)\n"
131 " data[mem] = sdata[thid];\n"
132 " mem += gridSize;\n"
133 " blockNr += gridDim;\n"
134 " barrier(CLK_LOCAL_MEM_FENCE);\n"
135 " }\n"
136 "}\n"
137 );
138 
143 static std::string ScanAdd_CL(
144 "__kernel void ScanAdd_KERNELNAME(__global TYPE* data, TYPE sum, int n)\n"
145 "{\n"
146 " int i = get_global_id(0);\n"
147 " unsigned int gridSize = get_local_size(0)*get_num_groups(0);\n"
148 " while(i < n)\n"
149 " {\n"
150 " data[i] = FUNCTIONNAME(data[i], sum, (TYPE)0);\n"
151 " i += gridSize;\n"
152 " }\n"
153 "}\n"
154 );
155 
160 }
161 
162 #endif
163 
164 #ifdef SKEPU_CUDA
165 
166 namespace skepu
167 {
168 
186 template<typename T, typename BinaryFunc>
187 __global__ void ScanKernel_CU(BinaryFunc scanFunc, T* input, T* output, T* blockSums, unsigned int n, unsigned int numElements)
188 {
189  extern __shared__ char _sdata[];
190  T* sdata = reinterpret_cast<T*>(_sdata);
191 
192  int thid = threadIdx.x;
193  int pout = 0;
194  int pin = 1;
195  int mem = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;
196  int blockNr = blockIdx.x;
197  unsigned int gridSize = blockDim.x*gridDim.x;
198  unsigned int numBlocks = numElements/(blockDim.x) + (numElements%(blockDim.x) == 0 ? 0:1);
199  int offset;
200 
201  while(blockNr < numBlocks)
202  {
203  sdata[pout*n+thid] = (mem < numElements) ? input[mem] : 0;
204 
205  __syncthreads();
206 
207  for(offset = 1; offset < n; offset *=2)
208  {
209  pout = 1-pout;
210  pin = 1-pout;
211  if(thid >= offset)
212  sdata[pout*n+thid] = scanFunc.CU(sdata[pin*n+thid], sdata[pin*n+thid-offset]);
213  else
214  sdata[pout*n+thid] = sdata[pin*n+thid];
215 
216  __syncthreads();
217  }
218  if(thid == blockDim.x - 1)
219  blockSums[blockNr] = sdata[pout*n+blockDim.x-1];
220 
221  if(mem < numElements)
222  output[mem] = sdata[pout*n+thid];
223 
224  mem += gridSize;
225  blockNr += gridDim.x;
226 
227  __syncthreads();
228  }
229 }
230 
236 template <typename T, typename BinaryFunc>
237 __global__ void ScanUpdate_CU(BinaryFunc scanFunc, T *data, T *sums, int isInclusive, T init, int n, T *ret)
238 {
239  extern __shared__ char _sdata[];
240  T* sdata = reinterpret_cast<T*>(_sdata);
241 
242  __shared__ T offset;
243  __shared__ T inc_offset;
244 
245  int thid = threadIdx.x;
246  int blockNr = blockIdx.x;
247  unsigned int gridSize = blockDim.x*gridDim.x;
248  int mem = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;
249  unsigned int numBlocks = n/(blockDim.x) + (n%(blockDim.x) == 0 ? 0:1);
250 
251  while(blockNr < numBlocks)
252  {
253  if(thid == 0)
254  {
255  if(isInclusive == 0)
256  {
257  offset = init;
258  if(blockNr > 0)
259  {
260  offset = scanFunc.CU(offset, sums[blockNr-1]);
261  inc_offset = sums[blockNr-1];
262  }
263  }
264  else
265  {
266  if(blockNr > 0)
267  offset = sums[blockNr-1];
268  }
269  }
270 
271  __syncthreads();
272 
273  if(isInclusive == 1)
274  {
275  if(blockNr > 0)
276  sdata[thid] = (mem < n) ? scanFunc.CU(offset, data[mem]) : 0;
277  else
278  sdata[thid] = (mem < n) ? data[mem] : 0;
279  if(mem == n-1)
280  *ret = sdata[thid];
281  }
282  else
283  {
284  if(mem == n-1)
285  *ret = scanFunc.CU(inc_offset, data[mem]);
286  if(thid == 0)
287  sdata[thid] = offset;
288  else
289  sdata[thid] = (mem-1 < n) ? scanFunc.CU(offset, data[mem-1]) : 0;
290  }
291 
292  __syncthreads();
293 
294  if(mem < n)
295  data[mem] = sdata[thid];
296 
297  mem += gridSize;
298  blockNr += gridDim.x;
299 
300  __syncthreads();
301  }
302 }
303 
308 template <typename T, typename BinaryFunc>
309 __global__ void ScanAdd_CU(BinaryFunc scanFunc, T *data, T sum, int n)
310 {
311  int i = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;
312  unsigned int gridSize = blockDim.x*gridDim.x;
313 
314  while(i < n)
315  {
316  data[i] = scanFunc.CU(data[i], sum);
317  i += gridSize;
318  }
319 }
320 
325 }
326 
327 #endif
328 
329 #endif
330 
__global__ void ScanAdd_CU(BinaryFunc scanFunc, T *data, T sum, int n)
Definition: scan_kernels.h:309
__global__ void ScanUpdate_CU(BinaryFunc scanFunc, T *data, T *sums, int isInclusive, T init, int n, T *ret)
Definition: scan_kernels.h:237
static std::string ScanAdd_CL("__kernel void ScanAdd_KERNELNAME(__global TYPE* data, TYPE sum, int n)\n""{\n"" int i = get_global_id(0);\n"" unsigned int gridSize = get_local_size(0)*get_num_groups(0);\n"" while(i < n)\n"" {\n"" data[i] = FUNCTIONNAME(data[i], sum, (TYPE)0);\n"" i += gridSize;\n"" }\n""}\n")
static std::string ScanKernel_CL("__kernel void ScanKernel_KERNELNAME(__global TYPE* input, __global TYPE* output, __global TYPE* blockSums, unsigned int n, unsigned int numElements, __local TYPE* sdata)\n""{\n"" unsigned int threadIdx = get_local_id(0);\n"" unsigned int blockDim = get_local_size(0);\n"" unsigned int blockIdx = get_group_id(0);\n"" unsigned int gridDim = get_num_groups(0);\n"" int thid = threadIdx;\n"" int pout = 0;\n"" int pin = 1;\n"" int mem = get_global_id(0);\n"" int blockNr = blockIdx;\n"" unsigned int gridSize = blockDim*gridDim;\n"" unsigned int numBlocks = numElements/(blockDim) + (numElements%(blockDim) == 0 ? 0:1);\n"" int offset;\n"" while(blockNr < numBlocks)\n"" {\n"" sdata[pout*n+thid] = (mem < numElements) ? input[mem] : 0;\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" for(offset = 1; offset < n; offset *=2)\n"" {\n"" pout = 1-pout;\n"" pin = 1-pout;\n"" if(thid >= offset)\n"" sdata[pout*n+thid] = FUNCTIONNAME(sdata[pin*n+thid], sdata[pin*n+thid-offset], (TYPE)0);\n"" else\n"" sdata[pout*n+thid] = sdata[pin*n+thid];\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" }\n"" if(thid == blockDim - 1)\n"" blockSums[blockNr] = sdata[pout*n+blockDim-1];\n"" if(mem < numElements)\n"" output[mem] = sdata[pout*n+thid];\n"" mem += gridSize;\n"" blockNr += gridDim;\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" }\n""}\n")
static std::string ScanUpdate_CL("__kernel void ScanUpdate_KERNELNAME(__global TYPE* data, __global TYPE* sums, int isInclusive, TYPE init, int n, __global TYPE* ret, __local TYPE* sdata)\n""{\n"" __local TYPE offset;\n"" __local TYPE inc_offset;\n"" unsigned int threadIdx = get_local_id(0);\n"" unsigned int blockDim = get_local_size(0);\n"" unsigned int blockIdx = get_group_id(0);\n"" unsigned int gridDim = get_num_groups(0);\n"" int thid = threadIdx;\n"" int blockNr = blockIdx;\n"" unsigned int gridSize = blockDim*gridDim;\n"" int mem = get_global_id(0);\n"" unsigned int numBlocks = n/(blockDim) + (n%(blockDim) == 0 ? 0:1);\n"" while(blockNr < numBlocks)\n"" {\n"" if(thid == 0)\n"" {\n"" if(isInclusive == 0)\n"" {\n"" offset = init;\n"" if(blockNr > 0)\n"" {\n"" offset = FUNCTIONNAME(offset, sums[blockNr-1], (TYPE)0);\n"" inc_offset = sums[blockNr-1];\n"" }\n"" }\n"" else\n"" {\n"" if(blockNr > 0)\n"" offset = sums[blockNr-1];\n"" }\n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if(isInclusive == 1)\n"" {\n"" if(blockNr > 0)\n"" sdata[thid] = (mem < n) ? FUNCTIONNAME(offset, data[mem], (TYPE)0) : 0;\n"" else\n"" sdata[thid] = (mem < n) ? data[mem] : 0;\n"" if(mem == n-1)\n"" *ret = sdata[thid];\n"" }\n"" else\n"" {\n"" if(mem == n-1)\n"" *ret = FUNCTIONNAME(inc_offset, data[mem], (TYPE)0);\n"" if(thid == 0)\n"" sdata[thid] = offset;\n"" else\n"" sdata[thid] = (mem-1 < n) ? FUNCTIONNAME(offset, data[mem-1], (TYPE)0) : 0;\n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if(mem < n)\n"" data[mem] = sdata[thid];\n"" mem += gridSize;\n"" blockNr += gridDim;\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" }\n""}\n")
__global__ void ScanKernel_CU(BinaryFunc scanFunc, T *input, T *output, T *blockSums, unsigned int n, unsigned int numElements)
Definition: scan_kernels.h:187