SkePU(integratedwithStarPU)  0.8.1
 All Classes Namespaces Files Functions Enumerations Friends Macros Groups Pages
mapoverlap_kernels.h
Go to the documentation of this file.
1 
5 #ifndef MAPOVERLAP_KERNELS_H
6 #define MAPOVERLAP_KERNELS_H
7 
8 #ifdef SKEPU_OPENCL
9 
10 #include <string>
11 
12 namespace skepu
13 {
14 
27 static std::string MatrixTranspose_CL(
28 "__kernel void matrix_transpose_KERNELNAME(__global float *odata, __global float *idata, int offset, int width, int height, __local float* block)\n"
29 "{\n"
30 " unsigned int xIndex = get_global_id(0);\n"
31 " unsigned int yIndex = get_global_id(1);\n"
32 " if((xIndex + offset < width) && (yIndex < height))\n"
33 " {\n"
34 " unsigned int index_in = yIndex * width + xIndex + offset;\n"
35 " block[get_local_id(1)*(BLOCK_DIM+1)+get_local_id(0)] = idata[index_in];\n"
36 " }\n"
37 " barrier(CLK_LOCAL_MEM_FENCE);\n"
38 " xIndex = get_group_id(1) * BLOCK_DIM + get_local_id(0);\n"
39 " yIndex = get_group_id(0) * BLOCK_DIM + get_local_id(1);\n"
40 " if((xIndex < height) && (yIndex + offset < width))\n"
41 " {\n"
42 " unsigned int index_out = yIndex * height + xIndex;\n"
43 " odata[index_out] = block[get_local_id(0)*(BLOCK_DIM+1)+get_local_id(1)];\n"
44 " }\n"
45 "}\n"
46 );
47 
48 
49 
57 static std::string MapOverlapKernel_CL(
58 "__kernel void MapOverlapKernel_KERNELNAME(__global TYPE* input, __global TYPE* output, __global TYPE* wrap, int n, int overlap, int out_offset, int out_numelements, int poly, TYPE pad, __local TYPE* sdata)\n"
59 "{\n"
60 " int tid = get_local_id(0);\n"
61 " int i = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"
62 " if(poly == 0)\n"
63 " {\n"
64 " sdata[overlap+tid] = (i < n) ? input[i] : pad;\n"
65 " if(tid < overlap)\n"
66 " {\n"
67 " sdata[tid] = (get_group_id(0) == 0) ? pad : input[i-overlap];\n"
68 " }\n"
69 " if(tid >= (get_local_size(0)-overlap))\n"
70 " {\n"
71 " sdata[tid+2*overlap] = (get_group_id(0) != get_num_groups(0)-1 && i+overlap < n) ? input[i+overlap] : pad;\n"
72 " }\n"
73 " }\n"
74 " else if(poly == 1)\n"
75 " {\n"
76 " if(i < n)\n"
77 " {\n"
78 " sdata[overlap+tid] = input[i];\n"
79 " }\n"
80 " else if(i-n < overlap)\n"
81 " {\n"
82 " sdata[overlap+tid] = wrap[overlap+(i-n)];\n"
83 " }\n"
84 " else\n"
85 " {\n"
86 " sdata[overlap+tid] = pad;\n"
87 " }\n"
88 " if(tid < overlap)\n"
89 " {\n"
90 " sdata[tid] = (get_group_id(0) == 0) ? wrap[tid] : input[i-overlap];\n"
91 " }\n"
92 " if(tid >= (get_local_size(0)-overlap))\n"
93 " {\n"
94 " sdata[tid+2*overlap] = (get_group_id(0) != get_num_groups(0)-1 && i+overlap < n) ? input[i+overlap] : wrap[overlap+(i+overlap-n)];\n"
95 " }\n"
96 " }\n"
97 " else if(poly == 2)\n"
98 " {\n"
99 " sdata[overlap+tid] = (i < n) ? input[i] : input[n-1];\n"
100 " if(tid < overlap)\n"
101 " {\n"
102 " sdata[tid] = (get_group_id(0) == 0) ? input[0] : input[i-overlap];\n"
103 " }\n"
104 " if(tid >= (get_local_size(0)-overlap))\n"
105 " {\n"
106 " sdata[tid+2*overlap] = (get_group_id(0) != get_num_groups(0)-1 && i+overlap < n) ? input[i+overlap] : input[n-1];\n"
107 " }\n"
108 " }\n"
109 " barrier(CLK_LOCAL_MEM_FENCE);\n"
110 " if( (i >= out_offset) && (i < out_offset+out_numelements) )\n"
111 " {\n"
112 " output[i-out_offset] = FUNCTIONNAME(&(sdata[tid+overlap]));\n"
113 " }\n"
114 "}\n"
115 );
116 
117 
128 static std::string MapOverlapKernel_CL_Matrix_Row(
129 "__kernel void MapOverlapKernel_MatRowWise_KERNELNAME(__global TYPE* input, __global TYPE* output, __global TYPE* wrap, int n, int overlap, int out_offset, int out_numelements, int poly, TYPE pad, int blocksPerRow, int rowWidth, __local TYPE* sdata)\n"
130 "{\n"
131 " int tid = get_local_id(0);\n"
132 " int i = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"
133 " int wrapIndex= 2 * overlap * (int)(get_group_id(0)/blocksPerRow);\n"
134 " int tmp= (get_group_id(0) % blocksPerRow);\n"
135 " int tmp2= (get_group_id(0) / blocksPerRow);\n"
136 " if(poly == 0)\n"
137 " {\n"
138 " sdata[overlap+tid] = (i < n) ? input[i] : pad;\n"
139 " if(tid < overlap)\n"
140 " {\n"
141 " sdata[tid] = (tmp==0) ? pad : input[i-overlap];\n"
142 " }\n"
143 " if(tid >= (get_local_size(0)-overlap))\n"
144 " {\n"
145 " sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (i+overlap < n) && tmp!=(blocksPerRow-1)) ? input[i+overlap] : pad;\n"
146 " }\n"
147 " }\n"
148 " else if(poly == 1)\n"
149 " {\n"
150 " if(i < n)\n"
151 " {\n"
152 " sdata[overlap+tid] = input[i];\n"
153 " }\n"
154 " else if(i-n < overlap)\n"
155 " {\n"
156 " sdata[overlap+tid] = wrap[(overlap+(i-n))+ wrapIndex];\n"
157 " }\n"
158 " else\n"
159 " {\n"
160 " sdata[overlap+tid] = pad;\n"
161 " }\n"
162 " if(tid < overlap)\n"
163 " {\n"
164 " sdata[tid] = (tmp==0) ? wrap[tid+wrapIndex] : input[i-overlap];\n"
165 " }\n"
166 " if(tid >= (get_local_size(0)-overlap))\n"
167 " {\n"
168 " sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && i+overlap < n && tmp!=(blocksPerRow-1)) ? input[i+overlap] : wrap[overlap+wrapIndex+(tid+overlap-get_local_size(0))];\n"
169 " }\n"
170 " }\n"
171 " else if(poly == 2)\n"
172 " {\n"
173 " sdata[overlap+tid] = (i < n) ? input[i] : input[n-1];\n"
174 " if(tid < overlap)\n"
175 " {\n"
176 " sdata[tid] = (tmp==0) ? input[tmp2*rowWidth] : input[i-overlap];\n"
177 " }\n"
178 " if(tid >= (get_local_size(0)-overlap))\n"
179 " {\n"
180 " sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (i+overlap < n) && (tmp!=(blocksPerRow-1))) ? input[i+overlap] : input[(tmp2+1)*rowWidth-1];\n"
181 " }\n"
182 " }\n"
183 " barrier(CLK_LOCAL_MEM_FENCE);\n"
184 " if( (i >= out_offset) && (i < out_offset+out_numelements) )\n"
185 " {\n"
186 " output[i-out_offset] = FUNCTIONNAME(&(sdata[tid+overlap]));\n"
187 " }\n"
188 "}\n"
189 );
190 
191 
192 
197 static std::string MapOverlapKernel_CL_Matrix_Col(
198 "__kernel void MapOverlapKernel_MatColWise_KERNELNAME(__global TYPE* input, __global TYPE* output, __global TYPE* wrap, int n, int overlap, int out_offset, int out_numelements, int poly, TYPE pad, int blocksPerCol, int rowWidth, int colWidth, __local TYPE* sdata)\n"
199 "{\n"
200 " int tid = get_local_id(0);\n"
201 " int i = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"
202 " int wrapIndex= 2 * overlap * (int)(get_group_id(0)/blocksPerCol);\n"
203 " int tmp= (get_group_id(0) % blocksPerCol);\n"
204 " int tmp2= (get_group_id(0) / blocksPerCol);\n"
205 " int arrInd = (tid + tmp*get_local_size(0))*rowWidth + tmp2;\n"
206 " if(poly == 0)\n"
207 " {\n"
208 " sdata[overlap+tid] = (i < n) ? input[arrInd] : pad;\n"
209 " if(tid < overlap)\n"
210 " {\n"
211 " sdata[tid] = (tmp==0) ? pad : input[(arrInd-(overlap*rowWidth))];\n"
212 " }\n"
213 " if(tid >= (get_local_size(0)-overlap))\n"
214 " {\n"
215 " sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+(overlap*rowWidth))] : pad;\n"
216 " }\n"
217 " }\n"
218 " else if(poly == 1)\n"
219 " {\n"
220 " if(i < n)\n"
221 " {\n"
222 " sdata[overlap+tid] = input[arrInd];\n"
223 " }\n"
224 " else if(i-n < overlap)\n"
225 " {\n"
226 " sdata[overlap+tid] = wrap[(overlap+(i-n))+ wrapIndex];\n"
227 " }\n"
228 " else\n"
229 " {\n"
230 " sdata[overlap+tid] = pad;\n"
231 " }\n"
232 " if(tid < overlap)\n"
233 " {\n"
234 " sdata[tid] = (tmp==0) ? wrap[tid+wrapIndex] : input[(arrInd-(overlap*rowWidth))];\n"
235 " }\n"
236 " if(tid >= (get_local_size(0)-overlap))\n"
237 " {\n"
238 " sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+(overlap*rowWidth))] : wrap[overlap+wrapIndex+(tid+overlap-get_local_size(0))];\n"
239 " }\n"
240 " }\n"
241 " else if(poly == 2)\n"
242 " {\n"
243 " sdata[overlap+tid] = (i < n) ? input[arrInd] : input[n-1];\n"
244 " if(tid < overlap)\n"
245 " {\n"
246 " sdata[tid] = (tmp==0) ? input[tmp2] : input[(arrInd-(overlap*rowWidth))];\n"
247 " }\n"
248 " if(tid >= (get_local_size(0)-overlap))\n"
249 " {\n"
250 " sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+(overlap*rowWidth))] : input[tmp2+(colWidth-1)*rowWidth];\n"
251 " }\n"
252 " }\n"
253 " barrier(CLK_LOCAL_MEM_FENCE);\n"
254 " if( (i >= out_offset) && (i < out_offset+out_numelements) )\n"
255 " {\n"
256 " output[arrInd-out_offset] = FUNCTIONNAME(&(sdata[tid+overlap]));\n"
257 " }\n"
258 "}\n"
259 );
260 
261 
262 
263 
264 
265 
266 
267 
268 
269 
270 
271 
272 
273 }
274 
275 #endif
276 
277 #ifdef SKEPU_CUDA
278 
279 namespace skepu
280 {
281 
293 // #define BLOCK_DIM 16
294 
295 template <typename T>
296 __global__ void transpose(T *odata, T *idata, int width, int height)
297 {
298 // __shared__ T block[BLOCK_DIM][BLOCK_DIM+1];
299  extern __shared__ char _sdata[];
300  T* sdata = reinterpret_cast<T*>(_sdata);
301 
302  int block_dim= blockDim.x;
303  int block_dimY= blockDim.y;
304  // read the matrix tile into shared memory
305  unsigned int xIndex = blockIdx.x * block_dim + threadIdx.x;
306  unsigned int yIndex = blockIdx.y * block_dimY + threadIdx.y;
307  if((xIndex < width) && (yIndex < height))
308  {
309  unsigned int index_in = yIndex * width + xIndex;
310  sdata[threadIdx.y][threadIdx.x] = idata[index_in];
311  }
312 
313  __syncthreads();
314 
315  // write the transposed matrix tile to global memory
316  xIndex = blockIdx.y * block_dim + threadIdx.x;
317  yIndex = blockIdx.x * block_dimY + threadIdx.y;
318  if((xIndex < height) && (yIndex < width))
319  {
320  unsigned int index_out = yIndex * height + xIndex;
321  odata[index_out] = sdata[threadIdx.x][threadIdx.y];
322  }
323 }
324 
331 template <int poly, typename T, typename OverlapFunc>
332 __global__ void MapOverlapKernel_CU(OverlapFunc mapOverlapFunc, T* input, T* output, T* wrap, unsigned int n, unsigned int out_offset, unsigned int out_numelements, T pad)
333 {
334  extern __shared__ char _sdata[];
335  T* sdata = reinterpret_cast<T*>(_sdata);
336 
337  unsigned int tid = threadIdx.x;
338  unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
339  int overlap = mapOverlapFunc.overlap;
340 
341  //Copy data to shared memory
342  if(poly == 0)
343  {
344  sdata[overlap+tid] = (i < n) ? input[i] : pad;
345 
346  if(tid < overlap)
347  {
348  sdata[tid] = (blockIdx.x == 0) ? pad : input[i-overlap];
349  }
350 
351  if(tid >= (blockDim.x-overlap))
352  {
353  sdata[tid+2*overlap] = (blockIdx.x != gridDim.x-1 && i+overlap < n) ? input[i+overlap] : pad;
354  }
355  }
356  else if(poly == 1)
357  {
358  if(i < n)
359  {
360  sdata[overlap+tid] = input[i];
361  }
362  else if(i-n < overlap)
363  {
364  sdata[overlap+tid] = wrap[overlap+(i-n)];
365  }
366  else
367  {
368  sdata[overlap+tid] = pad;
369  }
370 
371  if(tid < overlap)
372  {
373  sdata[tid] = (blockIdx.x == 0) ? wrap[tid] : input[i-overlap];
374  }
375 
376  if(tid >= (blockDim.x-overlap))
377  {
378  sdata[tid+2*overlap] = (blockIdx.x != gridDim.x-1 && i+overlap < n) ? input[i+overlap] : wrap[overlap+(i+overlap-n)];
379  }
380  }
381  else if(poly == 2)
382  {
383  sdata[overlap+tid] = (i < n) ? input[i] : input[n-1];
384 
385  if(tid < overlap)
386  {
387  sdata[tid] = (blockIdx.x == 0) ? input[0] : input[i-overlap];
388  }
389 
390  if(tid >= (blockDim.x-overlap))
391  {
392  sdata[tid+2*overlap] = (blockIdx.x != gridDim.x-1 && i+overlap < n) ? input[i+overlap] : input[n-1];
393  }
394  }
395 
396  __syncthreads();
397 
398  //Compute and store data
399  if( (i >= out_offset) && (i < out_offset+out_numelements) )
400  {
401  output[i-out_offset] = mapOverlapFunc.CU(&(sdata[tid+overlap]));
402  }
403 }
404 
405 
406 
407 
408 
409 
414 template <int poly, typename T, typename OverlapFunc>
415 __global__ void MapOverlapKernel_CU_Matrix_Row(OverlapFunc mapOverlapFunc, T* input, T* output, T* wrap, unsigned int n, unsigned int out_offset, unsigned int out_numelements, T pad, unsigned int blocksPerRow, unsigned int rowWidth)
416 {
417  extern __shared__ char _sdata[];
418  T* sdata = reinterpret_cast<T*>(_sdata);
419 
420  unsigned int tid = threadIdx.x;
421  unsigned int i = blockIdx.x * blockDim.x + tid;
422  int overlap = mapOverlapFunc.overlap;
423 
424  unsigned wrapIndex= 2 * overlap * (int)(blockIdx.x/blocksPerRow);
425  int tmp= (blockIdx.x % blocksPerRow);
426  int tmp2= (blockIdx.x / blocksPerRow);
427 
428 
429  //Copy data to shared memory
430  if(poly == 0)
431  {
432  sdata[overlap+tid] = (i < n) ? input[i] : pad;
433 
434  if(tid < overlap)
435  {
436  sdata[tid] = (tmp==0) ? pad : input[i-overlap];
437  }
438 
439  if(tid >= (blockDim.x-overlap))
440  {
441  sdata[tid+2*overlap] = (blockIdx.x != gridDim.x-1 && (i+overlap < n) && tmp!=(blocksPerRow-1)) ? input[i+overlap] : pad;
442  }
443  }
444  else if(poly == 1)
445  {
446  if(i < n)
447  {
448  sdata[overlap+tid] = input[i];
449  }
450  else if(i-n < overlap)
451  {
452  sdata[overlap+tid] = wrap[(overlap+(i-n))+ wrapIndex];
453  }
454  else
455  {
456  sdata[overlap+tid] = pad;
457  }
458 
459  if(tid < overlap)
460  {
461  sdata[tid] = (tmp==0) ? wrap[tid+wrapIndex] : input[i-overlap];
462  }
463 
464  if(tid >= (blockDim.x-overlap))
465  {
466  sdata[tid+2*overlap] = (blockIdx.x != gridDim.x-1 && i+overlap < n && tmp!=(blocksPerRow-1)) ? input[i+overlap] : wrap[overlap+wrapIndex+(tid+overlap-blockDim.x)];
467  }
468  }
469  else if(poly == 2)
470  {
471  sdata[overlap+tid] = (i < n) ? input[i] : input[n-1];
472 
473  if(tid < overlap)
474  {
475  sdata[tid] = (tmp==0) ? input[tmp2*rowWidth] : input[i-overlap];
476  }
477 
478  if(tid >= (blockDim.x-overlap))
479  {
480  sdata[tid+2*overlap] = (blockIdx.x != gridDim.x-1 && (i+overlap < n) && (tmp!=(blocksPerRow-1))) ? input[i+overlap] : input[(tmp2+1)*rowWidth-1];
481  }
482  }
483 
484  __syncthreads();
485 
486  //Compute and store data
487  if( (i >= out_offset) && (i < out_offset+out_numelements) )
488  {
489  output[i-out_offset] = mapOverlapFunc.CU(&(sdata[tid+overlap]));
490  }
491 }
492 
493 
494 
495 
496 
497 
502 template <int poly, typename T, typename OverlapFunc>
503 __global__ void MapOverlapKernel_CU_Matrix_Col(OverlapFunc mapOverlapFunc, T* input, T* output, T* wrap, unsigned int n, unsigned int out_offset, unsigned int out_numelements, T pad, unsigned int blocksPerCol, unsigned int rowWidth, unsigned int colWidth)
504 {
505  extern __shared__ char _sdata[];
506  T* sdata = reinterpret_cast<T*>(_sdata);
507 
508  unsigned int tid = threadIdx.x;
509  unsigned int i = blockIdx.x * blockDim.x + tid;
510  int overlap = mapOverlapFunc.overlap;
511 
512  unsigned wrapIndex= 2 * overlap * (int)(blockIdx.x/blocksPerCol);
513  int tmp= (blockIdx.x % blocksPerCol);
514  int tmp2= (blockIdx.x / blocksPerCol);
515 
516  unsigned int arrInd = (threadIdx.x + tmp*blockDim.x)*rowWidth + ((blockIdx.x)/blocksPerCol);
517 
518  //Copy data to shared memory
519  if(poly == 0)
520  {
521  sdata[overlap+tid] = (i < n) ? input[arrInd] : pad;
522 
523  if(tid < overlap)
524  {
525  sdata[tid] = (tmp==0) ? pad : input[(arrInd-(overlap*rowWidth))];
526  }
527 
528  if(tid >= (blockDim.x-overlap))
529  {
530  sdata[tid+2*overlap] = (blockIdx.x != gridDim.x-1 && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+(overlap*rowWidth))] : pad;
531  }
532  }
533  else if(poly == 1)
534  {
535  if(i < n)
536  {
537  sdata[overlap+tid] = input[arrInd];
538  }
539  else if(i-n < overlap)
540  {
541  sdata[overlap+tid] = wrap[(overlap+(i-n))+ wrapIndex];
542  }
543  else
544  {
545  sdata[overlap+tid] = pad;
546  }
547 
548  if(tid < overlap)
549  {
550  sdata[tid] = (tmp==0) ? wrap[tid+wrapIndex] : input[(arrInd-(overlap*rowWidth))];
551  }
552 
553  if(tid >= (blockDim.x-overlap))
554  {
555  sdata[tid+2*overlap] = (blockIdx.x != gridDim.x-1 && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+(overlap*rowWidth))] : wrap[overlap+wrapIndex+(tid+overlap-blockDim.x)];
556  }
557  }
558  else if(poly == 2)
559  {
560  sdata[overlap+tid] = (i < n) ? input[arrInd] : input[n-1];
561 
562  if(tid < overlap)
563  {
564  sdata[tid] = (tmp==0) ? input[tmp2] : input[(arrInd-(overlap*rowWidth))];
565  }
566 
567  if(tid >= (blockDim.x-overlap))
568  {
569  sdata[tid+2*overlap] = (blockIdx.x != gridDim.x-1 && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+(overlap*rowWidth))] : input[tmp2+(colWidth-1)*rowWidth];
570  }
571  }
572 
573  __syncthreads();
574 
575  //Compute and store data
576  if( (arrInd >= out_offset) && (arrInd < out_offset+out_numelements) )
577  {
578  output[arrInd-out_offset] = mapOverlapFunc.CU(&(sdata[tid+overlap]));
579  }
580 }
581 
582 
583 
584 
585 
586 
591 }
592 
593 #endif
594 
595 #endif
596 
__global__ void MapOverlapKernel_CU_Matrix_Row(OverlapFunc mapOverlapFunc, T *input, T *output, T *wrap, unsigned int n, unsigned int out_offset, unsigned int out_numelements, T pad, unsigned int blocksPerRow, unsigned int rowWidth)
Definition: mapoverlap_kernels.h:415
__global__ void MapOverlapKernel_CU_Matrix_Col(OverlapFunc mapOverlapFunc, T *input, T *output, T *wrap, unsigned int n, unsigned int out_offset, unsigned int out_numelements, T pad, unsigned int blocksPerCol, unsigned int rowWidth, unsigned int colWidth)
Definition: mapoverlap_kernels.h:503
static std::string MapOverlapKernel_CL("__kernel void MapOverlapKernel_KERNELNAME(__global TYPE* input, __global TYPE* output, __global TYPE* wrap, int n, int overlap, int out_offset, int out_numelements, int poly, TYPE pad, __local TYPE* sdata)\n""{\n"" int tid = get_local_id(0);\n"" int i = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"" if(poly == 0)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[i] : pad;\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (get_group_id(0) == 0) ? pad : input[i-overlap];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != get_num_groups(0)-1 && i+overlap < n) ? input[i+overlap] : pad;\n"" }\n"" }\n"" else if(poly == 1)\n"" {\n"" if(i < n)\n"" {\n"" sdata[overlap+tid] = input[i];\n"" }\n"" else if(i-n < overlap)\n"" {\n"" sdata[overlap+tid] = wrap[overlap+(i-n)];\n"" }\n"" else\n"" {\n"" sdata[overlap+tid] = pad;\n"" }\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (get_group_id(0) == 0) ? wrap[tid] : input[i-overlap];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != get_num_groups(0)-1 && i+overlap < n) ? input[i+overlap] : wrap[overlap+(i+overlap-n)];\n"" }\n"" }\n"" else if(poly == 2)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[i] : input[n-1];\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (get_group_id(0) == 0) ? input[0] : input[i-overlap];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != get_num_groups(0)-1 && i+overlap < n) ? input[i+overlap] : input[n-1];\n"" }\n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if( (i >= out_offset) && (i < out_offset+out_numelements) )\n"" {\n"" output[i-out_offset] = FUNCTIONNAME(&(sdata[tid+overlap]));\n"" }\n""}\n")
__global__ void MapOverlapKernel_CU(OverlapFunc mapOverlapFunc, T *input, T *output, T *wrap, unsigned int n, unsigned int out_offset, unsigned int out_numelements, T pad)
Definition: mapoverlap_kernels.h:332
static std::string MapOverlapKernel_CL_Matrix_Row("__kernel void MapOverlapKernel_MatRowWise_KERNELNAME(__global TYPE* input, __global TYPE* output, __global TYPE* wrap, int n, int overlap, int out_offset, int out_numelements, int poly, TYPE pad, int blocksPerRow, int rowWidth, __local TYPE* sdata)\n""{\n"" int tid = get_local_id(0);\n"" int i = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"" int wrapIndex= 2 * overlap * (int)(get_group_id(0)/blocksPerRow);\n"" int tmp= (get_group_id(0) % blocksPerRow);\n"" int tmp2= (get_group_id(0) / blocksPerRow);\n"" if(poly == 0)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[i] : pad;\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? pad : input[i-overlap];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (i+overlap < n) && tmp!=(blocksPerRow-1)) ? input[i+overlap] : pad;\n"" }\n"" }\n"" else if(poly == 1)\n"" {\n"" if(i < n)\n"" {\n"" sdata[overlap+tid] = input[i];\n"" }\n"" else if(i-n < overlap)\n"" {\n"" sdata[overlap+tid] = wrap[(overlap+(i-n))+ wrapIndex];\n"" }\n"" else\n"" {\n"" sdata[overlap+tid] = pad;\n"" }\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? wrap[tid+wrapIndex] : input[i-overlap];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && i+overlap < n && tmp!=(blocksPerRow-1)) ? input[i+overlap] : wrap[overlap+wrapIndex+(tid+overlap-get_local_size(0))];\n"" }\n"" }\n"" else if(poly == 2)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[i] : input[n-1];\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? input[tmp2*rowWidth] : input[i-overlap];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (i+overlap < n) && (tmp!=(blocksPerRow-1))) ? input[i+overlap] : input[(tmp2+1)*rowWidth-1];\n"" }\n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if( (i >= out_offset) && (i < out_offset+out_numelements) )\n"" {\n"" output[i-out_offset] = FUNCTIONNAME(&(sdata[tid+overlap]));\n"" }\n""}\n")
static std::string MapOverlapKernel_CL_Matrix_Col("__kernel void MapOverlapKernel_MatColWise_KERNELNAME(__global TYPE* input, __global TYPE* output, __global TYPE* wrap, int n, int overlap, int out_offset, int out_numelements, int poly, TYPE pad, int blocksPerCol, int rowWidth, int colWidth, __local TYPE* sdata)\n""{\n"" int tid = get_local_id(0);\n"" int i = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"" int wrapIndex= 2 * overlap * (int)(get_group_id(0)/blocksPerCol);\n"" int tmp= (get_group_id(0) % blocksPerCol);\n"" int tmp2= (get_group_id(0) / blocksPerCol);\n"" int arrInd = (tid + tmp*get_local_size(0))*rowWidth + tmp2;\n"" if(poly == 0)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[arrInd] : pad;\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? pad : input[(arrInd-(overlap*rowWidth))];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+(overlap*rowWidth))] : pad;\n"" }\n"" }\n"" else if(poly == 1)\n"" {\n"" if(i < n)\n"" {\n"" sdata[overlap+tid] = input[arrInd];\n"" }\n"" else if(i-n < overlap)\n"" {\n"" sdata[overlap+tid] = wrap[(overlap+(i-n))+ wrapIndex];\n"" }\n"" else\n"" {\n"" sdata[overlap+tid] = pad;\n"" }\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? wrap[tid+wrapIndex] : input[(arrInd-(overlap*rowWidth))];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+(overlap*rowWidth))] : wrap[overlap+wrapIndex+(tid+overlap-get_local_size(0))];\n"" }\n"" }\n"" else if(poly == 2)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[arrInd] : input[n-1];\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? input[tmp2] : input[(arrInd-(overlap*rowWidth))];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+(overlap*rowWidth))] : input[tmp2+(colWidth-1)*rowWidth];\n"" }\n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if( (i >= out_offset) && (i < out_offset+out_numelements) )\n"" {\n"" output[arrInd-out_offset] = FUNCTIONNAME(&(sdata[tid+overlap]));\n"" }\n""}\n")