SkePU  1.2
 All Classes Namespaces Files Functions Variables Enumerations Friends Macros Groups Pages
mapoverlap_kernels.h
Go to the documentation of this file.
1 
5 #ifndef MAPOVERLAP_KERNELS_H
6 #define MAPOVERLAP_KERNELS_H
7 
8 #ifdef SKEPU_OPENCL
9 
10 #include <string>
11 
12 namespace skepu
13 {
14 
35 static std::string MapOverlapKernel_CL(
36  "__kernel void MapOverlapKernel_KERNELNAME(__global TYPE* input, __global TYPE* output, __global TYPE* wrap, size_t n, size_t overlap, size_t out_offset, size_t out_numelements, int poly, TYPE pad, __local TYPE* sdata)\n"
37  "{\n"
38  " size_t tid = get_local_id(0);\n"
39  " size_t i = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"
40  " if(poly == 0)\n"
41  " {\n"
42  " sdata[overlap+tid] = (i < n) ? input[i] : pad;\n"
43  " if(tid < overlap)\n"
44  " {\n"
45  " sdata[tid] = (get_group_id(0) == 0) ? pad : input[i-overlap];\n"
46  " }\n"
47  " if(tid >= (get_local_size(0)-overlap))\n"
48  " {\n"
49  " sdata[tid+2*overlap] = (get_group_id(0) != get_num_groups(0)-1 && i+overlap < n) ? input[i+overlap] : pad;\n"
50  " }\n"
51  " }\n"
52  " else if(poly == 1)\n"
53  " {\n"
54  " if(i < n)\n"
55  " {\n"
56  " sdata[overlap+tid] = input[i];\n"
57  " }\n"
58  " else if(i-n < overlap)\n"
59  " {\n"
60  " sdata[overlap+tid] = wrap[overlap+(i-n)];\n"
61  " }\n"
62  " else\n"
63  " {\n"
64  " sdata[overlap+tid] = pad;\n"
65  " }\n"
66  " if(tid < overlap)\n"
67  " {\n"
68  " sdata[tid] = (get_group_id(0) == 0) ? wrap[tid] : input[i-overlap];\n"
69  " }\n"
70  " if(tid >= (get_local_size(0)-overlap))\n"
71  " {\n"
72  " sdata[tid+2*overlap] = (get_group_id(0) != get_num_groups(0)-1 && i+overlap < n) ? input[i+overlap] : wrap[overlap+(i+overlap-n)];\n"
73  " }\n"
74  " }\n"
75  " else if(poly == 2)\n"
76  " {\n"
77  " sdata[overlap+tid] = (i < n) ? input[i] : input[n-1];\n"
78  " if(tid < overlap)\n"
79  " {\n"
80  " sdata[tid] = (get_group_id(0) == 0) ? input[0] : input[i-overlap];\n"
81  " }\n"
82  " if(tid >= (get_local_size(0)-overlap))\n"
83  " {\n"
84  " sdata[tid+2*overlap] = (get_group_id(0) != get_num_groups(0)-1 && i+overlap < n) ? input[i+overlap] : input[n-1];\n"
85  " }\n"
86  " }\n"
87  " barrier(CLK_LOCAL_MEM_FENCE);\n"
88  " if( (i >= out_offset) && (i < out_offset+out_numelements) )\n"
89  " {\n"
90  " output[i-out_offset] = FUNCTIONNAME(&(sdata[tid+overlap]));\n"
91  " }\n"
92  "}\n"
93 );
94 
95 
104 static std::string MapOverlapKernel_CL_Matrix_Row(
105  "__kernel void MapOverlapKernel_MatRowWise_KERNELNAME(__global TYPE* input, __global TYPE* output, __global TYPE* wrap, size_t n, size_t overlap, size_t out_offset, size_t out_numelements, int poly, TYPE pad, size_t blocksPerRow, size_t rowWidth, __local TYPE* sdata)\n"
106  "{\n"
107  " size_t tid = get_local_id(0);\n"
108  " size_t i = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"
109  " size_t wrapIndex= 2 * overlap * (int)(get_group_id(0)/blocksPerRow);\n"
110  " size_t tmp= (get_group_id(0) % blocksPerRow);\n"
111  " size_t tmp2= (get_group_id(0) / blocksPerRow);\n"
112  " if(poly == 0)\n"
113  " {\n"
114  " sdata[overlap+tid] = (i < n) ? input[i] : pad;\n"
115  " if(tid < overlap)\n"
116  " {\n"
117  " sdata[tid] = (tmp==0) ? pad : input[i-overlap];\n"
118  " }\n"
119  " if(tid >= (get_local_size(0)-overlap))\n"
120  " {\n"
121  " sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (i+overlap < n) && tmp!=(blocksPerRow-1)) ? input[i+overlap] : pad;\n"
122  " }\n"
123  " }\n"
124  " else if(poly == 1)\n"
125  " {\n"
126  " if(i < n)\n"
127  " {\n"
128  " sdata[overlap+tid] = input[i];\n"
129  " }\n"
130  " else if(i-n < overlap)\n"
131  " {\n"
132  " sdata[overlap+tid] = wrap[(overlap+(i-n))+ wrapIndex];\n"
133  " }\n"
134  " else\n"
135  " {\n"
136  " sdata[overlap+tid] = pad;\n"
137  " }\n"
138  " if(tid < overlap)\n"
139  " {\n"
140  " sdata[tid] = (tmp==0) ? wrap[tid+wrapIndex] : input[i-overlap];\n"
141  " }\n"
142  " if(tid >= (get_local_size(0)-overlap))\n"
143  " {\n"
144  " sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && i+overlap < n && tmp!=(blocksPerRow-1)) ? input[i+overlap] : wrap[overlap+wrapIndex+(tid+overlap-get_local_size(0))];\n"
145  " }\n"
146  " }\n"
147  " else if(poly == 2)\n"
148  " {\n"
149  " sdata[overlap+tid] = (i < n) ? input[i] : input[n-1];\n"
150  " if(tid < overlap)\n"
151  " {\n"
152  " sdata[tid] = (tmp==0) ? input[tmp2*rowWidth] : input[i-overlap];\n"
153  " }\n"
154  " if(tid >= (get_local_size(0)-overlap))\n"
155  " {\n"
156  " sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (i+overlap < n) && (tmp!=(blocksPerRow-1))) ? input[i+overlap] : input[(tmp2+1)*rowWidth-1];\n"
157  " }\n"
158  " }\n"
159  " barrier(CLK_LOCAL_MEM_FENCE);\n"
160  " if( (i >= out_offset) && (i < out_offset+out_numelements) )\n"
161  " {\n"
162  " output[i-out_offset] = FUNCTIONNAME(&(sdata[tid+overlap]));\n"
163  " }\n"
164  "}\n"
165 );
166 
167 
168 
177 static std::string MapOverlapKernel_CL_Matrix_Col(
178  "__kernel void MapOverlapKernel_MatColWise_KERNELNAME(__global TYPE* input, __global TYPE* output, __global TYPE* wrap, size_t n, size_t overlap, size_t out_offset, size_t out_numelements, int poly, TYPE pad, size_t blocksPerCol, size_t rowWidth, size_t colWidth, __local TYPE* sdata)\n"
179  "{\n"
180  " size_t tid = get_local_id(0);\n"
181  " size_t i = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"
182  " size_t wrapIndex= 2 * overlap * (int)(get_group_id(0)/blocksPerCol);\n"
183  " size_t tmp= (get_group_id(0) % blocksPerCol);\n"
184  " size_t tmp2= (get_group_id(0) / blocksPerCol);\n"
185  " size_t arrInd = (tid + tmp*get_local_size(0))*rowWidth + tmp2;\n"
186  " if(poly == 0)\n"
187  " {\n"
188  " sdata[overlap+tid] = (i < n) ? input[arrInd] : pad;\n"
189  " if(tid < overlap)\n"
190  " {\n"
191  " sdata[tid] = (tmp==0) ? pad : input[(arrInd-(overlap*rowWidth))];\n"
192  " }\n"
193  " if(tid >= (get_local_size(0)-overlap))\n"
194  " {\n"
195  " sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+(overlap*rowWidth))] : pad;\n"
196  " }\n"
197  " }\n"
198  " else if(poly == 1)\n"
199  " {\n"
200  " if(i < n)\n"
201  " {\n"
202  " sdata[overlap+tid] = input[arrInd];\n"
203  " }\n"
204  " else if(i-n < overlap)\n"
205  " {\n"
206  " sdata[overlap+tid] = wrap[(overlap+(i-n))+ wrapIndex];\n"
207  " }\n"
208  " else\n"
209  " {\n"
210  " sdata[overlap+tid] = pad;\n"
211  " }\n"
212  " if(tid < overlap)\n"
213  " {\n"
214  " sdata[tid] = (tmp==0) ? wrap[tid+wrapIndex] : input[(arrInd-(overlap*rowWidth))];\n"
215  " }\n"
216  " if(tid >= (get_local_size(0)-overlap))\n"
217  " {\n"
218  " sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+(overlap*rowWidth))] : wrap[overlap+wrapIndex+(tid+overlap-get_local_size(0))];\n"
219  " }\n"
220  " }\n"
221  " else if(poly == 2)\n"
222  " {\n"
223  " sdata[overlap+tid] = (i < n) ? input[arrInd] : input[n-1];\n"
224  " if(tid < overlap)\n"
225  " {\n"
226  " sdata[tid] = (tmp==0) ? input[tmp2] : input[(arrInd-(overlap*rowWidth))];\n"
227  " }\n"
228  " if(tid >= (get_local_size(0)-overlap))\n"
229  " {\n"
230  " sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+(overlap*rowWidth))] : input[tmp2+(colWidth-1)*rowWidth];\n"
231  " }\n"
232  " }\n"
233  " barrier(CLK_LOCAL_MEM_FENCE);\n"
234  " if( (arrInd >= out_offset) && (arrInd < out_offset+out_numelements) )\n"
235  " {\n"
236  " output[arrInd-out_offset] = FUNCTIONNAME(&(sdata[tid+overlap]));\n"
237  " }\n"
238  "}\n"
239 );
240 
241 
242 
251 static std::string MapOverlapKernel_CL_Matrix_ColMulti(
252  "__kernel void MapOverlapKernel_MatColWiseMulti_KERNELNAME(__global TYPE* input, __global TYPE* output, __global TYPE* wrap, size_t n, size_t overlap, size_t in_offset, size_t out_numelements, int poly, int deviceType, TYPE pad, size_t blocksPerCol, size_t rowWidth, size_t colWidth, __local TYPE* sdata)\n"
253  "{\n"
254  " size_t tid = get_local_id(0);\n"
255  " size_t i = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"
256  " size_t wrapIndex= 2 * overlap * (int)(get_group_id(0)/blocksPerCol);\n"
257  " size_t tmp= (get_group_id(0) % blocksPerCol);\n"
258  " size_t tmp2= (get_group_id(0) / blocksPerCol);\n"
259  " size_t arrInd = (tid + tmp*get_local_size(0))*rowWidth + tmp2;\n"
260  " if(poly == 0)\n"
261  " {\n"
262  " sdata[overlap+tid] = (i < n) ? input[arrInd+in_offset] : pad;\n"
263  " if(deviceType == -1)\n"
264  " {\n"
265  " if(tid < overlap)\n"
266  " {\n"
267  " sdata[tid] = (tmp==0) ? pad : input[(arrInd-(overlap*rowWidth))];\n"
268  " }\n"
269  " \n"
270  " if(tid >= (get_local_size(0)-overlap))\n"
271  " {\n"
272  " sdata[tid+2*overlap] = input[(arrInd+in_offset+(overlap*rowWidth))];\n"
273  " }\n"
274  " }\n"
275  " else if(deviceType == 0) \n"
276  " {\n"
277  " if(tid < overlap)\n"
278  " {\n"
279  " sdata[tid] = input[arrInd];\n"
280  " }\n"
281  " if(tid >= (get_local_size(0)-overlap))\n"
282  " {\n"
283  " sdata[tid+2*overlap] = input[(arrInd+in_offset+(overlap*rowWidth))];\n"
284  " }\n"
285  " }\n"
286  " else if(deviceType == 1)\n"
287  " {\n"
288  " if(tid < overlap)\n"
289  " {\n"
290  " sdata[tid] = input[arrInd];\n"
291  " }\n"
292  " if(tid >= (get_local_size(0)-overlap))\n"
293  " {\n"
294  " sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+in_offset+(overlap*rowWidth))] : pad;\n"
295  " }\n"
296  " }\n"
297  " }\n"
298  " else if(poly == 1)\n"
299  " {\n"
300  " sdata[overlap+tid] = (i < n) ? input[arrInd+in_offset] : ((i-n < overlap) ? wrap[(i-n)+ (overlap * tmp2)] : pad);\n"
301  " if(deviceType == -1)\n"
302  " {\n"
303  " if(tid < overlap)\n"
304  " {\n"
305  " sdata[tid] = (tmp==0) ? wrap[tid+(overlap * tmp2)] : input[(arrInd-(overlap*rowWidth))];\n"
306  " }\n"
307  " if(tid >= (get_local_size(0)-overlap))\n"
308  " {\n"
309  " sdata[tid+2*overlap] = input[(arrInd+in_offset+(overlap*rowWidth))];\n"
310  " }\n"
311  " }\n"
312  " else if(deviceType == 0)\n"
313  " {\n"
314  " if(tid < overlap)\n"
315  " {\n"
316  " sdata[tid] = input[arrInd];\n"
317  " }\n"
318  " if(tid >= (get_local_size(0)-overlap))\n"
319  " {\n"
320  " sdata[tid+2*overlap] = input[(arrInd+in_offset+(overlap*rowWidth))];\n"
321  " }\n"
322  " }\n"
323  " else if(deviceType == 1)\n"
324  " {\n"
325  " if(tid < overlap)\n"
326  " {\n"
327  " sdata[tid] = input[arrInd];\n"
328  " }\n"
329  " if(tid >= (get_local_size(0)-overlap))\n"
330  " {\n"
331  " sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+in_offset+(overlap*rowWidth))] : wrap[(overlap * tmp2)+(tid+overlap-get_local_size(0))];\n"
332  " }\n"
333  " }\n"
334  " }\n"
335  " else if(poly == 2)\n"
336  " {\n"
337  " sdata[overlap+tid] = (i < n) ? input[arrInd+in_offset] : input[n+in_offset-1];\n"
338  " if(deviceType == -1)\n"
339  " {\n"
340  " if(tid < overlap)\n"
341  " {\n"
342  " sdata[tid] = (tmp==0) ? input[tmp2] : input[(arrInd-(overlap*rowWidth))];\n"
343  " }\n"
344  " if(tid >= (get_local_size(0)-overlap))\n"
345  " {\n"
346  " sdata[tid+2*overlap] = input[(arrInd+in_offset+(overlap*rowWidth))];\n"
347  " }\n"
348  " }\n"
349  " else if(deviceType == 0)\n"
350  " {\n"
351  " if(tid < overlap)\n"
352  " {\n"
353  " sdata[tid] = input[arrInd];\n"
354  " }\n"
355  " if(tid >= (get_local_size(0)-overlap))\n"
356  " {\n"
357  " sdata[tid+2*overlap] = input[(arrInd+in_offset+(overlap*rowWidth))];\n"
358  " }\n"
359  " }\n"
360  " else if(deviceType == 1)\n"
361  " {\n"
362  " if(tid < overlap)\n"
363  " {\n"
364  " sdata[tid] = input[arrInd];\n"
365  " }\n"
366  " if(tid >= (get_local_size(0)-overlap))\n"
367  " {\n"
368  " sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+in_offset+(overlap*rowWidth))] : input[tmp2+in_offset+(colWidth-1)*rowWidth];\n"
369  " }\n"
370  " }\n"
371  " }\n"
372  " barrier(CLK_LOCAL_MEM_FENCE);\n"
373  " if( arrInd < out_numelements )\n"
374  " {\n"
375  " output[arrInd] = FUNCTIONNAME(&(sdata[tid+overlap]));\n"
376  " }\n"
377  "}\n"
378 );
379 
380 
384 }
385 
386 #endif
387 
388 #ifdef SKEPU_CUDA
389 
390 namespace skepu
391 {
392 
407 template <typename T>
408 __global__ void transpose(T *odata, T *idata, size_t width, size_t height)
409 {
410  extern __shared__ char _sdata[];
411  T* sdata = reinterpret_cast<T*>(_sdata);
412 
413  size_t block_dim= blockDim.x;
414  size_t block_dimY= blockDim.y;
415  // read the matrix tile into shared memory
416  size_t xIndex = blockIdx.x * block_dim + threadIdx.x;
417  size_t yIndex = blockIdx.y * block_dimY + threadIdx.y;
418  if((xIndex < width) && (yIndex < height))
419  {
420  size_t index_in = yIndex * width + xIndex;
421  sdata[threadIdx.y][threadIdx.x] = idata[index_in];
422  }
423 
424  __syncthreads();
425 
426  // write the transposed matrix tile to global memory
427  xIndex = blockIdx.y * block_dim + threadIdx.x;
428  yIndex = blockIdx.x * block_dimY + threadIdx.y;
429  if((xIndex < height) && (yIndex < width))
430  {
431  size_t index_out = yIndex * height + xIndex;
432  odata[index_out] = sdata[threadIdx.x][threadIdx.y];
433  }
434 }
435 
442 template <int poly, typename T, typename OverlapFunc>
443 __global__ void MapOverlapKernel_CU(OverlapFunc mapOverlapFunc, T* input, T* output, T* wrap, size_t n, size_t out_offset, size_t out_numelements, T pad)
444 {
445  extern __shared__ char _sdata[];
446  T* sdata = reinterpret_cast<T*>(_sdata);
447 
448  size_t tid = threadIdx.x;
449  size_t i = blockIdx.x * blockDim.x + threadIdx.x;
450  size_t gridSize = blockDim.x*gridDim.x;
451 
452  size_t overlap = mapOverlapFunc.overlap;
453 
454  while(i<(n+overlap-1))
455  {
456  //Copy data to shared memory
457  if(poly == 0) // constant policy
458  {
459  sdata[overlap+tid] = (i < n) ? input[i] : pad;
460 
461  if(tid < overlap)
462  {
463  sdata[tid] = (i<overlap) ? pad : input[i-overlap];
464  }
465 
466  if(tid >= (blockDim.x-overlap))
467  {
468  sdata[tid+2*overlap] = (i+overlap < n) ? input[i+overlap] : pad;
469  }
470  }
471  else if(poly == 1)
472  {
473  if(i < n)
474  {
475  sdata[overlap+tid] = input[i];
476  }
477  else
478  {
479  sdata[overlap+tid] = wrap[overlap+(i-n)];
480  }
481 
482  if(tid < overlap)
483  {
484  sdata[tid] = (i<overlap) ? wrap[tid] : input[i-overlap];
485  }
486 
487  if(tid >= (blockDim.x-overlap))
488  {
489  sdata[tid+2*overlap] = (i+overlap < n) ? input[i+overlap] : wrap[overlap+(i+overlap-n)];
490  }
491  }
492  else if(poly == 2) // DUPLICATE
493  {
494  sdata[overlap+tid] = (i < n) ? input[i] : input[n-1];
495 
496  if(tid < overlap)
497  {
498  sdata[tid] = (i<overlap) ? input[0] : input[i-overlap];
499  }
500 
501  if(tid >= (blockDim.x-overlap))
502  {
503  sdata[tid+2*overlap] = (i+overlap < n) ? input[i+overlap] : input[n-1];
504  }
505  }
506 
507  __syncthreads();
508 
509  //Compute and store data
510  if( (i >= out_offset) && (i < out_offset+out_numelements) )
511  {
512  output[i-out_offset] = mapOverlapFunc.CU(&(sdata[tid+overlap]));
513  }
514 
515  i += gridSize;
516 
517  __syncthreads();
518  }
519 }
520 
521 
522 
531 template <int poly, typename T, typename OverlapFunc>
532 __global__ void MapOverlapKernel_CU_Matrix_Row(OverlapFunc mapOverlapFunc, T* input, T* output, T* wrap, size_t n, size_t out_offset, size_t out_numelements, T pad, size_t blocksPerRow, size_t rowWidth)
533 {
534  extern __shared__ char _sdata[];
535  T* sdata = reinterpret_cast<T*>(_sdata);
536 
537  size_t tid = threadIdx.x;
538  size_t i = blockIdx.x * blockDim.x + tid;
539  size_t overlap = mapOverlapFunc.overlap;
540 
541  size_t wrapIndex= 2 * overlap * (int)(blockIdx.x/blocksPerRow);
542  size_t tmp= (blockIdx.x % blocksPerRow);
543  size_t tmp2= (blockIdx.x / blocksPerRow);
544 
545 
546  //Copy data to shared memory
547  if(poly == 0)
548  {
549  sdata[overlap+tid] = (i < n) ? input[i] : pad;
550 
551  if(tid < overlap)
552  {
553  sdata[tid] = (tmp==0) ? pad : input[i-overlap];
554  }
555 
556  if(tid >= (blockDim.x-overlap))
557  {
558  sdata[tid+2*overlap] = (blockIdx.x != gridDim.x-1 && (i+overlap < n) && tmp!=(blocksPerRow-1)) ? input[i+overlap] : pad;
559  }
560  }
561  else if(poly == 1)
562  {
563  if(i < n)
564  {
565  sdata[overlap+tid] = input[i];
566  }
567  else if(i-n < overlap)
568  {
569  sdata[overlap+tid] = wrap[(overlap+(i-n))+ wrapIndex];
570  }
571  else
572  {
573  sdata[overlap+tid] = pad;
574  }
575 
576  if(tid < overlap)
577  {
578  sdata[tid] = (tmp==0) ? wrap[tid+wrapIndex] : input[i-overlap];
579  }
580 
581  if(tid >= (blockDim.x-overlap))
582  {
583  sdata[tid+2*overlap] = (blockIdx.x != gridDim.x-1 && i+overlap < n && tmp!=(blocksPerRow-1)) ? input[i+overlap] : wrap[overlap+wrapIndex+(tid+overlap-blockDim.x)];
584  }
585  }
586  else if(poly == 2)
587  {
588  sdata[overlap+tid] = (i < n) ? input[i] : input[n-1];
589 
590  if(tid < overlap)
591  {
592  sdata[tid] = (tmp==0) ? input[tmp2*rowWidth] : input[i-overlap];
593  }
594 
595  if(tid >= (blockDim.x-overlap))
596  {
597  sdata[tid+2*overlap] = (blockIdx.x != gridDim.x-1 && (i+overlap < n) && (tmp!=(blocksPerRow-1))) ? input[i+overlap] : input[(tmp2+1)*rowWidth-1];
598  }
599  }
600 
601  __syncthreads();
602 
603  //Compute and store data
604  if( (i >= out_offset) && (i < out_offset+out_numelements) )
605  {
606  output[i-out_offset] = mapOverlapFunc.CU(&(sdata[tid+overlap]));
607  }
608 }
609 
610 
611 
620 template <int poly, typename T, typename OverlapFunc>
621 __global__ void MapOverlapKernel_CU_Matrix_Col(OverlapFunc mapOverlapFunc, T* input, T* output, T* wrap, size_t n, size_t out_offset, size_t out_numelements, T pad, size_t blocksPerCol, size_t rowWidth, size_t colWidth)
622 {
623  extern __shared__ char _sdata[];
624  T* sdata = reinterpret_cast<T*>(_sdata);
625 
626  size_t tid = threadIdx.x;
627  size_t i = blockIdx.x * blockDim.x + tid;
628  size_t overlap = mapOverlapFunc.overlap;
629 
630  size_t wrapIndex= 2 * overlap * (int)(blockIdx.x/blocksPerCol);
631  size_t tmp= (blockIdx.x % blocksPerCol);
632  size_t tmp2= (blockIdx.x / blocksPerCol);
633 
634  size_t arrInd = (threadIdx.x + tmp*blockDim.x)*rowWidth + ((blockIdx.x)/blocksPerCol);
635 
636  //Copy data to shared memory
637  if(poly == 0)
638  {
639  sdata[overlap+tid] = (i < n) ? input[arrInd] : pad;
640 
641  if(tid < overlap)
642  {
643  sdata[tid] = (tmp==0) ? pad : input[(arrInd-(overlap*rowWidth))];
644  }
645 
646  if(tid >= (blockDim.x-overlap))
647  {
648  sdata[tid+2*overlap] = (blockIdx.x != gridDim.x-1 && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+(overlap*rowWidth))] : pad;
649  }
650  }
651  else if(poly == 1)
652  {
653  if(i < n)
654  {
655  sdata[overlap+tid] = input[arrInd];
656  }
657  else if(i-n < overlap)
658  {
659  sdata[overlap+tid] = wrap[(overlap+(i-n))+ wrapIndex];
660  }
661  else
662  {
663  sdata[overlap+tid] = pad;
664  }
665 
666  if(tid < overlap)
667  {
668  sdata[tid] = (tmp==0) ? wrap[tid+wrapIndex] : input[(arrInd-(overlap*rowWidth))];
669  }
670 
671  if(tid >= (blockDim.x-overlap))
672  {
673  sdata[tid+2*overlap] = (blockIdx.x != gridDim.x-1 && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+(overlap*rowWidth))] : wrap[overlap+wrapIndex+(tid+overlap-blockDim.x)];
674  }
675  }
676  else if(poly == 2)
677  {
678  sdata[overlap+tid] = (i < n) ? input[arrInd] : input[n-1];
679 
680  if(tid < overlap)
681  {
682  sdata[tid] = (tmp==0) ? input[tmp2] : input[(arrInd-(overlap*rowWidth))];
683  }
684 
685  if(tid >= (blockDim.x-overlap))
686  {
687  sdata[tid+2*overlap] = (blockIdx.x != gridDim.x-1 && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+(overlap*rowWidth))] : input[tmp2+(colWidth-1)*rowWidth];
688  }
689  }
690 
691  __syncthreads();
692 
693  //Compute and store data
694  if( (arrInd >= out_offset) && (arrInd < out_offset+out_numelements) )
695  {
696  output[arrInd-out_offset] = mapOverlapFunc.CU(&(sdata[tid+overlap]));
697  }
698 }
699 
700 
701 
702 
703 
704 
705 
715 template <int poly, int deviceType, typename T, typename OverlapFunc>
716 __global__ void MapOverlapKernel_CU_Matrix_ColMulti(OverlapFunc mapOverlapFunc, T* input, T* output, T* wrap, size_t n, size_t in_offset, size_t out_numelements, T pad, size_t blocksPerCol, size_t rowWidth, size_t colWidth)
717 {
718  extern __shared__ char _sdata[];
719  T* sdata = reinterpret_cast<T*>(_sdata);
720 
721  size_t tid = threadIdx.x;
722  size_t i = blockIdx.x * blockDim.x + tid;
723  size_t overlap = mapOverlapFunc.overlap;
724 
725  size_t tmp= (blockIdx.x % blocksPerCol);
726  size_t tmp2= (blockIdx.x / blocksPerCol);
727 
728  size_t arrInd = (threadIdx.x + tmp*blockDim.x)*rowWidth + tmp2; //((blockIdx.x)/blocksPerCol);
729 
730  if(poly == 0) //IF overlap policy is CONSTANT
731  {
732  sdata[overlap+tid] = (i < n) ? input[arrInd+in_offset] : pad; // in_offset
733 
734  if(deviceType == -1) // first device, i.e. in_offset=0
735  {
736  if(tid < overlap)
737  {
738  sdata[tid] = (tmp==0) ? pad : input[(arrInd-(overlap*rowWidth))];
739  }
740 
741  if(tid >= (blockDim.x-overlap))
742  {
743  sdata[tid+2*overlap] = input[(arrInd+in_offset+(overlap*rowWidth))];
744  }
745  }
746  else if(deviceType == 0) // middle device
747  {
748  if(tid < overlap)
749  {
750  sdata[tid] = input[arrInd];
751  }
752 
753  if(tid >= (blockDim.x-overlap))
754  {
755  sdata[tid+2*overlap] = input[(arrInd+in_offset+(overlap*rowWidth))];
756  }
757  }
758  else if(deviceType == 1) // last device
759  {
760  if(tid < overlap)
761  {
762  sdata[tid] = input[arrInd];
763  }
764 
765  if(tid >= (blockDim.x-overlap))
766  {
767  sdata[tid+2*overlap] = (blockIdx.x != gridDim.x-1 && (arrInd+in_offset+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+in_offset+(overlap*rowWidth))] : pad;
768  }
769  }
770  }
771  else if(poly == 1) //IF overlap policy is CYCLIC
772  {
773  sdata[overlap+tid] = (i < n) ? input[arrInd+in_offset] : ((i-n < overlap) ? wrap[(i-n)+ (overlap * tmp2)] : pad);
774 
775  if(deviceType == -1) // first device, i.e. in_offset=0
776  {
777  if(tid < overlap)
778  {
779  sdata[tid] = (tmp==0) ? wrap[tid+(overlap * tmp2)] : input[(arrInd-(overlap*rowWidth))];
780  }
781 
782  if(tid >= (blockDim.x-overlap))
783  {
784  sdata[tid+2*overlap] = input[(arrInd+in_offset+(overlap*rowWidth))];
785  }
786  }
787  else if(deviceType == 0) // middle device
788  {
789  if(tid < overlap)
790  {
791  sdata[tid] = input[arrInd];
792  }
793 
794  if(tid >= (blockDim.x-overlap))
795  {
796  sdata[tid+2*overlap] = input[(arrInd+in_offset+(overlap*rowWidth))];
797  }
798  }
799  else if(deviceType == 1) // last device
800  {
801  if(tid < overlap)
802  {
803  sdata[tid] = input[arrInd];
804  }
805 
806  if(tid >= (blockDim.x-overlap))
807  {
808  sdata[tid+2*overlap] = (blockIdx.x != gridDim.x-1 && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+in_offset+(overlap*rowWidth))] : wrap[(overlap * tmp2)+(tid+overlap-blockDim.x)];
809  }
810  }
811  }
812  else if(poly == 2) //IF overlap policy is DUPLICATE
813  {
814  sdata[overlap+tid] = (i < n) ? input[arrInd+in_offset] : input[n+in_offset-1];
815 
816  if(deviceType == -1) // first device, i.e. in_offset=0
817  {
818  if(tid < overlap)
819  {
820  sdata[tid] = (tmp==0) ? input[tmp2] : input[(arrInd-(overlap*rowWidth))];
821  }
822 
823  if(tid >= (blockDim.x-overlap))
824  {
825  sdata[tid+2*overlap] = input[(arrInd+in_offset+(overlap*rowWidth))];
826  }
827  }
828  else if(deviceType == 0) // middle device
829  {
830  if(tid < overlap)
831  {
832  sdata[tid] = input[arrInd];
833  }
834 
835  if(tid >= (blockDim.x-overlap))
836  {
837  sdata[tid+2*overlap] = input[(arrInd+in_offset+(overlap*rowWidth))];
838  }
839  }
840  else if(deviceType == 1) // last device
841  {
842  if(tid < overlap)
843  {
844  sdata[tid] = input[arrInd];
845  }
846 
847  if(tid >= (blockDim.x-overlap))
848  {
849  sdata[tid+2*overlap] = (blockIdx.x != gridDim.x-1 && (arrInd+in_offset+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+in_offset+(overlap*rowWidth))] : input[tmp2+in_offset+(colWidth-1)*rowWidth];
850  }
851  }
852  }
853 
854  __syncthreads();
855 
856  //Compute and store data
857  if( arrInd < out_numelements )
858  {
859  output[arrInd] = mapOverlapFunc.CU(&(sdata[tid+overlap]));
860  }
861 }
862 
863 
868 }
869 
870 #endif
871 
872 #endif
873 
874 
__global__ void MapOverlapKernel_CU_Matrix_Row(OverlapFunc mapOverlapFunc, T *input, T *output, T *wrap, size_t n, size_t out_offset, size_t out_numelements, T pad, size_t blocksPerRow, size_t rowWidth)
Definition: mapoverlap_kernels.h:532
__global__ void transpose(T *odata, T *idata, size_t width, size_t height)
Definition: mapoverlap_kernels.h:408
__global__ void MapOverlapKernel_CU_Matrix_ColMulti(OverlapFunc mapOverlapFunc, T *input, T *output, T *wrap, size_t n, size_t in_offset, size_t out_numelements, T pad, size_t blocksPerCol, size_t rowWidth, size_t colWidth)
Definition: mapoverlap_kernels.h:716
__global__ void MapOverlapKernel_CU_Matrix_Col(OverlapFunc mapOverlapFunc, T *input, T *output, T *wrap, size_t n, size_t out_offset, size_t out_numelements, T pad, size_t blocksPerCol, size_t rowWidth, size_t colWidth)
Definition: mapoverlap_kernels.h:621
static std::string MapOverlapKernel_CL_Matrix_Row("__kernel void MapOverlapKernel_MatRowWise_KERNELNAME(__global TYPE* input, __global TYPE* output, __global TYPE* wrap, size_t n, size_t overlap, size_t out_offset, size_t out_numelements, int poly, TYPE pad, size_t blocksPerRow, size_t rowWidth, __local TYPE* sdata)\n""{\n"" size_t tid = get_local_id(0);\n"" size_t i = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"" size_t wrapIndex= 2 * overlap * (int)(get_group_id(0)/blocksPerRow);\n"" size_t tmp= (get_group_id(0) % blocksPerRow);\n"" size_t tmp2= (get_group_id(0) / blocksPerRow);\n"" if(poly == 0)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[i] : pad;\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? pad : input[i-overlap];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (i+overlap < n) && tmp!=(blocksPerRow-1)) ? input[i+overlap] : pad;\n"" }\n"" }\n"" else if(poly == 1)\n"" {\n"" if(i < n)\n"" {\n"" sdata[overlap+tid] = input[i];\n"" }\n"" else if(i-n < overlap)\n"" {\n"" sdata[overlap+tid] = wrap[(overlap+(i-n))+ wrapIndex];\n"" }\n"" else\n"" {\n"" sdata[overlap+tid] = pad;\n"" }\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? wrap[tid+wrapIndex] : input[i-overlap];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && i+overlap < n && tmp!=(blocksPerRow-1)) ? input[i+overlap] : wrap[overlap+wrapIndex+(tid+overlap-get_local_size(0))];\n"" }\n"" }\n"" else if(poly == 2)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[i] : input[n-1];\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? input[tmp2*rowWidth] : input[i-overlap];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (i+overlap < n) && (tmp!=(blocksPerRow-1))) ? input[i+overlap] : input[(tmp2+1)*rowWidth-1];\n"" }\n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if( (i >= out_offset) && (i < out_offset+out_numelements) )\n"" {\n"" output[i-out_offset] = FUNCTIONNAME(&(sdata[tid+overlap]));\n"" }\n""}\n")
static std::string MapOverlapKernel_CL_Matrix_ColMulti("__kernel void MapOverlapKernel_MatColWiseMulti_KERNELNAME(__global TYPE* input, __global TYPE* output, __global TYPE* wrap, size_t n, size_t overlap, size_t in_offset, size_t out_numelements, int poly, int deviceType, TYPE pad, size_t blocksPerCol, size_t rowWidth, size_t colWidth, __local TYPE* sdata)\n""{\n"" size_t tid = get_local_id(0);\n"" size_t i = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"" size_t wrapIndex= 2 * overlap * (int)(get_group_id(0)/blocksPerCol);\n"" size_t tmp= (get_group_id(0) % blocksPerCol);\n"" size_t tmp2= (get_group_id(0) / blocksPerCol);\n"" size_t arrInd = (tid + tmp*get_local_size(0))*rowWidth + tmp2;\n"" if(poly == 0)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[arrInd+in_offset] : pad;\n"" if(deviceType == -1)\n"" {\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? pad : input[(arrInd-(overlap*rowWidth))];\n"" }\n"" \n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = input[(arrInd+in_offset+(overlap*rowWidth))];\n"" }\n"" }\n"" else if(deviceType == 0) \n"" {\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = input[arrInd];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = input[(arrInd+in_offset+(overlap*rowWidth))];\n"" }\n"" }\n"" else if(deviceType == 1)\n"" {\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = input[arrInd];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+in_offset+(overlap*rowWidth))] : pad;\n"" }\n"" }\n"" }\n"" else if(poly == 1)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[arrInd+in_offset] : ((i-n < overlap) ? wrap[(i-n)+ (overlap * tmp2)] : pad);\n"" if(deviceType == -1)\n"" {\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? wrap[tid+(overlap * tmp2)] : input[(arrInd-(overlap*rowWidth))];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = input[(arrInd+in_offset+(overlap*rowWidth))];\n"" }\n"" }\n"" else if(deviceType == 0)\n"" {\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = input[arrInd];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = input[(arrInd+in_offset+(overlap*rowWidth))];\n"" }\n"" }\n"" else if(deviceType == 1)\n"" {\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = input[arrInd];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+in_offset+(overlap*rowWidth))] : wrap[(overlap * tmp2)+(tid+overlap-get_local_size(0))];\n"" }\n"" }\n"" }\n"" else if(poly == 2)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[arrInd+in_offset] : input[n+in_offset-1];\n"" if(deviceType == -1)\n"" {\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? input[tmp2] : input[(arrInd-(overlap*rowWidth))];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = input[(arrInd+in_offset+(overlap*rowWidth))];\n"" }\n"" }\n"" else if(deviceType == 0)\n"" {\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = input[arrInd];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = input[(arrInd+in_offset+(overlap*rowWidth))];\n"" }\n"" }\n"" else if(deviceType == 1)\n"" {\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = input[arrInd];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+in_offset+(overlap*rowWidth))] : input[tmp2+in_offset+(colWidth-1)*rowWidth];\n"" }\n"" }\n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if( arrInd < out_numelements )\n"" {\n"" output[arrInd] = FUNCTIONNAME(&(sdata[tid+overlap]));\n"" }\n""}\n")
__global__ void MapOverlapKernel_CU(OverlapFunc mapOverlapFunc, T *input, T *output, T *wrap, size_t n, size_t out_offset, size_t out_numelements, T pad)
Definition: mapoverlap_kernels.h:443
static std::string MapOverlapKernel_CL_Matrix_Col("__kernel void MapOverlapKernel_MatColWise_KERNELNAME(__global TYPE* input, __global TYPE* output, __global TYPE* wrap, size_t n, size_t overlap, size_t out_offset, size_t out_numelements, int poly, TYPE pad, size_t blocksPerCol, size_t rowWidth, size_t colWidth, __local TYPE* sdata)\n""{\n"" size_t tid = get_local_id(0);\n"" size_t i = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"" size_t wrapIndex= 2 * overlap * (int)(get_group_id(0)/blocksPerCol);\n"" size_t tmp= (get_group_id(0) % blocksPerCol);\n"" size_t tmp2= (get_group_id(0) / blocksPerCol);\n"" size_t arrInd = (tid + tmp*get_local_size(0))*rowWidth + tmp2;\n"" if(poly == 0)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[arrInd] : pad;\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? pad : input[(arrInd-(overlap*rowWidth))];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+(overlap*rowWidth))] : pad;\n"" }\n"" }\n"" else if(poly == 1)\n"" {\n"" if(i < n)\n"" {\n"" sdata[overlap+tid] = input[arrInd];\n"" }\n"" else if(i-n < overlap)\n"" {\n"" sdata[overlap+tid] = wrap[(overlap+(i-n))+ wrapIndex];\n"" }\n"" else\n"" {\n"" sdata[overlap+tid] = pad;\n"" }\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? wrap[tid+wrapIndex] : input[(arrInd-(overlap*rowWidth))];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+(overlap*rowWidth))] : wrap[overlap+wrapIndex+(tid+overlap-get_local_size(0))];\n"" }\n"" }\n"" else if(poly == 2)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[arrInd] : input[n-1];\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? input[tmp2] : input[(arrInd-(overlap*rowWidth))];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+(overlap*rowWidth))] : input[tmp2+(colWidth-1)*rowWidth];\n"" }\n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if( (arrInd >= out_offset) && (arrInd < out_offset+out_numelements) )\n"" {\n"" output[arrInd-out_offset] = FUNCTIONNAME(&(sdata[tid+overlap]));\n"" }\n""}\n")
static std::string MapOverlapKernel_CL("__kernel void MapOverlapKernel_KERNELNAME(__global TYPE* input, __global TYPE* output, __global TYPE* wrap, size_t n, size_t overlap, size_t out_offset, size_t out_numelements, int poly, TYPE pad, __local TYPE* sdata)\n""{\n"" size_t tid = get_local_id(0);\n"" size_t i = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"" if(poly == 0)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[i] : pad;\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (get_group_id(0) == 0) ? pad : input[i-overlap];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != get_num_groups(0)-1 && i+overlap < n) ? input[i+overlap] : pad;\n"" }\n"" }\n"" else if(poly == 1)\n"" {\n"" if(i < n)\n"" {\n"" sdata[overlap+tid] = input[i];\n"" }\n"" else if(i-n < overlap)\n"" {\n"" sdata[overlap+tid] = wrap[overlap+(i-n)];\n"" }\n"" else\n"" {\n"" sdata[overlap+tid] = pad;\n"" }\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (get_group_id(0) == 0) ? wrap[tid] : input[i-overlap];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != get_num_groups(0)-1 && i+overlap < n) ? input[i+overlap] : wrap[overlap+(i+overlap-n)];\n"" }\n"" }\n"" else if(poly == 2)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[i] : input[n-1];\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (get_group_id(0) == 0) ? input[0] : input[i-overlap];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != get_num_groups(0)-1 && i+overlap < n) ? input[i+overlap] : input[n-1];\n"" }\n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if( (i >= out_offset) && (i < out_offset+out_numelements) )\n"" {\n"" output[i-out_offset] = FUNCTIONNAME(&(sdata[tid+overlap]));\n"" }\n""}\n")