5 #ifndef MAPOVERLAP_KERNELS_H
6 #define MAPOVERLAP_KERNELS_H
27 static std::string MatrixTranspose_CL(
28 "__kernel void matrix_transpose_KERNELNAME(__global float *odata, __global float *idata, int offset, int width, int height, __local float* block)\n"
30 " unsigned int xIndex = get_global_id(0);\n"
31 " unsigned int yIndex = get_global_id(1);\n"
32 " if((xIndex + offset < width) && (yIndex < height))\n"
34 " unsigned int index_in = yIndex * width + xIndex + offset;\n"
35 " block[get_local_id(1)*(BLOCK_DIM+1)+get_local_id(0)] = idata[index_in];\n"
37 " barrier(CLK_LOCAL_MEM_FENCE);\n"
38 " xIndex = get_group_id(1) * BLOCK_DIM + get_local_id(0);\n"
39 " yIndex = get_group_id(0) * BLOCK_DIM + get_local_id(1);\n"
40 " if((xIndex < height) && (yIndex + offset < width))\n"
42 " unsigned int index_out = yIndex * height + xIndex;\n"
43 " odata[index_out] = block[get_local_id(0)*(BLOCK_DIM+1)+get_local_id(1)];\n"
58 "__kernel void MapOverlapKernel_KERNELNAME(__global TYPE* input, __global TYPE* output, __global TYPE* wrap, int n, int overlap, int out_offset, int out_numelements, int poly, TYPE pad, __local TYPE* sdata)\n"
60 " int tid = get_local_id(0);\n"
61 " int i = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"
64 " sdata[overlap+tid] = (i < n) ? input[i] : pad;\n"
65 " if(tid < overlap)\n"
67 " sdata[tid] = (get_group_id(0) == 0) ? pad : input[i-overlap];\n"
69 " if(tid >= (get_local_size(0)-overlap))\n"
71 " sdata[tid+2*overlap] = (get_group_id(0) != get_num_groups(0)-1 && i+overlap < n) ? input[i+overlap] : pad;\n"
74 " else if(poly == 1)\n"
78 " sdata[overlap+tid] = input[i];\n"
80 " else if(i-n < overlap)\n"
82 " sdata[overlap+tid] = wrap[overlap+(i-n)];\n"
86 " sdata[overlap+tid] = pad;\n"
88 " if(tid < overlap)\n"
90 " sdata[tid] = (get_group_id(0) == 0) ? wrap[tid] : input[i-overlap];\n"
92 " if(tid >= (get_local_size(0)-overlap))\n"
94 " sdata[tid+2*overlap] = (get_group_id(0) != get_num_groups(0)-1 && i+overlap < n) ? input[i+overlap] : wrap[overlap+(i+overlap-n)];\n"
97 " else if(poly == 2)\n"
99 " sdata[overlap+tid] = (i < n) ? input[i] : input[n-1];\n"
100 " if(tid < overlap)\n"
102 " sdata[tid] = (get_group_id(0) == 0) ? input[0] : input[i-overlap];\n"
104 " if(tid >= (get_local_size(0)-overlap))\n"
106 " sdata[tid+2*overlap] = (get_group_id(0) != get_num_groups(0)-1 && i+overlap < n) ? input[i+overlap] : input[n-1];\n"
109 " barrier(CLK_LOCAL_MEM_FENCE);\n"
110 " if( (i >= out_offset) && (i < out_offset+out_numelements) )\n"
112 " output[i-out_offset] = FUNCTIONNAME(&(sdata[tid+overlap]));\n"
129 "__kernel void MapOverlapKernel_MatRowWise_KERNELNAME(__global TYPE* input, __global TYPE* output, __global TYPE* wrap, int n, int overlap, int out_offset, int out_numelements, int poly, TYPE pad, int blocksPerRow, int rowWidth, __local TYPE* sdata)\n"
131 " int tid = get_local_id(0);\n"
132 " int i = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"
133 " int wrapIndex= 2 * overlap * (int)(get_group_id(0)/blocksPerRow);\n"
134 " int tmp= (get_group_id(0) % blocksPerRow);\n"
135 " int tmp2= (get_group_id(0) / blocksPerRow);\n"
138 " sdata[overlap+tid] = (i < n) ? input[i] : pad;\n"
139 " if(tid < overlap)\n"
141 " sdata[tid] = (tmp==0) ? pad : input[i-overlap];\n"
143 " if(tid >= (get_local_size(0)-overlap))\n"
145 " sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (i+overlap < n) && tmp!=(blocksPerRow-1)) ? input[i+overlap] : pad;\n"
148 " else if(poly == 1)\n"
152 " sdata[overlap+tid] = input[i];\n"
154 " else if(i-n < overlap)\n"
156 " sdata[overlap+tid] = wrap[(overlap+(i-n))+ wrapIndex];\n"
160 " sdata[overlap+tid] = pad;\n"
162 " if(tid < overlap)\n"
164 " sdata[tid] = (tmp==0) ? wrap[tid+wrapIndex] : input[i-overlap];\n"
166 " if(tid >= (get_local_size(0)-overlap))\n"
168 " sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && i+overlap < n && tmp!=(blocksPerRow-1)) ? input[i+overlap] : wrap[overlap+wrapIndex+(tid+overlap-get_local_size(0))];\n"
171 " else if(poly == 2)\n"
173 " sdata[overlap+tid] = (i < n) ? input[i] : input[n-1];\n"
174 " if(tid < overlap)\n"
176 " sdata[tid] = (tmp==0) ? input[tmp2*rowWidth] : input[i-overlap];\n"
178 " if(tid >= (get_local_size(0)-overlap))\n"
180 " sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (i+overlap < n) && (tmp!=(blocksPerRow-1))) ? input[i+overlap] : input[(tmp2+1)*rowWidth-1];\n"
183 " barrier(CLK_LOCAL_MEM_FENCE);\n"
184 " if( (i >= out_offset) && (i < out_offset+out_numelements) )\n"
186 " output[i-out_offset] = FUNCTIONNAME(&(sdata[tid+overlap]));\n"
198 "__kernel void MapOverlapKernel_MatColWise_KERNELNAME(__global TYPE* input, __global TYPE* output, __global TYPE* wrap, int n, int overlap, int out_offset, int out_numelements, int poly, TYPE pad, int blocksPerCol, int rowWidth, int colWidth, __local TYPE* sdata)\n"
200 " int tid = get_local_id(0);\n"
201 " int i = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"
202 " int wrapIndex= 2 * overlap * (int)(get_group_id(0)/blocksPerCol);\n"
203 " int tmp= (get_group_id(0) % blocksPerCol);\n"
204 " int tmp2= (get_group_id(0) / blocksPerCol);\n"
205 " int arrInd = (tid + tmp*get_local_size(0))*rowWidth + tmp2;\n"
208 " sdata[overlap+tid] = (i < n) ? input[arrInd] : pad;\n"
209 " if(tid < overlap)\n"
211 " sdata[tid] = (tmp==0) ? pad : input[(arrInd-(overlap*rowWidth))];\n"
213 " if(tid >= (get_local_size(0)-overlap))\n"
215 " sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+(overlap*rowWidth))] : pad;\n"
218 " else if(poly == 1)\n"
222 " sdata[overlap+tid] = input[arrInd];\n"
224 " else if(i-n < overlap)\n"
226 " sdata[overlap+tid] = wrap[(overlap+(i-n))+ wrapIndex];\n"
230 " sdata[overlap+tid] = pad;\n"
232 " if(tid < overlap)\n"
234 " sdata[tid] = (tmp==0) ? wrap[tid+wrapIndex] : input[(arrInd-(overlap*rowWidth))];\n"
236 " if(tid >= (get_local_size(0)-overlap))\n"
238 " sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+(overlap*rowWidth))] : wrap[overlap+wrapIndex+(tid+overlap-get_local_size(0))];\n"
241 " else if(poly == 2)\n"
243 " sdata[overlap+tid] = (i < n) ? input[arrInd] : input[n-1];\n"
244 " if(tid < overlap)\n"
246 " sdata[tid] = (tmp==0) ? input[tmp2] : input[(arrInd-(overlap*rowWidth))];\n"
248 " if(tid >= (get_local_size(0)-overlap))\n"
250 " sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+(overlap*rowWidth))] : input[tmp2+(colWidth-1)*rowWidth];\n"
253 " barrier(CLK_LOCAL_MEM_FENCE);\n"
254 " if( (i >= out_offset) && (i < out_offset+out_numelements) )\n"
256 " output[arrInd-out_offset] = FUNCTIONNAME(&(sdata[tid+overlap]));\n"
295 template <
typename T>
296 __global__
void transpose(T *odata, T *idata,
int width,
int height)
299 extern __shared__
char _sdata[];
300 T* sdata =
reinterpret_cast<T*
>(_sdata);
302 int block_dim= blockDim.x;
303 int block_dimY= blockDim.y;
305 unsigned int xIndex = blockIdx.x * block_dim + threadIdx.x;
306 unsigned int yIndex = blockIdx.y * block_dimY + threadIdx.y;
307 if((xIndex < width) && (yIndex < height))
309 unsigned int index_in = yIndex * width + xIndex;
310 sdata[threadIdx.y][threadIdx.x] = idata[index_in];
316 xIndex = blockIdx.y * block_dim + threadIdx.x;
317 yIndex = blockIdx.x * block_dimY + threadIdx.y;
318 if((xIndex < height) && (yIndex < width))
320 unsigned int index_out = yIndex * height + xIndex;
321 odata[index_out] = sdata[threadIdx.x][threadIdx.y];
331 template <
int poly,
typename T,
typename OverlapFunc>
332 __global__
void MapOverlapKernel_CU(OverlapFunc mapOverlapFunc, T* input, T* output, T* wrap,
unsigned int n,
unsigned int out_offset,
unsigned int out_numelements, T pad)
334 extern __shared__
char _sdata[];
335 T* sdata =
reinterpret_cast<T*
>(_sdata);
337 unsigned int tid = threadIdx.x;
338 unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
339 int overlap = mapOverlapFunc.overlap;
344 sdata[overlap+tid] = (i < n) ? input[i] : pad;
348 sdata[tid] = (blockIdx.x == 0) ? pad : input[i-overlap];
351 if(tid >= (blockDim.x-overlap))
353 sdata[tid+2*overlap] = (blockIdx.x != gridDim.x-1 && i+overlap < n) ? input[i+overlap] : pad;
360 sdata[overlap+tid] = input[i];
362 else if(i-n < overlap)
364 sdata[overlap+tid] = wrap[overlap+(i-n)];
368 sdata[overlap+tid] = pad;
373 sdata[tid] = (blockIdx.x == 0) ? wrap[tid] : input[i-overlap];
376 if(tid >= (blockDim.x-overlap))
378 sdata[tid+2*overlap] = (blockIdx.x != gridDim.x-1 && i+overlap < n) ? input[i+overlap] : wrap[overlap+(i+overlap-n)];
383 sdata[overlap+tid] = (i < n) ? input[i] : input[n-1];
387 sdata[tid] = (blockIdx.x == 0) ? input[0] : input[i-overlap];
390 if(tid >= (blockDim.x-overlap))
392 sdata[tid+2*overlap] = (blockIdx.x != gridDim.x-1 && i+overlap < n) ? input[i+overlap] : input[n-1];
399 if( (i >= out_offset) && (i < out_offset+out_numelements) )
401 output[i-out_offset] = mapOverlapFunc.CU(&(sdata[tid+overlap]));
414 template <
int poly,
typename T,
typename OverlapFunc>
415 __global__
void MapOverlapKernel_CU_Matrix_Row(OverlapFunc mapOverlapFunc, T* input, T* output, T* wrap,
unsigned int n,
unsigned int out_offset,
unsigned int out_numelements, T pad,
unsigned int blocksPerRow,
unsigned int rowWidth)
417 extern __shared__
char _sdata[];
418 T* sdata =
reinterpret_cast<T*
>(_sdata);
420 unsigned int tid = threadIdx.x;
421 unsigned int i = blockIdx.x * blockDim.x + tid;
422 int overlap = mapOverlapFunc.overlap;
424 unsigned wrapIndex= 2 * overlap * (int)(blockIdx.x/blocksPerRow);
425 int tmp= (blockIdx.x % blocksPerRow);
426 int tmp2= (blockIdx.x / blocksPerRow);
432 sdata[overlap+tid] = (i < n) ? input[i] : pad;
436 sdata[tid] = (tmp==0) ? pad : input[i-overlap];
439 if(tid >= (blockDim.x-overlap))
441 sdata[tid+2*overlap] = (blockIdx.x != gridDim.x-1 && (i+overlap < n) && tmp!=(blocksPerRow-1)) ? input[i+overlap] : pad;
448 sdata[overlap+tid] = input[i];
450 else if(i-n < overlap)
452 sdata[overlap+tid] = wrap[(overlap+(i-n))+ wrapIndex];
456 sdata[overlap+tid] = pad;
461 sdata[tid] = (tmp==0) ? wrap[tid+wrapIndex] : input[i-overlap];
464 if(tid >= (blockDim.x-overlap))
466 sdata[tid+2*overlap] = (blockIdx.x != gridDim.x-1 && i+overlap < n && tmp!=(blocksPerRow-1)) ? input[i+overlap] : wrap[overlap+wrapIndex+(tid+overlap-blockDim.x)];
471 sdata[overlap+tid] = (i < n) ? input[i] : input[n-1];
475 sdata[tid] = (tmp==0) ? input[tmp2*rowWidth] : input[i-overlap];
478 if(tid >= (blockDim.x-overlap))
480 sdata[tid+2*overlap] = (blockIdx.x != gridDim.x-1 && (i+overlap < n) && (tmp!=(blocksPerRow-1))) ? input[i+overlap] : input[(tmp2+1)*rowWidth-1];
487 if( (i >= out_offset) && (i < out_offset+out_numelements) )
489 output[i-out_offset] = mapOverlapFunc.CU(&(sdata[tid+overlap]));
502 template <
int poly,
typename T,
typename OverlapFunc>
503 __global__
void MapOverlapKernel_CU_Matrix_Col(OverlapFunc mapOverlapFunc, T* input, T* output, T* wrap,
unsigned int n,
unsigned int out_offset,
unsigned int out_numelements, T pad,
unsigned int blocksPerCol,
unsigned int rowWidth,
unsigned int colWidth)
505 extern __shared__
char _sdata[];
506 T* sdata =
reinterpret_cast<T*
>(_sdata);
508 unsigned int tid = threadIdx.x;
509 unsigned int i = blockIdx.x * blockDim.x + tid;
510 int overlap = mapOverlapFunc.overlap;
512 unsigned wrapIndex= 2 * overlap * (int)(blockIdx.x/blocksPerCol);
513 int tmp= (blockIdx.x % blocksPerCol);
514 int tmp2= (blockIdx.x / blocksPerCol);
516 unsigned int arrInd = (threadIdx.x + tmp*blockDim.x)*rowWidth + ((blockIdx.x)/blocksPerCol);
521 sdata[overlap+tid] = (i < n) ? input[arrInd] : pad;
525 sdata[tid] = (tmp==0) ? pad : input[(arrInd-(overlap*rowWidth))];
528 if(tid >= (blockDim.x-overlap))
530 sdata[tid+2*overlap] = (blockIdx.x != gridDim.x-1 && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+(overlap*rowWidth))] : pad;
537 sdata[overlap+tid] = input[arrInd];
539 else if(i-n < overlap)
541 sdata[overlap+tid] = wrap[(overlap+(i-n))+ wrapIndex];
545 sdata[overlap+tid] = pad;
550 sdata[tid] = (tmp==0) ? wrap[tid+wrapIndex] : input[(arrInd-(overlap*rowWidth))];
553 if(tid >= (blockDim.x-overlap))
555 sdata[tid+2*overlap] = (blockIdx.x != gridDim.x-1 && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+(overlap*rowWidth))] : wrap[overlap+wrapIndex+(tid+overlap-blockDim.x)];
560 sdata[overlap+tid] = (i < n) ? input[arrInd] : input[n-1];
564 sdata[tid] = (tmp==0) ? input[tmp2] : input[(arrInd-(overlap*rowWidth))];
567 if(tid >= (blockDim.x-overlap))
569 sdata[tid+2*overlap] = (blockIdx.x != gridDim.x-1 && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+(overlap*rowWidth))] : input[tmp2+(colWidth-1)*rowWidth];
576 if( (arrInd >= out_offset) && (arrInd < out_offset+out_numelements) )
578 output[arrInd-out_offset] = mapOverlapFunc.CU(&(sdata[tid+overlap]));
__global__ void MapOverlapKernel_CU_Matrix_Row(OverlapFunc mapOverlapFunc, T *input, T *output, T *wrap, unsigned int n, unsigned int out_offset, unsigned int out_numelements, T pad, unsigned int blocksPerRow, unsigned int rowWidth)
Definition: mapoverlap_kernels.h:415
__global__ void MapOverlapKernel_CU_Matrix_Col(OverlapFunc mapOverlapFunc, T *input, T *output, T *wrap, unsigned int n, unsigned int out_offset, unsigned int out_numelements, T pad, unsigned int blocksPerCol, unsigned int rowWidth, unsigned int colWidth)
Definition: mapoverlap_kernels.h:503
static std::string MapOverlapKernel_CL("__kernel void MapOverlapKernel_KERNELNAME(__global TYPE* input, __global TYPE* output, __global TYPE* wrap, int n, int overlap, int out_offset, int out_numelements, int poly, TYPE pad, __local TYPE* sdata)\n""{\n"" int tid = get_local_id(0);\n"" int i = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"" if(poly == 0)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[i] : pad;\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (get_group_id(0) == 0) ? pad : input[i-overlap];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != get_num_groups(0)-1 && i+overlap < n) ? input[i+overlap] : pad;\n"" }\n"" }\n"" else if(poly == 1)\n"" {\n"" if(i < n)\n"" {\n"" sdata[overlap+tid] = input[i];\n"" }\n"" else if(i-n < overlap)\n"" {\n"" sdata[overlap+tid] = wrap[overlap+(i-n)];\n"" }\n"" else\n"" {\n"" sdata[overlap+tid] = pad;\n"" }\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (get_group_id(0) == 0) ? wrap[tid] : input[i-overlap];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != get_num_groups(0)-1 && i+overlap < n) ? input[i+overlap] : wrap[overlap+(i+overlap-n)];\n"" }\n"" }\n"" else if(poly == 2)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[i] : input[n-1];\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (get_group_id(0) == 0) ? input[0] : input[i-overlap];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != get_num_groups(0)-1 && i+overlap < n) ? input[i+overlap] : input[n-1];\n"" }\n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if( (i >= out_offset) && (i < out_offset+out_numelements) )\n"" {\n"" output[i-out_offset] = FUNCTIONNAME(&(sdata[tid+overlap]));\n"" }\n""}\n")
__global__ void MapOverlapKernel_CU(OverlapFunc mapOverlapFunc, T *input, T *output, T *wrap, unsigned int n, unsigned int out_offset, unsigned int out_numelements, T pad)
Definition: mapoverlap_kernels.h:332
static std::string MapOverlapKernel_CL_Matrix_Row("__kernel void MapOverlapKernel_MatRowWise_KERNELNAME(__global TYPE* input, __global TYPE* output, __global TYPE* wrap, int n, int overlap, int out_offset, int out_numelements, int poly, TYPE pad, int blocksPerRow, int rowWidth, __local TYPE* sdata)\n""{\n"" int tid = get_local_id(0);\n"" int i = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"" int wrapIndex= 2 * overlap * (int)(get_group_id(0)/blocksPerRow);\n"" int tmp= (get_group_id(0) % blocksPerRow);\n"" int tmp2= (get_group_id(0) / blocksPerRow);\n"" if(poly == 0)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[i] : pad;\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? pad : input[i-overlap];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (i+overlap < n) && tmp!=(blocksPerRow-1)) ? input[i+overlap] : pad;\n"" }\n"" }\n"" else if(poly == 1)\n"" {\n"" if(i < n)\n"" {\n"" sdata[overlap+tid] = input[i];\n"" }\n"" else if(i-n < overlap)\n"" {\n"" sdata[overlap+tid] = wrap[(overlap+(i-n))+ wrapIndex];\n"" }\n"" else\n"" {\n"" sdata[overlap+tid] = pad;\n"" }\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? wrap[tid+wrapIndex] : input[i-overlap];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && i+overlap < n && tmp!=(blocksPerRow-1)) ? input[i+overlap] : wrap[overlap+wrapIndex+(tid+overlap-get_local_size(0))];\n"" }\n"" }\n"" else if(poly == 2)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[i] : input[n-1];\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? input[tmp2*rowWidth] : input[i-overlap];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (i+overlap < n) && (tmp!=(blocksPerRow-1))) ? input[i+overlap] : input[(tmp2+1)*rowWidth-1];\n"" }\n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if( (i >= out_offset) && (i < out_offset+out_numelements) )\n"" {\n"" output[i-out_offset] = FUNCTIONNAME(&(sdata[tid+overlap]));\n"" }\n""}\n")
static std::string MapOverlapKernel_CL_Matrix_Col("__kernel void MapOverlapKernel_MatColWise_KERNELNAME(__global TYPE* input, __global TYPE* output, __global TYPE* wrap, int n, int overlap, int out_offset, int out_numelements, int poly, TYPE pad, int blocksPerCol, int rowWidth, int colWidth, __local TYPE* sdata)\n""{\n"" int tid = get_local_id(0);\n"" int i = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"" int wrapIndex= 2 * overlap * (int)(get_group_id(0)/blocksPerCol);\n"" int tmp= (get_group_id(0) % blocksPerCol);\n"" int tmp2= (get_group_id(0) / blocksPerCol);\n"" int arrInd = (tid + tmp*get_local_size(0))*rowWidth + tmp2;\n"" if(poly == 0)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[arrInd] : pad;\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? pad : input[(arrInd-(overlap*rowWidth))];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+(overlap*rowWidth))] : pad;\n"" }\n"" }\n"" else if(poly == 1)\n"" {\n"" if(i < n)\n"" {\n"" sdata[overlap+tid] = input[arrInd];\n"" }\n"" else if(i-n < overlap)\n"" {\n"" sdata[overlap+tid] = wrap[(overlap+(i-n))+ wrapIndex];\n"" }\n"" else\n"" {\n"" sdata[overlap+tid] = pad;\n"" }\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? wrap[tid+wrapIndex] : input[(arrInd-(overlap*rowWidth))];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+(overlap*rowWidth))] : wrap[overlap+wrapIndex+(tid+overlap-get_local_size(0))];\n"" }\n"" }\n"" else if(poly == 2)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[arrInd] : input[n-1];\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? input[tmp2] : input[(arrInd-(overlap*rowWidth))];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+(overlap*rowWidth))] : input[tmp2+(colWidth-1)*rowWidth];\n"" }\n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if( (i >= out_offset) && (i < out_offset+out_numelements) )\n"" {\n"" output[arrInd-out_offset] = FUNCTIONNAME(&(sdata[tid+overlap]));\n"" }\n""}\n")