5 #ifndef MAPOVERLAP_KERNELS_H
6 #define MAPOVERLAP_KERNELS_H
36 "__kernel void MapOverlapKernel_KERNELNAME(__global TYPE* input, __global TYPE* output, __global TYPE* wrap, size_t n, size_t overlap, size_t out_offset, size_t out_numelements, int poly, TYPE pad, __local TYPE* sdata)\n"
38 " size_t tid = get_local_id(0);\n"
39 " size_t i = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"
42 " sdata[overlap+tid] = (i < n) ? input[i] : pad;\n"
43 " if(tid < overlap)\n"
45 " sdata[tid] = (get_group_id(0) == 0) ? pad : input[i-overlap];\n"
47 " if(tid >= (get_local_size(0)-overlap))\n"
49 " sdata[tid+2*overlap] = (get_group_id(0) != get_num_groups(0)-1 && i+overlap < n) ? input[i+overlap] : pad;\n"
52 " else if(poly == 1)\n"
56 " sdata[overlap+tid] = input[i];\n"
58 " else if(i-n < overlap)\n"
60 " sdata[overlap+tid] = wrap[overlap+(i-n)];\n"
64 " sdata[overlap+tid] = pad;\n"
66 " if(tid < overlap)\n"
68 " sdata[tid] = (get_group_id(0) == 0) ? wrap[tid] : input[i-overlap];\n"
70 " if(tid >= (get_local_size(0)-overlap))\n"
72 " sdata[tid+2*overlap] = (get_group_id(0) != get_num_groups(0)-1 && i+overlap < n) ? input[i+overlap] : wrap[overlap+(i+overlap-n)];\n"
75 " else if(poly == 2)\n"
77 " sdata[overlap+tid] = (i < n) ? input[i] : input[n-1];\n"
78 " if(tid < overlap)\n"
80 " sdata[tid] = (get_group_id(0) == 0) ? input[0] : input[i-overlap];\n"
82 " if(tid >= (get_local_size(0)-overlap))\n"
84 " sdata[tid+2*overlap] = (get_group_id(0) != get_num_groups(0)-1 && i+overlap < n) ? input[i+overlap] : input[n-1];\n"
87 " barrier(CLK_LOCAL_MEM_FENCE);\n"
88 " if( (i >= out_offset) && (i < out_offset+out_numelements) )\n"
90 " output[i-out_offset] = FUNCTIONNAME(&(sdata[tid+overlap]));\n"
105 "__kernel void MapOverlapKernel_MatRowWise_KERNELNAME(__global TYPE* input, __global TYPE* output, __global TYPE* wrap, size_t n, size_t overlap, size_t out_offset, size_t out_numelements, int poly, TYPE pad, size_t blocksPerRow, size_t rowWidth, __local TYPE* sdata)\n"
107 " size_t tid = get_local_id(0);\n"
108 " size_t i = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"
109 " size_t wrapIndex= 2 * overlap * (int)(get_group_id(0)/blocksPerRow);\n"
110 " size_t tmp= (get_group_id(0) % blocksPerRow);\n"
111 " size_t tmp2= (get_group_id(0) / blocksPerRow);\n"
114 " sdata[overlap+tid] = (i < n) ? input[i] : pad;\n"
115 " if(tid < overlap)\n"
117 " sdata[tid] = (tmp==0) ? pad : input[i-overlap];\n"
119 " if(tid >= (get_local_size(0)-overlap))\n"
121 " sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (i+overlap < n) && tmp!=(blocksPerRow-1)) ? input[i+overlap] : pad;\n"
124 " else if(poly == 1)\n"
128 " sdata[overlap+tid] = input[i];\n"
130 " else if(i-n < overlap)\n"
132 " sdata[overlap+tid] = wrap[(overlap+(i-n))+ wrapIndex];\n"
136 " sdata[overlap+tid] = pad;\n"
138 " if(tid < overlap)\n"
140 " sdata[tid] = (tmp==0) ? wrap[tid+wrapIndex] : input[i-overlap];\n"
142 " if(tid >= (get_local_size(0)-overlap))\n"
144 " sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && i+overlap < n && tmp!=(blocksPerRow-1)) ? input[i+overlap] : wrap[overlap+wrapIndex+(tid+overlap-get_local_size(0))];\n"
147 " else if(poly == 2)\n"
149 " sdata[overlap+tid] = (i < n) ? input[i] : input[n-1];\n"
150 " if(tid < overlap)\n"
152 " sdata[tid] = (tmp==0) ? input[tmp2*rowWidth] : input[i-overlap];\n"
154 " if(tid >= (get_local_size(0)-overlap))\n"
156 " sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (i+overlap < n) && (tmp!=(blocksPerRow-1))) ? input[i+overlap] : input[(tmp2+1)*rowWidth-1];\n"
159 " barrier(CLK_LOCAL_MEM_FENCE);\n"
160 " if( (i >= out_offset) && (i < out_offset+out_numelements) )\n"
162 " output[i-out_offset] = FUNCTIONNAME(&(sdata[tid+overlap]));\n"
178 "__kernel void MapOverlapKernel_MatColWise_KERNELNAME(__global TYPE* input, __global TYPE* output, __global TYPE* wrap, size_t n, size_t overlap, size_t out_offset, size_t out_numelements, int poly, TYPE pad, size_t blocksPerCol, size_t rowWidth, size_t colWidth, __local TYPE* sdata)\n"
180 " size_t tid = get_local_id(0);\n"
181 " size_t i = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"
182 " size_t wrapIndex= 2 * overlap * (int)(get_group_id(0)/blocksPerCol);\n"
183 " size_t tmp= (get_group_id(0) % blocksPerCol);\n"
184 " size_t tmp2= (get_group_id(0) / blocksPerCol);\n"
185 " size_t arrInd = (tid + tmp*get_local_size(0))*rowWidth + tmp2;\n"
188 " sdata[overlap+tid] = (i < n) ? input[arrInd] : pad;\n"
189 " if(tid < overlap)\n"
191 " sdata[tid] = (tmp==0) ? pad : input[(arrInd-(overlap*rowWidth))];\n"
193 " if(tid >= (get_local_size(0)-overlap))\n"
195 " sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+(overlap*rowWidth))] : pad;\n"
198 " else if(poly == 1)\n"
202 " sdata[overlap+tid] = input[arrInd];\n"
204 " else if(i-n < overlap)\n"
206 " sdata[overlap+tid] = wrap[(overlap+(i-n))+ wrapIndex];\n"
210 " sdata[overlap+tid] = pad;\n"
212 " if(tid < overlap)\n"
214 " sdata[tid] = (tmp==0) ? wrap[tid+wrapIndex] : input[(arrInd-(overlap*rowWidth))];\n"
216 " if(tid >= (get_local_size(0)-overlap))\n"
218 " sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+(overlap*rowWidth))] : wrap[overlap+wrapIndex+(tid+overlap-get_local_size(0))];\n"
221 " else if(poly == 2)\n"
223 " sdata[overlap+tid] = (i < n) ? input[arrInd] : input[n-1];\n"
224 " if(tid < overlap)\n"
226 " sdata[tid] = (tmp==0) ? input[tmp2] : input[(arrInd-(overlap*rowWidth))];\n"
228 " if(tid >= (get_local_size(0)-overlap))\n"
230 " sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+(overlap*rowWidth))] : input[tmp2+(colWidth-1)*rowWidth];\n"
233 " barrier(CLK_LOCAL_MEM_FENCE);\n"
234 " if( (arrInd >= out_offset) && (arrInd < out_offset+out_numelements) )\n"
236 " output[arrInd-out_offset] = FUNCTIONNAME(&(sdata[tid+overlap]));\n"
252 "__kernel void MapOverlapKernel_MatColWiseMulti_KERNELNAME(__global TYPE* input, __global TYPE* output, __global TYPE* wrap, size_t n, size_t overlap, size_t in_offset, size_t out_numelements, int poly, int deviceType, TYPE pad, size_t blocksPerCol, size_t rowWidth, size_t colWidth, __local TYPE* sdata)\n"
254 " size_t tid = get_local_id(0);\n"
255 " size_t i = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"
256 " size_t wrapIndex= 2 * overlap * (int)(get_group_id(0)/blocksPerCol);\n"
257 " size_t tmp= (get_group_id(0) % blocksPerCol);\n"
258 " size_t tmp2= (get_group_id(0) / blocksPerCol);\n"
259 " size_t arrInd = (tid + tmp*get_local_size(0))*rowWidth + tmp2;\n"
262 " sdata[overlap+tid] = (i < n) ? input[arrInd+in_offset] : pad;\n"
263 " if(deviceType == -1)\n"
265 " if(tid < overlap)\n"
267 " sdata[tid] = (tmp==0) ? pad : input[(arrInd-(overlap*rowWidth))];\n"
270 " if(tid >= (get_local_size(0)-overlap))\n"
272 " sdata[tid+2*overlap] = input[(arrInd+in_offset+(overlap*rowWidth))];\n"
275 " else if(deviceType == 0) \n"
277 " if(tid < overlap)\n"
279 " sdata[tid] = input[arrInd];\n"
281 " if(tid >= (get_local_size(0)-overlap))\n"
283 " sdata[tid+2*overlap] = input[(arrInd+in_offset+(overlap*rowWidth))];\n"
286 " else if(deviceType == 1)\n"
288 " if(tid < overlap)\n"
290 " sdata[tid] = input[arrInd];\n"
292 " if(tid >= (get_local_size(0)-overlap))\n"
294 " sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+in_offset+(overlap*rowWidth))] : pad;\n"
298 " else if(poly == 1)\n"
300 " sdata[overlap+tid] = (i < n) ? input[arrInd+in_offset] : ((i-n < overlap) ? wrap[(i-n)+ (overlap * tmp2)] : pad);\n"
301 " if(deviceType == -1)\n"
303 " if(tid < overlap)\n"
305 " sdata[tid] = (tmp==0) ? wrap[tid+(overlap * tmp2)] : input[(arrInd-(overlap*rowWidth))];\n"
307 " if(tid >= (get_local_size(0)-overlap))\n"
309 " sdata[tid+2*overlap] = input[(arrInd+in_offset+(overlap*rowWidth))];\n"
312 " else if(deviceType == 0)\n"
314 " if(tid < overlap)\n"
316 " sdata[tid] = input[arrInd];\n"
318 " if(tid >= (get_local_size(0)-overlap))\n"
320 " sdata[tid+2*overlap] = input[(arrInd+in_offset+(overlap*rowWidth))];\n"
323 " else if(deviceType == 1)\n"
325 " if(tid < overlap)\n"
327 " sdata[tid] = input[arrInd];\n"
329 " if(tid >= (get_local_size(0)-overlap))\n"
331 " sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+in_offset+(overlap*rowWidth))] : wrap[(overlap * tmp2)+(tid+overlap-get_local_size(0))];\n"
335 " else if(poly == 2)\n"
337 " sdata[overlap+tid] = (i < n) ? input[arrInd+in_offset] : input[n+in_offset-1];\n"
338 " if(deviceType == -1)\n"
340 " if(tid < overlap)\n"
342 " sdata[tid] = (tmp==0) ? input[tmp2] : input[(arrInd-(overlap*rowWidth))];\n"
344 " if(tid >= (get_local_size(0)-overlap))\n"
346 " sdata[tid+2*overlap] = input[(arrInd+in_offset+(overlap*rowWidth))];\n"
349 " else if(deviceType == 0)\n"
351 " if(tid < overlap)\n"
353 " sdata[tid] = input[arrInd];\n"
355 " if(tid >= (get_local_size(0)-overlap))\n"
357 " sdata[tid+2*overlap] = input[(arrInd+in_offset+(overlap*rowWidth))];\n"
360 " else if(deviceType == 1)\n"
362 " if(tid < overlap)\n"
364 " sdata[tid] = input[arrInd];\n"
366 " if(tid >= (get_local_size(0)-overlap))\n"
368 " sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+in_offset+(overlap*rowWidth))] : input[tmp2+in_offset+(colWidth-1)*rowWidth];\n"
372 " barrier(CLK_LOCAL_MEM_FENCE);\n"
373 " if( arrInd < out_numelements )\n"
375 " output[arrInd] = FUNCTIONNAME(&(sdata[tid+overlap]));\n"
407 template <
typename T>
408 __global__
void transpose(T *odata, T *idata,
size_t width,
size_t height)
410 extern __shared__
char _sdata[];
411 T* sdata =
reinterpret_cast<T*
>(_sdata);
413 size_t block_dim= blockDim.x;
414 size_t block_dimY= blockDim.y;
416 size_t xIndex = blockIdx.x * block_dim + threadIdx.x;
417 size_t yIndex = blockIdx.y * block_dimY + threadIdx.y;
418 if((xIndex < width) && (yIndex < height))
420 size_t index_in = yIndex * width + xIndex;
421 sdata[threadIdx.y][threadIdx.x] = idata[index_in];
427 xIndex = blockIdx.y * block_dim + threadIdx.x;
428 yIndex = blockIdx.x * block_dimY + threadIdx.y;
429 if((xIndex < height) && (yIndex < width))
431 size_t index_out = yIndex * height + xIndex;
432 odata[index_out] = sdata[threadIdx.x][threadIdx.y];
442 template <
int poly,
typename T,
typename OverlapFunc>
443 __global__
void MapOverlapKernel_CU(OverlapFunc mapOverlapFunc, T* input, T* output, T* wrap,
size_t n,
size_t out_offset,
size_t out_numelements, T pad)
445 extern __shared__
char _sdata[];
446 T* sdata =
reinterpret_cast<T*
>(_sdata);
448 size_t tid = threadIdx.x;
449 size_t i = blockIdx.x * blockDim.x + threadIdx.x;
450 size_t gridSize = blockDim.x*gridDim.x;
452 size_t overlap = mapOverlapFunc.overlap;
454 while(i<(n+overlap-1))
459 sdata[overlap+tid] = (i < n) ? input[i] : pad;
463 sdata[tid] = (i<overlap) ? pad : input[i-overlap];
466 if(tid >= (blockDim.x-overlap))
468 sdata[tid+2*overlap] = (i+overlap < n) ? input[i+overlap] : pad;
475 sdata[overlap+tid] = input[i];
479 sdata[overlap+tid] = wrap[overlap+(i-n)];
484 sdata[tid] = (i<overlap) ? wrap[tid] : input[i-overlap];
487 if(tid >= (blockDim.x-overlap))
489 sdata[tid+2*overlap] = (i+overlap < n) ? input[i+overlap] : wrap[overlap+(i+overlap-n)];
494 sdata[overlap+tid] = (i < n) ? input[i] : input[n-1];
498 sdata[tid] = (i<overlap) ? input[0] : input[i-overlap];
501 if(tid >= (blockDim.x-overlap))
503 sdata[tid+2*overlap] = (i+overlap < n) ? input[i+overlap] : input[n-1];
510 if( (i >= out_offset) && (i < out_offset+out_numelements) )
512 output[i-out_offset] = mapOverlapFunc.CU(&(sdata[tid+overlap]));
531 template <
int poly,
typename T,
typename OverlapFunc>
532 __global__
void MapOverlapKernel_CU_Matrix_Row(OverlapFunc mapOverlapFunc, T* input, T* output, T* wrap,
size_t n,
size_t out_offset,
size_t out_numelements, T pad,
size_t blocksPerRow,
size_t rowWidth)
534 extern __shared__
char _sdata[];
535 T* sdata =
reinterpret_cast<T*
>(_sdata);
537 size_t tid = threadIdx.x;
538 size_t i = blockIdx.x * blockDim.x + tid;
539 size_t overlap = mapOverlapFunc.overlap;
541 size_t wrapIndex= 2 * overlap * (int)(blockIdx.x/blocksPerRow);
542 size_t tmp= (blockIdx.x % blocksPerRow);
543 size_t tmp2= (blockIdx.x / blocksPerRow);
549 sdata[overlap+tid] = (i < n) ? input[i] : pad;
553 sdata[tid] = (tmp==0) ? pad : input[i-overlap];
556 if(tid >= (blockDim.x-overlap))
558 sdata[tid+2*overlap] = (blockIdx.x != gridDim.x-1 && (i+overlap < n) && tmp!=(blocksPerRow-1)) ? input[i+overlap] : pad;
565 sdata[overlap+tid] = input[i];
567 else if(i-n < overlap)
569 sdata[overlap+tid] = wrap[(overlap+(i-n))+ wrapIndex];
573 sdata[overlap+tid] = pad;
578 sdata[tid] = (tmp==0) ? wrap[tid+wrapIndex] : input[i-overlap];
581 if(tid >= (blockDim.x-overlap))
583 sdata[tid+2*overlap] = (blockIdx.x != gridDim.x-1 && i+overlap < n && tmp!=(blocksPerRow-1)) ? input[i+overlap] : wrap[overlap+wrapIndex+(tid+overlap-blockDim.x)];
588 sdata[overlap+tid] = (i < n) ? input[i] : input[n-1];
592 sdata[tid] = (tmp==0) ? input[tmp2*rowWidth] : input[i-overlap];
595 if(tid >= (blockDim.x-overlap))
597 sdata[tid+2*overlap] = (blockIdx.x != gridDim.x-1 && (i+overlap < n) && (tmp!=(blocksPerRow-1))) ? input[i+overlap] : input[(tmp2+1)*rowWidth-1];
604 if( (i >= out_offset) && (i < out_offset+out_numelements) )
606 output[i-out_offset] = mapOverlapFunc.CU(&(sdata[tid+overlap]));
620 template <
int poly,
typename T,
typename OverlapFunc>
621 __global__
void MapOverlapKernel_CU_Matrix_Col(OverlapFunc mapOverlapFunc, T* input, T* output, T* wrap,
size_t n,
size_t out_offset,
size_t out_numelements, T pad,
size_t blocksPerCol,
size_t rowWidth,
size_t colWidth)
623 extern __shared__
char _sdata[];
624 T* sdata =
reinterpret_cast<T*
>(_sdata);
626 size_t tid = threadIdx.x;
627 size_t i = blockIdx.x * blockDim.x + tid;
628 size_t overlap = mapOverlapFunc.overlap;
630 size_t wrapIndex= 2 * overlap * (int)(blockIdx.x/blocksPerCol);
631 size_t tmp= (blockIdx.x % blocksPerCol);
632 size_t tmp2= (blockIdx.x / blocksPerCol);
634 size_t arrInd = (threadIdx.x + tmp*blockDim.x)*rowWidth + ((blockIdx.x)/blocksPerCol);
639 sdata[overlap+tid] = (i < n) ? input[arrInd] : pad;
643 sdata[tid] = (tmp==0) ? pad : input[(arrInd-(overlap*rowWidth))];
646 if(tid >= (blockDim.x-overlap))
648 sdata[tid+2*overlap] = (blockIdx.x != gridDim.x-1 && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+(overlap*rowWidth))] : pad;
655 sdata[overlap+tid] = input[arrInd];
657 else if(i-n < overlap)
659 sdata[overlap+tid] = wrap[(overlap+(i-n))+ wrapIndex];
663 sdata[overlap+tid] = pad;
668 sdata[tid] = (tmp==0) ? wrap[tid+wrapIndex] : input[(arrInd-(overlap*rowWidth))];
671 if(tid >= (blockDim.x-overlap))
673 sdata[tid+2*overlap] = (blockIdx.x != gridDim.x-1 && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+(overlap*rowWidth))] : wrap[overlap+wrapIndex+(tid+overlap-blockDim.x)];
678 sdata[overlap+tid] = (i < n) ? input[arrInd] : input[n-1];
682 sdata[tid] = (tmp==0) ? input[tmp2] : input[(arrInd-(overlap*rowWidth))];
685 if(tid >= (blockDim.x-overlap))
687 sdata[tid+2*overlap] = (blockIdx.x != gridDim.x-1 && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+(overlap*rowWidth))] : input[tmp2+(colWidth-1)*rowWidth];
694 if( (arrInd >= out_offset) && (arrInd < out_offset+out_numelements) )
696 output[arrInd-out_offset] = mapOverlapFunc.CU(&(sdata[tid+overlap]));
715 template <
int poly,
int deviceType,
typename T,
typename OverlapFunc>
716 __global__
void MapOverlapKernel_CU_Matrix_ColMulti(OverlapFunc mapOverlapFunc, T* input, T* output, T* wrap,
size_t n,
size_t in_offset,
size_t out_numelements, T pad,
size_t blocksPerCol,
size_t rowWidth,
size_t colWidth)
718 extern __shared__
char _sdata[];
719 T* sdata =
reinterpret_cast<T*
>(_sdata);
721 size_t tid = threadIdx.x;
722 size_t i = blockIdx.x * blockDim.x + tid;
723 size_t overlap = mapOverlapFunc.overlap;
725 size_t tmp= (blockIdx.x % blocksPerCol);
726 size_t tmp2= (blockIdx.x / blocksPerCol);
728 size_t arrInd = (threadIdx.x + tmp*blockDim.x)*rowWidth + tmp2;
732 sdata[overlap+tid] = (i < n) ? input[arrInd+in_offset] : pad;
738 sdata[tid] = (tmp==0) ? pad : input[(arrInd-(overlap*rowWidth))];
741 if(tid >= (blockDim.x-overlap))
743 sdata[tid+2*overlap] = input[(arrInd+in_offset+(overlap*rowWidth))];
746 else if(deviceType == 0)
750 sdata[tid] = input[arrInd];
753 if(tid >= (blockDim.x-overlap))
755 sdata[tid+2*overlap] = input[(arrInd+in_offset+(overlap*rowWidth))];
758 else if(deviceType == 1)
762 sdata[tid] = input[arrInd];
765 if(tid >= (blockDim.x-overlap))
767 sdata[tid+2*overlap] = (blockIdx.x != gridDim.x-1 && (arrInd+in_offset+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+in_offset+(overlap*rowWidth))] : pad;
773 sdata[overlap+tid] = (i < n) ? input[arrInd+in_offset] : ((i-n < overlap) ? wrap[(i-n)+ (overlap * tmp2)] : pad);
779 sdata[tid] = (tmp==0) ? wrap[tid+(overlap * tmp2)] : input[(arrInd-(overlap*rowWidth))];
782 if(tid >= (blockDim.x-overlap))
784 sdata[tid+2*overlap] = input[(arrInd+in_offset+(overlap*rowWidth))];
787 else if(deviceType == 0)
791 sdata[tid] = input[arrInd];
794 if(tid >= (blockDim.x-overlap))
796 sdata[tid+2*overlap] = input[(arrInd+in_offset+(overlap*rowWidth))];
799 else if(deviceType == 1)
803 sdata[tid] = input[arrInd];
806 if(tid >= (blockDim.x-overlap))
808 sdata[tid+2*overlap] = (blockIdx.x != gridDim.x-1 && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+in_offset+(overlap*rowWidth))] : wrap[(overlap * tmp2)+(tid+overlap-blockDim.x)];
814 sdata[overlap+tid] = (i < n) ? input[arrInd+in_offset] : input[n+in_offset-1];
820 sdata[tid] = (tmp==0) ? input[tmp2] : input[(arrInd-(overlap*rowWidth))];
823 if(tid >= (blockDim.x-overlap))
825 sdata[tid+2*overlap] = input[(arrInd+in_offset+(overlap*rowWidth))];
828 else if(deviceType == 0)
832 sdata[tid] = input[arrInd];
835 if(tid >= (blockDim.x-overlap))
837 sdata[tid+2*overlap] = input[(arrInd+in_offset+(overlap*rowWidth))];
840 else if(deviceType == 1)
844 sdata[tid] = input[arrInd];
847 if(tid >= (blockDim.x-overlap))
849 sdata[tid+2*overlap] = (blockIdx.x != gridDim.x-1 && (arrInd+in_offset+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+in_offset+(overlap*rowWidth))] : input[tmp2+in_offset+(colWidth-1)*rowWidth];
857 if( arrInd < out_numelements )
859 output[arrInd] = mapOverlapFunc.CU(&(sdata[tid+overlap]));
__global__ void MapOverlapKernel_CU_Matrix_Row(OverlapFunc mapOverlapFunc, T *input, T *output, T *wrap, size_t n, size_t out_offset, size_t out_numelements, T pad, size_t blocksPerRow, size_t rowWidth)
Definition: mapoverlap_kernels.h:532
__global__ void transpose(T *odata, T *idata, size_t width, size_t height)
Definition: mapoverlap_kernels.h:408
__global__ void MapOverlapKernel_CU_Matrix_ColMulti(OverlapFunc mapOverlapFunc, T *input, T *output, T *wrap, size_t n, size_t in_offset, size_t out_numelements, T pad, size_t blocksPerCol, size_t rowWidth, size_t colWidth)
Definition: mapoverlap_kernels.h:716
__global__ void MapOverlapKernel_CU_Matrix_Col(OverlapFunc mapOverlapFunc, T *input, T *output, T *wrap, size_t n, size_t out_offset, size_t out_numelements, T pad, size_t blocksPerCol, size_t rowWidth, size_t colWidth)
Definition: mapoverlap_kernels.h:621
static std::string MapOverlapKernel_CL_Matrix_Row("__kernel void MapOverlapKernel_MatRowWise_KERNELNAME(__global TYPE* input, __global TYPE* output, __global TYPE* wrap, size_t n, size_t overlap, size_t out_offset, size_t out_numelements, int poly, TYPE pad, size_t blocksPerRow, size_t rowWidth, __local TYPE* sdata)\n""{\n"" size_t tid = get_local_id(0);\n"" size_t i = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"" size_t wrapIndex= 2 * overlap * (int)(get_group_id(0)/blocksPerRow);\n"" size_t tmp= (get_group_id(0) % blocksPerRow);\n"" size_t tmp2= (get_group_id(0) / blocksPerRow);\n"" if(poly == 0)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[i] : pad;\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? pad : input[i-overlap];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (i+overlap < n) && tmp!=(blocksPerRow-1)) ? input[i+overlap] : pad;\n"" }\n"" }\n"" else if(poly == 1)\n"" {\n"" if(i < n)\n"" {\n"" sdata[overlap+tid] = input[i];\n"" }\n"" else if(i-n < overlap)\n"" {\n"" sdata[overlap+tid] = wrap[(overlap+(i-n))+ wrapIndex];\n"" }\n"" else\n"" {\n"" sdata[overlap+tid] = pad;\n"" }\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? wrap[tid+wrapIndex] : input[i-overlap];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && i+overlap < n && tmp!=(blocksPerRow-1)) ? input[i+overlap] : wrap[overlap+wrapIndex+(tid+overlap-get_local_size(0))];\n"" }\n"" }\n"" else if(poly == 2)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[i] : input[n-1];\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? input[tmp2*rowWidth] : input[i-overlap];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (i+overlap < n) && (tmp!=(blocksPerRow-1))) ? input[i+overlap] : input[(tmp2+1)*rowWidth-1];\n"" }\n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if( (i >= out_offset) && (i < out_offset+out_numelements) )\n"" {\n"" output[i-out_offset] = FUNCTIONNAME(&(sdata[tid+overlap]));\n"" }\n""}\n")
static std::string MapOverlapKernel_CL_Matrix_ColMulti("__kernel void MapOverlapKernel_MatColWiseMulti_KERNELNAME(__global TYPE* input, __global TYPE* output, __global TYPE* wrap, size_t n, size_t overlap, size_t in_offset, size_t out_numelements, int poly, int deviceType, TYPE pad, size_t blocksPerCol, size_t rowWidth, size_t colWidth, __local TYPE* sdata)\n""{\n"" size_t tid = get_local_id(0);\n"" size_t i = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"" size_t wrapIndex= 2 * overlap * (int)(get_group_id(0)/blocksPerCol);\n"" size_t tmp= (get_group_id(0) % blocksPerCol);\n"" size_t tmp2= (get_group_id(0) / blocksPerCol);\n"" size_t arrInd = (tid + tmp*get_local_size(0))*rowWidth + tmp2;\n"" if(poly == 0)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[arrInd+in_offset] : pad;\n"" if(deviceType == -1)\n"" {\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? pad : input[(arrInd-(overlap*rowWidth))];\n"" }\n"" \n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = input[(arrInd+in_offset+(overlap*rowWidth))];\n"" }\n"" }\n"" else if(deviceType == 0) \n"" {\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = input[arrInd];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = input[(arrInd+in_offset+(overlap*rowWidth))];\n"" }\n"" }\n"" else if(deviceType == 1)\n"" {\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = input[arrInd];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+in_offset+(overlap*rowWidth))] : pad;\n"" }\n"" }\n"" }\n"" else if(poly == 1)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[arrInd+in_offset] : ((i-n < overlap) ? wrap[(i-n)+ (overlap * tmp2)] : pad);\n"" if(deviceType == -1)\n"" {\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? wrap[tid+(overlap * tmp2)] : input[(arrInd-(overlap*rowWidth))];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = input[(arrInd+in_offset+(overlap*rowWidth))];\n"" }\n"" }\n"" else if(deviceType == 0)\n"" {\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = input[arrInd];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = input[(arrInd+in_offset+(overlap*rowWidth))];\n"" }\n"" }\n"" else if(deviceType == 1)\n"" {\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = input[arrInd];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+in_offset+(overlap*rowWidth))] : wrap[(overlap * tmp2)+(tid+overlap-get_local_size(0))];\n"" }\n"" }\n"" }\n"" else if(poly == 2)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[arrInd+in_offset] : input[n+in_offset-1];\n"" if(deviceType == -1)\n"" {\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? input[tmp2] : input[(arrInd-(overlap*rowWidth))];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = input[(arrInd+in_offset+(overlap*rowWidth))];\n"" }\n"" }\n"" else if(deviceType == 0)\n"" {\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = input[arrInd];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = input[(arrInd+in_offset+(overlap*rowWidth))];\n"" }\n"" }\n"" else if(deviceType == 1)\n"" {\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = input[arrInd];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+in_offset+(overlap*rowWidth))] : input[tmp2+in_offset+(colWidth-1)*rowWidth];\n"" }\n"" }\n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if( arrInd < out_numelements )\n"" {\n"" output[arrInd] = FUNCTIONNAME(&(sdata[tid+overlap]));\n"" }\n""}\n")
__global__ void MapOverlapKernel_CU(OverlapFunc mapOverlapFunc, T *input, T *output, T *wrap, size_t n, size_t out_offset, size_t out_numelements, T pad)
Definition: mapoverlap_kernels.h:443
static std::string MapOverlapKernel_CL_Matrix_Col("__kernel void MapOverlapKernel_MatColWise_KERNELNAME(__global TYPE* input, __global TYPE* output, __global TYPE* wrap, size_t n, size_t overlap, size_t out_offset, size_t out_numelements, int poly, TYPE pad, size_t blocksPerCol, size_t rowWidth, size_t colWidth, __local TYPE* sdata)\n""{\n"" size_t tid = get_local_id(0);\n"" size_t i = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"" size_t wrapIndex= 2 * overlap * (int)(get_group_id(0)/blocksPerCol);\n"" size_t tmp= (get_group_id(0) % blocksPerCol);\n"" size_t tmp2= (get_group_id(0) / blocksPerCol);\n"" size_t arrInd = (tid + tmp*get_local_size(0))*rowWidth + tmp2;\n"" if(poly == 0)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[arrInd] : pad;\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? pad : input[(arrInd-(overlap*rowWidth))];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+(overlap*rowWidth))] : pad;\n"" }\n"" }\n"" else if(poly == 1)\n"" {\n"" if(i < n)\n"" {\n"" sdata[overlap+tid] = input[arrInd];\n"" }\n"" else if(i-n < overlap)\n"" {\n"" sdata[overlap+tid] = wrap[(overlap+(i-n))+ wrapIndex];\n"" }\n"" else\n"" {\n"" sdata[overlap+tid] = pad;\n"" }\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? wrap[tid+wrapIndex] : input[(arrInd-(overlap*rowWidth))];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+(overlap*rowWidth))] : wrap[overlap+wrapIndex+(tid+overlap-get_local_size(0))];\n"" }\n"" }\n"" else if(poly == 2)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[arrInd] : input[n-1];\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? input[tmp2] : input[(arrInd-(overlap*rowWidth))];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+(overlap*rowWidth))] : input[tmp2+(colWidth-1)*rowWidth];\n"" }\n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if( (arrInd >= out_offset) && (arrInd < out_offset+out_numelements) )\n"" {\n"" output[arrInd-out_offset] = FUNCTIONNAME(&(sdata[tid+overlap]));\n"" }\n""}\n")
static std::string MapOverlapKernel_CL("__kernel void MapOverlapKernel_KERNELNAME(__global TYPE* input, __global TYPE* output, __global TYPE* wrap, size_t n, size_t overlap, size_t out_offset, size_t out_numelements, int poly, TYPE pad, __local TYPE* sdata)\n""{\n"" size_t tid = get_local_id(0);\n"" size_t i = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"" if(poly == 0)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[i] : pad;\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (get_group_id(0) == 0) ? pad : input[i-overlap];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != get_num_groups(0)-1 && i+overlap < n) ? input[i+overlap] : pad;\n"" }\n"" }\n"" else if(poly == 1)\n"" {\n"" if(i < n)\n"" {\n"" sdata[overlap+tid] = input[i];\n"" }\n"" else if(i-n < overlap)\n"" {\n"" sdata[overlap+tid] = wrap[overlap+(i-n)];\n"" }\n"" else\n"" {\n"" sdata[overlap+tid] = pad;\n"" }\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (get_group_id(0) == 0) ? wrap[tid] : input[i-overlap];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != get_num_groups(0)-1 && i+overlap < n) ? input[i+overlap] : wrap[overlap+(i+overlap-n)];\n"" }\n"" }\n"" else if(poly == 2)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[i] : input[n-1];\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (get_group_id(0) == 0) ? input[0] : input[i-overlap];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != get_num_groups(0)-1 && i+overlap < n) ? input[i+overlap] : input[n-1];\n"" }\n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if( (i >= out_offset) && (i < out_offset+out_numelements) )\n"" {\n"" output[i-out_offset] = FUNCTIONNAME(&(sdata[tid+overlap]));\n"" }\n""}\n")