5 #ifndef MAPOVERLAP_CONVOL_KERNELS_H
6 #define MAPOVERLAP_CONVOL_KERNELS_H
32 "__kernel void conv_opencl_shared_filter_KERNELNAME(__global TYPE* input, __global TYPE* output, __constant TYPE* filter, size_t in_rows, size_t in_cols, size_t out_rows, size_t out_cols, size_t filter_rows, size_t filter_cols, size_t in_pitch, size_t out_pitch, size_t sharedRows, size_t sharedCols, __local TYPE* sdata)\n"
34 " size_t xx = ( (size_t)(get_global_id(0)/get_local_size(0))) * get_local_size(0);\n"
35 " size_t yy = ( (size_t)(get_global_id(1)/get_local_size(1))) * get_local_size(1);\n"
36 " size_t x = get_global_id(0);\n"
37 " size_t y = get_global_id(1);\n"
38 " if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))\n"
40 " size_t sharedIdx = get_local_id(1) * sharedCols + get_local_id(0);\n"
41 " sdata[sharedIdx]= input[y*in_pitch + x];\n"
42 " size_t shared_x= get_local_id(0)+get_local_size(0);\n"
43 " size_t shared_y= get_local_id(1);\n"
44 " while(shared_y<sharedRows)\n"
46 " while(shared_x<sharedCols)\n"
48 " sharedIdx = shared_y * sharedCols + shared_x; \n"
49 " sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];\n"
50 " shared_x = shared_x + get_local_size(0);\n"
52 " shared_x = get_local_id(0);\n"
53 " shared_y = shared_y + get_local_size(1);\n"
56 " barrier(CLK_LOCAL_MEM_FENCE);\n"
57 " if(x<out_cols && y<out_rows)\n"
60 " for(size_t j=0;j<filter_rows;j++) \n"
62 " for(size_t i=0;i<filter_cols;i++) \n"
64 " sum += sdata[(get_local_id(1)+j) * sharedCols + (get_local_id(0)+i) ] * filter[j*filter_cols+i];\n"
67 " output[y*out_pitch+x] = sum / (filter_rows * filter_cols);\n"
78 "__kernel void conv_opencl_2D_KERNELNAME(__global TYPE* input, __global TYPE* output, size_t out_rows, size_t out_cols, size_t filter_rows, size_t filter_cols, size_t in_pitch, size_t out_pitch, size_t stride, size_t sharedRows, size_t sharedCols, __local TYPE* sdata)\n"
80 " size_t xx = ( (size_t)(get_global_id(0)/get_local_size(0))) * get_local_size(0);\n"
81 " size_t yy = ( (size_t)(get_global_id(1)/get_local_size(1))) * get_local_size(1);\n"
82 " size_t x = get_global_id(0);\n"
83 " size_t y = get_global_id(1);\n"
84 " if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))\n"
86 " size_t sharedIdx = get_local_id(1) * sharedCols + get_local_id(0);\n"
87 " sdata[sharedIdx]= input[y*in_pitch + x];\n"
88 " size_t shared_x= get_local_id(0)+get_local_size(0);\n"
89 " size_t shared_y= get_local_id(1);\n"
90 " while(shared_y<sharedRows)\n"
92 " while(shared_x<sharedCols)\n"
94 " sharedIdx = shared_y * sharedCols + shared_x; \n"
95 " sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];\n"
96 " shared_x = shared_x + get_local_size(0);\n"
98 " shared_x = get_local_id(0);\n"
99 " shared_y = shared_y + get_local_size(1);\n"
102 " barrier(CLK_LOCAL_MEM_FENCE);\n"
103 " if(x<out_cols && y<out_rows)\n"
105 " output[y*out_pitch+x] = FUNCTIONNAME(&(sdata[(get_local_id(1)+(filter_rows/2)) * sharedCols + (get_local_id(0)+(filter_cols/2))]), stride);\n"
115 "__kernel void conv_opencl_shared_KERNELNAME(__global TYPE* input, __global TYPE* output, size_t in_rows, size_t in_cols, size_t out_rows, size_t out_cols, size_t filter_rows, size_t filter_cols, size_t in_pitch, size_t out_pitch, size_t sharedRows, size_t sharedCols, __local TYPE* sdata)\n"
117 " size_t xx = ( (size_t)(get_global_id(0)/get_local_size(0))) * get_local_size(0);\n"
118 " size_t yy = ( (size_t)(get_global_id(1)/get_local_size(1))) * get_local_size(1);\n"
119 " size_t x = get_global_id(0);\n"
120 " size_t y = get_global_id(1);\n"
121 " if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))\n"
123 " size_t sharedIdx = get_local_id(1) * sharedCols + get_local_id(0);\n"
124 " sdata[sharedIdx]= input[y*in_pitch + x];\n"
125 " size_t shared_x= get_local_id(0)+get_local_size(0);\n"
126 " size_t shared_y= get_local_id(1);\n"
127 " while(shared_y<sharedRows)\n"
129 " while(shared_x<sharedCols)\n"
131 " sharedIdx = shared_y * sharedCols + shared_x; \n"
132 " sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];\n"
133 " shared_x = shared_x + get_local_size(0);\n"
135 " shared_x = get_local_id(0);\n"
136 " shared_y = shared_y + get_local_size(1);\n"
139 " barrier(CLK_LOCAL_MEM_FENCE);\n"
140 " if(x<out_cols && y<out_rows)\n"
143 " for(size_t j=0;j<filter_rows;j++) \n"
145 " for(size_t i=0;i<filter_cols;i++) \n"
147 " sum += sdata[(get_local_id(1)+j) * sharedCols + (get_local_id(0)+i) ];\n"
150 " output[y*out_pitch+x] = sum / (filter_rows * filter_cols);\n"
189 #define BLOCK_SIZE_X 16
190 #define BLOCK_SIZE_Y 32
192 #define NUM_REGISTERS_PER_SP 32768
193 #define SHARED_MEM_SIZE_BYTES 48000
194 #define THREADS_PER_WARP 32
195 #define WARPS_PER_SP 48
196 #define THREAD_BLOCK_PER_SP 8
202 template <
typename T>
211 template <
typename T>
223 template <
typename T>
224 size_t calculateTiling(
size_t regCountPerThread,
size_t filterSizeX,
size_t filterSizeY,
size_t inputSizeX,
bool maximizeTiling=
false)
226 size_t numThreadsPerTB = (BLOCK_SIZE_X * BLOCK_SIZE_Y);
228 size_t numWarpsPerTB = (numThreadsPerTB+WARP_SIZE-1) / WARP_SIZE;
230 size_t maxTBPerSP =
min( (WARPS_PER_SP / numWarpsPerTB), (
size_t)THREAD_BLOCK_PER_SP);
238 long long remRegPerThreads = NUM_REGISTERS_PER_SP - (regCountPerThread * numWarpsPerTB * WARP_SIZE * maxTBPerSP);
240 if(remRegPerThreads <0)
242 std::cerr <<
"Error! Limited by Register usage, tiling cannot be more than 1\n";
246 remRegPerThreads = remRegPerThreads / (numWarpsPerTB * WARP_SIZE * maxTBPerSP);
248 long long sharedMem = SHARED_MEM_SIZE_BYTES - ((BLOCK_SIZE_X + filterSizeX - 1) * (BLOCK_SIZE_Y + filterSizeY - 1) *
sizeof(T) * maxTBPerSP);
252 std::cerr <<
"Error! Limited by shared memory usage, tiling cannot be more than 1\n";
256 size_t tilingSM =
min( (
size_t)(inputSizeX/BLOCK_SIZE_X), (
size_t)(sharedMem / (BLOCK_SIZE_X * (BLOCK_SIZE_Y + filterSizeY - 1) *
sizeof (T) * maxTBPerSP)) );
258 tilingSM =
min(tilingSM, (
size_t)remRegPerThreads);
260 inputSizeX = inputSizeX / BLOCK_SIZE_X;
263 while( (inputSizeX%tilingSM) != 0)
277 __device__ __constant__
char deviceFilter[16386];
286 template<
typename T,
typename OverlapFunc>
287 __global__
void conv_cuda_2D_kernel(OverlapFunc mapOverlapFunc, T* input, T* output,
const size_t out_rows,
const size_t out_cols,
const size_t filter_rows,
const size_t filter_cols,
size_t in_pitch,
size_t out_pitch,
const size_t sharedRows,
const size_t sharedCols)
289 extern __shared__
char _sdata[];
290 T* sdata =
reinterpret_cast<T*
>(_sdata);
292 size_t xx = blockIdx.x * blockDim.x;
293 size_t yy = blockIdx.y * blockDim.y;
295 size_t x = xx + threadIdx.x;
296 size_t y = yy + threadIdx.y;
298 if( x<(out_cols+(filter_cols-1)) && y<(out_rows+(filter_rows-1)) )
300 size_t sharedIdx = threadIdx.y * sharedCols + threadIdx.x;
302 sdata[sharedIdx]= input[y*in_pitch + x];
304 size_t shared_x= threadIdx.x+blockDim.x;
305 size_t shared_y= threadIdx.y;
308 while(shared_y<sharedRows)
310 while(shared_x<sharedCols)
312 sharedIdx = shared_y * sharedCols + shared_x;
313 sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];
314 shared_x = shared_x + blockDim.x;
316 shared_x = threadIdx.x;
317 shared_y = shared_y + blockDim.y;
322 if(x<out_cols && y<out_rows)
324 output[y*out_pitch+x] = mapOverlapFunc.CU(&(sdata[(threadIdx.y+(filter_rows/2)) * sharedCols + (threadIdx.x+(filter_cols/2))]));
334 template<
bool useFilter,
typename T>
335 __global__
void conv_cuda_shared_kernel(T* input, T* output,
const size_t in_rows,
const size_t in_cols,
const size_t out_rows,
const size_t out_cols,
const size_t filter_rows,
const size_t filter_cols,
size_t in_pitch,
size_t out_pitch,
const size_t sharedRows,
const size_t sharedCols)
337 extern __shared__
char _sdata[];
338 T* sdata =
reinterpret_cast<T*
>(_sdata);
340 size_t xx = blockIdx.x * blockDim.x;
341 size_t yy = blockIdx.y * blockDim.y;
343 size_t x = xx + threadIdx.x;
344 size_t y = yy + threadIdx.y;
346 if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))
348 size_t sharedIdx = threadIdx.y * sharedCols + threadIdx.x;
350 sdata[sharedIdx]= input[y*in_pitch + x];
352 size_t shared_x= threadIdx.x+blockDim.x;
353 size_t shared_y= threadIdx.y;
356 while(shared_y<sharedRows)
358 while(shared_x<sharedCols)
360 sharedIdx = shared_y * sharedCols + shared_x;
361 sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];
362 shared_x = shared_x + blockDim.x;
364 shared_x = threadIdx.x;
365 shared_y = shared_y + blockDim.y;
370 if(x<out_cols && y<out_rows)
376 T *d_Filter =
reinterpret_cast<T*
>(deviceFilter);
377 for(
size_t j=0; j<filter_rows; j++)
379 for(
size_t i=0; i<filter_cols; i++)
381 sum += sdata[(threadIdx.y+j) * sharedCols + (threadIdx.x+i) ] * d_Filter[j*filter_cols+i];
387 for(
size_t j=0; j<filter_rows; j++)
389 for(
size_t i=0; i<filter_cols; i++)
391 sum += sdata[(threadIdx.y+j) * sharedCols + (threadIdx.x+i) ];
395 output[y*out_pitch+x] = sum / (filter_rows * filter_cols);
406 template<
bool useFilter,
typename T>
407 __global__
void conv_cuda_shared_tiling_kernel(T* input, T* output,
const size_t numTiles,
const size_t in_cols,
const size_t out_rows,
const size_t out_cols,
const size_t filter_rows,
const size_t filter_cols,
size_t in_pitch,
size_t out_pitch,
const size_t sharedRows,
const size_t sharedCols)
409 extern __shared__
char _sdata[];
410 T* sdata =
reinterpret_cast<T*
>(_sdata);
412 size_t xx = blockIdx.x * blockDim.x * numTiles;
413 size_t yy = blockIdx.y * blockDim.y;
415 size_t x = xx + threadIdx.x;
416 size_t y = yy + threadIdx.y;
418 size_t sharedIdx = threadIdx.y * sharedCols + threadIdx.x;
419 size_t shared_x= threadIdx.x+blockDim.x;
420 size_t shared_y= threadIdx.y;
422 if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))
424 sdata[sharedIdx]= input[y*in_pitch + x];
427 while(shared_y<sharedRows)
429 while(shared_x<sharedCols)
431 sharedIdx = shared_y * sharedCols + shared_x;
432 sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];
433 shared_x = shared_x + blockDim.x;
435 shared_x = threadIdx.x;
436 shared_y = shared_y + blockDim.y;
442 sharedIdx = threadIdx.x;
444 for(
size_t t=0; t<numTiles; t++)
446 if(x<out_cols && y<out_rows)
453 T *d_Filter =
reinterpret_cast<T*
>(deviceFilter);
454 for(
size_t j=0; j<filter_rows; j++)
456 for(
size_t i=0; i<filter_cols; i++)
458 shared_x += sdata[(threadIdx.y+j) * sharedCols + (sharedIdx+i) ] * d_Filter[j*filter_cols+i];
464 for(
size_t j=0; j<filter_rows; j++)
466 for(
size_t i=0; i<filter_cols; i++)
468 shared_x += sdata[(threadIdx.y+j) * sharedCols + (sharedIdx+i) ];
472 output[y*out_pitch+x] = shared_x / (filter_rows * filter_cols);
474 sharedIdx += blockDim.x;
488 template<
bool useFilter,
typename T>
489 __global__
void conv_cuda_shared_tiling_2_kernel(T* input, T* output,
const size_t in_cols,
const size_t out_rows,
const size_t out_cols,
const size_t filter_rows,
const size_t filter_cols,
size_t in_pitch,
size_t out_pitch,
const size_t sharedRows,
const size_t sharedCols)
491 extern __shared__
char _sdata[];
492 T* sdata =
reinterpret_cast<T*
>(_sdata);
494 size_t xx = blockIdx.x * blockDim.x * 2;
495 size_t yy = blockIdx.y * blockDim.y;
497 size_t x = xx + threadIdx.x;
498 size_t y = yy + threadIdx.y;
500 size_t sharedIdx = threadIdx.y * sharedCols + threadIdx.x;
503 if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))
505 sdata[sharedIdx]= input[y*in_pitch + x];
507 size_t shared_x= threadIdx.x+blockDim.x;
508 size_t shared_y= threadIdx.y;
511 while(shared_y<sharedRows)
513 while(shared_x<sharedCols)
515 sharedIdx = shared_y * sharedCols + shared_x;
516 sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];
517 shared_x = shared_x + blockDim.x;
519 shared_x = threadIdx.x;
520 shared_y = shared_y + blockDim.y;
526 sharedIdx = threadIdx.x;
530 if(x<out_cols && y<out_rows)
537 T *d_Filter =
reinterpret_cast<T*
>(deviceFilter);
538 for(
size_t j=0; j<filter_rows; j++)
540 for(
size_t i=0; i<filter_cols; i++)
542 sum += sdata[(threadIdx.y+j) * sharedCols + (sharedIdx+i) ] * d_Filter[j*filter_cols+i];
543 sum2 += sdata[(threadIdx.y+j) * sharedCols + (sharedIdx+blockDim.x+i) ] * d_Filter[j*filter_cols+i];
549 for(
size_t j=0; j<filter_rows; j++)
551 for(
size_t i=0; i<filter_cols; i++)
553 sum += sdata[(threadIdx.y+j) * sharedCols + (sharedIdx+i) ];
554 sum2 += sdata[(threadIdx.y+j) * sharedCols + (sharedIdx+blockDim.x+i) ];
558 output[y*out_pitch+x] = sum / (filter_rows * filter_cols);
559 output[y*out_pitch+x+blockDim.x] = sum2 / (filter_rows * filter_cols);
572 template<
bool useFilter,
typename T>
573 __global__
void conv_cuda_shared_tiling_4_kernel(T* input, T* output,
const size_t in_cols,
const size_t out_rows,
const size_t out_cols,
const size_t filter_rows,
const size_t filter_cols,
size_t in_pitch,
size_t out_pitch,
const size_t sharedRows,
const size_t sharedCols)
575 extern __shared__
char _sdata[];
576 T* sdata =
reinterpret_cast<T*
>(_sdata);
578 size_t xx = blockIdx.x * blockDim.x * 4;
579 size_t yy = blockIdx.y * blockDim.y;
581 size_t x = xx + threadIdx.x;
583 size_t y = yy + threadIdx.y;
585 size_t sharedIdx = threadIdx.y * sharedCols + threadIdx.x;
587 size_t shared_x= threadIdx.x+blockDim.x;
590 if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))
592 sdata[sharedIdx]= input[y*in_pitch + x];
594 size_t shared_y= threadIdx.y;
597 while(shared_y<sharedRows)
599 while(shared_x<sharedCols)
601 sharedIdx = shared_y * sharedCols + shared_x;
602 sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];
603 shared_x = shared_x + blockDim.x;
605 shared_x = threadIdx.x;
606 shared_y = shared_y + blockDim.y;
612 sharedIdx = threadIdx.x;
616 if(x<out_cols && y<out_rows)
625 T *d_Filter =
reinterpret_cast<T*
>(deviceFilter);
626 for(
size_t j=0; j<filter_rows; j++)
628 for(
size_t i=0; i<filter_cols; i++)
630 shared_x = (threadIdx.y+j) * sharedCols + (sharedIdx+i);
631 sum += sdata[shared_x] * d_Filter[j*filter_cols+i];
632 shared_x += blockDim.x;
633 sum2 += sdata[shared_x] * d_Filter[j*filter_cols+i];
634 shared_x += blockDim.x;
635 sum3 += sdata[shared_x] * d_Filter[j*filter_cols+i];
636 shared_x += blockDim.x;
637 sum4 += sdata[shared_x] * d_Filter[j*filter_cols+i];
643 for(
size_t j=0; j<filter_rows; j++)
645 for(
size_t i=0; i<filter_cols; i++)
647 shared_x = (threadIdx.y+j) * sharedCols + (sharedIdx+i);
648 sum += sdata[shared_x];
649 shared_x += blockDim.x;
650 sum2 += sdata[shared_x];
651 shared_x += blockDim.x;
652 sum3 += sdata[shared_x];
653 shared_x += blockDim.x;
654 sum4 += sdata[shared_x];
658 shared_x = y*out_pitch+x;
659 output[shared_x] = sum / (filter_rows * filter_cols);
660 shared_x += blockDim.x;
661 output[shared_x] = sum2 / (filter_rows * filter_cols);
662 shared_x += blockDim.x;
663 output[shared_x] = sum3 / (filter_rows * filter_cols);
664 shared_x += blockDim.x;
665 output[shared_x] = sum4 / (filter_rows * filter_cols);
675 template<
bool useFilter,
typename T>
676 __global__
void conv_cuda_shared_tiling_6_kernel(T* input, T* output,
const size_t in_cols,
const size_t out_rows,
const size_t out_cols,
const size_t filter_rows,
const size_t filter_cols,
size_t in_pitch,
size_t out_pitch,
const size_t sharedRows,
const size_t sharedCols)
678 extern __shared__
char _sdata[];
679 T* sdata =
reinterpret_cast<T*
>(_sdata);
681 size_t xx = blockIdx.x * blockDim.x * 6;
682 size_t yy = blockIdx.y * blockDim.y;
684 size_t x = xx + threadIdx.x;
686 size_t y = yy + threadIdx.y;
688 size_t sharedIdx = threadIdx.y * sharedCols + threadIdx.x;
690 size_t shared_x= threadIdx.x+blockDim.x;
693 if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))
695 sdata[sharedIdx]= input[y*in_pitch + x];
697 size_t shared_y= threadIdx.y;
700 while(shared_y<sharedRows)
702 while(shared_x<sharedCols)
704 sharedIdx = shared_y * sharedCols + shared_x;
705 sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];
706 shared_x = shared_x + blockDim.x;
708 shared_x = threadIdx.x;
709 shared_y = shared_y + blockDim.y;
715 sharedIdx = threadIdx.x;
719 if(x<out_cols && y<out_rows)
730 T *d_Filter =
reinterpret_cast<T*
>(deviceFilter);
731 for(
size_t j=0; j<filter_rows; j++)
733 for(
size_t i=0; i<filter_cols; i++)
735 shared_x = (threadIdx.y+j) * sharedCols + (sharedIdx+i);
736 sum += sdata[shared_x] * d_Filter[j*filter_cols+i];
737 shared_x += blockDim.x;
738 sum2 += sdata[shared_x] * d_Filter[j*filter_cols+i];
739 shared_x += blockDim.x;
740 sum3 += sdata[shared_x] * d_Filter[j*filter_cols+i];
741 shared_x += blockDim.x;
742 sum4 += sdata[shared_x] * d_Filter[j*filter_cols+i];
743 shared_x += blockDim.x;
744 sum5 += sdata[shared_x] * d_Filter[j*filter_cols+i];
745 shared_x += blockDim.x;
746 sum6 += sdata[shared_x] * d_Filter[j*filter_cols+i];
752 for(
size_t j=0; j<filter_rows; j++)
754 for(
size_t i=0; i<filter_cols; i++)
756 shared_x = (threadIdx.y+j) * sharedCols + (sharedIdx+i);
757 sum += sdata[shared_x];
758 shared_x += blockDim.x;
759 sum2 += sdata[shared_x];
760 shared_x += blockDim.x;
761 sum3 += sdata[shared_x];
762 shared_x += blockDim.x;
763 sum4 += sdata[shared_x];
764 shared_x += blockDim.x;
765 sum5 += sdata[shared_x];
766 shared_x += blockDim.x;
767 sum6 += sdata[shared_x];
771 shared_x = y*out_pitch+x;
772 output[shared_x] = sum / (filter_rows * filter_cols);
773 shared_x += blockDim.x;
774 output[shared_x] = sum2 / (filter_rows * filter_cols);
775 shared_x += blockDim.x;
776 output[shared_x] = sum3 / (filter_rows * filter_cols);
777 shared_x += blockDim.x;
778 output[shared_x] = sum4 / (filter_rows * filter_cols);
779 shared_x += blockDim.x;
780 output[shared_x] = sum5 / (filter_rows * filter_cols);
781 shared_x += blockDim.x;
782 output[shared_x] = sum6 / (filter_rows * filter_cols);
794 template<
bool useFilter,
typename T>
795 __global__
void conv_cuda_shared_tiling_8_kernel(T* input, T* output,
const size_t in_cols,
const size_t out_rows,
const size_t out_cols,
const size_t filter_rows,
const size_t filter_cols,
size_t in_pitch,
size_t out_pitch,
const size_t sharedRows,
const size_t sharedCols)
797 extern __shared__
char _sdata[];
798 T* sdata =
reinterpret_cast<T*
>(_sdata);
800 size_t xx = blockIdx.x * blockDim.x * 8;
801 size_t yy = blockIdx.y * blockDim.y;
803 size_t x = xx + threadIdx.x;
805 size_t y = yy + threadIdx.y;
807 size_t sharedIdx = threadIdx.y * sharedCols + threadIdx.x;
809 size_t shared_x= threadIdx.x+blockDim.x;
812 if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))
814 sdata[sharedIdx]= input[y*in_pitch + x];
816 size_t shared_y= threadIdx.y;
819 while(shared_y<sharedRows)
821 while(shared_x<sharedCols)
823 sharedIdx = shared_y * sharedCols + shared_x;
824 sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];
825 shared_x = shared_x + blockDim.x;
827 shared_x = threadIdx.x;
828 shared_y = shared_y + blockDim.y;
834 sharedIdx = threadIdx.x;
838 if(x<out_cols && y<out_rows)
851 T *d_Filter =
reinterpret_cast<T*
>(deviceFilter);
852 for(
size_t j=0; j<filter_rows; j++)
854 for(
size_t i=0; i<filter_cols; i++)
856 shared_x = (threadIdx.y+j) * sharedCols + (sharedIdx+i);
857 sum += sdata[shared_x] * d_Filter[j*filter_cols+i];
858 shared_x += blockDim.x;
859 sum2 += sdata[shared_x] * d_Filter[j*filter_cols+i];
860 shared_x += blockDim.x;
861 sum3 += sdata[shared_x] * d_Filter[j*filter_cols+i];
862 shared_x += blockDim.x;
863 sum4 += sdata[shared_x] * d_Filter[j*filter_cols+i];
864 shared_x += blockDim.x;
865 sum5 += sdata[shared_x] * d_Filter[j*filter_cols+i];
866 shared_x += blockDim.x;
867 sum6 += sdata[shared_x] * d_Filter[j*filter_cols+i];
868 shared_x += blockDim.x;
869 sum7 += sdata[shared_x] * d_Filter[j*filter_cols+i];
870 shared_x += blockDim.x;
871 sum8 += sdata[shared_x] * d_Filter[j*filter_cols+i];
877 for(
size_t j=0; j<filter_rows; j++)
879 for(
size_t i=0; i<filter_cols; i++)
881 shared_x = (threadIdx.y+j) * sharedCols + (sharedIdx+i);
882 sum += sdata[shared_x];
883 shared_x += blockDim.x;
884 sum2 += sdata[shared_x];
885 shared_x += blockDim.x;
886 sum3 += sdata[shared_x];
887 shared_x += blockDim.x;
888 sum4 += sdata[shared_x];
889 shared_x += blockDim.x;
890 sum5 += sdata[shared_x];
891 shared_x += blockDim.x;
892 sum6 += sdata[shared_x];
893 shared_x += blockDim.x;
894 sum7 += sdata[shared_x];
895 shared_x += blockDim.x;
896 sum8 += sdata[shared_x];
900 shared_x = y*out_pitch+x;
901 output[shared_x] = sum / (filter_rows * filter_cols);
902 shared_x += blockDim.x;
903 output[shared_x] = sum2 / (filter_rows * filter_cols);
904 shared_x += blockDim.x;
905 output[shared_x] = sum3 / (filter_rows * filter_cols);
906 shared_x += blockDim.x;
907 output[shared_x] = sum4 / (filter_rows * filter_cols);
908 shared_x += blockDim.x;
909 output[shared_x] = sum5 / (filter_rows * filter_cols);
910 shared_x += blockDim.x;
911 output[shared_x] = sum6 / (filter_rows * filter_cols);
912 shared_x += blockDim.x;
913 output[shared_x] = sum7 / (filter_rows * filter_cols);
914 shared_x += blockDim.x;
915 output[shared_x] = sum8 / (filter_rows * filter_cols);
928 template<
bool useFilter,
typename T>
929 __global__
void conv_cuda_shared_tiling_10_kernel(T* input, T* output,
const size_t in_cols,
const size_t out_rows,
const size_t out_cols,
const size_t filter_rows,
const size_t filter_cols,
size_t in_pitch,
size_t out_pitch,
const size_t sharedRows,
const size_t sharedCols)
931 extern __shared__
char _sdata[];
932 T* sdata =
reinterpret_cast<T*
>(_sdata);
934 size_t xx = blockIdx.x * blockDim.x * 10;
935 size_t yy = blockIdx.y * blockDim.y;
937 size_t x = xx + threadIdx.x;
939 size_t y = yy + threadIdx.y;
941 size_t sharedIdx = threadIdx.y * sharedCols + threadIdx.x;
943 size_t shared_x= threadIdx.x+blockDim.x;
946 if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))
948 sdata[sharedIdx]= input[y*in_pitch + x];
950 size_t shared_y= threadIdx.y;
953 while(shared_y<sharedRows)
955 while(shared_x<sharedCols)
957 sharedIdx = shared_y * sharedCols + shared_x;
958 sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];
959 shared_x = shared_x + blockDim.x;
961 shared_x = threadIdx.x;
962 shared_y = shared_y + blockDim.y;
968 sharedIdx = threadIdx.x;
972 if(x<out_cols && y<out_rows)
987 T *d_Filter =
reinterpret_cast<T*
>(deviceFilter);
988 for(
size_t j=0; j<filter_rows; j++)
990 for(
size_t i=0; i<filter_cols; i++)
992 shared_x = (threadIdx.y+j) * sharedCols + (sharedIdx+i);
993 sum += sdata[shared_x] * d_Filter[j*filter_cols+i];
994 shared_x += blockDim.x;
995 sum2 += sdata[shared_x] * d_Filter[j*filter_cols+i];
996 shared_x += blockDim.x;
997 sum3 += sdata[shared_x] * d_Filter[j*filter_cols+i];
998 shared_x += blockDim.x;
999 sum4 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1000 shared_x += blockDim.x;
1001 sum5 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1002 shared_x += blockDim.x;
1003 sum6 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1004 shared_x += blockDim.x;
1005 sum7 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1006 shared_x += blockDim.x;
1007 sum8 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1008 shared_x += blockDim.x;
1009 sum9 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1010 shared_x += blockDim.x;
1011 sum10 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1017 for(
size_t j=0; j<filter_rows; j++)
1019 for(
size_t i=0; i<filter_cols; i++)
1021 shared_x = (threadIdx.y+j) * sharedCols + (sharedIdx+i);
1022 sum += sdata[shared_x];
1023 shared_x += blockDim.x;
1024 sum2 += sdata[shared_x];
1025 shared_x += blockDim.x;
1026 sum3 += sdata[shared_x];
1027 shared_x += blockDim.x;
1028 sum4 += sdata[shared_x];
1029 shared_x += blockDim.x;
1030 sum5 += sdata[shared_x];
1031 shared_x += blockDim.x;
1032 sum6 += sdata[shared_x];
1033 shared_x += blockDim.x;
1034 sum7 += sdata[shared_x];
1035 shared_x += blockDim.x;
1036 sum8 += sdata[shared_x];
1037 shared_x += blockDim.x;
1038 sum9 += sdata[shared_x];
1039 shared_x += blockDim.x;
1040 sum10 += sdata[shared_x];
1044 shared_x = y*out_pitch+x;
1045 output[shared_x] = sum / (filter_rows * filter_cols);
1046 shared_x += blockDim.x;
1047 output[shared_x] = sum2 / (filter_rows * filter_cols);
1048 shared_x += blockDim.x;
1049 output[shared_x] = sum3 / (filter_rows * filter_cols);
1050 shared_x += blockDim.x;
1051 output[shared_x] = sum4 / (filter_rows * filter_cols);
1052 shared_x += blockDim.x;
1053 output[shared_x] = sum5 / (filter_rows * filter_cols);
1054 shared_x += blockDim.x;
1055 output[shared_x] = sum6 / (filter_rows * filter_cols);
1056 shared_x += blockDim.x;
1057 output[shared_x] = sum7 / (filter_rows * filter_cols);
1058 shared_x += blockDim.x;
1059 output[shared_x] = sum8 / (filter_rows * filter_cols);
1060 shared_x += blockDim.x;
1061 output[shared_x] = sum9 / (filter_rows * filter_cols);
1062 shared_x += blockDim.x;
1063 output[shared_x] = sum10 / (filter_rows * filter_cols);
1075 template<
bool useFilter,
typename T>
1076 __global__
void conv_cuda_shared_tiling_12_kernel(T* input, T* output,
const size_t in_cols,
const size_t out_rows,
const size_t out_cols,
const size_t filter_rows,
const size_t filter_cols,
size_t in_pitch,
size_t out_pitch,
const size_t sharedRows,
const size_t sharedCols)
1078 extern __shared__
char _sdata[];
1079 T* sdata =
reinterpret_cast<T*
>(_sdata);
1081 size_t xx = blockIdx.x * blockDim.x * 12;
1082 size_t yy = blockIdx.y * blockDim.y;
1084 size_t x = xx + threadIdx.x;
1086 size_t y = yy + threadIdx.y;
1088 size_t sharedIdx = threadIdx.y * sharedCols + threadIdx.x;
1090 size_t shared_x= threadIdx.x+blockDim.x;
1093 if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))
1095 sdata[sharedIdx]= input[y*in_pitch + x];
1097 size_t shared_y= threadIdx.y;
1100 while(shared_y<sharedRows)
1102 while(shared_x<sharedCols)
1104 sharedIdx = shared_y * sharedCols + shared_x;
1105 sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];
1106 shared_x = shared_x + blockDim.x;
1108 shared_x = threadIdx.x;
1109 shared_y = shared_y + blockDim.y;
1115 sharedIdx = threadIdx.x;
1119 if(x<out_cols && y<out_rows)
1136 T *d_Filter =
reinterpret_cast<T*
>(deviceFilter);
1137 for(
size_t j=0; j<filter_rows; j++)
1139 for(
size_t i=0; i<filter_cols; i++)
1141 shared_x = (threadIdx.y+j) * sharedCols + (sharedIdx+i);
1142 sum += sdata[shared_x] * d_Filter[j*filter_cols+i];
1143 shared_x += blockDim.x;
1144 sum2 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1145 shared_x += blockDim.x;
1146 sum3 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1147 shared_x += blockDim.x;
1148 sum4 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1149 shared_x += blockDim.x;
1150 sum5 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1151 shared_x += blockDim.x;
1152 sum6 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1153 shared_x += blockDim.x;
1154 sum7 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1155 shared_x += blockDim.x;
1156 sum8 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1157 shared_x += blockDim.x;
1158 sum9 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1159 shared_x += blockDim.x;
1160 sum10 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1161 shared_x += blockDim.x;
1162 sum11 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1163 shared_x += blockDim.x;
1164 sum12 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1170 for(
size_t j=0; j<filter_rows; j++)
1172 for(
size_t i=0; i<filter_cols; i++)
1174 shared_x = (threadIdx.y+j) * sharedCols + (sharedIdx+i);
1175 sum += sdata[shared_x];
1176 shared_x += blockDim.x;
1177 sum2 += sdata[shared_x];
1178 shared_x += blockDim.x;
1179 sum3 += sdata[shared_x];
1180 shared_x += blockDim.x;
1181 sum4 += sdata[shared_x];
1182 shared_x += blockDim.x;
1183 sum5 += sdata[shared_x];
1184 shared_x += blockDim.x;
1185 sum6 += sdata[shared_x];
1186 shared_x += blockDim.x;
1187 sum7 += sdata[shared_x];
1188 shared_x += blockDim.x;
1189 sum8 += sdata[shared_x];
1190 shared_x += blockDim.x;
1191 sum9 += sdata[shared_x];
1192 shared_x += blockDim.x;
1193 sum10 += sdata[shared_x];
1194 shared_x += blockDim.x;
1195 sum11 += sdata[shared_x];
1196 shared_x += blockDim.x;
1197 sum12 += sdata[shared_x];
1201 shared_x = y*out_pitch+x;
1202 output[shared_x] = sum / (filter_rows * filter_cols);
1203 shared_x += blockDim.x;
1204 output[shared_x] = sum2 / (filter_rows * filter_cols);
1205 shared_x += blockDim.x;
1206 output[shared_x] = sum3 / (filter_rows * filter_cols);
1207 shared_x += blockDim.x;
1208 output[shared_x] = sum4 / (filter_rows * filter_cols);
1209 shared_x += blockDim.x;
1210 output[shared_x] = sum5 / (filter_rows * filter_cols);
1211 shared_x += blockDim.x;
1212 output[shared_x] = sum6 / (filter_rows * filter_cols);
1213 shared_x += blockDim.x;
1214 output[shared_x] = sum7 / (filter_rows * filter_cols);
1215 shared_x += blockDim.x;
1216 output[shared_x] = sum8 / (filter_rows * filter_cols);
1217 shared_x += blockDim.x;
1218 output[shared_x] = sum9 / (filter_rows * filter_cols);
1219 shared_x += blockDim.x;
1220 output[shared_x] = sum10 / (filter_rows * filter_cols);
1221 shared_x += blockDim.x;
1222 output[shared_x] = sum11 / (filter_rows * filter_cols);
1223 shared_x += blockDim.x;
1224 output[shared_x] = sum12 / (filter_rows * filter_cols);
1236 template<
bool useFilter,
typename T>
1237 __global__
void conv_cuda_shared_tiling_14_kernel(T* input, T* output,
const size_t in_cols,
const size_t out_rows,
const size_t out_cols,
const size_t filter_rows,
const size_t filter_cols,
size_t in_pitch,
size_t out_pitch,
const size_t sharedRows,
const size_t sharedCols)
1239 extern __shared__
char _sdata[];
1240 T* sdata =
reinterpret_cast<T*
>(_sdata);
1242 size_t xx = blockIdx.x * blockDim.x * 14;
1243 size_t yy = blockIdx.y * blockDim.y;
1245 size_t x = xx + threadIdx.x;
1247 size_t y = yy + threadIdx.y;
1249 size_t sharedIdx = threadIdx.y * sharedCols + threadIdx.x;
1251 size_t shared_x= threadIdx.x+blockDim.x;
1254 if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))
1256 sdata[sharedIdx]= input[y*in_pitch + x];
1258 size_t shared_y= threadIdx.y;
1261 while(shared_y<sharedRows)
1263 while(shared_x<sharedCols)
1265 sharedIdx = shared_y * sharedCols + shared_x;
1266 sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];
1267 shared_x = shared_x + blockDim.x;
1269 shared_x = threadIdx.x;
1270 shared_y = shared_y + blockDim.y;
1276 sharedIdx = threadIdx.x;
1280 if(x<out_cols && y<out_rows)
1299 T *d_Filter =
reinterpret_cast<T*
>(deviceFilter);
1300 for(
size_t j=0; j<filter_rows; j++)
1302 for(
size_t i=0; i<filter_cols; i++)
1304 shared_x = (threadIdx.y+j) * sharedCols + (sharedIdx+i);
1305 sum += sdata[shared_x] * d_Filter[j*filter_cols+i];
1306 shared_x += blockDim.x;
1307 sum2 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1308 shared_x += blockDim.x;
1309 sum3 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1310 shared_x += blockDim.x;
1311 sum4 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1312 shared_x += blockDim.x;
1313 sum5 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1314 shared_x += blockDim.x;
1315 sum6 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1316 shared_x += blockDim.x;
1317 sum7 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1318 shared_x += blockDim.x;
1319 sum8 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1320 shared_x += blockDim.x;
1321 sum9 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1322 shared_x += blockDim.x;
1323 sum10 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1324 shared_x += blockDim.x;
1325 sum11 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1326 shared_x += blockDim.x;
1327 sum12 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1328 shared_x += blockDim.x;
1329 sum13 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1330 shared_x += blockDim.x;
1331 sum14 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1337 for(
size_t j=0; j<filter_rows; j++)
1339 for(
size_t i=0; i<filter_cols; i++)
1341 shared_x = (threadIdx.y+j) * sharedCols + (sharedIdx+i);
1342 sum += sdata[shared_x];
1343 shared_x += blockDim.x;
1344 sum2 += sdata[shared_x];
1345 shared_x += blockDim.x;
1346 sum3 += sdata[shared_x];
1347 shared_x += blockDim.x;
1348 sum4 += sdata[shared_x];
1349 shared_x += blockDim.x;
1350 sum5 += sdata[shared_x];
1351 shared_x += blockDim.x;
1352 sum6 += sdata[shared_x];
1353 shared_x += blockDim.x;
1354 sum7 += sdata[shared_x];
1355 shared_x += blockDim.x;
1356 sum8 += sdata[shared_x];
1357 shared_x += blockDim.x;
1358 sum9 += sdata[shared_x];
1359 shared_x += blockDim.x;
1360 sum10 += sdata[shared_x];
1361 shared_x += blockDim.x;
1362 sum11 += sdata[shared_x];
1363 shared_x += blockDim.x;
1364 sum12 += sdata[shared_x];
1365 shared_x += blockDim.x;
1366 sum13 += sdata[shared_x];
1367 shared_x += blockDim.x;
1368 sum14 += sdata[shared_x];
1372 shared_x = y*out_pitch+x;
1373 output[shared_x] = sum / (filter_rows * filter_cols);
1374 shared_x += blockDim.x;
1375 output[shared_x] = sum2 / (filter_rows * filter_cols);
1376 shared_x += blockDim.x;
1377 output[shared_x] = sum3 / (filter_rows * filter_cols);
1378 shared_x += blockDim.x;
1379 output[shared_x] = sum4 / (filter_rows * filter_cols);
1380 shared_x += blockDim.x;
1381 output[shared_x] = sum5 / (filter_rows * filter_cols);
1382 shared_x += blockDim.x;
1383 output[shared_x] = sum6 / (filter_rows * filter_cols);
1384 shared_x += blockDim.x;
1385 output[shared_x] = sum7 / (filter_rows * filter_cols);
1386 shared_x += blockDim.x;
1387 output[shared_x] = sum8 / (filter_rows * filter_cols);
1388 shared_x += blockDim.x;
1389 output[shared_x] = sum9 / (filter_rows * filter_cols);
1390 shared_x += blockDim.x;
1391 output[shared_x] = sum10 / (filter_rows * filter_cols);
1392 shared_x += blockDim.x;
1393 output[shared_x] = sum11 / (filter_rows * filter_cols);
1394 shared_x += blockDim.x;
1395 output[shared_x] = sum12 / (filter_rows * filter_cols);
1396 shared_x += blockDim.x;
1397 output[shared_x] = sum13 / (filter_rows * filter_cols);
1398 shared_x += blockDim.x;
1399 output[shared_x] = sum14 / (filter_rows * filter_cols);
1412 template<
bool useFilter,
typename T>
1413 __global__
void conv_cuda_shared_tiling_16_kernel(T* input, T* output,
const size_t in_cols,
const size_t out_rows,
const size_t out_cols,
const size_t filter_rows,
const size_t filter_cols,
size_t in_pitch,
size_t out_pitch,
const size_t sharedRows,
const size_t sharedCols)
1415 extern __shared__
char _sdata[];
1416 T* sdata =
reinterpret_cast<T*
>(_sdata);
1418 size_t xx = blockIdx.x * blockDim.x * 16;
1419 size_t yy = blockIdx.y * blockDim.y;
1421 size_t x = xx + threadIdx.x;
1423 size_t y = yy + threadIdx.y;
1425 size_t sharedIdx = threadIdx.y * sharedCols + threadIdx.x;
1427 size_t shared_x= threadIdx.x+blockDim.x;
1430 if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))
1432 sdata[sharedIdx]= input[y*in_pitch + x];
1434 size_t shared_y= threadIdx.y;
1437 while(shared_y<sharedRows)
1439 while(shared_x<sharedCols)
1441 sharedIdx = shared_y * sharedCols + shared_x;
1442 sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];
1443 shared_x = shared_x + blockDim.x;
1445 shared_x = threadIdx.x;
1446 shared_y = shared_y + blockDim.y;
1452 sharedIdx = threadIdx.x;
1456 if(x<out_cols && y<out_rows)
1477 T *d_Filter =
reinterpret_cast<T*
>(deviceFilter);
1478 for(
size_t j=0; j<filter_rows; j++)
1480 for(
size_t i=0; i<filter_cols; i++)
1482 shared_x = (threadIdx.y+j) * sharedCols + (sharedIdx+i);
1483 sum += sdata[shared_x] * d_Filter[j*filter_cols+i];
1484 shared_x += blockDim.x;
1485 sum2 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1486 shared_x += blockDim.x;
1487 sum3 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1488 shared_x += blockDim.x;
1489 sum4 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1490 shared_x += blockDim.x;
1491 sum5 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1492 shared_x += blockDim.x;
1493 sum6 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1494 shared_x += blockDim.x;
1495 sum7 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1496 shared_x += blockDim.x;
1497 sum8 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1498 shared_x += blockDim.x;
1499 sum9 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1500 shared_x += blockDim.x;
1501 sum10 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1502 shared_x += blockDim.x;
1503 sum11 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1504 shared_x += blockDim.x;
1505 sum12 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1506 shared_x += blockDim.x;
1507 sum13 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1508 shared_x += blockDim.x;
1509 sum14 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1510 shared_x += blockDim.x;
1511 sum15 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1512 shared_x += blockDim.x;
1513 sum16 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1519 for(
size_t j=0; j<filter_rows; j++)
1521 for(
size_t i=0; i<filter_cols; i++)
1523 shared_x = (threadIdx.y+j) * sharedCols + (sharedIdx+i);
1524 sum += sdata[shared_x];
1525 shared_x += blockDim.x;
1526 sum2 += sdata[shared_x];
1527 shared_x += blockDim.x;
1528 sum3 += sdata[shared_x];
1529 shared_x += blockDim.x;
1530 sum4 += sdata[shared_x];
1531 shared_x += blockDim.x;
1532 sum5 += sdata[shared_x];
1533 shared_x += blockDim.x;
1534 sum6 += sdata[shared_x];
1535 shared_x += blockDim.x;
1536 sum7 += sdata[shared_x];
1537 shared_x += blockDim.x;
1538 sum8 += sdata[shared_x];
1539 shared_x += blockDim.x;
1540 sum9 += sdata[shared_x];
1541 shared_x += blockDim.x;
1542 sum10 += sdata[shared_x];
1543 shared_x += blockDim.x;
1544 sum11 += sdata[shared_x];
1545 shared_x += blockDim.x;
1546 sum12 += sdata[shared_x];
1547 shared_x += blockDim.x;
1548 sum13 += sdata[shared_x];
1549 shared_x += blockDim.x;
1550 sum14 += sdata[shared_x];
1551 shared_x += blockDim.x;
1552 sum15 += sdata[shared_x];
1553 shared_x += blockDim.x;
1554 sum16 += sdata[shared_x];
1558 shared_x = y*out_pitch+x;
1559 output[shared_x] = sum / (filter_rows * filter_cols);
1560 shared_x += blockDim.x;
1561 output[shared_x] = sum2 / (filter_rows * filter_cols);
1562 shared_x += blockDim.x;
1563 output[shared_x] = sum3 / (filter_rows * filter_cols);
1564 shared_x += blockDim.x;
1565 output[shared_x] = sum4 / (filter_rows * filter_cols);
1566 shared_x += blockDim.x;
1567 output[shared_x] = sum5 / (filter_rows * filter_cols);
1568 shared_x += blockDim.x;
1569 output[shared_x] = sum6 / (filter_rows * filter_cols);
1570 shared_x += blockDim.x;
1571 output[shared_x] = sum7 / (filter_rows * filter_cols);
1572 shared_x += blockDim.x;
1573 output[shared_x] = sum8 / (filter_rows * filter_cols);
1574 shared_x += blockDim.x;
1575 output[shared_x] = sum9 / (filter_rows * filter_cols);
1576 shared_x += blockDim.x;
1577 output[shared_x] = sum10 / (filter_rows * filter_cols);
1578 shared_x += blockDim.x;
1579 output[shared_x] = sum11 / (filter_rows * filter_cols);
1580 shared_x += blockDim.x;
1581 output[shared_x] = sum12 / (filter_rows * filter_cols);
1582 shared_x += blockDim.x;
1583 output[shared_x] = sum13 / (filter_rows * filter_cols);
1584 shared_x += blockDim.x;
1585 output[shared_x] = sum14 / (filter_rows * filter_cols);
1586 shared_x += blockDim.x;
1587 output[shared_x] = sum15 / (filter_rows * filter_cols);
1588 shared_x += blockDim.x;
1589 output[shared_x] = sum16 / (filter_rows * filter_cols);
__global__ void conv_cuda_shared_tiling_12_kernel(T *input, T *output, const size_t in_cols, const size_t out_rows, const size_t out_cols, const size_t filter_rows, const size_t filter_cols, size_t in_pitch, size_t out_pitch, const size_t sharedRows, const size_t sharedCols)
Definition: mapoverlap_convol_kernels.h:1076
__global__ void conv_cuda_shared_tiling_16_kernel(T *input, T *output, const size_t in_cols, const size_t out_rows, const size_t out_cols, const size_t filter_rows, const size_t filter_cols, size_t in_pitch, size_t out_pitch, const size_t sharedRows, const size_t sharedCols)
Definition: mapoverlap_convol_kernels.h:1413
__global__ void conv_cuda_shared_kernel(T *input, T *output, const size_t in_rows, const size_t in_cols, const size_t out_rows, const size_t out_cols, const size_t filter_rows, const size_t filter_cols, size_t in_pitch, size_t out_pitch, const size_t sharedRows, const size_t sharedCols)
Definition: mapoverlap_convol_kernels.h:335
__global__ void conv_cuda_shared_tiling_4_kernel(T *input, T *output, const size_t in_cols, const size_t out_rows, const size_t out_cols, const size_t filter_rows, const size_t filter_cols, size_t in_pitch, size_t out_pitch, const size_t sharedRows, const size_t sharedCols)
Definition: mapoverlap_convol_kernels.h:573
T min(T a, T b)
Definition: mapoverlap_convol_kernels.h:212
size_t calculateTiling(size_t regCountPerThread, size_t filterSizeX, size_t filterSizeY, size_t inputSizeX, bool maximizeTiling=false)
Definition: mapoverlap_convol_kernels.h:224
__global__ void conv_cuda_shared_tiling_14_kernel(T *input, T *output, const size_t in_cols, const size_t out_rows, const size_t out_cols, const size_t filter_rows, const size_t filter_cols, size_t in_pitch, size_t out_pitch, const size_t sharedRows, const size_t sharedCols)
Definition: mapoverlap_convol_kernels.h:1237
T max(T a, T b)
Definition: mapoverlap_convol_kernels.h:203
__global__ void conv_cuda_shared_tiling_8_kernel(T *input, T *output, const size_t in_cols, const size_t out_rows, const size_t out_cols, const size_t filter_rows, const size_t filter_cols, size_t in_pitch, size_t out_pitch, const size_t sharedRows, const size_t sharedCols)
Definition: mapoverlap_convol_kernels.h:795
__global__ void conv_cuda_shared_tiling_10_kernel(T *input, T *output, const size_t in_cols, const size_t out_rows, const size_t out_cols, const size_t filter_rows, const size_t filter_cols, size_t in_pitch, size_t out_pitch, const size_t sharedRows, const size_t sharedCols)
Definition: mapoverlap_convol_kernels.h:929
__global__ void conv_cuda_shared_tiling_6_kernel(T *input, T *output, const size_t in_cols, const size_t out_rows, const size_t out_cols, const size_t filter_rows, const size_t filter_cols, size_t in_pitch, size_t out_pitch, const size_t sharedRows, const size_t sharedCols)
Definition: mapoverlap_convol_kernels.h:676
static std::string MatrixConvol2D_CL("__kernel void conv_opencl_2D_KERNELNAME(__global TYPE* input, __global TYPE* output, size_t out_rows, size_t out_cols, size_t filter_rows, size_t filter_cols, size_t in_pitch, size_t out_pitch, size_t stride, size_t sharedRows, size_t sharedCols, __local TYPE* sdata)\n""{\n"" size_t xx = ( (size_t)(get_global_id(0)/get_local_size(0))) * get_local_size(0);\n"" size_t yy = ( (size_t)(get_global_id(1)/get_local_size(1))) * get_local_size(1);\n"" size_t x = get_global_id(0);\n"" size_t y = get_global_id(1);\n"" if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))\n"" {\n"" size_t sharedIdx = get_local_id(1) * sharedCols + get_local_id(0);\n"" sdata[sharedIdx]= input[y*in_pitch + x];\n"" size_t shared_x= get_local_id(0)+get_local_size(0);\n"" size_t shared_y= get_local_id(1);\n"" while(shared_y<sharedRows)\n"" {\n"" while(shared_x<sharedCols)\n"" {\n"" sharedIdx = shared_y * sharedCols + shared_x; \n"" sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];\n"" shared_x = shared_x + get_local_size(0);\n"" }\n"" shared_x = get_local_id(0);\n"" shared_y = shared_y + get_local_size(1);\n"" } \n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if(x<out_cols && y<out_rows)\n"" {\n"" output[y*out_pitch+x] = FUNCTIONNAME(&(sdata[(get_local_id(1)+(filter_rows/2)) * sharedCols + (get_local_id(0)+(filter_cols/2))]), stride);\n"" }\n""}")
__global__ void conv_cuda_2D_kernel(OverlapFunc mapOverlapFunc, T *input, T *output, const size_t out_rows, const size_t out_cols, const size_t filter_rows, const size_t filter_cols, size_t in_pitch, size_t out_pitch, const size_t sharedRows, const size_t sharedCols)
Definition: mapoverlap_convol_kernels.h:287
static std::string MatrixConvolSharedFilter_CL("__kernel void conv_opencl_shared_filter_KERNELNAME(__global TYPE* input, __global TYPE* output, __constant TYPE* filter, size_t in_rows, size_t in_cols, size_t out_rows, size_t out_cols, size_t filter_rows, size_t filter_cols, size_t in_pitch, size_t out_pitch, size_t sharedRows, size_t sharedCols, __local TYPE* sdata)\n""{\n"" size_t xx = ( (size_t)(get_global_id(0)/get_local_size(0))) * get_local_size(0);\n"" size_t yy = ( (size_t)(get_global_id(1)/get_local_size(1))) * get_local_size(1);\n"" size_t x = get_global_id(0);\n"" size_t y = get_global_id(1);\n"" if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))\n"" {\n"" size_t sharedIdx = get_local_id(1) * sharedCols + get_local_id(0);\n"" sdata[sharedIdx]= input[y*in_pitch + x];\n"" size_t shared_x= get_local_id(0)+get_local_size(0);\n"" size_t shared_y= get_local_id(1);\n"" while(shared_y<sharedRows)\n"" {\n"" while(shared_x<sharedCols)\n"" {\n"" sharedIdx = shared_y * sharedCols + shared_x; \n"" sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];\n"" shared_x = shared_x + get_local_size(0);\n"" }\n"" shared_x = get_local_id(0);\n"" shared_y = shared_y + get_local_size(1);\n"" } \n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if(x<out_cols && y<out_rows)\n"" {\n"" TYPE sum=0;\n"" for(size_t j=0;j<filter_rows;j++) \n"" {\n"" for(size_t i=0;i<filter_cols;i++) \n"" {\n"" sum += sdata[(get_local_id(1)+j) * sharedCols + (get_local_id(0)+i) ] * filter[j*filter_cols+i];\n"" }\n"" }\n"" output[y*out_pitch+x] = sum / (filter_rows * filter_cols);\n"" }\n""}")
__global__ void conv_cuda_shared_tiling_kernel(T *input, T *output, const size_t numTiles, const size_t in_cols, const size_t out_rows, const size_t out_cols, const size_t filter_rows, const size_t filter_cols, size_t in_pitch, size_t out_pitch, const size_t sharedRows, const size_t sharedCols)
Definition: mapoverlap_convol_kernels.h:407
__global__ void conv_cuda_shared_tiling_2_kernel(T *input, T *output, const size_t in_cols, const size_t out_rows, const size_t out_cols, const size_t filter_rows, const size_t filter_cols, size_t in_pitch, size_t out_pitch, const size_t sharedRows, const size_t sharedCols)
Definition: mapoverlap_convol_kernels.h:489
static std::string MatrixConvolShared_CL("__kernel void conv_opencl_shared_KERNELNAME(__global TYPE* input, __global TYPE* output, size_t in_rows, size_t in_cols, size_t out_rows, size_t out_cols, size_t filter_rows, size_t filter_cols, size_t in_pitch, size_t out_pitch, size_t sharedRows, size_t sharedCols, __local TYPE* sdata)\n""{\n"" size_t xx = ( (size_t)(get_global_id(0)/get_local_size(0))) * get_local_size(0);\n"" size_t yy = ( (size_t)(get_global_id(1)/get_local_size(1))) * get_local_size(1);\n"" size_t x = get_global_id(0);\n"" size_t y = get_global_id(1);\n"" if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))\n"" {\n"" size_t sharedIdx = get_local_id(1) * sharedCols + get_local_id(0);\n"" sdata[sharedIdx]= input[y*in_pitch + x];\n"" size_t shared_x= get_local_id(0)+get_local_size(0);\n"" size_t shared_y= get_local_id(1);\n"" while(shared_y<sharedRows)\n"" {\n"" while(shared_x<sharedCols)\n"" {\n"" sharedIdx = shared_y * sharedCols + shared_x; \n"" sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];\n"" shared_x = shared_x + get_local_size(0);\n"" }\n"" shared_x = get_local_id(0);\n"" shared_y = shared_y + get_local_size(1);\n"" } \n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if(x<out_cols && y<out_rows)\n"" {\n"" TYPE sum=0;\n"" for(size_t j=0;j<filter_rows;j++) \n"" {\n"" for(size_t i=0;i<filter_cols;i++) \n"" {\n"" sum += sdata[(get_local_id(1)+j) * sharedCols + (get_local_id(0)+i) ];\n"" }\n"" }\n"" output[y*out_pitch+x] = sum / (filter_rows * filter_cols);\n"" }\n""}")