5 #ifndef MAPOVERLAP_CONVOL_KERNELS_H
6 #define MAPOVERLAP_CONVOL_KERNELS_H
17 static std::string MatrixConvolSharedFilter_CL(
18 "__kernel void conv_cuda_shared_filter_KERNELNAME(__global TYPE* input, __global TYPE* output, __constant TYPE* filter, int in_rows, int in_cols, int out_rows, int out_cols, int filter_rows, int filter_cols, int in_pitch, int out_pitch, int sharedRows, int sharedCols, __local TYPE* sdata)\n"
20 " unsigned int xx = ( (int)(get_global_id(0)/get_local_size(0))) * get_local_size(0);\n"
21 " unsigned int yy = ( (int)(get_global_id(1)/get_local_size(1))) * get_local_size(1);\n"
22 " unsigned int x = get_global_id(0);\n"
23 " unsigned int y = get_global_id(1);\n"
24 " if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))\n"
26 " unsigned int sharedIdx = get_local_id(1) * sharedCols + get_local_id(0);\n"
27 " sdata[sharedIdx]= input[y*in_pitch + x];\n"
28 " unsigned int shared_x= get_local_id(0)+get_local_size(0);\n"
29 " unsigned int shared_y= get_local_id(1);\n"
30 " while(shared_y<sharedRows)\n"
32 " while(shared_x<sharedCols)\n"
34 " sharedIdx = shared_y * sharedCols + shared_x; \n"
35 " sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];\n"
36 " shared_x = shared_x + get_local_size(0);\n"
38 " shared_x = get_local_id(0);\n"
39 " shared_y = shared_y + get_local_size(1);\n"
42 " barrier(CLK_LOCAL_MEM_FENCE);\n"
43 " if(x<out_cols && y<out_rows)\n"
46 " for(int j=0;j<filter_rows;j++) \n"
48 " for(int i=0;i<filter_cols;i++) \n"
50 " sum += sdata[(get_local_id(1)+j) * sharedCols + (get_local_id(0)+i) ] * filter[j*filter_cols+i];\n"
53 " output[y*out_pitch+x] = sum / (filter_rows * filter_cols);\n"
62 static std::string MatrixConvolShared_CL(
63 "__kernel void conv_cuda_shared_KERNELNAME(__global TYPE* input, __global TYPE* output, int in_rows, int in_cols, int out_rows, int out_cols, int filter_rows, int filter_cols, int in_pitch, int out_pitch, int sharedRows, int sharedCols, __local TYPE* sdata)\n"
65 " unsigned int xx = ( (int)(get_global_id(0)/get_local_size(0))) * get_local_size(0);\n"
66 " unsigned int yy = ( (int)(get_global_id(1)/get_local_size(1))) * get_local_size(1);\n"
67 " unsigned int x = get_global_id(0);\n"
68 " unsigned int y = get_global_id(1);\n"
69 " if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))\n"
71 " unsigned int sharedIdx = get_local_id(1) * sharedCols + get_local_id(0);\n"
72 " sdata[sharedIdx]= input[y*in_pitch + x];\n"
73 " unsigned int shared_x= get_local_id(0)+get_local_size(0);\n"
74 " unsigned int shared_y= get_local_id(1);\n"
75 " while(shared_y<sharedRows)\n"
77 " while(shared_x<sharedCols)\n"
79 " sharedIdx = shared_y * sharedCols + shared_x; \n"
80 " sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];\n"
81 " shared_x = shared_x + get_local_size(0);\n"
83 " shared_x = get_local_id(0);\n"
84 " shared_y = shared_y + get_local_size(1);\n"
87 " barrier(CLK_LOCAL_MEM_FENCE);\n"
88 " if(x<out_cols && y<out_rows)\n"
91 " for(int j=0;j<filter_rows;j++) \n"
93 " for(int i=0;i<filter_cols;i++) \n"
95 " sum += sdata[(get_local_id(1)+j) * sharedCols + (get_local_id(0)+i) ];\n"
98 " output[y*out_pitch+x] = sum / (filter_rows * filter_cols);\n"
120 #define BLOCK_SIZE_X 16
121 #define BLOCK_SIZE_Y 32
123 #define NUM_REGISTERS_PER_SP 32768
124 #define SHARED_MEM_SIZE_BYTES 48000
125 #define THREADS_PER_WARP 32
126 #define WARPS_PER_SP 48
127 #define THREAD_BLOCK_PER_SP 8
129 template <
typename T>
135 template <
typename T>
141 template <
typename T>
142 int calculateTiling(
int regCountPerThread,
int filterSizeX,
int filterSizeY)
144 int numThreadsPerTB = (BLOCK_SIZE_X * BLOCK_SIZE_Y);
146 int numWarpsPerTB = (numThreadsPerTB+WARP_SIZE-1) / WARP_SIZE;
148 int maxTBPerSP = min( (WARPS_PER_SP / numWarpsPerTB), THREAD_BLOCK_PER_SP);
150 int remRegPerThreads = NUM_REGISTERS_PER_SP - (regCountPerThread * numWarpsPerTB * WARP_SIZE * maxTBPerSP);
152 if(remRegPerThreads <0)
154 std::cerr <<
"Error! Limited by Register usage, tiling cannot be more than 1\n";
157 remRegPerThreads = remRegPerThreads / (numWarpsPerTB * WARP_SIZE * maxTBPerSP);
159 int sharedMem = SHARED_MEM_SIZE_BYTES - ((BLOCK_SIZE_X + filterSizeX - 1) * (BLOCK_SIZE_Y + filterSizeY - 1) *
sizeof(T));
163 std::cerr <<
"Error! Limited by shared memory usage, tiling cannot be more than 1\n";
167 int tilingSM = sharedMem / (BLOCK_SIZE_X * BLOCK_SIZE_Y *
sizeof (T));
169 std::cerr<<
"tilingSM: "<<tilingSM<<
" , remRegPerThreads: "<<remRegPerThreads<<
"\n";
170 return min(tilingSM, remRegPerThreads);
175 __device__ __constant__
char deviceFilter[16386];
189 __global__
void conv_cuda_shared_kernel(T* input, T* output,
const int in_rows,
const int in_cols,
const int out_rows,
const int out_cols,
const int filter_rows,
const int filter_cols,
size_t in_pitch,
size_t out_pitch,
const int sharedRows,
const int sharedCols)
191 extern __shared__
char _sdata[];
192 T* sdata =
reinterpret_cast<T*
>(_sdata);
194 unsigned int xx = blockIdx.x * blockDim.x;
195 unsigned int yy = blockIdx.y * blockDim.y;
197 unsigned int x = xx + threadIdx.x;
198 unsigned int y = yy + threadIdx.y;
200 if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))
202 unsigned int sharedIdx = threadIdx.y * sharedCols + threadIdx.x;
204 sdata[sharedIdx]= input[y*in_pitch + x];
206 unsigned int shared_x= threadIdx.x+blockDim.x;
207 unsigned int shared_y= threadIdx.y;
210 while(shared_y<sharedRows)
212 while(shared_x<sharedCols)
214 sharedIdx = shared_y * sharedCols + shared_x;
215 sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];
216 shared_x = shared_x + blockDim.x;
218 shared_x = threadIdx.x;
219 shared_y = shared_y + blockDim.y;
225 if(x<out_cols && y<out_rows)
229 for(
int j=0;j<filter_rows;j++)
231 for(
int i=0;i<filter_cols;i++)
233 sum += sdata[(threadIdx.y+j) * sharedCols + (threadIdx.x+i) ];
236 output[y*out_pitch+x] = sum / (filter_rows * filter_cols);
257 __global__
void conv_cuda_shared_kernel_filter(T* input, T* output, T* filter,
const int in_rows,
const int in_cols,
const int out_rows,
const int out_cols,
const int filter_rows,
const int filter_cols,
size_t in_pitch,
size_t out_pitch,
const int sharedRows,
const int sharedCols)
259 extern __shared__
char _sdata[];
260 T* sdata =
reinterpret_cast<T*
>(_sdata);
262 unsigned int xx = blockIdx.x * blockDim.x;
263 unsigned int yy = blockIdx.y * blockDim.y;
265 unsigned int x = xx + threadIdx.x;
266 unsigned int y = yy + threadIdx.y;
268 if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))
270 unsigned int sharedIdx = threadIdx.y * sharedCols + threadIdx.x;
272 sdata[sharedIdx]= input[y*in_pitch + x];
274 unsigned int shared_x= threadIdx.x+blockDim.x;
275 unsigned int shared_y= threadIdx.y;
278 while(shared_y<sharedRows)
280 while(shared_x<sharedCols)
282 sharedIdx = shared_y * sharedCols + shared_x;
283 sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];
284 shared_x = shared_x + blockDim.x;
286 shared_x = threadIdx.x;
287 shared_y = shared_y + blockDim.y;
293 if(x<out_cols && y<out_rows)
298 for(
int j=0;j<filter_rows;j++)
300 for(
int i=0;i<filter_cols;i++)
302 sum += sdata[(threadIdx.y+j) * sharedCols + (threadIdx.x+i) ] * filter[j*filter_cols+i];
305 output[y*out_pitch+x] = sum / (filter_rows * filter_cols);
327 __global__
void conv_cuda_shared_tiling_kernel(T* input, T* output,
const int numTiles,
const int in_cols,
const int out_rows,
const int out_cols,
const int filter_rows,
const int filter_cols,
size_t in_pitch,
size_t out_pitch,
const int sharedRows,
const int sharedCols)
329 extern __shared__
char _sdata[];
330 T* sdata =
reinterpret_cast<T*
>(_sdata);
332 unsigned int xx = blockIdx.x * blockDim.x * numTiles;
333 unsigned int yy = blockIdx.y * blockDim.y;
335 unsigned int x = xx + threadIdx.x;
336 unsigned int y = yy + threadIdx.y;
338 unsigned int sharedIdx = threadIdx.y * sharedCols + threadIdx.x;
339 unsigned int shared_x= threadIdx.x+blockDim.x;
340 unsigned int shared_y= threadIdx.y;
342 if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))
344 sdata[sharedIdx]= input[y*in_pitch + x];
347 while(shared_y<sharedRows)
349 while(shared_x<sharedCols)
351 sharedIdx = shared_y * sharedCols + shared_x;
352 sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];
353 shared_x = shared_x + blockDim.x;
355 shared_x = threadIdx.x;
356 shared_y = shared_y + blockDim.y;
362 sharedIdx = threadIdx.x;
364 for(
int t=0;t<numTiles; t++)
366 if(x<out_cols && y<out_rows)
371 for(
int j=0;j<filter_rows;j++)
373 for(
int i=0;i<filter_cols;i++)
375 shared_x += sdata[(threadIdx.y+j) * sharedCols + (sharedIdx+i) ];
378 output[y*out_pitch+x] = shared_x / (filter_rows * filter_cols);
380 sharedIdx += blockDim.x;
401 __global__
void conv_cuda_shared_tiling_kernel_filter(T* input, T* output, T* filter,
const int numTiles,
const int in_cols,
const int out_rows,
const int out_cols,
const int filter_rows,
const int filter_cols,
size_t in_pitch,
size_t out_pitch,
const int sharedRows,
const int sharedCols)
403 extern __shared__
char _sdata[];
404 T* sdata =
reinterpret_cast<T*
>(_sdata);
406 unsigned int xx = blockIdx.x * blockDim.x * numTiles;
407 unsigned int yy = blockIdx.y * blockDim.y;
409 unsigned int x = xx + threadIdx.x;
410 unsigned int y = yy + threadIdx.y;
412 unsigned int sharedIdx = threadIdx.y * sharedCols + threadIdx.x;
413 unsigned int shared_x= threadIdx.x+blockDim.x;
414 unsigned int shared_y= threadIdx.y;
416 if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))
418 sdata[sharedIdx]= input[y*in_pitch + x];
421 while(shared_y<sharedRows)
423 while(shared_x<sharedCols)
425 sharedIdx = shared_y * sharedCols + shared_x;
426 sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];
427 shared_x = shared_x + blockDim.x;
429 shared_x = threadIdx.x;
430 shared_y = shared_y + blockDim.y;
436 sharedIdx = threadIdx.x;
438 for(
int t=0;t<numTiles; t++)
440 if(x<out_cols && y<out_rows)
445 for(
int j=0;j<filter_rows;j++)
447 for(
int i=0;i<filter_cols;i++)
449 shared_x += sdata[(threadIdx.y+j) * sharedCols + (sharedIdx+i) ] * filter[j*filter_cols+i];
452 output[y*out_pitch+x] = shared_x / (filter_rows * filter_cols);
454 sharedIdx += blockDim.x;
462 template<
bool useFilter,
typename T>
463 __global__
void conv_cuda_shared_tiling_2_kernel(T* input, T* output,
const int in_cols,
const int out_rows,
const int out_cols,
const int filter_rows,
const int filter_cols,
size_t in_pitch,
size_t out_pitch,
const int sharedRows,
const int sharedCols)
465 extern __shared__
char _sdata[];
466 T* sdata =
reinterpret_cast<T*
>(_sdata);
468 unsigned int xx = blockIdx.x * blockDim.x * 2;
469 unsigned int yy = blockIdx.y * blockDim.y;
471 unsigned int x = xx + threadIdx.x;
473 unsigned int y = yy + threadIdx.y;
475 unsigned int sharedIdx = threadIdx.y * sharedCols + threadIdx.x;
478 if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))
480 sdata[sharedIdx]= input[y*in_pitch + x];
482 unsigned int shared_x= threadIdx.x+blockDim.x;
483 unsigned int shared_y= threadIdx.y;
486 while(shared_y<sharedRows)
488 while(shared_x<sharedCols)
490 sharedIdx = shared_y * sharedCols + shared_x;
491 sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];
492 shared_x = shared_x + blockDim.x;
494 shared_x = threadIdx.x;
495 shared_y = shared_y + blockDim.y;
501 sharedIdx = threadIdx.x;
505 if(x<out_cols && y<out_rows)
512 T *d_Filter =
reinterpret_cast<T*
>(deviceFilter);
513 for(
int j=0;j<filter_rows;j++)
515 for(
int i=0;i<filter_cols;i++)
517 sum += sdata[(threadIdx.y+j) * sharedCols + (sharedIdx+i) ] * d_Filter[j*filter_cols+i];
518 sum2 += sdata[(threadIdx.y+j) * sharedCols + (sharedIdx+blockDim.x+i) ] * d_Filter[j*filter_cols+i];
524 for(
int j=0;j<filter_rows;j++)
526 for(
int i=0;i<filter_cols;i++)
528 sum += sdata[(threadIdx.y+j) * sharedCols + (sharedIdx+i) ];
529 sum2 += sdata[(threadIdx.y+j) * sharedCols + (sharedIdx+blockDim.x+i) ];
533 output[y*out_pitch+x] = sum / (filter_rows * filter_cols);
534 output[y*out_pitch+x+blockDim.x] = sum2 / (filter_rows * filter_cols);
543 template<
bool useFilter,
typename T>
544 __global__
void conv_cuda_shared_tiling_4_kernel(T* input, T* output,
const int in_cols,
const int out_rows,
const int out_cols,
const int filter_rows,
const int filter_cols,
size_t in_pitch,
size_t out_pitch,
const int sharedRows,
const int sharedCols)
546 extern __shared__
char _sdata[];
547 T* sdata =
reinterpret_cast<T*
>(_sdata);
549 unsigned int xx = blockIdx.x * blockDim.x * 4;
550 unsigned int yy = blockIdx.y * blockDim.y;
552 unsigned int x = xx + threadIdx.x;
554 unsigned int y = yy + threadIdx.y;
556 unsigned int sharedIdx = threadIdx.y * sharedCols + threadIdx.x;
558 unsigned int shared_x= threadIdx.x+blockDim.x;
561 if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))
563 sdata[sharedIdx]= input[y*in_pitch + x];
565 unsigned int shared_y= threadIdx.y;
568 while(shared_y<sharedRows)
570 while(shared_x<sharedCols)
572 sharedIdx = shared_y * sharedCols + shared_x;
573 sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];
574 shared_x = shared_x + blockDim.x;
576 shared_x = threadIdx.x;
577 shared_y = shared_y + blockDim.y;
583 sharedIdx = threadIdx.x;
587 if(x<out_cols && y<out_rows)
596 T *d_Filter =
reinterpret_cast<T*
>(deviceFilter);
597 for(
int j=0;j<filter_rows;j++)
599 for(
int i=0;i<filter_cols;i++)
601 shared_x = (threadIdx.y+j) * sharedCols + (sharedIdx+i);
602 sum += sdata[shared_x] * d_Filter[j*filter_cols+i];
603 shared_x += blockDim.x;
604 sum2 += sdata[shared_x] * d_Filter[j*filter_cols+i];
605 shared_x += blockDim.x;
606 sum3 += sdata[shared_x] * d_Filter[j*filter_cols+i];
607 shared_x += blockDim.x;
608 sum4 += sdata[shared_x] * d_Filter[j*filter_cols+i];
614 for(
int j=0;j<filter_rows;j++)
616 for(
int i=0;i<filter_cols;i++)
618 shared_x = (threadIdx.y+j) * sharedCols + (sharedIdx+i);
619 sum += sdata[shared_x];
620 shared_x += blockDim.x;
621 sum2 += sdata[shared_x];
622 shared_x += blockDim.x;
623 sum3 += sdata[shared_x];
624 shared_x += blockDim.x;
625 sum4 += sdata[shared_x];
629 shared_x = y*out_pitch+x;
630 output[shared_x] = sum / (filter_rows * filter_cols);
631 shared_x += blockDim.x;
632 output[shared_x] = sum2 / (filter_rows * filter_cols);
633 shared_x += blockDim.x;
634 output[shared_x] = sum3 / (filter_rows * filter_cols);
635 shared_x += blockDim.x;
636 output[shared_x] = sum4 / (filter_rows * filter_cols);
651 template<
bool useFilter,
typename T>
652 __global__
void conv_cuda_shared_tiling_6_kernel(T* input, T* output,
const int in_cols,
const int out_rows,
const int out_cols,
const int filter_rows,
const int filter_cols,
size_t in_pitch,
size_t out_pitch,
const int sharedRows,
const int sharedCols)
654 extern __shared__
char _sdata[];
655 T* sdata =
reinterpret_cast<T*
>(_sdata);
657 unsigned int xx = blockIdx.x * blockDim.x * 6;
658 unsigned int yy = blockIdx.y * blockDim.y;
660 unsigned int x = xx + threadIdx.x;
662 unsigned int y = yy + threadIdx.y;
664 unsigned int sharedIdx = threadIdx.y * sharedCols + threadIdx.x;
666 unsigned int shared_x= threadIdx.x+blockDim.x;
669 if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))
671 sdata[sharedIdx]= input[y*in_pitch + x];
673 unsigned int shared_y= threadIdx.y;
676 while(shared_y<sharedRows)
678 while(shared_x<sharedCols)
680 sharedIdx = shared_y * sharedCols + shared_x;
681 sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];
682 shared_x = shared_x + blockDim.x;
684 shared_x = threadIdx.x;
685 shared_y = shared_y + blockDim.y;
691 sharedIdx = threadIdx.x;
695 if(x<out_cols && y<out_rows)
706 T *d_Filter =
reinterpret_cast<T*
>(deviceFilter);
707 for(
int j=0;j<filter_rows;j++)
709 for(
int i=0;i<filter_cols;i++)
711 shared_x = (threadIdx.y+j) * sharedCols + (sharedIdx+i);
712 sum += sdata[shared_x] * d_Filter[j*filter_cols+i];
713 shared_x += blockDim.x;
714 sum2 += sdata[shared_x] * d_Filter[j*filter_cols+i];
715 shared_x += blockDim.x;
716 sum3 += sdata[shared_x] * d_Filter[j*filter_cols+i];
717 shared_x += blockDim.x;
718 sum4 += sdata[shared_x] * d_Filter[j*filter_cols+i];
719 shared_x += blockDim.x;
720 sum5 += sdata[shared_x] * d_Filter[j*filter_cols+i];
721 shared_x += blockDim.x;
722 sum6 += sdata[shared_x] * d_Filter[j*filter_cols+i];
728 for(
int j=0;j<filter_rows;j++)
730 for(
int i=0;i<filter_cols;i++)
732 shared_x = (threadIdx.y+j) * sharedCols + (sharedIdx+i);
733 sum += sdata[shared_x];
734 shared_x += blockDim.x;
735 sum2 += sdata[shared_x];
736 shared_x += blockDim.x;
737 sum3 += sdata[shared_x];
738 shared_x += blockDim.x;
739 sum4 += sdata[shared_x];
740 shared_x += blockDim.x;
741 sum5 += sdata[shared_x];
742 shared_x += blockDim.x;
743 sum6 += sdata[shared_x];
747 shared_x = y*out_pitch+x;
748 output[shared_x] = sum / (filter_rows * filter_cols);
749 shared_x += blockDim.x;
750 output[shared_x] = sum2 / (filter_rows * filter_cols);
751 shared_x += blockDim.x;
752 output[shared_x] = sum3 / (filter_rows * filter_cols);
753 shared_x += blockDim.x;
754 output[shared_x] = sum4 / (filter_rows * filter_cols);
755 shared_x += blockDim.x;
756 output[shared_x] = sum5 / (filter_rows * filter_cols);
757 shared_x += blockDim.x;
758 output[shared_x] = sum6 / (filter_rows * filter_cols);
768 template<
bool useFilter,
typename T>
769 __global__
void conv_cuda_shared_tiling_8_kernel(T* input, T* output,
const int in_cols,
const int out_rows,
const int out_cols,
const int filter_rows,
const int filter_cols,
size_t in_pitch,
size_t out_pitch,
const int sharedRows,
const int sharedCols)
771 extern __shared__
char _sdata[];
772 T* sdata =
reinterpret_cast<T*
>(_sdata);
774 unsigned int xx = blockIdx.x * blockDim.x * 8;
775 unsigned int yy = blockIdx.y * blockDim.y;
777 unsigned int x = xx + threadIdx.x;
779 unsigned int y = yy + threadIdx.y;
781 unsigned int sharedIdx = threadIdx.y * sharedCols + threadIdx.x;
783 unsigned int shared_x= threadIdx.x+blockDim.x;
786 if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))
788 sdata[sharedIdx]= input[y*in_pitch + x];
790 unsigned int shared_y= threadIdx.y;
793 while(shared_y<sharedRows)
795 while(shared_x<sharedCols)
797 sharedIdx = shared_y * sharedCols + shared_x;
798 sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];
799 shared_x = shared_x + blockDim.x;
801 shared_x = threadIdx.x;
802 shared_y = shared_y + blockDim.y;
808 sharedIdx = threadIdx.x;
812 if(x<out_cols && y<out_rows)
825 T *d_Filter =
reinterpret_cast<T*
>(deviceFilter);
826 for(
int j=0;j<filter_rows;j++)
828 for(
int i=0;i<filter_cols;i++)
830 shared_x = (threadIdx.y+j) * sharedCols + (sharedIdx+i);
831 sum += sdata[shared_x] * d_Filter[j*filter_cols+i];
832 shared_x += blockDim.x;
833 sum2 += sdata[shared_x] * d_Filter[j*filter_cols+i];
834 shared_x += blockDim.x;
835 sum3 += sdata[shared_x] * d_Filter[j*filter_cols+i];
836 shared_x += blockDim.x;
837 sum4 += sdata[shared_x] * d_Filter[j*filter_cols+i];
838 shared_x += blockDim.x;
839 sum5 += sdata[shared_x] * d_Filter[j*filter_cols+i];
840 shared_x += blockDim.x;
841 sum6 += sdata[shared_x] * d_Filter[j*filter_cols+i];
842 shared_x += blockDim.x;
843 sum7 += sdata[shared_x] * d_Filter[j*filter_cols+i];
844 shared_x += blockDim.x;
845 sum8 += sdata[shared_x] * d_Filter[j*filter_cols+i];
851 for(
int j=0;j<filter_rows;j++)
853 for(
int i=0;i<filter_cols;i++)
855 shared_x = (threadIdx.y+j) * sharedCols + (sharedIdx+i);
856 sum += sdata[shared_x];
857 shared_x += blockDim.x;
858 sum2 += sdata[shared_x];
859 shared_x += blockDim.x;
860 sum3 += sdata[shared_x];
861 shared_x += blockDim.x;
862 sum4 += sdata[shared_x];
863 shared_x += blockDim.x;
864 sum5 += sdata[shared_x];
865 shared_x += blockDim.x;
866 sum6 += sdata[shared_x];
867 shared_x += blockDim.x;
868 sum7 += sdata[shared_x];
869 shared_x += blockDim.x;
870 sum8 += sdata[shared_x];
874 shared_x = y*out_pitch+x;
875 output[shared_x] = sum / (filter_rows * filter_cols);
876 shared_x += blockDim.x;
877 output[shared_x] = sum2 / (filter_rows * filter_cols);
878 shared_x += blockDim.x;
879 output[shared_x] = sum3 / (filter_rows * filter_cols);
880 shared_x += blockDim.x;
881 output[shared_x] = sum4 / (filter_rows * filter_cols);
882 shared_x += blockDim.x;
883 output[shared_x] = sum5 / (filter_rows * filter_cols);
884 shared_x += blockDim.x;
885 output[shared_x] = sum6 / (filter_rows * filter_cols);
886 shared_x += blockDim.x;
887 output[shared_x] = sum7 / (filter_rows * filter_cols);
888 shared_x += blockDim.x;
889 output[shared_x] = sum8 / (filter_rows * filter_cols);
903 template<
bool useFilter,
typename T>
904 __global__
void conv_cuda_shared_tiling_10_kernel(T* input, T* output,
const int in_cols,
const int out_rows,
const int out_cols,
const int filter_rows,
const int filter_cols,
size_t in_pitch,
size_t out_pitch,
const int sharedRows,
const int sharedCols)
906 extern __shared__
char _sdata[];
907 T* sdata =
reinterpret_cast<T*
>(_sdata);
909 unsigned int xx = blockIdx.x * blockDim.x * 10;
910 unsigned int yy = blockIdx.y * blockDim.y;
912 unsigned int x = xx + threadIdx.x;
914 unsigned int y = yy + threadIdx.y;
916 unsigned int sharedIdx = threadIdx.y * sharedCols + threadIdx.x;
918 unsigned int shared_x= threadIdx.x+blockDim.x;
921 if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))
923 sdata[sharedIdx]= input[y*in_pitch + x];
925 unsigned int shared_y= threadIdx.y;
928 while(shared_y<sharedRows)
930 while(shared_x<sharedCols)
932 sharedIdx = shared_y * sharedCols + shared_x;
933 sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];
934 shared_x = shared_x + blockDim.x;
936 shared_x = threadIdx.x;
937 shared_y = shared_y + blockDim.y;
943 sharedIdx = threadIdx.x;
947 if(x<out_cols && y<out_rows)
962 T *d_Filter =
reinterpret_cast<T*
>(deviceFilter);
963 for(
int j=0;j<filter_rows;j++)
965 for(
int i=0;i<filter_cols;i++)
967 shared_x = (threadIdx.y+j) * sharedCols + (sharedIdx+i);
968 sum += sdata[shared_x] * d_Filter[j*filter_cols+i];
969 shared_x += blockDim.x;
970 sum2 += sdata[shared_x] * d_Filter[j*filter_cols+i];
971 shared_x += blockDim.x;
972 sum3 += sdata[shared_x] * d_Filter[j*filter_cols+i];
973 shared_x += blockDim.x;
974 sum4 += sdata[shared_x] * d_Filter[j*filter_cols+i];
975 shared_x += blockDim.x;
976 sum5 += sdata[shared_x] * d_Filter[j*filter_cols+i];
977 shared_x += blockDim.x;
978 sum6 += sdata[shared_x] * d_Filter[j*filter_cols+i];
979 shared_x += blockDim.x;
980 sum7 += sdata[shared_x] * d_Filter[j*filter_cols+i];
981 shared_x += blockDim.x;
982 sum8 += sdata[shared_x] * d_Filter[j*filter_cols+i];
983 shared_x += blockDim.x;
984 sum9 += sdata[shared_x] * d_Filter[j*filter_cols+i];
985 shared_x += blockDim.x;
986 sum10 += sdata[shared_x] * d_Filter[j*filter_cols+i];
992 for(
int j=0;j<filter_rows;j++)
994 for(
int i=0;i<filter_cols;i++)
996 shared_x = (threadIdx.y+j) * sharedCols + (sharedIdx+i);
997 sum += sdata[shared_x];
998 shared_x += blockDim.x;
999 sum2 += sdata[shared_x];
1000 shared_x += blockDim.x;
1001 sum3 += sdata[shared_x];
1002 shared_x += blockDim.x;
1003 sum4 += sdata[shared_x];
1004 shared_x += blockDim.x;
1005 sum5 += sdata[shared_x];
1006 shared_x += blockDim.x;
1007 sum6 += sdata[shared_x];
1008 shared_x += blockDim.x;
1009 sum7 += sdata[shared_x];
1010 shared_x += blockDim.x;
1011 sum8 += sdata[shared_x];
1012 shared_x += blockDim.x;
1013 sum9 += sdata[shared_x];
1014 shared_x += blockDim.x;
1015 sum10 += sdata[shared_x];
1019 shared_x = y*out_pitch+x;
1020 output[shared_x] = sum / (filter_rows * filter_cols);
1021 shared_x += blockDim.x;
1022 output[shared_x] = sum2 / (filter_rows * filter_cols);
1023 shared_x += blockDim.x;
1024 output[shared_x] = sum3 / (filter_rows * filter_cols);
1025 shared_x += blockDim.x;
1026 output[shared_x] = sum4 / (filter_rows * filter_cols);
1027 shared_x += blockDim.x;
1028 output[shared_x] = sum5 / (filter_rows * filter_cols);
1029 shared_x += blockDim.x;
1030 output[shared_x] = sum6 / (filter_rows * filter_cols);
1031 shared_x += blockDim.x;
1032 output[shared_x] = sum7 / (filter_rows * filter_cols);
1033 shared_x += blockDim.x;
1034 output[shared_x] = sum8 / (filter_rows * filter_cols);
1035 shared_x += blockDim.x;
1036 output[shared_x] = sum9 / (filter_rows * filter_cols);
1037 shared_x += blockDim.x;
1038 output[shared_x] = sum10 / (filter_rows * filter_cols);
1045 template<
bool useFilter,
typename T>
1046 __global__
void conv_cuda_shared_tiling_12_kernel(T* input, T* output,
const int in_cols,
const int out_rows,
const int out_cols,
const int filter_rows,
const int filter_cols,
size_t in_pitch,
size_t out_pitch,
const int sharedRows,
const int sharedCols)
1048 extern __shared__
char _sdata[];
1049 T* sdata =
reinterpret_cast<T*
>(_sdata);
1051 unsigned int xx = blockIdx.x * blockDim.x * 12;
1052 unsigned int yy = blockIdx.y * blockDim.y;
1054 unsigned int x = xx + threadIdx.x;
1056 unsigned int y = yy + threadIdx.y;
1058 unsigned int sharedIdx = threadIdx.y * sharedCols + threadIdx.x;
1060 unsigned int shared_x= threadIdx.x+blockDim.x;
1063 if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))
1065 sdata[sharedIdx]= input[y*in_pitch + x];
1067 unsigned int shared_y= threadIdx.y;
1070 while(shared_y<sharedRows)
1072 while(shared_x<sharedCols)
1074 sharedIdx = shared_y * sharedCols + shared_x;
1075 sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];
1076 shared_x = shared_x + blockDim.x;
1078 shared_x = threadIdx.x;
1079 shared_y = shared_y + blockDim.y;
1085 sharedIdx = threadIdx.x;
1089 if(x<out_cols && y<out_rows)
1106 T *d_Filter =
reinterpret_cast<T*
>(deviceFilter);
1107 for(
int j=0;j<filter_rows;j++)
1109 for(
int i=0;i<filter_cols;i++)
1111 shared_x = (threadIdx.y+j) * sharedCols + (sharedIdx+i);
1112 sum += sdata[shared_x] * d_Filter[j*filter_cols+i];
1113 shared_x += blockDim.x;
1114 sum2 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1115 shared_x += blockDim.x;
1116 sum3 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1117 shared_x += blockDim.x;
1118 sum4 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1119 shared_x += blockDim.x;
1120 sum5 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1121 shared_x += blockDim.x;
1122 sum6 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1123 shared_x += blockDim.x;
1124 sum7 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1125 shared_x += blockDim.x;
1126 sum8 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1127 shared_x += blockDim.x;
1128 sum9 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1129 shared_x += blockDim.x;
1130 sum10 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1131 shared_x += blockDim.x;
1132 sum11 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1133 shared_x += blockDim.x;
1134 sum12 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1140 for(
int j=0;j<filter_rows;j++)
1142 for(
int i=0;i<filter_cols;i++)
1144 shared_x = (threadIdx.y+j) * sharedCols + (sharedIdx+i);
1145 sum += sdata[shared_x];
1146 shared_x += blockDim.x;
1147 sum2 += sdata[shared_x];
1148 shared_x += blockDim.x;
1149 sum3 += sdata[shared_x];
1150 shared_x += blockDim.x;
1151 sum4 += sdata[shared_x];
1152 shared_x += blockDim.x;
1153 sum5 += sdata[shared_x];
1154 shared_x += blockDim.x;
1155 sum6 += sdata[shared_x];
1156 shared_x += blockDim.x;
1157 sum7 += sdata[shared_x];
1158 shared_x += blockDim.x;
1159 sum8 += sdata[shared_x];
1160 shared_x += blockDim.x;
1161 sum9 += sdata[shared_x];
1162 shared_x += blockDim.x;
1163 sum10 += sdata[shared_x];
1164 shared_x += blockDim.x;
1165 sum11 += sdata[shared_x];
1166 shared_x += blockDim.x;
1167 sum12 += sdata[shared_x];
1171 shared_x = y*out_pitch+x;
1172 output[shared_x] = sum / (filter_rows * filter_cols);
1173 shared_x += blockDim.x;
1174 output[shared_x] = sum2 / (filter_rows * filter_cols);
1175 shared_x += blockDim.x;
1176 output[shared_x] = sum3 / (filter_rows * filter_cols);
1177 shared_x += blockDim.x;
1178 output[shared_x] = sum4 / (filter_rows * filter_cols);
1179 shared_x += blockDim.x;
1180 output[shared_x] = sum5 / (filter_rows * filter_cols);
1181 shared_x += blockDim.x;
1182 output[shared_x] = sum6 / (filter_rows * filter_cols);
1183 shared_x += blockDim.x;
1184 output[shared_x] = sum7 / (filter_rows * filter_cols);
1185 shared_x += blockDim.x;
1186 output[shared_x] = sum8 / (filter_rows * filter_cols);
1187 shared_x += blockDim.x;
1188 output[shared_x] = sum9 / (filter_rows * filter_cols);
1189 shared_x += blockDim.x;
1190 output[shared_x] = sum10 / (filter_rows * filter_cols);
1191 shared_x += blockDim.x;
1192 output[shared_x] = sum11 / (filter_rows * filter_cols);
1193 shared_x += blockDim.x;
1194 output[shared_x] = sum12 / (filter_rows * filter_cols);
1202 template<
bool useFilter,
typename T>
1203 __global__
void conv_cuda_shared_tiling_14_kernel(T* input, T* output,
const int in_cols,
const int out_rows,
const int out_cols,
const int filter_rows,
const int filter_cols,
size_t in_pitch,
size_t out_pitch,
const int sharedRows,
const int sharedCols)
1205 extern __shared__
char _sdata[];
1206 T* sdata =
reinterpret_cast<T*
>(_sdata);
1208 unsigned int xx = blockIdx.x * blockDim.x * 14;
1209 unsigned int yy = blockIdx.y * blockDim.y;
1211 unsigned int x = xx + threadIdx.x;
1213 unsigned int y = yy + threadIdx.y;
1215 unsigned int sharedIdx = threadIdx.y * sharedCols + threadIdx.x;
1217 unsigned int shared_x= threadIdx.x+blockDim.x;
1220 if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))
1222 sdata[sharedIdx]= input[y*in_pitch + x];
1224 unsigned int shared_y= threadIdx.y;
1227 while(shared_y<sharedRows)
1229 while(shared_x<sharedCols)
1231 sharedIdx = shared_y * sharedCols + shared_x;
1232 sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];
1233 shared_x = shared_x + blockDim.x;
1235 shared_x = threadIdx.x;
1236 shared_y = shared_y + blockDim.y;
1242 sharedIdx = threadIdx.x;
1246 if(x<out_cols && y<out_rows)
1265 T *d_Filter =
reinterpret_cast<T*
>(deviceFilter);
1266 for(
int j=0;j<filter_rows;j++)
1268 for(
int i=0;i<filter_cols;i++)
1270 shared_x = (threadIdx.y+j) * sharedCols + (sharedIdx+i);
1271 sum += sdata[shared_x] * d_Filter[j*filter_cols+i];
1272 shared_x += blockDim.x;
1273 sum2 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1274 shared_x += blockDim.x;
1275 sum3 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1276 shared_x += blockDim.x;
1277 sum4 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1278 shared_x += blockDim.x;
1279 sum5 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1280 shared_x += blockDim.x;
1281 sum6 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1282 shared_x += blockDim.x;
1283 sum7 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1284 shared_x += blockDim.x;
1285 sum8 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1286 shared_x += blockDim.x;
1287 sum9 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1288 shared_x += blockDim.x;
1289 sum10 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1290 shared_x += blockDim.x;
1291 sum11 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1292 shared_x += blockDim.x;
1293 sum12 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1294 shared_x += blockDim.x;
1295 sum13 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1296 shared_x += blockDim.x;
1297 sum14 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1303 for(
int j=0;j<filter_rows;j++)
1305 for(
int i=0;i<filter_cols;i++)
1307 shared_x = (threadIdx.y+j) * sharedCols + (sharedIdx+i);
1308 sum += sdata[shared_x];
1309 shared_x += blockDim.x;
1310 sum2 += sdata[shared_x];
1311 shared_x += blockDim.x;
1312 sum3 += sdata[shared_x];
1313 shared_x += blockDim.x;
1314 sum4 += sdata[shared_x];
1315 shared_x += blockDim.x;
1316 sum5 += sdata[shared_x];
1317 shared_x += blockDim.x;
1318 sum6 += sdata[shared_x];
1319 shared_x += blockDim.x;
1320 sum7 += sdata[shared_x];
1321 shared_x += blockDim.x;
1322 sum8 += sdata[shared_x];
1323 shared_x += blockDim.x;
1324 sum9 += sdata[shared_x];
1325 shared_x += blockDim.x;
1326 sum10 += sdata[shared_x];
1327 shared_x += blockDim.x;
1328 sum11 += sdata[shared_x];
1329 shared_x += blockDim.x;
1330 sum12 += sdata[shared_x];
1331 shared_x += blockDim.x;
1332 sum13 += sdata[shared_x];
1333 shared_x += blockDim.x;
1334 sum14 += sdata[shared_x];
1338 shared_x = y*out_pitch+x;
1339 output[shared_x] = sum / (filter_rows * filter_cols);
1340 shared_x += blockDim.x;
1341 output[shared_x] = sum2 / (filter_rows * filter_cols);
1342 shared_x += blockDim.x;
1343 output[shared_x] = sum3 / (filter_rows * filter_cols);
1344 shared_x += blockDim.x;
1345 output[shared_x] = sum4 / (filter_rows * filter_cols);
1346 shared_x += blockDim.x;
1347 output[shared_x] = sum5 / (filter_rows * filter_cols);
1348 shared_x += blockDim.x;
1349 output[shared_x] = sum6 / (filter_rows * filter_cols);
1350 shared_x += blockDim.x;
1351 output[shared_x] = sum7 / (filter_rows * filter_cols);
1352 shared_x += blockDim.x;
1353 output[shared_x] = sum8 / (filter_rows * filter_cols);
1354 shared_x += blockDim.x;
1355 output[shared_x] = sum9 / (filter_rows * filter_cols);
1356 shared_x += blockDim.x;
1357 output[shared_x] = sum10 / (filter_rows * filter_cols);
1358 shared_x += blockDim.x;
1359 output[shared_x] = sum11 / (filter_rows * filter_cols);
1360 shared_x += blockDim.x;
1361 output[shared_x] = sum12 / (filter_rows * filter_cols);
1362 shared_x += blockDim.x;
1363 output[shared_x] = sum13 / (filter_rows * filter_cols);
1364 shared_x += blockDim.x;
1365 output[shared_x] = sum14 / (filter_rows * filter_cols);
1376 template<
bool useFilter,
typename T>
1377 __global__
void conv_cuda_shared_tiling_16_kernel(T* input, T* output,
const int in_cols,
const int out_rows,
const int out_cols,
const int filter_rows,
const int filter_cols,
size_t in_pitch,
size_t out_pitch,
const int sharedRows,
const int sharedCols)
1379 extern __shared__
char _sdata[];
1380 T* sdata =
reinterpret_cast<T*
>(_sdata);
1382 unsigned int xx = blockIdx.x * blockDim.x * 16;
1383 unsigned int yy = blockIdx.y * blockDim.y;
1385 unsigned int x = xx + threadIdx.x;
1387 unsigned int y = yy + threadIdx.y;
1389 unsigned int sharedIdx = threadIdx.y * sharedCols + threadIdx.x;
1391 unsigned int shared_x= threadIdx.x+blockDim.x;
1394 if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))
1396 sdata[sharedIdx]= input[y*in_pitch + x];
1398 unsigned int shared_y= threadIdx.y;
1401 while(shared_y<sharedRows)
1403 while(shared_x<sharedCols)
1405 sharedIdx = shared_y * sharedCols + shared_x;
1406 sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];
1407 shared_x = shared_x + blockDim.x;
1409 shared_x = threadIdx.x;
1410 shared_y = shared_y + blockDim.y;
1416 sharedIdx = threadIdx.x;
1420 if(x<out_cols && y<out_rows)
1441 T *d_Filter =
reinterpret_cast<T*
>(deviceFilter);
1442 for(
int j=0;j<filter_rows;j++)
1444 for(
int i=0;i<filter_cols;i++)
1446 shared_x = (threadIdx.y+j) * sharedCols + (sharedIdx+i);
1447 sum += sdata[shared_x] * d_Filter[j*filter_cols+i];
1448 shared_x += blockDim.x;
1449 sum2 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1450 shared_x += blockDim.x;
1451 sum3 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1452 shared_x += blockDim.x;
1453 sum4 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1454 shared_x += blockDim.x;
1455 sum5 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1456 shared_x += blockDim.x;
1457 sum6 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1458 shared_x += blockDim.x;
1459 sum7 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1460 shared_x += blockDim.x;
1461 sum8 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1462 shared_x += blockDim.x;
1463 sum9 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1464 shared_x += blockDim.x;
1465 sum10 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1466 shared_x += blockDim.x;
1467 sum11 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1468 shared_x += blockDim.x;
1469 sum12 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1470 shared_x += blockDim.x;
1471 sum13 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1472 shared_x += blockDim.x;
1473 sum14 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1474 shared_x += blockDim.x;
1475 sum15 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1476 shared_x += blockDim.x;
1477 sum16 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1483 for(
int j=0;j<filter_rows;j++)
1485 for(
int i=0;i<filter_cols;i++)
1487 shared_x = (threadIdx.y+j) * sharedCols + (sharedIdx+i);
1488 sum += sdata[shared_x];
1489 shared_x += blockDim.x;
1490 sum2 += sdata[shared_x];
1491 shared_x += blockDim.x;
1492 sum3 += sdata[shared_x];
1493 shared_x += blockDim.x;
1494 sum4 += sdata[shared_x];
1495 shared_x += blockDim.x;
1496 sum5 += sdata[shared_x];
1497 shared_x += blockDim.x;
1498 sum6 += sdata[shared_x];
1499 shared_x += blockDim.x;
1500 sum7 += sdata[shared_x];
1501 shared_x += blockDim.x;
1502 sum8 += sdata[shared_x];
1503 shared_x += blockDim.x;
1504 sum9 += sdata[shared_x];
1505 shared_x += blockDim.x;
1506 sum10 += sdata[shared_x];
1507 shared_x += blockDim.x;
1508 sum11 += sdata[shared_x];
1509 shared_x += blockDim.x;
1510 sum12 += sdata[shared_x];
1511 shared_x += blockDim.x;
1512 sum13 += sdata[shared_x];
1513 shared_x += blockDim.x;
1514 sum14 += sdata[shared_x];
1515 shared_x += blockDim.x;
1516 sum15 += sdata[shared_x];
1517 shared_x += blockDim.x;
1518 sum16 += sdata[shared_x];
1522 shared_x = y*out_pitch+x;
1523 output[shared_x] = sum / (filter_rows * filter_cols);
1524 shared_x += blockDim.x;
1525 output[shared_x] = sum2 / (filter_rows * filter_cols);
1526 shared_x += blockDim.x;
1527 output[shared_x] = sum3 / (filter_rows * filter_cols);
1528 shared_x += blockDim.x;
1529 output[shared_x] = sum4 / (filter_rows * filter_cols);
1530 shared_x += blockDim.x;
1531 output[shared_x] = sum5 / (filter_rows * filter_cols);
1532 shared_x += blockDim.x;
1533 output[shared_x] = sum6 / (filter_rows * filter_cols);
1534 shared_x += blockDim.x;
1535 output[shared_x] = sum7 / (filter_rows * filter_cols);
1536 shared_x += blockDim.x;
1537 output[shared_x] = sum8 / (filter_rows * filter_cols);
1538 shared_x += blockDim.x;
1539 output[shared_x] = sum9 / (filter_rows * filter_cols);
1540 shared_x += blockDim.x;
1541 output[shared_x] = sum10 / (filter_rows * filter_cols);
1542 shared_x += blockDim.x;
1543 output[shared_x] = sum11 / (filter_rows * filter_cols);
1544 shared_x += blockDim.x;
1545 output[shared_x] = sum12 / (filter_rows * filter_cols);
1546 shared_x += blockDim.x;
1547 output[shared_x] = sum13 / (filter_rows * filter_cols);
1548 shared_x += blockDim.x;
1549 output[shared_x] = sum14 / (filter_rows * filter_cols);
1550 shared_x += blockDim.x;
1551 output[shared_x] = sum15 / (filter_rows * filter_cols);
1552 shared_x += blockDim.x;
1553 output[shared_x] = sum16 / (filter_rows * filter_cols);