SkePU  1.2
 All Classes Namespaces Files Functions Variables Enumerations Friends Macros Groups Pages
mapoverlap_convol_kernels.h
Go to the documentation of this file.
1 
5 #ifndef MAPOVERLAP_CONVOL_KERNELS_H
6 #define MAPOVERLAP_CONVOL_KERNELS_H
7 
8 #ifdef SKEPU_OPENCL
9 
10 
11 #include <string>
12 
13 namespace skepu
14 {
15 
31 static std::string MatrixConvolSharedFilter_CL(
32  "__kernel void conv_opencl_shared_filter_KERNELNAME(__global TYPE* input, __global TYPE* output, __constant TYPE* filter, size_t in_rows, size_t in_cols, size_t out_rows, size_t out_cols, size_t filter_rows, size_t filter_cols, size_t in_pitch, size_t out_pitch, size_t sharedRows, size_t sharedCols, __local TYPE* sdata)\n"
33  "{\n"
34  " size_t xx = ( (size_t)(get_global_id(0)/get_local_size(0))) * get_local_size(0);\n"
35  " size_t yy = ( (size_t)(get_global_id(1)/get_local_size(1))) * get_local_size(1);\n"
36  " size_t x = get_global_id(0);\n"
37  " size_t y = get_global_id(1);\n"
38  " if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))\n"
39  " {\n"
40  " size_t sharedIdx = get_local_id(1) * sharedCols + get_local_id(0);\n"
41  " sdata[sharedIdx]= input[y*in_pitch + x];\n"
42  " size_t shared_x= get_local_id(0)+get_local_size(0);\n"
43  " size_t shared_y= get_local_id(1);\n"
44  " while(shared_y<sharedRows)\n"
45  " {\n"
46  " while(shared_x<sharedCols)\n"
47  " {\n"
48  " sharedIdx = shared_y * sharedCols + shared_x; \n"
49  " sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];\n"
50  " shared_x = shared_x + get_local_size(0);\n"
51  " }\n"
52  " shared_x = get_local_id(0);\n"
53  " shared_y = shared_y + get_local_size(1);\n"
54  " } \n"
55  " }\n"
56  " barrier(CLK_LOCAL_MEM_FENCE);\n"
57  " if(x<out_cols && y<out_rows)\n"
58  " {\n"
59  " TYPE sum=0;\n"
60  " for(size_t j=0;j<filter_rows;j++) \n"
61  " {\n"
62  " for(size_t i=0;i<filter_cols;i++) \n"
63  " {\n"
64  " sum += sdata[(get_local_id(1)+j) * sharedCols + (get_local_id(0)+i) ] * filter[j*filter_cols+i];\n"
65  " }\n"
66  " }\n"
67  " output[y*out_pitch+x] = sum / (filter_rows * filter_cols);\n"
68  " }\n"
69  "}"
70 );
71 
72 
73 
77 static std::string MatrixConvol2D_CL(
78  "__kernel void conv_opencl_2D_KERNELNAME(__global TYPE* input, __global TYPE* output, size_t out_rows, size_t out_cols, size_t filter_rows, size_t filter_cols, size_t in_pitch, size_t out_pitch, size_t stride, size_t sharedRows, size_t sharedCols, __local TYPE* sdata)\n"
79  "{\n"
80  " size_t xx = ( (size_t)(get_global_id(0)/get_local_size(0))) * get_local_size(0);\n"
81  " size_t yy = ( (size_t)(get_global_id(1)/get_local_size(1))) * get_local_size(1);\n"
82  " size_t x = get_global_id(0);\n"
83  " size_t y = get_global_id(1);\n"
84  " if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))\n"
85  " {\n"
86  " size_t sharedIdx = get_local_id(1) * sharedCols + get_local_id(0);\n"
87  " sdata[sharedIdx]= input[y*in_pitch + x];\n"
88  " size_t shared_x= get_local_id(0)+get_local_size(0);\n"
89  " size_t shared_y= get_local_id(1);\n"
90  " while(shared_y<sharedRows)\n"
91  " {\n"
92  " while(shared_x<sharedCols)\n"
93  " {\n"
94  " sharedIdx = shared_y * sharedCols + shared_x; \n"
95  " sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];\n"
96  " shared_x = shared_x + get_local_size(0);\n"
97  " }\n"
98  " shared_x = get_local_id(0);\n"
99  " shared_y = shared_y + get_local_size(1);\n"
100  " } \n"
101  " }\n"
102  " barrier(CLK_LOCAL_MEM_FENCE);\n"
103  " if(x<out_cols && y<out_rows)\n"
104  " {\n"
105  " output[y*out_pitch+x] = FUNCTIONNAME(&(sdata[(get_local_id(1)+(filter_rows/2)) * sharedCols + (get_local_id(0)+(filter_cols/2))]), stride);\n"
106  " }\n"
107  "}"
108 );
109 
110 
114 static std::string MatrixConvolShared_CL(
115  "__kernel void conv_opencl_shared_KERNELNAME(__global TYPE* input, __global TYPE* output, size_t in_rows, size_t in_cols, size_t out_rows, size_t out_cols, size_t filter_rows, size_t filter_cols, size_t in_pitch, size_t out_pitch, size_t sharedRows, size_t sharedCols, __local TYPE* sdata)\n"
116  "{\n"
117  " size_t xx = ( (size_t)(get_global_id(0)/get_local_size(0))) * get_local_size(0);\n"
118  " size_t yy = ( (size_t)(get_global_id(1)/get_local_size(1))) * get_local_size(1);\n"
119  " size_t x = get_global_id(0);\n"
120  " size_t y = get_global_id(1);\n"
121  " if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))\n"
122  " {\n"
123  " size_t sharedIdx = get_local_id(1) * sharedCols + get_local_id(0);\n"
124  " sdata[sharedIdx]= input[y*in_pitch + x];\n"
125  " size_t shared_x= get_local_id(0)+get_local_size(0);\n"
126  " size_t shared_y= get_local_id(1);\n"
127  " while(shared_y<sharedRows)\n"
128  " {\n"
129  " while(shared_x<sharedCols)\n"
130  " {\n"
131  " sharedIdx = shared_y * sharedCols + shared_x; \n"
132  " sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];\n"
133  " shared_x = shared_x + get_local_size(0);\n"
134  " }\n"
135  " shared_x = get_local_id(0);\n"
136  " shared_y = shared_y + get_local_size(1);\n"
137  " } \n"
138  " }\n"
139  " barrier(CLK_LOCAL_MEM_FENCE);\n"
140  " if(x<out_cols && y<out_rows)\n"
141  " {\n"
142  " TYPE sum=0;\n"
143  " for(size_t j=0;j<filter_rows;j++) \n"
144  " {\n"
145  " for(size_t i=0;i<filter_cols;i++) \n"
146  " {\n"
147  " sum += sdata[(get_local_id(1)+j) * sharedCols + (get_local_id(0)+i) ];\n"
148  " }\n"
149  " }\n"
150  " output[y*out_pitch+x] = sum / (filter_rows * filter_cols);\n"
151  " }\n"
152  "}"
153 );
154 
155 
160 }
161 
162 
163 #endif
164 
165 //#################
166 //-----------------
167 //#################
168 
169 
170 #ifdef SKEPU_CUDA
171 
172 
173 namespace skepu
174 {
175 
176 
189 #define BLOCK_SIZE_X 16
190 #define BLOCK_SIZE_Y 32
191 #define WARP_SIZE 32
192 #define NUM_REGISTERS_PER_SP 32768
193 #define SHARED_MEM_SIZE_BYTES 48000
194 #define THREADS_PER_WARP 32
195 #define WARPS_PER_SP 48
196 #define THREAD_BLOCK_PER_SP 8
197 
198 
202 template <typename T>
203 T max(T a, T b)
204 {
205  return (a>b)? a:b;
206 }
207 
211 template <typename T>
212 T min(T a, T b)
213 {
214  return (a<b)? a:b;
215 }
216 
217 
218 
219 
223 template <typename T>
224 size_t calculateTiling(size_t regCountPerThread, size_t filterSizeX, size_t filterSizeY, size_t inputSizeX, bool maximizeTiling=false)
225 {
226  size_t numThreadsPerTB = (BLOCK_SIZE_X * BLOCK_SIZE_Y);
227 
228  size_t numWarpsPerTB = (numThreadsPerTB+WARP_SIZE-1) / WARP_SIZE;
229 
230  size_t maxTBPerSP = min( (WARPS_PER_SP / numWarpsPerTB), (size_t)THREAD_BLOCK_PER_SP);
231 
232  if(maximizeTiling)
233  maxTBPerSP = 1;
234  else
235  maxTBPerSP = 2; // limit to 2, not full occupancy
236 
237 
238  long long remRegPerThreads = NUM_REGISTERS_PER_SP - (regCountPerThread * numWarpsPerTB * WARP_SIZE * maxTBPerSP); // * maxTBPerSP
239 
240  if(remRegPerThreads <0)
241  {
242  std::cerr << "Error! Limited by Register usage, tiling cannot be more than 1\n";
243  return 1;
244  }
245 
246  remRegPerThreads = remRegPerThreads / (numWarpsPerTB * WARP_SIZE * maxTBPerSP); //maxTBPerSP); // tiling cannot be more than this
247 
248  long long sharedMem = SHARED_MEM_SIZE_BYTES - ((BLOCK_SIZE_X + filterSizeX - 1) * (BLOCK_SIZE_Y + filterSizeY - 1) * sizeof(T) * maxTBPerSP); // * sizeof(T) * maxTBPerSP);
249 
250  if(sharedMem < 0)
251  {
252  std::cerr << "Error! Limited by shared memory usage, tiling cannot be more than 1\n";
253  return 1;
254  }
255 
256  size_t tilingSM = min( (size_t)(inputSizeX/BLOCK_SIZE_X), (size_t)(sharedMem / (BLOCK_SIZE_X * (BLOCK_SIZE_Y + filterSizeY - 1) * sizeof (T) * maxTBPerSP)) ); // * maxTBPerSP);
257 
258  tilingSM = min(tilingSM, (size_t)remRegPerThreads); // assuming a tile increase register count by one.
259 
260  inputSizeX = inputSizeX / BLOCK_SIZE_X;
261  if(tilingSM>1)
262  {
263  while( (inputSizeX%tilingSM) != 0)
264  {
265  tilingSM--;
266  }
267  }
268  else
269  tilingSM = 1;
270 
271  return tilingSM;
272 }
273 
274 
275 
276 // constant buffer used to store filter...
277 __device__ __constant__ char deviceFilter[16386];
278 
279 
280 
281 
286 template<typename T, typename OverlapFunc>
287 __global__ void conv_cuda_2D_kernel(OverlapFunc mapOverlapFunc, T* input, T* output, const size_t out_rows, const size_t out_cols, const size_t filter_rows, const size_t filter_cols, size_t in_pitch, size_t out_pitch, const size_t sharedRows, const size_t sharedCols)
288 {
289  extern __shared__ char _sdata[];
290  T* sdata = reinterpret_cast<T*>(_sdata); // will also contain extra (overlap data)
291 
292  size_t xx = blockIdx.x * blockDim.x;
293  size_t yy = blockIdx.y * blockDim.y;
294 
295  size_t x = xx + threadIdx.x;
296  size_t y = yy + threadIdx.y;
297 
298  if( x<(out_cols+(filter_cols-1)) && y<(out_rows+(filter_rows-1)) )
299  {
300  size_t sharedIdx = threadIdx.y * sharedCols + threadIdx.x;
301 
302  sdata[sharedIdx]= input[y*in_pitch + x];
303 
304  size_t shared_x= threadIdx.x+blockDim.x;
305  size_t shared_y= threadIdx.y;
306 
307  // To load data in shared memory including neighbouring elements...
308  while(shared_y<sharedRows)
309  {
310  while(shared_x<sharedCols)
311  {
312  sharedIdx = shared_y * sharedCols + shared_x;
313  sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];
314  shared_x = shared_x + blockDim.x;
315  }
316  shared_x = threadIdx.x;
317  shared_y = shared_y + blockDim.y;
318  }
319  }
320  __syncthreads();
321 
322  if(x<out_cols && y<out_rows)
323  {
324  output[y*out_pitch+x] = mapOverlapFunc.CU(&(sdata[(threadIdx.y+(filter_rows/2)) * sharedCols + (threadIdx.x+(filter_cols/2))]));
325  }
326 }
327 
328 
329 
334 template<bool useFilter, typename T>
335 __global__ void conv_cuda_shared_kernel(T* input, T* output, const size_t in_rows, const size_t in_cols, const size_t out_rows, const size_t out_cols, const size_t filter_rows, const size_t filter_cols, size_t in_pitch, size_t out_pitch, const size_t sharedRows, const size_t sharedCols)
336 {
337  extern __shared__ char _sdata[];
338  T* sdata = reinterpret_cast<T*>(_sdata); // will also contain extra (overlap data)
339 
340  size_t xx = blockIdx.x * blockDim.x;
341  size_t yy = blockIdx.y * blockDim.y;
342 
343  size_t x = xx + threadIdx.x;
344  size_t y = yy + threadIdx.y;
345 
346  if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))
347  {
348  size_t sharedIdx = threadIdx.y * sharedCols + threadIdx.x;
349 
350  sdata[sharedIdx]= input[y*in_pitch + x];
351 
352  size_t shared_x= threadIdx.x+blockDim.x;
353  size_t shared_y= threadIdx.y;
354 
355  // To load data in shared memory including neighbouring elements...
356  while(shared_y<sharedRows)
357  {
358  while(shared_x<sharedCols)
359  {
360  sharedIdx = shared_y * sharedCols + shared_x;
361  sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];
362  shared_x = shared_x + blockDim.x;
363  }
364  shared_x = threadIdx.x;
365  shared_y = shared_y + blockDim.y;
366  }
367  }
368  __syncthreads();
369 
370  if(x<out_cols && y<out_rows)
371  {
372  T sum=0;
373 
374  if(useFilter)
375  {
376  T *d_Filter = reinterpret_cast<T*>(deviceFilter);
377  for(size_t j=0; j<filter_rows; j++)
378  {
379  for(size_t i=0; i<filter_cols; i++)
380  {
381  sum += sdata[(threadIdx.y+j) * sharedCols + (threadIdx.x+i) ] * d_Filter[j*filter_cols+i];
382  }
383  }
384  }
385  else
386  {
387  for(size_t j=0; j<filter_rows; j++)
388  {
389  for(size_t i=0; i<filter_cols; i++)
390  {
391  sum += sdata[(threadIdx.y+j) * sharedCols + (threadIdx.x+i) ];
392  }
393  }
394  }
395  output[y*out_pitch+x] = sum / (filter_rows * filter_cols); //sdata[(threadIdx.y+2) * sharedCols + (threadIdx.x+2) ];
396  }
397 }
398 
399 
400 
406 template<bool useFilter, typename T>
407 __global__ void conv_cuda_shared_tiling_kernel(T* input, T* output, const size_t numTiles, const size_t in_cols, const size_t out_rows, const size_t out_cols, const size_t filter_rows, const size_t filter_cols, size_t in_pitch, size_t out_pitch, const size_t sharedRows, const size_t sharedCols)
408 {
409  extern __shared__ char _sdata[];
410  T* sdata = reinterpret_cast<T*>(_sdata); // will also contain extra (overlap data)
411 
412  size_t xx = blockIdx.x * blockDim.x * numTiles;
413  size_t yy = blockIdx.y * blockDim.y;
414 
415  size_t x = xx + threadIdx.x;
416  size_t y = yy + threadIdx.y;
417 
418  size_t sharedIdx = threadIdx.y * sharedCols + threadIdx.x;
419  size_t shared_x= threadIdx.x+blockDim.x;
420  size_t shared_y= threadIdx.y;
421 
422  if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))
423  {
424  sdata[sharedIdx]= input[y*in_pitch + x];
425 
426  // To load data in shared memory including neighbouring elements...
427  while(shared_y<sharedRows)
428  {
429  while(shared_x<sharedCols)
430  {
431  sharedIdx = shared_y * sharedCols + shared_x;
432  sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];
433  shared_x = shared_x + blockDim.x;
434  }
435  shared_x = threadIdx.x;
436  shared_y = shared_y + blockDim.y;
437  }
438  }
439 
440  __syncthreads();
441 
442  sharedIdx = threadIdx.x;
443 
444  for(size_t t=0; t<numTiles; t++)
445  {
446  if(x<out_cols && y<out_rows)
447  {
448 // T sum=0;
449  shared_x = 0;
450 
451  if(useFilter)
452  {
453  T *d_Filter = reinterpret_cast<T*>(deviceFilter);
454  for(size_t j=0; j<filter_rows; j++) // 7
455  {
456  for(size_t i=0; i<filter_cols; i++) // 7
457  {
458  shared_x += sdata[(threadIdx.y+j) * sharedCols + (sharedIdx+i) ] * d_Filter[j*filter_cols+i];
459  }
460  }
461  }
462  else
463  {
464  for(size_t j=0; j<filter_rows; j++) // 7
465  {
466  for(size_t i=0; i<filter_cols; i++) // 7
467  {
468  shared_x += sdata[(threadIdx.y+j) * sharedCols + (sharedIdx+i) ];
469  }
470  }
471  }
472  output[y*out_pitch+x] = shared_x / (filter_rows * filter_cols); //sdata[(threadIdx.y+2) * sharedCols + (threadIdx.x+2) ];
473  x += blockDim.x;
474  sharedIdx += blockDim.x;
475  }
476  }
477 }
478 
479 
480 
481 
482 
488 template<bool useFilter, typename T>
489 __global__ void conv_cuda_shared_tiling_2_kernel(T* input, T* output, const size_t in_cols, const size_t out_rows, const size_t out_cols, const size_t filter_rows, const size_t filter_cols, size_t in_pitch, size_t out_pitch, const size_t sharedRows, const size_t sharedCols)
490 {
491  extern __shared__ char _sdata[];
492  T* sdata = reinterpret_cast<T*>(_sdata); // will also contain extra (overlap data)
493 
494  size_t xx = blockIdx.x * blockDim.x * 2;
495  size_t yy = blockIdx.y * blockDim.y;
496 
497  size_t x = xx + threadIdx.x;
498  size_t y = yy + threadIdx.y;
499 
500  size_t sharedIdx = threadIdx.y * sharedCols + threadIdx.x;
501 
502 
503  if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))
504  {
505  sdata[sharedIdx]= input[y*in_pitch + x];
506 
507  size_t shared_x= threadIdx.x+blockDim.x;
508  size_t shared_y= threadIdx.y;
509 
510  // To load data in shared memory including neighbouring elements...
511  while(shared_y<sharedRows)
512  {
513  while(shared_x<sharedCols)
514  {
515  sharedIdx = shared_y * sharedCols + shared_x;
516  sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];
517  shared_x = shared_x + blockDim.x;
518  }
519  shared_x = threadIdx.x;
520  shared_y = shared_y + blockDim.y;
521  }
522  }
523 
524  __syncthreads();
525 
526  sharedIdx = threadIdx.x;
527 
528 // for(size_t t=0;t<numTiles; t++)
529  {
530  if(x<out_cols && y<out_rows)
531  {
532  T sum=0;
533  T sum2=0;
534 
535  if(useFilter)
536  {
537  T *d_Filter = reinterpret_cast<T*>(deviceFilter);
538  for(size_t j=0; j<filter_rows; j++) // 7
539  {
540  for(size_t i=0; i<filter_cols; i++) // 7
541  {
542  sum += sdata[(threadIdx.y+j) * sharedCols + (sharedIdx+i) ] * d_Filter[j*filter_cols+i];
543  sum2 += sdata[(threadIdx.y+j) * sharedCols + (sharedIdx+blockDim.x+i) ] * d_Filter[j*filter_cols+i];
544  }
545  }
546  }
547  else
548  {
549  for(size_t j=0; j<filter_rows; j++) // 7
550  {
551  for(size_t i=0; i<filter_cols; i++) // 7
552  {
553  sum += sdata[(threadIdx.y+j) * sharedCols + (sharedIdx+i) ];
554  sum2 += sdata[(threadIdx.y+j) * sharedCols + (sharedIdx+blockDim.x+i) ];
555  }
556  }
557  }
558  output[y*out_pitch+x] = sum / (filter_rows * filter_cols); //sdata[(threadIdx.y+2) * sharedCols + (threadIdx.x+2) ];
559  output[y*out_pitch+x+blockDim.x] = sum2 / (filter_rows * filter_cols); //sdata[(threadIdx.y+2) * sharedCols + (threadIdx.x+2) ];
560 // x += blockDim.x;
561 // sharedIdx += blockDim.x;
562  }
563  }
564 }
565 
566 
572 template<bool useFilter, typename T>
573 __global__ void conv_cuda_shared_tiling_4_kernel(T* input, T* output, const size_t in_cols, const size_t out_rows, const size_t out_cols, const size_t filter_rows, const size_t filter_cols, size_t in_pitch, size_t out_pitch, const size_t sharedRows, const size_t sharedCols)
574 {
575  extern __shared__ char _sdata[];
576  T* sdata = reinterpret_cast<T*>(_sdata); // will also contain extra (overlap data)
577 
578  size_t xx = blockIdx.x * blockDim.x * 4;
579  size_t yy = blockIdx.y * blockDim.y;
580 
581  size_t x = xx + threadIdx.x;
582 // size_t x_in = xx + threadIdx.x;
583  size_t y = yy + threadIdx.y;
584 
585  size_t sharedIdx = threadIdx.y * sharedCols + threadIdx.x;
586 
587  size_t shared_x= threadIdx.x+blockDim.x;
588 
589 
590  if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))
591  {
592  sdata[sharedIdx]= input[y*in_pitch + x];
593 
594  size_t shared_y= threadIdx.y;
595 
596  // To load data in shared memory including neighbouring elements...
597  while(shared_y<sharedRows)
598  {
599  while(shared_x<sharedCols)
600  {
601  sharedIdx = shared_y * sharedCols + shared_x;
602  sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];
603  shared_x = shared_x + blockDim.x;
604  }
605  shared_x = threadIdx.x;
606  shared_y = shared_y + blockDim.y;
607  }
608  }
609 
610  __syncthreads();
611 
612  sharedIdx = threadIdx.x;
613 
614 // for(size_t t=0;t<numTiles; t++)
615  {
616  if(x<out_cols && y<out_rows)
617  {
618  T sum=0;
619  T sum2=0;
620  T sum3=0;
621  T sum4=0;
622 
623  if(useFilter)
624  {
625  T *d_Filter = reinterpret_cast<T*>(deviceFilter);
626  for(size_t j=0; j<filter_rows; j++) // 7
627  {
628  for(size_t i=0; i<filter_cols; i++) // 7
629  {
630  shared_x = (threadIdx.y+j) * sharedCols + (sharedIdx+i);
631  sum += sdata[shared_x] * d_Filter[j*filter_cols+i];
632  shared_x += blockDim.x;
633  sum2 += sdata[shared_x] * d_Filter[j*filter_cols+i];
634  shared_x += blockDim.x;
635  sum3 += sdata[shared_x] * d_Filter[j*filter_cols+i];
636  shared_x += blockDim.x;
637  sum4 += sdata[shared_x] * d_Filter[j*filter_cols+i];
638  }
639  }
640  }
641  else
642  {
643  for(size_t j=0; j<filter_rows; j++) // 7
644  {
645  for(size_t i=0; i<filter_cols; i++) // 7
646  {
647  shared_x = (threadIdx.y+j) * sharedCols + (sharedIdx+i);
648  sum += sdata[shared_x];
649  shared_x += blockDim.x;
650  sum2 += sdata[shared_x];
651  shared_x += blockDim.x;
652  sum3 += sdata[shared_x];
653  shared_x += blockDim.x;
654  sum4 += sdata[shared_x];
655  }
656  }
657  }
658  shared_x = y*out_pitch+x;
659  output[shared_x] = sum / (filter_rows * filter_cols); //sdata[(threadIdx.y+2) * sharedCols + (threadIdx.x+2) ];
660  shared_x += blockDim.x;
661  output[shared_x] = sum2 / (filter_rows * filter_cols); //sdata[(threadIdx.y+2) * sharedCols + (threadIdx.x+2) ];
662  shared_x += blockDim.x;
663  output[shared_x] = sum3 / (filter_rows * filter_cols);
664  shared_x += blockDim.x;
665  output[shared_x] = sum4 / (filter_rows * filter_cols);
666  }
667  }
668 }
669 
675 template<bool useFilter, typename T>
676 __global__ void conv_cuda_shared_tiling_6_kernel(T* input, T* output, const size_t in_cols, const size_t out_rows, const size_t out_cols, const size_t filter_rows, const size_t filter_cols, size_t in_pitch, size_t out_pitch, const size_t sharedRows, const size_t sharedCols)
677 {
678  extern __shared__ char _sdata[];
679  T* sdata = reinterpret_cast<T*>(_sdata); // will also contain extra (overlap data)
680 
681  size_t xx = blockIdx.x * blockDim.x * 6;
682  size_t yy = blockIdx.y * blockDim.y;
683 
684  size_t x = xx + threadIdx.x;
685 // size_t x_in = xx + threadIdx.x;
686  size_t y = yy + threadIdx.y;
687 
688  size_t sharedIdx = threadIdx.y * sharedCols + threadIdx.x;
689 
690  size_t shared_x= threadIdx.x+blockDim.x;
691 
692 
693  if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))
694  {
695  sdata[sharedIdx]= input[y*in_pitch + x];
696 
697  size_t shared_y= threadIdx.y;
698 
699  // To load data in shared memory including neighbouring elements...
700  while(shared_y<sharedRows)
701  {
702  while(shared_x<sharedCols)
703  {
704  sharedIdx = shared_y * sharedCols + shared_x;
705  sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];
706  shared_x = shared_x + blockDim.x;
707  }
708  shared_x = threadIdx.x;
709  shared_y = shared_y + blockDim.y;
710  }
711  }
712 
713  __syncthreads();
714 
715  sharedIdx = threadIdx.x;
716 
717 // for(size_t t=0;t<numTiles; t++)
718  {
719  if(x<out_cols && y<out_rows)
720  {
721  T sum=0;
722  T sum2=0;
723  T sum3=0;
724  T sum4=0;
725  T sum5=0;
726  T sum6=0;
727 
728  if(useFilter)
729  {
730  T *d_Filter = reinterpret_cast<T*>(deviceFilter);
731  for(size_t j=0; j<filter_rows; j++) // 7
732  {
733  for(size_t i=0; i<filter_cols; i++) // 7
734  {
735  shared_x = (threadIdx.y+j) * sharedCols + (sharedIdx+i);
736  sum += sdata[shared_x] * d_Filter[j*filter_cols+i];
737  shared_x += blockDim.x;
738  sum2 += sdata[shared_x] * d_Filter[j*filter_cols+i];
739  shared_x += blockDim.x;
740  sum3 += sdata[shared_x] * d_Filter[j*filter_cols+i];
741  shared_x += blockDim.x;
742  sum4 += sdata[shared_x] * d_Filter[j*filter_cols+i];
743  shared_x += blockDim.x;
744  sum5 += sdata[shared_x] * d_Filter[j*filter_cols+i];
745  shared_x += blockDim.x;
746  sum6 += sdata[shared_x] * d_Filter[j*filter_cols+i];
747  }
748  }
749  }
750  else
751  {
752  for(size_t j=0; j<filter_rows; j++) // 7
753  {
754  for(size_t i=0; i<filter_cols; i++) // 7
755  {
756  shared_x = (threadIdx.y+j) * sharedCols + (sharedIdx+i);
757  sum += sdata[shared_x];
758  shared_x += blockDim.x;
759  sum2 += sdata[shared_x];
760  shared_x += blockDim.x;
761  sum3 += sdata[shared_x];
762  shared_x += blockDim.x;
763  sum4 += sdata[shared_x];
764  shared_x += blockDim.x;
765  sum5 += sdata[shared_x];
766  shared_x += blockDim.x;
767  sum6 += sdata[shared_x];
768  }
769  }
770  }
771  shared_x = y*out_pitch+x;
772  output[shared_x] = sum / (filter_rows * filter_cols); //sdata[(threadIdx.y+2) * sharedCols + (threadIdx.x+2) ];
773  shared_x += blockDim.x;
774  output[shared_x] = sum2 / (filter_rows * filter_cols); //sdata[(threadIdx.y+2) * sharedCols + (threadIdx.x+2) ];
775  shared_x += blockDim.x;
776  output[shared_x] = sum3 / (filter_rows * filter_cols);
777  shared_x += blockDim.x;
778  output[shared_x] = sum4 / (filter_rows * filter_cols);
779  shared_x += blockDim.x;
780  output[shared_x] = sum5 / (filter_rows * filter_cols);
781  shared_x += blockDim.x;
782  output[shared_x] = sum6 / (filter_rows * filter_cols);
783  }
784  }
785 }
786 
787 
788 
794 template<bool useFilter, typename T>
795 __global__ void conv_cuda_shared_tiling_8_kernel(T* input, T* output, const size_t in_cols, const size_t out_rows, const size_t out_cols, const size_t filter_rows, const size_t filter_cols, size_t in_pitch, size_t out_pitch, const size_t sharedRows, const size_t sharedCols)
796 {
797  extern __shared__ char _sdata[];
798  T* sdata = reinterpret_cast<T*>(_sdata); // will also contain extra (overlap data)
799 
800  size_t xx = blockIdx.x * blockDim.x * 8;
801  size_t yy = blockIdx.y * blockDim.y;
802 
803  size_t x = xx + threadIdx.x;
804 // size_t x_in = xx + threadIdx.x;
805  size_t y = yy + threadIdx.y;
806 
807  size_t sharedIdx = threadIdx.y * sharedCols + threadIdx.x;
808 
809  size_t shared_x= threadIdx.x+blockDim.x;
810 
811 
812  if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))
813  {
814  sdata[sharedIdx]= input[y*in_pitch + x];
815 
816  size_t shared_y= threadIdx.y;
817 
818  // To load data in shared memory including neighbouring elements...
819  while(shared_y<sharedRows)
820  {
821  while(shared_x<sharedCols)
822  {
823  sharedIdx = shared_y * sharedCols + shared_x;
824  sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];
825  shared_x = shared_x + blockDim.x;
826  }
827  shared_x = threadIdx.x;
828  shared_y = shared_y + blockDim.y;
829  }
830  }
831 
832  __syncthreads();
833 
834  sharedIdx = threadIdx.x;
835 
836 // for(size_t t=0;t<numTiles; t++)
837  {
838  if(x<out_cols && y<out_rows)
839  {
840  T sum=0;
841  T sum2=0;
842  T sum3=0;
843  T sum4=0;
844  T sum5=0;
845  T sum6=0;
846  T sum7=0;
847  T sum8=0;
848 
849  if(useFilter)
850  {
851  T *d_Filter = reinterpret_cast<T*>(deviceFilter);
852  for(size_t j=0; j<filter_rows; j++) // 7
853  {
854  for(size_t i=0; i<filter_cols; i++) // 7
855  {
856  shared_x = (threadIdx.y+j) * sharedCols + (sharedIdx+i);
857  sum += sdata[shared_x] * d_Filter[j*filter_cols+i];
858  shared_x += blockDim.x;
859  sum2 += sdata[shared_x] * d_Filter[j*filter_cols+i];
860  shared_x += blockDim.x;
861  sum3 += sdata[shared_x] * d_Filter[j*filter_cols+i];
862  shared_x += blockDim.x;
863  sum4 += sdata[shared_x] * d_Filter[j*filter_cols+i];
864  shared_x += blockDim.x;
865  sum5 += sdata[shared_x] * d_Filter[j*filter_cols+i];
866  shared_x += blockDim.x;
867  sum6 += sdata[shared_x] * d_Filter[j*filter_cols+i];
868  shared_x += blockDim.x;
869  sum7 += sdata[shared_x] * d_Filter[j*filter_cols+i];
870  shared_x += blockDim.x;
871  sum8 += sdata[shared_x] * d_Filter[j*filter_cols+i];
872  }
873  }
874  }
875  else
876  {
877  for(size_t j=0; j<filter_rows; j++) // 7
878  {
879  for(size_t i=0; i<filter_cols; i++) // 7
880  {
881  shared_x = (threadIdx.y+j) * sharedCols + (sharedIdx+i);
882  sum += sdata[shared_x];
883  shared_x += blockDim.x;
884  sum2 += sdata[shared_x];
885  shared_x += blockDim.x;
886  sum3 += sdata[shared_x];
887  shared_x += blockDim.x;
888  sum4 += sdata[shared_x];
889  shared_x += blockDim.x;
890  sum5 += sdata[shared_x];
891  shared_x += blockDim.x;
892  sum6 += sdata[shared_x];
893  shared_x += blockDim.x;
894  sum7 += sdata[shared_x];
895  shared_x += blockDim.x;
896  sum8 += sdata[shared_x];
897  }
898  }
899  }
900  shared_x = y*out_pitch+x;
901  output[shared_x] = sum / (filter_rows * filter_cols); //sdata[(threadIdx.y+2) * sharedCols + (threadIdx.x+2) ];
902  shared_x += blockDim.x;
903  output[shared_x] = sum2 / (filter_rows * filter_cols); //sdata[(threadIdx.y+2) * sharedCols + (threadIdx.x+2) ];
904  shared_x += blockDim.x;
905  output[shared_x] = sum3 / (filter_rows * filter_cols);
906  shared_x += blockDim.x;
907  output[shared_x] = sum4 / (filter_rows * filter_cols);
908  shared_x += blockDim.x;
909  output[shared_x] = sum5 / (filter_rows * filter_cols);
910  shared_x += blockDim.x;
911  output[shared_x] = sum6 / (filter_rows * filter_cols);
912  shared_x += blockDim.x;
913  output[shared_x] = sum7 / (filter_rows * filter_cols);
914  shared_x += blockDim.x;
915  output[shared_x] = sum8 / (filter_rows * filter_cols);
916  }
917  }
918 }
919 
920 
921 
922 
928 template<bool useFilter, typename T>
929 __global__ void conv_cuda_shared_tiling_10_kernel(T* input, T* output, const size_t in_cols, const size_t out_rows, const size_t out_cols, const size_t filter_rows, const size_t filter_cols, size_t in_pitch, size_t out_pitch, const size_t sharedRows, const size_t sharedCols)
930 {
931  extern __shared__ char _sdata[];
932  T* sdata = reinterpret_cast<T*>(_sdata); // will also contain extra (overlap data)
933 
934  size_t xx = blockIdx.x * blockDim.x * 10;
935  size_t yy = blockIdx.y * blockDim.y;
936 
937  size_t x = xx + threadIdx.x;
938 // size_t x_in = xx + threadIdx.x;
939  size_t y = yy + threadIdx.y;
940 
941  size_t sharedIdx = threadIdx.y * sharedCols + threadIdx.x;
942 
943  size_t shared_x= threadIdx.x+blockDim.x;
944 
945 
946  if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))
947  {
948  sdata[sharedIdx]= input[y*in_pitch + x];
949 
950  size_t shared_y= threadIdx.y;
951 
952  // To load data in shared memory including neighbouring elements...
953  while(shared_y<sharedRows)
954  {
955  while(shared_x<sharedCols)
956  {
957  sharedIdx = shared_y * sharedCols + shared_x;
958  sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];
959  shared_x = shared_x + blockDim.x;
960  }
961  shared_x = threadIdx.x;
962  shared_y = shared_y + blockDim.y;
963  }
964  }
965 
966  __syncthreads();
967 
968  sharedIdx = threadIdx.x;
969 
970 // for(size_t t=0;t<numTiles; t++)
971  {
972  if(x<out_cols && y<out_rows)
973  {
974  T sum=0;
975  T sum2=0;
976  T sum3=0;
977  T sum4=0;
978  T sum5=0;
979  T sum6=0;
980  T sum7=0;
981  T sum8=0;
982  T sum9=0;
983  T sum10=0;
984 
985  if(useFilter)
986  {
987  T *d_Filter = reinterpret_cast<T*>(deviceFilter);
988  for(size_t j=0; j<filter_rows; j++) // 7
989  {
990  for(size_t i=0; i<filter_cols; i++) // 7
991  {
992  shared_x = (threadIdx.y+j) * sharedCols + (sharedIdx+i);
993  sum += sdata[shared_x] * d_Filter[j*filter_cols+i];
994  shared_x += blockDim.x;
995  sum2 += sdata[shared_x] * d_Filter[j*filter_cols+i];
996  shared_x += blockDim.x;
997  sum3 += sdata[shared_x] * d_Filter[j*filter_cols+i];
998  shared_x += blockDim.x;
999  sum4 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1000  shared_x += blockDim.x;
1001  sum5 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1002  shared_x += blockDim.x;
1003  sum6 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1004  shared_x += blockDim.x;
1005  sum7 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1006  shared_x += blockDim.x;
1007  sum8 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1008  shared_x += blockDim.x;
1009  sum9 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1010  shared_x += blockDim.x;
1011  sum10 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1012  }
1013  }
1014  }
1015  else
1016  {
1017  for(size_t j=0; j<filter_rows; j++) // 7
1018  {
1019  for(size_t i=0; i<filter_cols; i++) // 7
1020  {
1021  shared_x = (threadIdx.y+j) * sharedCols + (sharedIdx+i);
1022  sum += sdata[shared_x];
1023  shared_x += blockDim.x;
1024  sum2 += sdata[shared_x];
1025  shared_x += blockDim.x;
1026  sum3 += sdata[shared_x];
1027  shared_x += blockDim.x;
1028  sum4 += sdata[shared_x];
1029  shared_x += blockDim.x;
1030  sum5 += sdata[shared_x];
1031  shared_x += blockDim.x;
1032  sum6 += sdata[shared_x];
1033  shared_x += blockDim.x;
1034  sum7 += sdata[shared_x];
1035  shared_x += blockDim.x;
1036  sum8 += sdata[shared_x];
1037  shared_x += blockDim.x;
1038  sum9 += sdata[shared_x];
1039  shared_x += blockDim.x;
1040  sum10 += sdata[shared_x];
1041  }
1042  }
1043  }
1044  shared_x = y*out_pitch+x;
1045  output[shared_x] = sum / (filter_rows * filter_cols); //sdata[(threadIdx.y+2) * sharedCols + (threadIdx.x+2) ];
1046  shared_x += blockDim.x;
1047  output[shared_x] = sum2 / (filter_rows * filter_cols); //sdata[(threadIdx.y+2) * sharedCols + (threadIdx.x+2) ];
1048  shared_x += blockDim.x;
1049  output[shared_x] = sum3 / (filter_rows * filter_cols);
1050  shared_x += blockDim.x;
1051  output[shared_x] = sum4 / (filter_rows * filter_cols);
1052  shared_x += blockDim.x;
1053  output[shared_x] = sum5 / (filter_rows * filter_cols);
1054  shared_x += blockDim.x;
1055  output[shared_x] = sum6 / (filter_rows * filter_cols);
1056  shared_x += blockDim.x;
1057  output[shared_x] = sum7 / (filter_rows * filter_cols);
1058  shared_x += blockDim.x;
1059  output[shared_x] = sum8 / (filter_rows * filter_cols);
1060  shared_x += blockDim.x;
1061  output[shared_x] = sum9 / (filter_rows * filter_cols);
1062  shared_x += blockDim.x;
1063  output[shared_x] = sum10 / (filter_rows * filter_cols);
1064  }
1065  }
1066 }
1067 
1068 
1069 
1075 template<bool useFilter, typename T>
1076 __global__ void conv_cuda_shared_tiling_12_kernel(T* input, T* output, const size_t in_cols, const size_t out_rows, const size_t out_cols, const size_t filter_rows, const size_t filter_cols, size_t in_pitch, size_t out_pitch, const size_t sharedRows, const size_t sharedCols)
1077 {
1078  extern __shared__ char _sdata[];
1079  T* sdata = reinterpret_cast<T*>(_sdata); // will also contain extra (overlap data)
1080 
1081  size_t xx = blockIdx.x * blockDim.x * 12;
1082  size_t yy = blockIdx.y * blockDim.y;
1083 
1084  size_t x = xx + threadIdx.x;
1085 // size_t x_in = xx + threadIdx.x;
1086  size_t y = yy + threadIdx.y;
1087 
1088  size_t sharedIdx = threadIdx.y * sharedCols + threadIdx.x;
1089 
1090  size_t shared_x= threadIdx.x+blockDim.x;
1091 
1092 
1093  if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))
1094  {
1095  sdata[sharedIdx]= input[y*in_pitch + x];
1096 
1097  size_t shared_y= threadIdx.y;
1098 
1099  // To load data in shared memory including neighbouring elements...
1100  while(shared_y<sharedRows)
1101  {
1102  while(shared_x<sharedCols)
1103  {
1104  sharedIdx = shared_y * sharedCols + shared_x;
1105  sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];
1106  shared_x = shared_x + blockDim.x;
1107  }
1108  shared_x = threadIdx.x;
1109  shared_y = shared_y + blockDim.y;
1110  }
1111  }
1112 
1113  __syncthreads();
1114 
1115  sharedIdx = threadIdx.x;
1116 
1117 // for(size_t t=0;t<numTiles; t++)
1118  {
1119  if(x<out_cols && y<out_rows)
1120  {
1121  T sum=0;
1122  T sum2=0;
1123  T sum3=0;
1124  T sum4=0;
1125  T sum5=0;
1126  T sum6=0;
1127  T sum7=0;
1128  T sum8=0;
1129  T sum9=0;
1130  T sum10=0;
1131  T sum11=0;
1132  T sum12=0;
1133 
1134  if(useFilter)
1135  {
1136  T *d_Filter = reinterpret_cast<T*>(deviceFilter);
1137  for(size_t j=0; j<filter_rows; j++) // 7
1138  {
1139  for(size_t i=0; i<filter_cols; i++) // 7
1140  {
1141  shared_x = (threadIdx.y+j) * sharedCols + (sharedIdx+i);
1142  sum += sdata[shared_x] * d_Filter[j*filter_cols+i];
1143  shared_x += blockDim.x;
1144  sum2 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1145  shared_x += blockDim.x;
1146  sum3 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1147  shared_x += blockDim.x;
1148  sum4 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1149  shared_x += blockDim.x;
1150  sum5 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1151  shared_x += blockDim.x;
1152  sum6 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1153  shared_x += blockDim.x;
1154  sum7 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1155  shared_x += blockDim.x;
1156  sum8 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1157  shared_x += blockDim.x;
1158  sum9 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1159  shared_x += blockDim.x;
1160  sum10 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1161  shared_x += blockDim.x;
1162  sum11 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1163  shared_x += blockDim.x;
1164  sum12 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1165  }
1166  }
1167  }
1168  else
1169  {
1170  for(size_t j=0; j<filter_rows; j++) // 7
1171  {
1172  for(size_t i=0; i<filter_cols; i++) // 7
1173  {
1174  shared_x = (threadIdx.y+j) * sharedCols + (sharedIdx+i);
1175  sum += sdata[shared_x];
1176  shared_x += blockDim.x;
1177  sum2 += sdata[shared_x];
1178  shared_x += blockDim.x;
1179  sum3 += sdata[shared_x];
1180  shared_x += blockDim.x;
1181  sum4 += sdata[shared_x];
1182  shared_x += blockDim.x;
1183  sum5 += sdata[shared_x];
1184  shared_x += blockDim.x;
1185  sum6 += sdata[shared_x];
1186  shared_x += blockDim.x;
1187  sum7 += sdata[shared_x];
1188  shared_x += blockDim.x;
1189  sum8 += sdata[shared_x];
1190  shared_x += blockDim.x;
1191  sum9 += sdata[shared_x];
1192  shared_x += blockDim.x;
1193  sum10 += sdata[shared_x];
1194  shared_x += blockDim.x;
1195  sum11 += sdata[shared_x];
1196  shared_x += blockDim.x;
1197  sum12 += sdata[shared_x];
1198  }
1199  }
1200  }
1201  shared_x = y*out_pitch+x;
1202  output[shared_x] = sum / (filter_rows * filter_cols); //sdata[(threadIdx.y+2) * sharedCols + (threadIdx.x+2) ];
1203  shared_x += blockDim.x;
1204  output[shared_x] = sum2 / (filter_rows * filter_cols); //sdata[(threadIdx.y+2) * sharedCols + (threadIdx.x+2) ];
1205  shared_x += blockDim.x;
1206  output[shared_x] = sum3 / (filter_rows * filter_cols);
1207  shared_x += blockDim.x;
1208  output[shared_x] = sum4 / (filter_rows * filter_cols);
1209  shared_x += blockDim.x;
1210  output[shared_x] = sum5 / (filter_rows * filter_cols);
1211  shared_x += blockDim.x;
1212  output[shared_x] = sum6 / (filter_rows * filter_cols);
1213  shared_x += blockDim.x;
1214  output[shared_x] = sum7 / (filter_rows * filter_cols);
1215  shared_x += blockDim.x;
1216  output[shared_x] = sum8 / (filter_rows * filter_cols);
1217  shared_x += blockDim.x;
1218  output[shared_x] = sum9 / (filter_rows * filter_cols);
1219  shared_x += blockDim.x;
1220  output[shared_x] = sum10 / (filter_rows * filter_cols);
1221  shared_x += blockDim.x;
1222  output[shared_x] = sum11 / (filter_rows * filter_cols);
1223  shared_x += blockDim.x;
1224  output[shared_x] = sum12 / (filter_rows * filter_cols);
1225  }
1226  }
1227 }
1228 
1229 
1230 
1236 template<bool useFilter, typename T>
1237 __global__ void conv_cuda_shared_tiling_14_kernel(T* input, T* output, const size_t in_cols, const size_t out_rows, const size_t out_cols, const size_t filter_rows, const size_t filter_cols, size_t in_pitch, size_t out_pitch, const size_t sharedRows, const size_t sharedCols)
1238 {
1239  extern __shared__ char _sdata[];
1240  T* sdata = reinterpret_cast<T*>(_sdata); // will also contain extra (overlap data)
1241 
1242  size_t xx = blockIdx.x * blockDim.x * 14;
1243  size_t yy = blockIdx.y * blockDim.y;
1244 
1245  size_t x = xx + threadIdx.x;
1246 // size_t x_in = xx + threadIdx.x;
1247  size_t y = yy + threadIdx.y;
1248 
1249  size_t sharedIdx = threadIdx.y * sharedCols + threadIdx.x;
1250 
1251  size_t shared_x= threadIdx.x+blockDim.x;
1252 
1253 
1254  if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))
1255  {
1256  sdata[sharedIdx]= input[y*in_pitch + x];
1257 
1258  size_t shared_y= threadIdx.y;
1259 
1260  // To load data in shared memory including neighbouring elements...
1261  while(shared_y<sharedRows)
1262  {
1263  while(shared_x<sharedCols)
1264  {
1265  sharedIdx = shared_y * sharedCols + shared_x;
1266  sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];
1267  shared_x = shared_x + blockDim.x;
1268  }
1269  shared_x = threadIdx.x;
1270  shared_y = shared_y + blockDim.y;
1271  }
1272  }
1273 
1274  __syncthreads();
1275 
1276  sharedIdx = threadIdx.x;
1277 
1278 // for(size_t t=0;t<numTiles; t++)
1279  {
1280  if(x<out_cols && y<out_rows)
1281  {
1282  T sum=0;
1283  T sum2=0;
1284  T sum3=0;
1285  T sum4=0;
1286  T sum5=0;
1287  T sum6=0;
1288  T sum7=0;
1289  T sum8=0;
1290  T sum9=0;
1291  T sum10=0;
1292  T sum11=0;
1293  T sum12=0;
1294  T sum13=0;
1295  T sum14=0;
1296 
1297  if(useFilter)
1298  {
1299  T *d_Filter = reinterpret_cast<T*>(deviceFilter);
1300  for(size_t j=0; j<filter_rows; j++) // 7
1301  {
1302  for(size_t i=0; i<filter_cols; i++) // 7
1303  {
1304  shared_x = (threadIdx.y+j) * sharedCols + (sharedIdx+i);
1305  sum += sdata[shared_x] * d_Filter[j*filter_cols+i];
1306  shared_x += blockDim.x;
1307  sum2 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1308  shared_x += blockDim.x;
1309  sum3 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1310  shared_x += blockDim.x;
1311  sum4 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1312  shared_x += blockDim.x;
1313  sum5 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1314  shared_x += blockDim.x;
1315  sum6 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1316  shared_x += blockDim.x;
1317  sum7 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1318  shared_x += blockDim.x;
1319  sum8 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1320  shared_x += blockDim.x;
1321  sum9 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1322  shared_x += blockDim.x;
1323  sum10 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1324  shared_x += blockDim.x;
1325  sum11 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1326  shared_x += blockDim.x;
1327  sum12 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1328  shared_x += blockDim.x;
1329  sum13 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1330  shared_x += blockDim.x;
1331  sum14 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1332  }
1333  }
1334  }
1335  else
1336  {
1337  for(size_t j=0; j<filter_rows; j++) // 7
1338  {
1339  for(size_t i=0; i<filter_cols; i++) // 7
1340  {
1341  shared_x = (threadIdx.y+j) * sharedCols + (sharedIdx+i);
1342  sum += sdata[shared_x];
1343  shared_x += blockDim.x;
1344  sum2 += sdata[shared_x];
1345  shared_x += blockDim.x;
1346  sum3 += sdata[shared_x];
1347  shared_x += blockDim.x;
1348  sum4 += sdata[shared_x];
1349  shared_x += blockDim.x;
1350  sum5 += sdata[shared_x];
1351  shared_x += blockDim.x;
1352  sum6 += sdata[shared_x];
1353  shared_x += blockDim.x;
1354  sum7 += sdata[shared_x];
1355  shared_x += blockDim.x;
1356  sum8 += sdata[shared_x];
1357  shared_x += blockDim.x;
1358  sum9 += sdata[shared_x];
1359  shared_x += blockDim.x;
1360  sum10 += sdata[shared_x];
1361  shared_x += blockDim.x;
1362  sum11 += sdata[shared_x];
1363  shared_x += blockDim.x;
1364  sum12 += sdata[shared_x];
1365  shared_x += blockDim.x;
1366  sum13 += sdata[shared_x];
1367  shared_x += blockDim.x;
1368  sum14 += sdata[shared_x];
1369  }
1370  }
1371  }
1372  shared_x = y*out_pitch+x;
1373  output[shared_x] = sum / (filter_rows * filter_cols); //sdata[(threadIdx.y+2) * sharedCols + (threadIdx.x+2) ];
1374  shared_x += blockDim.x;
1375  output[shared_x] = sum2 / (filter_rows * filter_cols); //sdata[(threadIdx.y+2) * sharedCols + (threadIdx.x+2) ];
1376  shared_x += blockDim.x;
1377  output[shared_x] = sum3 / (filter_rows * filter_cols);
1378  shared_x += blockDim.x;
1379  output[shared_x] = sum4 / (filter_rows * filter_cols);
1380  shared_x += blockDim.x;
1381  output[shared_x] = sum5 / (filter_rows * filter_cols);
1382  shared_x += blockDim.x;
1383  output[shared_x] = sum6 / (filter_rows * filter_cols);
1384  shared_x += blockDim.x;
1385  output[shared_x] = sum7 / (filter_rows * filter_cols);
1386  shared_x += blockDim.x;
1387  output[shared_x] = sum8 / (filter_rows * filter_cols);
1388  shared_x += blockDim.x;
1389  output[shared_x] = sum9 / (filter_rows * filter_cols);
1390  shared_x += blockDim.x;
1391  output[shared_x] = sum10 / (filter_rows * filter_cols);
1392  shared_x += blockDim.x;
1393  output[shared_x] = sum11 / (filter_rows * filter_cols);
1394  shared_x += blockDim.x;
1395  output[shared_x] = sum12 / (filter_rows * filter_cols);
1396  shared_x += blockDim.x;
1397  output[shared_x] = sum13 / (filter_rows * filter_cols);
1398  shared_x += blockDim.x;
1399  output[shared_x] = sum14 / (filter_rows * filter_cols);
1400  }
1401  }
1402 }
1403 
1404 
1405 
1406 
1412 template<bool useFilter, typename T>
1413 __global__ void conv_cuda_shared_tiling_16_kernel(T* input, T* output, const size_t in_cols, const size_t out_rows, const size_t out_cols, const size_t filter_rows, const size_t filter_cols, size_t in_pitch, size_t out_pitch, const size_t sharedRows, const size_t sharedCols)
1414 {
1415  extern __shared__ char _sdata[];
1416  T* sdata = reinterpret_cast<T*>(_sdata); // will also contain extra (overlap data)
1417 
1418  size_t xx = blockIdx.x * blockDim.x * 16;
1419  size_t yy = blockIdx.y * blockDim.y;
1420 
1421  size_t x = xx + threadIdx.x;
1422 // size_t x_in = xx + threadIdx.x;
1423  size_t y = yy + threadIdx.y;
1424 
1425  size_t sharedIdx = threadIdx.y * sharedCols + threadIdx.x;
1426 
1427  size_t shared_x= threadIdx.x+blockDim.x;
1428 
1429 
1430  if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))
1431  {
1432  sdata[sharedIdx]= input[y*in_pitch + x];
1433 
1434  size_t shared_y= threadIdx.y;
1435 
1436  // To load data in shared memory including neighbouring elements...
1437  while(shared_y<sharedRows)
1438  {
1439  while(shared_x<sharedCols)
1440  {
1441  sharedIdx = shared_y * sharedCols + shared_x;
1442  sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];
1443  shared_x = shared_x + blockDim.x;
1444  }
1445  shared_x = threadIdx.x;
1446  shared_y = shared_y + blockDim.y;
1447  }
1448  }
1449 
1450  __syncthreads();
1451 
1452  sharedIdx = threadIdx.x;
1453 
1454 // for(size_t t=0;t<numTiles; t++)
1455  {
1456  if(x<out_cols && y<out_rows)
1457  {
1458  T sum=0;
1459  T sum2=0;
1460  T sum3=0;
1461  T sum4=0;
1462  T sum5=0;
1463  T sum6=0;
1464  T sum7=0;
1465  T sum8=0;
1466  T sum9=0;
1467  T sum10=0;
1468  T sum11=0;
1469  T sum12=0;
1470  T sum13=0;
1471  T sum14=0;
1472  T sum15=0;
1473  T sum16=0;
1474 
1475  if(useFilter)
1476  {
1477  T *d_Filter = reinterpret_cast<T*>(deviceFilter);
1478  for(size_t j=0; j<filter_rows; j++) // 7
1479  {
1480  for(size_t i=0; i<filter_cols; i++) // 7
1481  {
1482  shared_x = (threadIdx.y+j) * sharedCols + (sharedIdx+i);
1483  sum += sdata[shared_x] * d_Filter[j*filter_cols+i];
1484  shared_x += blockDim.x;
1485  sum2 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1486  shared_x += blockDim.x;
1487  sum3 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1488  shared_x += blockDim.x;
1489  sum4 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1490  shared_x += blockDim.x;
1491  sum5 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1492  shared_x += blockDim.x;
1493  sum6 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1494  shared_x += blockDim.x;
1495  sum7 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1496  shared_x += blockDim.x;
1497  sum8 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1498  shared_x += blockDim.x;
1499  sum9 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1500  shared_x += blockDim.x;
1501  sum10 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1502  shared_x += blockDim.x;
1503  sum11 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1504  shared_x += blockDim.x;
1505  sum12 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1506  shared_x += blockDim.x;
1507  sum13 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1508  shared_x += blockDim.x;
1509  sum14 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1510  shared_x += blockDim.x;
1511  sum15 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1512  shared_x += blockDim.x;
1513  sum16 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1514  }
1515  }
1516  }
1517  else
1518  {
1519  for(size_t j=0; j<filter_rows; j++) // 7
1520  {
1521  for(size_t i=0; i<filter_cols; i++) // 7
1522  {
1523  shared_x = (threadIdx.y+j) * sharedCols + (sharedIdx+i);
1524  sum += sdata[shared_x];
1525  shared_x += blockDim.x;
1526  sum2 += sdata[shared_x];
1527  shared_x += blockDim.x;
1528  sum3 += sdata[shared_x];
1529  shared_x += blockDim.x;
1530  sum4 += sdata[shared_x];
1531  shared_x += blockDim.x;
1532  sum5 += sdata[shared_x];
1533  shared_x += blockDim.x;
1534  sum6 += sdata[shared_x];
1535  shared_x += blockDim.x;
1536  sum7 += sdata[shared_x];
1537  shared_x += blockDim.x;
1538  sum8 += sdata[shared_x];
1539  shared_x += blockDim.x;
1540  sum9 += sdata[shared_x];
1541  shared_x += blockDim.x;
1542  sum10 += sdata[shared_x];
1543  shared_x += blockDim.x;
1544  sum11 += sdata[shared_x];
1545  shared_x += blockDim.x;
1546  sum12 += sdata[shared_x];
1547  shared_x += blockDim.x;
1548  sum13 += sdata[shared_x];
1549  shared_x += blockDim.x;
1550  sum14 += sdata[shared_x];
1551  shared_x += blockDim.x;
1552  sum15 += sdata[shared_x];
1553  shared_x += blockDim.x;
1554  sum16 += sdata[shared_x];
1555  }
1556  }
1557  }
1558  shared_x = y*out_pitch+x;
1559  output[shared_x] = sum / (filter_rows * filter_cols); //sdata[(threadIdx.y+2) * sharedCols + (threadIdx.x+2) ];
1560  shared_x += blockDim.x;
1561  output[shared_x] = sum2 / (filter_rows * filter_cols); //sdata[(threadIdx.y+2) * sharedCols + (threadIdx.x+2) ];
1562  shared_x += blockDim.x;
1563  output[shared_x] = sum3 / (filter_rows * filter_cols);
1564  shared_x += blockDim.x;
1565  output[shared_x] = sum4 / (filter_rows * filter_cols);
1566  shared_x += blockDim.x;
1567  output[shared_x] = sum5 / (filter_rows * filter_cols);
1568  shared_x += blockDim.x;
1569  output[shared_x] = sum6 / (filter_rows * filter_cols);
1570  shared_x += blockDim.x;
1571  output[shared_x] = sum7 / (filter_rows * filter_cols);
1572  shared_x += blockDim.x;
1573  output[shared_x] = sum8 / (filter_rows * filter_cols);
1574  shared_x += blockDim.x;
1575  output[shared_x] = sum9 / (filter_rows * filter_cols);
1576  shared_x += blockDim.x;
1577  output[shared_x] = sum10 / (filter_rows * filter_cols);
1578  shared_x += blockDim.x;
1579  output[shared_x] = sum11 / (filter_rows * filter_cols);
1580  shared_x += blockDim.x;
1581  output[shared_x] = sum12 / (filter_rows * filter_cols);
1582  shared_x += blockDim.x;
1583  output[shared_x] = sum13 / (filter_rows * filter_cols);
1584  shared_x += blockDim.x;
1585  output[shared_x] = sum14 / (filter_rows * filter_cols);
1586  shared_x += blockDim.x;
1587  output[shared_x] = sum15 / (filter_rows * filter_cols);
1588  shared_x += blockDim.x;
1589  output[shared_x] = sum16 / (filter_rows * filter_cols);
1590  }
1591  }
1592 }
1593 
1594 
1599 } // end namespace skepu
1600 
1601 #endif
1602 
1603 
1604 #endif
1605 
1606 
__global__ void conv_cuda_shared_tiling_12_kernel(T *input, T *output, const size_t in_cols, const size_t out_rows, const size_t out_cols, const size_t filter_rows, const size_t filter_cols, size_t in_pitch, size_t out_pitch, const size_t sharedRows, const size_t sharedCols)
Definition: mapoverlap_convol_kernels.h:1076
__global__ void conv_cuda_shared_tiling_16_kernel(T *input, T *output, const size_t in_cols, const size_t out_rows, const size_t out_cols, const size_t filter_rows, const size_t filter_cols, size_t in_pitch, size_t out_pitch, const size_t sharedRows, const size_t sharedCols)
Definition: mapoverlap_convol_kernels.h:1413
__global__ void conv_cuda_shared_kernel(T *input, T *output, const size_t in_rows, const size_t in_cols, const size_t out_rows, const size_t out_cols, const size_t filter_rows, const size_t filter_cols, size_t in_pitch, size_t out_pitch, const size_t sharedRows, const size_t sharedCols)
Definition: mapoverlap_convol_kernels.h:335
__global__ void conv_cuda_shared_tiling_4_kernel(T *input, T *output, const size_t in_cols, const size_t out_rows, const size_t out_cols, const size_t filter_rows, const size_t filter_cols, size_t in_pitch, size_t out_pitch, const size_t sharedRows, const size_t sharedCols)
Definition: mapoverlap_convol_kernels.h:573
T min(T a, T b)
Definition: mapoverlap_convol_kernels.h:212
size_t calculateTiling(size_t regCountPerThread, size_t filterSizeX, size_t filterSizeY, size_t inputSizeX, bool maximizeTiling=false)
Definition: mapoverlap_convol_kernels.h:224
__global__ void conv_cuda_shared_tiling_14_kernel(T *input, T *output, const size_t in_cols, const size_t out_rows, const size_t out_cols, const size_t filter_rows, const size_t filter_cols, size_t in_pitch, size_t out_pitch, const size_t sharedRows, const size_t sharedCols)
Definition: mapoverlap_convol_kernels.h:1237
T max(T a, T b)
Definition: mapoverlap_convol_kernels.h:203
__global__ void conv_cuda_shared_tiling_8_kernel(T *input, T *output, const size_t in_cols, const size_t out_rows, const size_t out_cols, const size_t filter_rows, const size_t filter_cols, size_t in_pitch, size_t out_pitch, const size_t sharedRows, const size_t sharedCols)
Definition: mapoverlap_convol_kernels.h:795
__global__ void conv_cuda_shared_tiling_10_kernel(T *input, T *output, const size_t in_cols, const size_t out_rows, const size_t out_cols, const size_t filter_rows, const size_t filter_cols, size_t in_pitch, size_t out_pitch, const size_t sharedRows, const size_t sharedCols)
Definition: mapoverlap_convol_kernels.h:929
__global__ void conv_cuda_shared_tiling_6_kernel(T *input, T *output, const size_t in_cols, const size_t out_rows, const size_t out_cols, const size_t filter_rows, const size_t filter_cols, size_t in_pitch, size_t out_pitch, const size_t sharedRows, const size_t sharedCols)
Definition: mapoverlap_convol_kernels.h:676
static std::string MatrixConvol2D_CL("__kernel void conv_opencl_2D_KERNELNAME(__global TYPE* input, __global TYPE* output, size_t out_rows, size_t out_cols, size_t filter_rows, size_t filter_cols, size_t in_pitch, size_t out_pitch, size_t stride, size_t sharedRows, size_t sharedCols, __local TYPE* sdata)\n""{\n"" size_t xx = ( (size_t)(get_global_id(0)/get_local_size(0))) * get_local_size(0);\n"" size_t yy = ( (size_t)(get_global_id(1)/get_local_size(1))) * get_local_size(1);\n"" size_t x = get_global_id(0);\n"" size_t y = get_global_id(1);\n"" if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))\n"" {\n"" size_t sharedIdx = get_local_id(1) * sharedCols + get_local_id(0);\n"" sdata[sharedIdx]= input[y*in_pitch + x];\n"" size_t shared_x= get_local_id(0)+get_local_size(0);\n"" size_t shared_y= get_local_id(1);\n"" while(shared_y<sharedRows)\n"" {\n"" while(shared_x<sharedCols)\n"" {\n"" sharedIdx = shared_y * sharedCols + shared_x; \n"" sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];\n"" shared_x = shared_x + get_local_size(0);\n"" }\n"" shared_x = get_local_id(0);\n"" shared_y = shared_y + get_local_size(1);\n"" } \n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if(x<out_cols && y<out_rows)\n"" {\n"" output[y*out_pitch+x] = FUNCTIONNAME(&(sdata[(get_local_id(1)+(filter_rows/2)) * sharedCols + (get_local_id(0)+(filter_cols/2))]), stride);\n"" }\n""}")
__global__ void conv_cuda_2D_kernel(OverlapFunc mapOverlapFunc, T *input, T *output, const size_t out_rows, const size_t out_cols, const size_t filter_rows, const size_t filter_cols, size_t in_pitch, size_t out_pitch, const size_t sharedRows, const size_t sharedCols)
Definition: mapoverlap_convol_kernels.h:287
static std::string MatrixConvolSharedFilter_CL("__kernel void conv_opencl_shared_filter_KERNELNAME(__global TYPE* input, __global TYPE* output, __constant TYPE* filter, size_t in_rows, size_t in_cols, size_t out_rows, size_t out_cols, size_t filter_rows, size_t filter_cols, size_t in_pitch, size_t out_pitch, size_t sharedRows, size_t sharedCols, __local TYPE* sdata)\n""{\n"" size_t xx = ( (size_t)(get_global_id(0)/get_local_size(0))) * get_local_size(0);\n"" size_t yy = ( (size_t)(get_global_id(1)/get_local_size(1))) * get_local_size(1);\n"" size_t x = get_global_id(0);\n"" size_t y = get_global_id(1);\n"" if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))\n"" {\n"" size_t sharedIdx = get_local_id(1) * sharedCols + get_local_id(0);\n"" sdata[sharedIdx]= input[y*in_pitch + x];\n"" size_t shared_x= get_local_id(0)+get_local_size(0);\n"" size_t shared_y= get_local_id(1);\n"" while(shared_y<sharedRows)\n"" {\n"" while(shared_x<sharedCols)\n"" {\n"" sharedIdx = shared_y * sharedCols + shared_x; \n"" sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];\n"" shared_x = shared_x + get_local_size(0);\n"" }\n"" shared_x = get_local_id(0);\n"" shared_y = shared_y + get_local_size(1);\n"" } \n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if(x<out_cols && y<out_rows)\n"" {\n"" TYPE sum=0;\n"" for(size_t j=0;j<filter_rows;j++) \n"" {\n"" for(size_t i=0;i<filter_cols;i++) \n"" {\n"" sum += sdata[(get_local_id(1)+j) * sharedCols + (get_local_id(0)+i) ] * filter[j*filter_cols+i];\n"" }\n"" }\n"" output[y*out_pitch+x] = sum / (filter_rows * filter_cols);\n"" }\n""}")
__global__ void conv_cuda_shared_tiling_kernel(T *input, T *output, const size_t numTiles, const size_t in_cols, const size_t out_rows, const size_t out_cols, const size_t filter_rows, const size_t filter_cols, size_t in_pitch, size_t out_pitch, const size_t sharedRows, const size_t sharedCols)
Definition: mapoverlap_convol_kernels.h:407
__global__ void conv_cuda_shared_tiling_2_kernel(T *input, T *output, const size_t in_cols, const size_t out_rows, const size_t out_cols, const size_t filter_rows, const size_t filter_cols, size_t in_pitch, size_t out_pitch, const size_t sharedRows, const size_t sharedCols)
Definition: mapoverlap_convol_kernels.h:489
static std::string MatrixConvolShared_CL("__kernel void conv_opencl_shared_KERNELNAME(__global TYPE* input, __global TYPE* output, size_t in_rows, size_t in_cols, size_t out_rows, size_t out_cols, size_t filter_rows, size_t filter_cols, size_t in_pitch, size_t out_pitch, size_t sharedRows, size_t sharedCols, __local TYPE* sdata)\n""{\n"" size_t xx = ( (size_t)(get_global_id(0)/get_local_size(0))) * get_local_size(0);\n"" size_t yy = ( (size_t)(get_global_id(1)/get_local_size(1))) * get_local_size(1);\n"" size_t x = get_global_id(0);\n"" size_t y = get_global_id(1);\n"" if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))\n"" {\n"" size_t sharedIdx = get_local_id(1) * sharedCols + get_local_id(0);\n"" sdata[sharedIdx]= input[y*in_pitch + x];\n"" size_t shared_x= get_local_id(0)+get_local_size(0);\n"" size_t shared_y= get_local_id(1);\n"" while(shared_y<sharedRows)\n"" {\n"" while(shared_x<sharedCols)\n"" {\n"" sharedIdx = shared_y * sharedCols + shared_x; \n"" sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];\n"" shared_x = shared_x + get_local_size(0);\n"" }\n"" shared_x = get_local_id(0);\n"" shared_y = shared_y + get_local_size(1);\n"" } \n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if(x<out_cols && y<out_rows)\n"" {\n"" TYPE sum=0;\n"" for(size_t j=0;j<filter_rows;j++) \n"" {\n"" for(size_t i=0;i<filter_cols;i++) \n"" {\n"" sum += sdata[(get_local_id(1)+j) * sharedCols + (get_local_id(0)+i) ];\n"" }\n"" }\n"" output[y*out_pitch+x] = sum / (filter_rows * filter_cols);\n"" }\n""}")