SkePU(integratedwithStarPU)  0.8.1
 All Classes Namespaces Files Functions Enumerations Friends Macros Groups Pages
mapoverlap_convol_kernels.h
Go to the documentation of this file.
1 
5 #ifndef MAPOVERLAP_CONVOL_KERNELS_H
6 #define MAPOVERLAP_CONVOL_KERNELS_H
7 
8 #ifdef SKEPU_OPENCL
9 
10 
11 #include <string>
12 
13 namespace skepu
14 {
15 
16 
17 static std::string MatrixConvolSharedFilter_CL(
18 "__kernel void conv_cuda_shared_filter_KERNELNAME(__global TYPE* input, __global TYPE* output, __constant TYPE* filter, int in_rows, int in_cols, int out_rows, int out_cols, int filter_rows, int filter_cols, int in_pitch, int out_pitch, int sharedRows, int sharedCols, __local TYPE* sdata)\n"
19 "{\n"
20 " unsigned int xx = ( (int)(get_global_id(0)/get_local_size(0))) * get_local_size(0);\n"
21 " unsigned int yy = ( (int)(get_global_id(1)/get_local_size(1))) * get_local_size(1);\n"
22 " unsigned int x = get_global_id(0);\n"
23 " unsigned int y = get_global_id(1);\n"
24 " if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))\n"
25 " {\n"
26 " unsigned int sharedIdx = get_local_id(1) * sharedCols + get_local_id(0);\n"
27 " sdata[sharedIdx]= input[y*in_pitch + x];\n"
28 " unsigned int shared_x= get_local_id(0)+get_local_size(0);\n"
29 " unsigned int shared_y= get_local_id(1);\n"
30 " while(shared_y<sharedRows)\n"
31 " {\n"
32 " while(shared_x<sharedCols)\n"
33 " {\n"
34 " sharedIdx = shared_y * sharedCols + shared_x; \n"
35 " sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];\n"
36 " shared_x = shared_x + get_local_size(0);\n"
37 " }\n"
38 " shared_x = get_local_id(0);\n"
39 " shared_y = shared_y + get_local_size(1);\n"
40 " } \n"
41 " }\n"
42 " barrier(CLK_LOCAL_MEM_FENCE);\n"
43 " if(x<out_cols && y<out_rows)\n"
44 " {\n"
45 " TYPE sum=0;\n"
46 " for(int j=0;j<filter_rows;j++) \n"
47 " {\n"
48 " for(int i=0;i<filter_cols;i++) \n"
49 " {\n"
50 " sum += sdata[(get_local_id(1)+j) * sharedCols + (get_local_id(0)+i) ] * filter[j*filter_cols+i];\n"
51 " }\n"
52 " }\n"
53 " output[y*out_pitch+x] = sum / (filter_rows * filter_cols);\n"
54 " }\n"
55 "}"
56 );
57 
58 
59 
60 
61 
62 static std::string MatrixConvolShared_CL(
63 "__kernel void conv_cuda_shared_KERNELNAME(__global TYPE* input, __global TYPE* output, int in_rows, int in_cols, int out_rows, int out_cols, int filter_rows, int filter_cols, int in_pitch, int out_pitch, int sharedRows, int sharedCols, __local TYPE* sdata)\n"
64 "{\n"
65 " unsigned int xx = ( (int)(get_global_id(0)/get_local_size(0))) * get_local_size(0);\n"
66 " unsigned int yy = ( (int)(get_global_id(1)/get_local_size(1))) * get_local_size(1);\n"
67 " unsigned int x = get_global_id(0);\n"
68 " unsigned int y = get_global_id(1);\n"
69 " if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))\n"
70 " {\n"
71 " unsigned int sharedIdx = get_local_id(1) * sharedCols + get_local_id(0);\n"
72 " sdata[sharedIdx]= input[y*in_pitch + x];\n"
73 " unsigned int shared_x= get_local_id(0)+get_local_size(0);\n"
74 " unsigned int shared_y= get_local_id(1);\n"
75 " while(shared_y<sharedRows)\n"
76 " {\n"
77 " while(shared_x<sharedCols)\n"
78 " {\n"
79 " sharedIdx = shared_y * sharedCols + shared_x; \n"
80 " sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];\n"
81 " shared_x = shared_x + get_local_size(0);\n"
82 " }\n"
83 " shared_x = get_local_id(0);\n"
84 " shared_y = shared_y + get_local_size(1);\n"
85 " } \n"
86 " }\n"
87 " barrier(CLK_LOCAL_MEM_FENCE);\n"
88 " if(x<out_cols && y<out_rows)\n"
89 " {\n"
90 " TYPE sum=0;\n"
91 " for(int j=0;j<filter_rows;j++) \n"
92 " {\n"
93 " for(int i=0;i<filter_cols;i++) \n"
94 " {\n"
95 " sum += sdata[(get_local_id(1)+j) * sharedCols + (get_local_id(0)+i) ];\n"
96 " }\n"
97 " }\n"
98 " output[y*out_pitch+x] = sum / (filter_rows * filter_cols);\n"
99 " }\n"
100 "}"
101 );
102 
103 }
104 
105 
106 #endif
107 
108 //#################
109 //-----------------
110 //#################
111 
112 
113 #ifdef SKEPU_CUDA
114 
115 
116 namespace skepu
117 {
118 
119 
120 #define BLOCK_SIZE_X 16
121 #define BLOCK_SIZE_Y 32
122 #define WARP_SIZE 32
123 #define NUM_REGISTERS_PER_SP 32768
124 #define SHARED_MEM_SIZE_BYTES 48000
125 #define THREADS_PER_WARP 32
126 #define WARPS_PER_SP 48
127 #define THREAD_BLOCK_PER_SP 8
128 
129 template <typename T>
130 T max(T a, T b)
131 {
132  return (a>b)? a:b;
133 }
134 
135 template <typename T>
136 T min(T a, T b)
137 {
138  return (a<b)? a:b;
139 }
140 
141 template <typename T>
142 int calculateTiling(int regCountPerThread, int filterSizeX, int filterSizeY)
143 {
144  int numThreadsPerTB = (BLOCK_SIZE_X * BLOCK_SIZE_Y);
145 
146  int numWarpsPerTB = (numThreadsPerTB+WARP_SIZE-1) / WARP_SIZE;
147 
148  int maxTBPerSP = min( (WARPS_PER_SP / numWarpsPerTB), THREAD_BLOCK_PER_SP);
149 
150  int remRegPerThreads = NUM_REGISTERS_PER_SP - (regCountPerThread * numWarpsPerTB * WARP_SIZE * maxTBPerSP);
151 
152  if(remRegPerThreads <0)
153  {
154  std::cerr << "Error! Limited by Register usage, tiling cannot be more than 1\n";
155  return 1;
156  }
157  remRegPerThreads = remRegPerThreads / (numWarpsPerTB * WARP_SIZE * maxTBPerSP); // tiling cannot be more than this
158 
159  int sharedMem = SHARED_MEM_SIZE_BYTES - ((BLOCK_SIZE_X + filterSizeX - 1) * (BLOCK_SIZE_Y + filterSizeY - 1) * sizeof(T));
160 
161  if(sharedMem < 0)
162  {
163  std::cerr << "Error! Limited by shared memory usage, tiling cannot be more than 1\n";
164  return 1;
165  }
166 
167  int tilingSM = sharedMem / (BLOCK_SIZE_X * BLOCK_SIZE_Y * sizeof (T));
168 
169  std::cerr<<"tilingSM: "<<tilingSM<<" , remRegPerThreads: "<<remRegPerThreads<<"\n";
170  return min(tilingSM, remRegPerThreads); // assuming a tile increase register count by one.
171 }
172 
173 
174 // constant buffer used to store filter...
175 __device__ __constant__ char deviceFilter[16386];
176 
177 
188 template<typename T>
189 __global__ void conv_cuda_shared_kernel(T* input, T* output, const int in_rows, const int in_cols, const int out_rows, const int out_cols, const int filter_rows, const int filter_cols, size_t in_pitch, size_t out_pitch, const int sharedRows, const int sharedCols)
190 {
191  extern __shared__ char _sdata[];
192  T* sdata = reinterpret_cast<T*>(_sdata); // will also contain extra (overlap data)
193 
194  unsigned int xx = blockIdx.x * blockDim.x;
195  unsigned int yy = blockIdx.y * blockDim.y;
196 
197  unsigned int x = xx + threadIdx.x;
198  unsigned int y = yy + threadIdx.y;
199 
200  if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))
201  {
202  unsigned int sharedIdx = threadIdx.y * sharedCols + threadIdx.x;
203 
204  sdata[sharedIdx]= input[y*in_pitch + x];
205 
206  unsigned int shared_x= threadIdx.x+blockDim.x;
207  unsigned int shared_y= threadIdx.y;
208 
209  // To load data in shared memory including neighbouring elements...
210  while(shared_y<sharedRows)
211  {
212  while(shared_x<sharedCols)
213  {
214  sharedIdx = shared_y * sharedCols + shared_x;
215  sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];
216  shared_x = shared_x + blockDim.x;
217  }
218  shared_x = threadIdx.x;
219  shared_y = shared_y + blockDim.y;
220  }
221  }
222 
223  __syncthreads();
224 
225  if(x<out_cols && y<out_rows)
226  {
227  T sum=0;
228 
229  for(int j=0;j<filter_rows;j++)
230  {
231  for(int i=0;i<filter_cols;i++)
232  {
233  sum += sdata[(threadIdx.y+j) * sharedCols + (threadIdx.x+i) ];
234  }
235  }
236  output[y*out_pitch+x] = sum / (filter_rows * filter_cols); //sdata[(threadIdx.y+2) * sharedCols + (threadIdx.x+2) ];
237  }
238 }
239 
240 
241 
242 
243 
244 
245 
256 template<typename T>
257 __global__ void conv_cuda_shared_kernel_filter(T* input, T* output, T* filter, const int in_rows, const int in_cols, const int out_rows, const int out_cols, const int filter_rows, const int filter_cols, size_t in_pitch, size_t out_pitch, const int sharedRows, const int sharedCols)
258 {
259  extern __shared__ char _sdata[];
260  T* sdata = reinterpret_cast<T*>(_sdata); // will also contain extra (overlap data)
261 
262  unsigned int xx = blockIdx.x * blockDim.x;
263  unsigned int yy = blockIdx.y * blockDim.y;
264 
265  unsigned int x = xx + threadIdx.x;
266  unsigned int y = yy + threadIdx.y;
267 
268  if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))
269  {
270  unsigned int sharedIdx = threadIdx.y * sharedCols + threadIdx.x;
271 
272  sdata[sharedIdx]= input[y*in_pitch + x];
273 
274  unsigned int shared_x= threadIdx.x+blockDim.x;
275  unsigned int shared_y= threadIdx.y;
276 
277  // To load data in shared memory including neighbouring elements...
278  while(shared_y<sharedRows)
279  {
280  while(shared_x<sharedCols)
281  {
282  sharedIdx = shared_y * sharedCols + shared_x;
283  sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];
284  shared_x = shared_x + blockDim.x;
285  }
286  shared_x = threadIdx.x;
287  shared_y = shared_y + blockDim.y;
288  }
289  }
290 
291  __syncthreads();
292 
293  if(x<out_cols && y<out_rows)
294  {
295  T sum=0;
296 
297 // T *d_Filter = reinterpret_cast<T*>(deviceFilter);
298  for(int j=0;j<filter_rows;j++)
299  {
300  for(int i=0;i<filter_cols;i++)
301  {
302  sum += sdata[(threadIdx.y+j) * sharedCols + (threadIdx.x+i) ] * filter[j*filter_cols+i];
303  }
304  }
305  output[y*out_pitch+x] = sum / (filter_rows * filter_cols); //sdata[(threadIdx.y+2) * sharedCols + (threadIdx.x+2) ];
306  }
307 }
308 
309 
310 
311 
312 
313 
314 
315 
326 template<typename T>
327 __global__ void conv_cuda_shared_tiling_kernel(T* input, T* output, const int numTiles, const int in_cols, const int out_rows, const int out_cols, const int filter_rows, const int filter_cols, size_t in_pitch, size_t out_pitch, const int sharedRows, const int sharedCols)
328 {
329  extern __shared__ char _sdata[];
330  T* sdata = reinterpret_cast<T*>(_sdata); // will also contain extra (overlap data)
331 
332  unsigned int xx = blockIdx.x * blockDim.x * numTiles;
333  unsigned int yy = blockIdx.y * blockDim.y;
334 
335  unsigned int x = xx + threadIdx.x;
336  unsigned int y = yy + threadIdx.y;
337 
338  unsigned int sharedIdx = threadIdx.y * sharedCols + threadIdx.x;
339  unsigned int shared_x= threadIdx.x+blockDim.x;
340  unsigned int shared_y= threadIdx.y;
341 
342  if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))
343  {
344  sdata[sharedIdx]= input[y*in_pitch + x];
345 
346  // To load data in shared memory including neighbouring elements...
347  while(shared_y<sharedRows)
348  {
349  while(shared_x<sharedCols)
350  {
351  sharedIdx = shared_y * sharedCols + shared_x;
352  sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];
353  shared_x = shared_x + blockDim.x;
354  }
355  shared_x = threadIdx.x;
356  shared_y = shared_y + blockDim.y;
357  }
358  }
359 
360  __syncthreads();
361 
362  sharedIdx = threadIdx.x;
363 
364  for(int t=0;t<numTiles; t++)
365  {
366  if(x<out_cols && y<out_rows)
367  {
368 // T sum=0;
369  shared_x = 0;
370 
371  for(int j=0;j<filter_rows;j++) // 7
372  {
373  for(int i=0;i<filter_cols;i++) // 7
374  {
375  shared_x += sdata[(threadIdx.y+j) * sharedCols + (sharedIdx+i) ];
376  }
377  }
378  output[y*out_pitch+x] = shared_x / (filter_rows * filter_cols); //sdata[(threadIdx.y+2) * sharedCols + (threadIdx.x+2) ];
379  x += blockDim.x;
380  sharedIdx += blockDim.x;
381  }
382  }
383 }
384 
385 
386 
387 
388 
389 
400 template<typename T>
401 __global__ void conv_cuda_shared_tiling_kernel_filter(T* input, T* output, T* filter, const int numTiles, const int in_cols, const int out_rows, const int out_cols, const int filter_rows, const int filter_cols, size_t in_pitch, size_t out_pitch, const int sharedRows, const int sharedCols)
402 {
403  extern __shared__ char _sdata[];
404  T* sdata = reinterpret_cast<T*>(_sdata); // will also contain extra (overlap data)
405 
406  unsigned int xx = blockIdx.x * blockDim.x * numTiles;
407  unsigned int yy = blockIdx.y * blockDim.y;
408 
409  unsigned int x = xx + threadIdx.x;
410  unsigned int y = yy + threadIdx.y;
411 
412  unsigned int sharedIdx = threadIdx.y * sharedCols + threadIdx.x;
413  unsigned int shared_x= threadIdx.x+blockDim.x;
414  unsigned int shared_y= threadIdx.y;
415 
416  if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))
417  {
418  sdata[sharedIdx]= input[y*in_pitch + x];
419 
420  // To load data in shared memory including neighbouring elements...
421  while(shared_y<sharedRows)
422  {
423  while(shared_x<sharedCols)
424  {
425  sharedIdx = shared_y * sharedCols + shared_x;
426  sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];
427  shared_x = shared_x + blockDim.x;
428  }
429  shared_x = threadIdx.x;
430  shared_y = shared_y + blockDim.y;
431  }
432  }
433 
434  __syncthreads();
435 
436  sharedIdx = threadIdx.x;
437 
438  for(int t=0;t<numTiles; t++)
439  {
440  if(x<out_cols && y<out_rows)
441  {
442 // T sum=0;
443  shared_x = 0;
444 
445  for(int j=0;j<filter_rows;j++) // 7
446  {
447  for(int i=0;i<filter_cols;i++) // 7
448  {
449  shared_x += sdata[(threadIdx.y+j) * sharedCols + (sharedIdx+i) ] * filter[j*filter_cols+i];
450  }
451  }
452  output[y*out_pitch+x] = shared_x / (filter_rows * filter_cols); //sdata[(threadIdx.y+2) * sharedCols + (threadIdx.x+2) ];
453  x += blockDim.x;
454  sharedIdx += blockDim.x;
455  }
456  }
457 }
458 
459 
460 
461 
462 template<bool useFilter, typename T>
463 __global__ void conv_cuda_shared_tiling_2_kernel(T* input, T* output, const int in_cols, const int out_rows, const int out_cols, const int filter_rows, const int filter_cols, size_t in_pitch, size_t out_pitch, const int sharedRows, const int sharedCols)
464 {
465  extern __shared__ char _sdata[];
466  T* sdata = reinterpret_cast<T*>(_sdata); // will also contain extra (overlap data)
467 
468  unsigned int xx = blockIdx.x * blockDim.x * 2;
469  unsigned int yy = blockIdx.y * blockDim.y;
470 
471  unsigned int x = xx + threadIdx.x;
472 // unsigned int x_in = xx + threadIdx.x;
473  unsigned int y = yy + threadIdx.y;
474 
475  unsigned int sharedIdx = threadIdx.y * sharedCols + threadIdx.x;
476 
477 
478  if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))
479  {
480  sdata[sharedIdx]= input[y*in_pitch + x];
481 
482  unsigned int shared_x= threadIdx.x+blockDim.x;
483  unsigned int shared_y= threadIdx.y;
484 
485  // To load data in shared memory including neighbouring elements...
486  while(shared_y<sharedRows)
487  {
488  while(shared_x<sharedCols)
489  {
490  sharedIdx = shared_y * sharedCols + shared_x;
491  sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];
492  shared_x = shared_x + blockDim.x;
493  }
494  shared_x = threadIdx.x;
495  shared_y = shared_y + blockDim.y;
496  }
497  }
498 
499  __syncthreads();
500 
501  sharedIdx = threadIdx.x;
502 
503 // for(int t=0;t<numTiles; t++)
504  {
505  if(x<out_cols && y<out_rows)
506  {
507  T sum=0;
508  T sum2=0;
509 
510  if(useFilter)
511  {
512  T *d_Filter = reinterpret_cast<T*>(deviceFilter);
513  for(int j=0;j<filter_rows;j++) // 7
514  {
515  for(int i=0;i<filter_cols;i++) // 7
516  {
517  sum += sdata[(threadIdx.y+j) * sharedCols + (sharedIdx+i) ] * d_Filter[j*filter_cols+i];
518  sum2 += sdata[(threadIdx.y+j) * sharedCols + (sharedIdx+blockDim.x+i) ] * d_Filter[j*filter_cols+i];
519  }
520  }
521  }
522  else
523  {
524  for(int j=0;j<filter_rows;j++) // 7
525  {
526  for(int i=0;i<filter_cols;i++) // 7
527  {
528  sum += sdata[(threadIdx.y+j) * sharedCols + (sharedIdx+i) ];
529  sum2 += sdata[(threadIdx.y+j) * sharedCols + (sharedIdx+blockDim.x+i) ];
530  }
531  }
532  }
533  output[y*out_pitch+x] = sum / (filter_rows * filter_cols); //sdata[(threadIdx.y+2) * sharedCols + (threadIdx.x+2) ];
534  output[y*out_pitch+x+blockDim.x] = sum2 / (filter_rows * filter_cols); //sdata[(threadIdx.y+2) * sharedCols + (threadIdx.x+2) ];
535 // x += blockDim.x;
536 // sharedIdx += blockDim.x;
537  }
538  }
539 }
540 
541 
542 
543 template<bool useFilter, typename T>
544 __global__ void conv_cuda_shared_tiling_4_kernel(T* input, T* output, const int in_cols, const int out_rows, const int out_cols, const int filter_rows, const int filter_cols, size_t in_pitch, size_t out_pitch, const int sharedRows, const int sharedCols)
545 {
546  extern __shared__ char _sdata[];
547  T* sdata = reinterpret_cast<T*>(_sdata); // will also contain extra (overlap data)
548 
549  unsigned int xx = blockIdx.x * blockDim.x * 4;
550  unsigned int yy = blockIdx.y * blockDim.y;
551 
552  unsigned int x = xx + threadIdx.x;
553 // unsigned int x_in = xx + threadIdx.x;
554  unsigned int y = yy + threadIdx.y;
555 
556  unsigned int sharedIdx = threadIdx.y * sharedCols + threadIdx.x;
557 
558  unsigned int shared_x= threadIdx.x+blockDim.x;
559 
560 
561  if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))
562  {
563  sdata[sharedIdx]= input[y*in_pitch + x];
564 
565  unsigned int shared_y= threadIdx.y;
566 
567  // To load data in shared memory including neighbouring elements...
568  while(shared_y<sharedRows)
569  {
570  while(shared_x<sharedCols)
571  {
572  sharedIdx = shared_y * sharedCols + shared_x;
573  sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];
574  shared_x = shared_x + blockDim.x;
575  }
576  shared_x = threadIdx.x;
577  shared_y = shared_y + blockDim.y;
578  }
579  }
580 
581  __syncthreads();
582 
583  sharedIdx = threadIdx.x;
584 
585 // for(int t=0;t<numTiles; t++)
586  {
587  if(x<out_cols && y<out_rows)
588  {
589  T sum=0;
590  T sum2=0;
591  T sum3=0;
592  T sum4=0;
593 
594  if(useFilter)
595  {
596  T *d_Filter = reinterpret_cast<T*>(deviceFilter);
597  for(int j=0;j<filter_rows;j++) // 7
598  {
599  for(int i=0;i<filter_cols;i++) // 7
600  {
601  shared_x = (threadIdx.y+j) * sharedCols + (sharedIdx+i);
602  sum += sdata[shared_x] * d_Filter[j*filter_cols+i];
603  shared_x += blockDim.x;
604  sum2 += sdata[shared_x] * d_Filter[j*filter_cols+i];
605  shared_x += blockDim.x;
606  sum3 += sdata[shared_x] * d_Filter[j*filter_cols+i];
607  shared_x += blockDim.x;
608  sum4 += sdata[shared_x] * d_Filter[j*filter_cols+i];
609  }
610  }
611  }
612  else
613  {
614  for(int j=0;j<filter_rows;j++) // 7
615  {
616  for(int i=0;i<filter_cols;i++) // 7
617  {
618  shared_x = (threadIdx.y+j) * sharedCols + (sharedIdx+i);
619  sum += sdata[shared_x];
620  shared_x += blockDim.x;
621  sum2 += sdata[shared_x];
622  shared_x += blockDim.x;
623  sum3 += sdata[shared_x];
624  shared_x += blockDim.x;
625  sum4 += sdata[shared_x];
626  }
627  }
628  }
629  shared_x = y*out_pitch+x;
630  output[shared_x] = sum / (filter_rows * filter_cols); //sdata[(threadIdx.y+2) * sharedCols + (threadIdx.x+2) ];
631  shared_x += blockDim.x;
632  output[shared_x] = sum2 / (filter_rows * filter_cols); //sdata[(threadIdx.y+2) * sharedCols + (threadIdx.x+2) ];
633  shared_x += blockDim.x;
634  output[shared_x] = sum3 / (filter_rows * filter_cols);
635  shared_x += blockDim.x;
636  output[shared_x] = sum4 / (filter_rows * filter_cols);
637  }
638  }
639 }
640 
641 
642 
643 
644 
645 
646 
647 
648 
649 
650 
651 template<bool useFilter, typename T>
652 __global__ void conv_cuda_shared_tiling_6_kernel(T* input, T* output, const int in_cols, const int out_rows, const int out_cols, const int filter_rows, const int filter_cols, size_t in_pitch, size_t out_pitch, const int sharedRows, const int sharedCols)
653 {
654  extern __shared__ char _sdata[];
655  T* sdata = reinterpret_cast<T*>(_sdata); // will also contain extra (overlap data)
656 
657  unsigned int xx = blockIdx.x * blockDim.x * 6;
658  unsigned int yy = blockIdx.y * blockDim.y;
659 
660  unsigned int x = xx + threadIdx.x;
661 // unsigned int x_in = xx + threadIdx.x;
662  unsigned int y = yy + threadIdx.y;
663 
664  unsigned int sharedIdx = threadIdx.y * sharedCols + threadIdx.x;
665 
666  unsigned int shared_x= threadIdx.x+blockDim.x;
667 
668 
669  if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))
670  {
671  sdata[sharedIdx]= input[y*in_pitch + x];
672 
673  unsigned int shared_y= threadIdx.y;
674 
675  // To load data in shared memory including neighbouring elements...
676  while(shared_y<sharedRows)
677  {
678  while(shared_x<sharedCols)
679  {
680  sharedIdx = shared_y * sharedCols + shared_x;
681  sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];
682  shared_x = shared_x + blockDim.x;
683  }
684  shared_x = threadIdx.x;
685  shared_y = shared_y + blockDim.y;
686  }
687  }
688 
689  __syncthreads();
690 
691  sharedIdx = threadIdx.x;
692 
693 // for(int t=0;t<numTiles; t++)
694  {
695  if(x<out_cols && y<out_rows)
696  {
697  T sum=0;
698  T sum2=0;
699  T sum3=0;
700  T sum4=0;
701  T sum5=0;
702  T sum6=0;
703 
704  if(useFilter)
705  {
706  T *d_Filter = reinterpret_cast<T*>(deviceFilter);
707  for(int j=0;j<filter_rows;j++) // 7
708  {
709  for(int i=0;i<filter_cols;i++) // 7
710  {
711  shared_x = (threadIdx.y+j) * sharedCols + (sharedIdx+i);
712  sum += sdata[shared_x] * d_Filter[j*filter_cols+i];
713  shared_x += blockDim.x;
714  sum2 += sdata[shared_x] * d_Filter[j*filter_cols+i];
715  shared_x += blockDim.x;
716  sum3 += sdata[shared_x] * d_Filter[j*filter_cols+i];
717  shared_x += blockDim.x;
718  sum4 += sdata[shared_x] * d_Filter[j*filter_cols+i];
719  shared_x += blockDim.x;
720  sum5 += sdata[shared_x] * d_Filter[j*filter_cols+i];
721  shared_x += blockDim.x;
722  sum6 += sdata[shared_x] * d_Filter[j*filter_cols+i];
723  }
724  }
725  }
726  else
727  {
728  for(int j=0;j<filter_rows;j++) // 7
729  {
730  for(int i=0;i<filter_cols;i++) // 7
731  {
732  shared_x = (threadIdx.y+j) * sharedCols + (sharedIdx+i);
733  sum += sdata[shared_x];
734  shared_x += blockDim.x;
735  sum2 += sdata[shared_x];
736  shared_x += blockDim.x;
737  sum3 += sdata[shared_x];
738  shared_x += blockDim.x;
739  sum4 += sdata[shared_x];
740  shared_x += blockDim.x;
741  sum5 += sdata[shared_x];
742  shared_x += blockDim.x;
743  sum6 += sdata[shared_x];
744  }
745  }
746  }
747  shared_x = y*out_pitch+x;
748  output[shared_x] = sum / (filter_rows * filter_cols); //sdata[(threadIdx.y+2) * sharedCols + (threadIdx.x+2) ];
749  shared_x += blockDim.x;
750  output[shared_x] = sum2 / (filter_rows * filter_cols); //sdata[(threadIdx.y+2) * sharedCols + (threadIdx.x+2) ];
751  shared_x += blockDim.x;
752  output[shared_x] = sum3 / (filter_rows * filter_cols);
753  shared_x += blockDim.x;
754  output[shared_x] = sum4 / (filter_rows * filter_cols);
755  shared_x += blockDim.x;
756  output[shared_x] = sum5 / (filter_rows * filter_cols);
757  shared_x += blockDim.x;
758  output[shared_x] = sum6 / (filter_rows * filter_cols);
759  }
760  }
761 }
762 
763 
764 
765 
766 
767 
768 template<bool useFilter, typename T>
769 __global__ void conv_cuda_shared_tiling_8_kernel(T* input, T* output, const int in_cols, const int out_rows, const int out_cols, const int filter_rows, const int filter_cols, size_t in_pitch, size_t out_pitch, const int sharedRows, const int sharedCols)
770 {
771  extern __shared__ char _sdata[];
772  T* sdata = reinterpret_cast<T*>(_sdata); // will also contain extra (overlap data)
773 
774  unsigned int xx = blockIdx.x * blockDim.x * 8;
775  unsigned int yy = blockIdx.y * blockDim.y;
776 
777  unsigned int x = xx + threadIdx.x;
778 // unsigned int x_in = xx + threadIdx.x;
779  unsigned int y = yy + threadIdx.y;
780 
781  unsigned int sharedIdx = threadIdx.y * sharedCols + threadIdx.x;
782 
783  unsigned int shared_x= threadIdx.x+blockDim.x;
784 
785 
786  if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))
787  {
788  sdata[sharedIdx]= input[y*in_pitch + x];
789 
790  unsigned int shared_y= threadIdx.y;
791 
792  // To load data in shared memory including neighbouring elements...
793  while(shared_y<sharedRows)
794  {
795  while(shared_x<sharedCols)
796  {
797  sharedIdx = shared_y * sharedCols + shared_x;
798  sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];
799  shared_x = shared_x + blockDim.x;
800  }
801  shared_x = threadIdx.x;
802  shared_y = shared_y + blockDim.y;
803  }
804  }
805 
806  __syncthreads();
807 
808  sharedIdx = threadIdx.x;
809 
810 // for(int t=0;t<numTiles; t++)
811  {
812  if(x<out_cols && y<out_rows)
813  {
814  T sum=0;
815  T sum2=0;
816  T sum3=0;
817  T sum4=0;
818  T sum5=0;
819  T sum6=0;
820  T sum7=0;
821  T sum8=0;
822 
823  if(useFilter)
824  {
825  T *d_Filter = reinterpret_cast<T*>(deviceFilter);
826  for(int j=0;j<filter_rows;j++) // 7
827  {
828  for(int i=0;i<filter_cols;i++) // 7
829  {
830  shared_x = (threadIdx.y+j) * sharedCols + (sharedIdx+i);
831  sum += sdata[shared_x] * d_Filter[j*filter_cols+i];
832  shared_x += blockDim.x;
833  sum2 += sdata[shared_x] * d_Filter[j*filter_cols+i];
834  shared_x += blockDim.x;
835  sum3 += sdata[shared_x] * d_Filter[j*filter_cols+i];
836  shared_x += blockDim.x;
837  sum4 += sdata[shared_x] * d_Filter[j*filter_cols+i];
838  shared_x += blockDim.x;
839  sum5 += sdata[shared_x] * d_Filter[j*filter_cols+i];
840  shared_x += blockDim.x;
841  sum6 += sdata[shared_x] * d_Filter[j*filter_cols+i];
842  shared_x += blockDim.x;
843  sum7 += sdata[shared_x] * d_Filter[j*filter_cols+i];
844  shared_x += blockDim.x;
845  sum8 += sdata[shared_x] * d_Filter[j*filter_cols+i];
846  }
847  }
848  }
849  else
850  {
851  for(int j=0;j<filter_rows;j++) // 7
852  {
853  for(int i=0;i<filter_cols;i++) // 7
854  {
855  shared_x = (threadIdx.y+j) * sharedCols + (sharedIdx+i);
856  sum += sdata[shared_x];
857  shared_x += blockDim.x;
858  sum2 += sdata[shared_x];
859  shared_x += blockDim.x;
860  sum3 += sdata[shared_x];
861  shared_x += blockDim.x;
862  sum4 += sdata[shared_x];
863  shared_x += blockDim.x;
864  sum5 += sdata[shared_x];
865  shared_x += blockDim.x;
866  sum6 += sdata[shared_x];
867  shared_x += blockDim.x;
868  sum7 += sdata[shared_x];
869  shared_x += blockDim.x;
870  sum8 += sdata[shared_x];
871  }
872  }
873  }
874  shared_x = y*out_pitch+x;
875  output[shared_x] = sum / (filter_rows * filter_cols); //sdata[(threadIdx.y+2) * sharedCols + (threadIdx.x+2) ];
876  shared_x += blockDim.x;
877  output[shared_x] = sum2 / (filter_rows * filter_cols); //sdata[(threadIdx.y+2) * sharedCols + (threadIdx.x+2) ];
878  shared_x += blockDim.x;
879  output[shared_x] = sum3 / (filter_rows * filter_cols);
880  shared_x += blockDim.x;
881  output[shared_x] = sum4 / (filter_rows * filter_cols);
882  shared_x += blockDim.x;
883  output[shared_x] = sum5 / (filter_rows * filter_cols);
884  shared_x += blockDim.x;
885  output[shared_x] = sum6 / (filter_rows * filter_cols);
886  shared_x += blockDim.x;
887  output[shared_x] = sum7 / (filter_rows * filter_cols);
888  shared_x += blockDim.x;
889  output[shared_x] = sum8 / (filter_rows * filter_cols);
890  }
891  }
892 }
893 
894 
895 
896 
897 
898 
899 
900 
901 
902 
903 template<bool useFilter, typename T>
904 __global__ void conv_cuda_shared_tiling_10_kernel(T* input, T* output, const int in_cols, const int out_rows, const int out_cols, const int filter_rows, const int filter_cols, size_t in_pitch, size_t out_pitch, const int sharedRows, const int sharedCols)
905 {
906  extern __shared__ char _sdata[];
907  T* sdata = reinterpret_cast<T*>(_sdata); // will also contain extra (overlap data)
908 
909  unsigned int xx = blockIdx.x * blockDim.x * 10;
910  unsigned int yy = blockIdx.y * blockDim.y;
911 
912  unsigned int x = xx + threadIdx.x;
913 // unsigned int x_in = xx + threadIdx.x;
914  unsigned int y = yy + threadIdx.y;
915 
916  unsigned int sharedIdx = threadIdx.y * sharedCols + threadIdx.x;
917 
918  unsigned int shared_x= threadIdx.x+blockDim.x;
919 
920 
921  if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))
922  {
923  sdata[sharedIdx]= input[y*in_pitch + x];
924 
925  unsigned int shared_y= threadIdx.y;
926 
927  // To load data in shared memory including neighbouring elements...
928  while(shared_y<sharedRows)
929  {
930  while(shared_x<sharedCols)
931  {
932  sharedIdx = shared_y * sharedCols + shared_x;
933  sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];
934  shared_x = shared_x + blockDim.x;
935  }
936  shared_x = threadIdx.x;
937  shared_y = shared_y + blockDim.y;
938  }
939  }
940 
941  __syncthreads();
942 
943  sharedIdx = threadIdx.x;
944 
945 // for(int t=0;t<numTiles; t++)
946  {
947  if(x<out_cols && y<out_rows)
948  {
949  T sum=0;
950  T sum2=0;
951  T sum3=0;
952  T sum4=0;
953  T sum5=0;
954  T sum6=0;
955  T sum7=0;
956  T sum8=0;
957  T sum9=0;
958  T sum10=0;
959 
960  if(useFilter)
961  {
962  T *d_Filter = reinterpret_cast<T*>(deviceFilter);
963  for(int j=0;j<filter_rows;j++) // 7
964  {
965  for(int i=0;i<filter_cols;i++) // 7
966  {
967  shared_x = (threadIdx.y+j) * sharedCols + (sharedIdx+i);
968  sum += sdata[shared_x] * d_Filter[j*filter_cols+i];
969  shared_x += blockDim.x;
970  sum2 += sdata[shared_x] * d_Filter[j*filter_cols+i];
971  shared_x += blockDim.x;
972  sum3 += sdata[shared_x] * d_Filter[j*filter_cols+i];
973  shared_x += blockDim.x;
974  sum4 += sdata[shared_x] * d_Filter[j*filter_cols+i];
975  shared_x += blockDim.x;
976  sum5 += sdata[shared_x] * d_Filter[j*filter_cols+i];
977  shared_x += blockDim.x;
978  sum6 += sdata[shared_x] * d_Filter[j*filter_cols+i];
979  shared_x += blockDim.x;
980  sum7 += sdata[shared_x] * d_Filter[j*filter_cols+i];
981  shared_x += blockDim.x;
982  sum8 += sdata[shared_x] * d_Filter[j*filter_cols+i];
983  shared_x += blockDim.x;
984  sum9 += sdata[shared_x] * d_Filter[j*filter_cols+i];
985  shared_x += blockDim.x;
986  sum10 += sdata[shared_x] * d_Filter[j*filter_cols+i];
987  }
988  }
989  }
990  else
991  {
992  for(int j=0;j<filter_rows;j++) // 7
993  {
994  for(int i=0;i<filter_cols;i++) // 7
995  {
996  shared_x = (threadIdx.y+j) * sharedCols + (sharedIdx+i);
997  sum += sdata[shared_x];
998  shared_x += blockDim.x;
999  sum2 += sdata[shared_x];
1000  shared_x += blockDim.x;
1001  sum3 += sdata[shared_x];
1002  shared_x += blockDim.x;
1003  sum4 += sdata[shared_x];
1004  shared_x += blockDim.x;
1005  sum5 += sdata[shared_x];
1006  shared_x += blockDim.x;
1007  sum6 += sdata[shared_x];
1008  shared_x += blockDim.x;
1009  sum7 += sdata[shared_x];
1010  shared_x += blockDim.x;
1011  sum8 += sdata[shared_x];
1012  shared_x += blockDim.x;
1013  sum9 += sdata[shared_x];
1014  shared_x += blockDim.x;
1015  sum10 += sdata[shared_x];
1016  }
1017  }
1018  }
1019  shared_x = y*out_pitch+x;
1020  output[shared_x] = sum / (filter_rows * filter_cols); //sdata[(threadIdx.y+2) * sharedCols + (threadIdx.x+2) ];
1021  shared_x += blockDim.x;
1022  output[shared_x] = sum2 / (filter_rows * filter_cols); //sdata[(threadIdx.y+2) * sharedCols + (threadIdx.x+2) ];
1023  shared_x += blockDim.x;
1024  output[shared_x] = sum3 / (filter_rows * filter_cols);
1025  shared_x += blockDim.x;
1026  output[shared_x] = sum4 / (filter_rows * filter_cols);
1027  shared_x += blockDim.x;
1028  output[shared_x] = sum5 / (filter_rows * filter_cols);
1029  shared_x += blockDim.x;
1030  output[shared_x] = sum6 / (filter_rows * filter_cols);
1031  shared_x += blockDim.x;
1032  output[shared_x] = sum7 / (filter_rows * filter_cols);
1033  shared_x += blockDim.x;
1034  output[shared_x] = sum8 / (filter_rows * filter_cols);
1035  shared_x += blockDim.x;
1036  output[shared_x] = sum9 / (filter_rows * filter_cols);
1037  shared_x += blockDim.x;
1038  output[shared_x] = sum10 / (filter_rows * filter_cols);
1039  }
1040  }
1041 }
1042 
1043 
1044 
1045 template<bool useFilter, typename T>
1046 __global__ void conv_cuda_shared_tiling_12_kernel(T* input, T* output, const int in_cols, const int out_rows, const int out_cols, const int filter_rows, const int filter_cols, size_t in_pitch, size_t out_pitch, const int sharedRows, const int sharedCols)
1047 {
1048  extern __shared__ char _sdata[];
1049  T* sdata = reinterpret_cast<T*>(_sdata); // will also contain extra (overlap data)
1050 
1051  unsigned int xx = blockIdx.x * blockDim.x * 12;
1052  unsigned int yy = blockIdx.y * blockDim.y;
1053 
1054  unsigned int x = xx + threadIdx.x;
1055 // unsigned int x_in = xx + threadIdx.x;
1056  unsigned int y = yy + threadIdx.y;
1057 
1058  unsigned int sharedIdx = threadIdx.y * sharedCols + threadIdx.x;
1059 
1060  unsigned int shared_x= threadIdx.x+blockDim.x;
1061 
1062 
1063  if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))
1064  {
1065  sdata[sharedIdx]= input[y*in_pitch + x];
1066 
1067  unsigned int shared_y= threadIdx.y;
1068 
1069  // To load data in shared memory including neighbouring elements...
1070  while(shared_y<sharedRows)
1071  {
1072  while(shared_x<sharedCols)
1073  {
1074  sharedIdx = shared_y * sharedCols + shared_x;
1075  sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];
1076  shared_x = shared_x + blockDim.x;
1077  }
1078  shared_x = threadIdx.x;
1079  shared_y = shared_y + blockDim.y;
1080  }
1081  }
1082 
1083  __syncthreads();
1084 
1085  sharedIdx = threadIdx.x;
1086 
1087 // for(int t=0;t<numTiles; t++)
1088  {
1089  if(x<out_cols && y<out_rows)
1090  {
1091  T sum=0;
1092  T sum2=0;
1093  T sum3=0;
1094  T sum4=0;
1095  T sum5=0;
1096  T sum6=0;
1097  T sum7=0;
1098  T sum8=0;
1099  T sum9=0;
1100  T sum10=0;
1101  T sum11=0;
1102  T sum12=0;
1103 
1104  if(useFilter)
1105  {
1106  T *d_Filter = reinterpret_cast<T*>(deviceFilter);
1107  for(int j=0;j<filter_rows;j++) // 7
1108  {
1109  for(int i=0;i<filter_cols;i++) // 7
1110  {
1111  shared_x = (threadIdx.y+j) * sharedCols + (sharedIdx+i);
1112  sum += sdata[shared_x] * d_Filter[j*filter_cols+i];
1113  shared_x += blockDim.x;
1114  sum2 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1115  shared_x += blockDim.x;
1116  sum3 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1117  shared_x += blockDim.x;
1118  sum4 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1119  shared_x += blockDim.x;
1120  sum5 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1121  shared_x += blockDim.x;
1122  sum6 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1123  shared_x += blockDim.x;
1124  sum7 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1125  shared_x += blockDim.x;
1126  sum8 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1127  shared_x += blockDim.x;
1128  sum9 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1129  shared_x += blockDim.x;
1130  sum10 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1131  shared_x += blockDim.x;
1132  sum11 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1133  shared_x += blockDim.x;
1134  sum12 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1135  }
1136  }
1137  }
1138  else
1139  {
1140  for(int j=0;j<filter_rows;j++) // 7
1141  {
1142  for(int i=0;i<filter_cols;i++) // 7
1143  {
1144  shared_x = (threadIdx.y+j) * sharedCols + (sharedIdx+i);
1145  sum += sdata[shared_x];
1146  shared_x += blockDim.x;
1147  sum2 += sdata[shared_x];
1148  shared_x += blockDim.x;
1149  sum3 += sdata[shared_x];
1150  shared_x += blockDim.x;
1151  sum4 += sdata[shared_x];
1152  shared_x += blockDim.x;
1153  sum5 += sdata[shared_x];
1154  shared_x += blockDim.x;
1155  sum6 += sdata[shared_x];
1156  shared_x += blockDim.x;
1157  sum7 += sdata[shared_x];
1158  shared_x += blockDim.x;
1159  sum8 += sdata[shared_x];
1160  shared_x += blockDim.x;
1161  sum9 += sdata[shared_x];
1162  shared_x += blockDim.x;
1163  sum10 += sdata[shared_x];
1164  shared_x += blockDim.x;
1165  sum11 += sdata[shared_x];
1166  shared_x += blockDim.x;
1167  sum12 += sdata[shared_x];
1168  }
1169  }
1170  }
1171  shared_x = y*out_pitch+x;
1172  output[shared_x] = sum / (filter_rows * filter_cols); //sdata[(threadIdx.y+2) * sharedCols + (threadIdx.x+2) ];
1173  shared_x += blockDim.x;
1174  output[shared_x] = sum2 / (filter_rows * filter_cols); //sdata[(threadIdx.y+2) * sharedCols + (threadIdx.x+2) ];
1175  shared_x += blockDim.x;
1176  output[shared_x] = sum3 / (filter_rows * filter_cols);
1177  shared_x += blockDim.x;
1178  output[shared_x] = sum4 / (filter_rows * filter_cols);
1179  shared_x += blockDim.x;
1180  output[shared_x] = sum5 / (filter_rows * filter_cols);
1181  shared_x += blockDim.x;
1182  output[shared_x] = sum6 / (filter_rows * filter_cols);
1183  shared_x += blockDim.x;
1184  output[shared_x] = sum7 / (filter_rows * filter_cols);
1185  shared_x += blockDim.x;
1186  output[shared_x] = sum8 / (filter_rows * filter_cols);
1187  shared_x += blockDim.x;
1188  output[shared_x] = sum9 / (filter_rows * filter_cols);
1189  shared_x += blockDim.x;
1190  output[shared_x] = sum10 / (filter_rows * filter_cols);
1191  shared_x += blockDim.x;
1192  output[shared_x] = sum11 / (filter_rows * filter_cols);
1193  shared_x += blockDim.x;
1194  output[shared_x] = sum12 / (filter_rows * filter_cols);
1195  }
1196  }
1197 }
1198 
1199 
1200 
1201 
1202 template<bool useFilter, typename T>
1203 __global__ void conv_cuda_shared_tiling_14_kernel(T* input, T* output, const int in_cols, const int out_rows, const int out_cols, const int filter_rows, const int filter_cols, size_t in_pitch, size_t out_pitch, const int sharedRows, const int sharedCols)
1204 {
1205  extern __shared__ char _sdata[];
1206  T* sdata = reinterpret_cast<T*>(_sdata); // will also contain extra (overlap data)
1207 
1208  unsigned int xx = blockIdx.x * blockDim.x * 14;
1209  unsigned int yy = blockIdx.y * blockDim.y;
1210 
1211  unsigned int x = xx + threadIdx.x;
1212 // unsigned int x_in = xx + threadIdx.x;
1213  unsigned int y = yy + threadIdx.y;
1214 
1215  unsigned int sharedIdx = threadIdx.y * sharedCols + threadIdx.x;
1216 
1217  unsigned int shared_x= threadIdx.x+blockDim.x;
1218 
1219 
1220  if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))
1221  {
1222  sdata[sharedIdx]= input[y*in_pitch + x];
1223 
1224  unsigned int shared_y= threadIdx.y;
1225 
1226  // To load data in shared memory including neighbouring elements...
1227  while(shared_y<sharedRows)
1228  {
1229  while(shared_x<sharedCols)
1230  {
1231  sharedIdx = shared_y * sharedCols + shared_x;
1232  sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];
1233  shared_x = shared_x + blockDim.x;
1234  }
1235  shared_x = threadIdx.x;
1236  shared_y = shared_y + blockDim.y;
1237  }
1238  }
1239 
1240  __syncthreads();
1241 
1242  sharedIdx = threadIdx.x;
1243 
1244 // for(int t=0;t<numTiles; t++)
1245  {
1246  if(x<out_cols && y<out_rows)
1247  {
1248  T sum=0;
1249  T sum2=0;
1250  T sum3=0;
1251  T sum4=0;
1252  T sum5=0;
1253  T sum6=0;
1254  T sum7=0;
1255  T sum8=0;
1256  T sum9=0;
1257  T sum10=0;
1258  T sum11=0;
1259  T sum12=0;
1260  T sum13=0;
1261  T sum14=0;
1262 
1263  if(useFilter)
1264  {
1265  T *d_Filter = reinterpret_cast<T*>(deviceFilter);
1266  for(int j=0;j<filter_rows;j++) // 7
1267  {
1268  for(int i=0;i<filter_cols;i++) // 7
1269  {
1270  shared_x = (threadIdx.y+j) * sharedCols + (sharedIdx+i);
1271  sum += sdata[shared_x] * d_Filter[j*filter_cols+i];
1272  shared_x += blockDim.x;
1273  sum2 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1274  shared_x += blockDim.x;
1275  sum3 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1276  shared_x += blockDim.x;
1277  sum4 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1278  shared_x += blockDim.x;
1279  sum5 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1280  shared_x += blockDim.x;
1281  sum6 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1282  shared_x += blockDim.x;
1283  sum7 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1284  shared_x += blockDim.x;
1285  sum8 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1286  shared_x += blockDim.x;
1287  sum9 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1288  shared_x += blockDim.x;
1289  sum10 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1290  shared_x += blockDim.x;
1291  sum11 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1292  shared_x += blockDim.x;
1293  sum12 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1294  shared_x += blockDim.x;
1295  sum13 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1296  shared_x += blockDim.x;
1297  sum14 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1298  }
1299  }
1300  }
1301  else
1302  {
1303  for(int j=0;j<filter_rows;j++) // 7
1304  {
1305  for(int i=0;i<filter_cols;i++) // 7
1306  {
1307  shared_x = (threadIdx.y+j) * sharedCols + (sharedIdx+i);
1308  sum += sdata[shared_x];
1309  shared_x += blockDim.x;
1310  sum2 += sdata[shared_x];
1311  shared_x += blockDim.x;
1312  sum3 += sdata[shared_x];
1313  shared_x += blockDim.x;
1314  sum4 += sdata[shared_x];
1315  shared_x += blockDim.x;
1316  sum5 += sdata[shared_x];
1317  shared_x += blockDim.x;
1318  sum6 += sdata[shared_x];
1319  shared_x += blockDim.x;
1320  sum7 += sdata[shared_x];
1321  shared_x += blockDim.x;
1322  sum8 += sdata[shared_x];
1323  shared_x += blockDim.x;
1324  sum9 += sdata[shared_x];
1325  shared_x += blockDim.x;
1326  sum10 += sdata[shared_x];
1327  shared_x += blockDim.x;
1328  sum11 += sdata[shared_x];
1329  shared_x += blockDim.x;
1330  sum12 += sdata[shared_x];
1331  shared_x += blockDim.x;
1332  sum13 += sdata[shared_x];
1333  shared_x += blockDim.x;
1334  sum14 += sdata[shared_x];
1335  }
1336  }
1337  }
1338  shared_x = y*out_pitch+x;
1339  output[shared_x] = sum / (filter_rows * filter_cols); //sdata[(threadIdx.y+2) * sharedCols + (threadIdx.x+2) ];
1340  shared_x += blockDim.x;
1341  output[shared_x] = sum2 / (filter_rows * filter_cols); //sdata[(threadIdx.y+2) * sharedCols + (threadIdx.x+2) ];
1342  shared_x += blockDim.x;
1343  output[shared_x] = sum3 / (filter_rows * filter_cols);
1344  shared_x += blockDim.x;
1345  output[shared_x] = sum4 / (filter_rows * filter_cols);
1346  shared_x += blockDim.x;
1347  output[shared_x] = sum5 / (filter_rows * filter_cols);
1348  shared_x += blockDim.x;
1349  output[shared_x] = sum6 / (filter_rows * filter_cols);
1350  shared_x += blockDim.x;
1351  output[shared_x] = sum7 / (filter_rows * filter_cols);
1352  shared_x += blockDim.x;
1353  output[shared_x] = sum8 / (filter_rows * filter_cols);
1354  shared_x += blockDim.x;
1355  output[shared_x] = sum9 / (filter_rows * filter_cols);
1356  shared_x += blockDim.x;
1357  output[shared_x] = sum10 / (filter_rows * filter_cols);
1358  shared_x += blockDim.x;
1359  output[shared_x] = sum11 / (filter_rows * filter_cols);
1360  shared_x += blockDim.x;
1361  output[shared_x] = sum12 / (filter_rows * filter_cols);
1362  shared_x += blockDim.x;
1363  output[shared_x] = sum13 / (filter_rows * filter_cols);
1364  shared_x += blockDim.x;
1365  output[shared_x] = sum14 / (filter_rows * filter_cols);
1366  }
1367  }
1368 }
1369 
1370 
1371 
1372 
1373 
1374 
1375 
1376 template<bool useFilter, typename T>
1377 __global__ void conv_cuda_shared_tiling_16_kernel(T* input, T* output, const int in_cols, const int out_rows, const int out_cols, const int filter_rows, const int filter_cols, size_t in_pitch, size_t out_pitch, const int sharedRows, const int sharedCols)
1378 {
1379  extern __shared__ char _sdata[];
1380  T* sdata = reinterpret_cast<T*>(_sdata); // will also contain extra (overlap data)
1381 
1382  unsigned int xx = blockIdx.x * blockDim.x * 16;
1383  unsigned int yy = blockIdx.y * blockDim.y;
1384 
1385  unsigned int x = xx + threadIdx.x;
1386 // unsigned int x_in = xx + threadIdx.x;
1387  unsigned int y = yy + threadIdx.y;
1388 
1389  unsigned int sharedIdx = threadIdx.y * sharedCols + threadIdx.x;
1390 
1391  unsigned int shared_x= threadIdx.x+blockDim.x;
1392 
1393 
1394  if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))
1395  {
1396  sdata[sharedIdx]= input[y*in_pitch + x];
1397 
1398  unsigned int shared_y= threadIdx.y;
1399 
1400  // To load data in shared memory including neighbouring elements...
1401  while(shared_y<sharedRows)
1402  {
1403  while(shared_x<sharedCols)
1404  {
1405  sharedIdx = shared_y * sharedCols + shared_x;
1406  sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];
1407  shared_x = shared_x + blockDim.x;
1408  }
1409  shared_x = threadIdx.x;
1410  shared_y = shared_y + blockDim.y;
1411  }
1412  }
1413 
1414  __syncthreads();
1415 
1416  sharedIdx = threadIdx.x;
1417 
1418 // for(int t=0;t<numTiles; t++)
1419  {
1420  if(x<out_cols && y<out_rows)
1421  {
1422  T sum=0;
1423  T sum2=0;
1424  T sum3=0;
1425  T sum4=0;
1426  T sum5=0;
1427  T sum6=0;
1428  T sum7=0;
1429  T sum8=0;
1430  T sum9=0;
1431  T sum10=0;
1432  T sum11=0;
1433  T sum12=0;
1434  T sum13=0;
1435  T sum14=0;
1436  T sum15=0;
1437  T sum16=0;
1438 
1439  if(useFilter)
1440  {
1441  T *d_Filter = reinterpret_cast<T*>(deviceFilter);
1442  for(int j=0;j<filter_rows;j++) // 7
1443  {
1444  for(int i=0;i<filter_cols;i++) // 7
1445  {
1446  shared_x = (threadIdx.y+j) * sharedCols + (sharedIdx+i);
1447  sum += sdata[shared_x] * d_Filter[j*filter_cols+i];
1448  shared_x += blockDim.x;
1449  sum2 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1450  shared_x += blockDim.x;
1451  sum3 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1452  shared_x += blockDim.x;
1453  sum4 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1454  shared_x += blockDim.x;
1455  sum5 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1456  shared_x += blockDim.x;
1457  sum6 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1458  shared_x += blockDim.x;
1459  sum7 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1460  shared_x += blockDim.x;
1461  sum8 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1462  shared_x += blockDim.x;
1463  sum9 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1464  shared_x += blockDim.x;
1465  sum10 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1466  shared_x += blockDim.x;
1467  sum11 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1468  shared_x += blockDim.x;
1469  sum12 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1470  shared_x += blockDim.x;
1471  sum13 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1472  shared_x += blockDim.x;
1473  sum14 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1474  shared_x += blockDim.x;
1475  sum15 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1476  shared_x += blockDim.x;
1477  sum16 += sdata[shared_x] * d_Filter[j*filter_cols+i];
1478  }
1479  }
1480  }
1481  else
1482  {
1483  for(int j=0;j<filter_rows;j++) // 7
1484  {
1485  for(int i=0;i<filter_cols;i++) // 7
1486  {
1487  shared_x = (threadIdx.y+j) * sharedCols + (sharedIdx+i);
1488  sum += sdata[shared_x];
1489  shared_x += blockDim.x;
1490  sum2 += sdata[shared_x];
1491  shared_x += blockDim.x;
1492  sum3 += sdata[shared_x];
1493  shared_x += blockDim.x;
1494  sum4 += sdata[shared_x];
1495  shared_x += blockDim.x;
1496  sum5 += sdata[shared_x];
1497  shared_x += blockDim.x;
1498  sum6 += sdata[shared_x];
1499  shared_x += blockDim.x;
1500  sum7 += sdata[shared_x];
1501  shared_x += blockDim.x;
1502  sum8 += sdata[shared_x];
1503  shared_x += blockDim.x;
1504  sum9 += sdata[shared_x];
1505  shared_x += blockDim.x;
1506  sum10 += sdata[shared_x];
1507  shared_x += blockDim.x;
1508  sum11 += sdata[shared_x];
1509  shared_x += blockDim.x;
1510  sum12 += sdata[shared_x];
1511  shared_x += blockDim.x;
1512  sum13 += sdata[shared_x];
1513  shared_x += blockDim.x;
1514  sum14 += sdata[shared_x];
1515  shared_x += blockDim.x;
1516  sum15 += sdata[shared_x];
1517  shared_x += blockDim.x;
1518  sum16 += sdata[shared_x];
1519  }
1520  }
1521  }
1522  shared_x = y*out_pitch+x;
1523  output[shared_x] = sum / (filter_rows * filter_cols); //sdata[(threadIdx.y+2) * sharedCols + (threadIdx.x+2) ];
1524  shared_x += blockDim.x;
1525  output[shared_x] = sum2 / (filter_rows * filter_cols); //sdata[(threadIdx.y+2) * sharedCols + (threadIdx.x+2) ];
1526  shared_x += blockDim.x;
1527  output[shared_x] = sum3 / (filter_rows * filter_cols);
1528  shared_x += blockDim.x;
1529  output[shared_x] = sum4 / (filter_rows * filter_cols);
1530  shared_x += blockDim.x;
1531  output[shared_x] = sum5 / (filter_rows * filter_cols);
1532  shared_x += blockDim.x;
1533  output[shared_x] = sum6 / (filter_rows * filter_cols);
1534  shared_x += blockDim.x;
1535  output[shared_x] = sum7 / (filter_rows * filter_cols);
1536  shared_x += blockDim.x;
1537  output[shared_x] = sum8 / (filter_rows * filter_cols);
1538  shared_x += blockDim.x;
1539  output[shared_x] = sum9 / (filter_rows * filter_cols);
1540  shared_x += blockDim.x;
1541  output[shared_x] = sum10 / (filter_rows * filter_cols);
1542  shared_x += blockDim.x;
1543  output[shared_x] = sum11 / (filter_rows * filter_cols);
1544  shared_x += blockDim.x;
1545  output[shared_x] = sum12 / (filter_rows * filter_cols);
1546  shared_x += blockDim.x;
1547  output[shared_x] = sum13 / (filter_rows * filter_cols);
1548  shared_x += blockDim.x;
1549  output[shared_x] = sum14 / (filter_rows * filter_cols);
1550  shared_x += blockDim.x;
1551  output[shared_x] = sum15 / (filter_rows * filter_cols);
1552  shared_x += blockDim.x;
1553  output[shared_x] = sum16 / (filter_rows * filter_cols);
1554  }
1555  }
1556 }
1557 
1558 
1559 
1560 
1561 
1562 } // end namespace skepu
1563 
1564 #endif
1565 
1566 
1567 #endif
1568