|
SkePU 0.7
|
|
Functions | |
| static std::string | skepu::MatrixConvolSharedFilter_CL ("__kernel void conv_cuda_shared_filter_KERNELNAME(__global TYPE* input, __global TYPE* output, __constant TYPE* filter, int in_rows, int in_cols, int out_rows, int out_cols, int filter_rows, int filter_cols, int in_pitch, int out_pitch, int sharedRows, int sharedCols, __local TYPE* sdata)\n""{\n"" unsigned int xx = ( (int)(get_global_id(0)/get_local_size(0))) * get_local_size(0);\n"" unsigned int yy = ( (int)(get_global_id(1)/get_local_size(1))) * get_local_size(1);\n"" unsigned int x = get_global_id(0);\n"" unsigned int y = get_global_id(1);\n"" if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))\n"" {\n"" unsigned int sharedIdx = get_local_id(1) * sharedCols + get_local_id(0);\n"" sdata[sharedIdx]= input[y*in_pitch + x];\n"" unsigned int shared_x= get_local_id(0)+get_local_size(0);\n"" unsigned int shared_y= get_local_id(1);\n"" while(shared_y<sharedRows)\n"" {\n"" while(shared_x<sharedCols)\n"" {\n"" sharedIdx = shared_y * sharedCols + shared_x; \n"" sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];\n"" shared_x = shared_x + get_local_size(0);\n"" }\n"" shared_x = get_local_id(0);\n"" shared_y = shared_y + get_local_size(1);\n"" } \n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if(x<out_cols && y<out_rows)\n"" {\n"" TYPE sum=0;\n"" for(int j=0;j<filter_rows;j++) \n"" {\n"" for(int i=0;i<filter_cols;i++) \n"" {\n"" sum += sdata[(get_local_id(1)+j) * sharedCols + (get_local_id(0)+i) ] * filter[j*filter_cols+i];\n"" }\n"" }\n"" output[y*out_pitch+x] = sum / (filter_rows * filter_cols);\n"" }\n""}") |
| static std::string | skepu::MatrixConvol2D_CL ("__kernel void conv_cuda_2D_KERNELNAME(__global TYPE* input, __global TYPE* output, int out_rows, int out_cols, int filter_rows, int filter_cols, int in_pitch, int out_pitch, int stride, int sharedRows, int sharedCols, __local TYPE* sdata)\n""{\n"" unsigned int xx = ( (int)(get_global_id(0)/get_local_size(0))) * get_local_size(0);\n"" unsigned int yy = ( (int)(get_global_id(1)/get_local_size(1))) * get_local_size(1);\n"" unsigned int x = get_global_id(0);\n"" unsigned int y = get_global_id(1);\n"" if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))\n"" {\n"" unsigned int sharedIdx = get_local_id(1) * sharedCols + get_local_id(0);\n"" sdata[sharedIdx]= input[y*in_pitch + x];\n"" unsigned int shared_x= get_local_id(0)+get_local_size(0);\n"" unsigned int shared_y= get_local_id(1);\n"" while(shared_y<sharedRows)\n"" {\n"" while(shared_x<sharedCols)\n"" {\n"" sharedIdx = shared_y * sharedCols + shared_x; \n"" sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];\n"" shared_x = shared_x + get_local_size(0);\n"" }\n"" shared_x = get_local_id(0);\n"" shared_y = shared_y + get_local_size(1);\n"" } \n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if(x<out_cols && y<out_rows)\n"" {\n"" output[y*out_pitch+x] = FUNCTIONNAME(&(sdata[(get_local_id(1)+(filter_rows/2)) * sharedCols + (get_local_id(0)+(filter_cols/2))]), stride);\n"" }\n""}") |
| static std::string | skepu::MatrixConvolShared_CL ("__kernel void conv_cuda_shared_KERNELNAME(__global TYPE* input, __global TYPE* output, int in_rows, int in_cols, int out_rows, int out_cols, int filter_rows, int filter_cols, int in_pitch, int out_pitch, int sharedRows, int sharedCols, __local TYPE* sdata)\n""{\n"" unsigned int xx = ( (int)(get_global_id(0)/get_local_size(0))) * get_local_size(0);\n"" unsigned int yy = ( (int)(get_global_id(1)/get_local_size(1))) * get_local_size(1);\n"" unsigned int x = get_global_id(0);\n"" unsigned int y = get_global_id(1);\n"" if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))\n"" {\n"" unsigned int sharedIdx = get_local_id(1) * sharedCols + get_local_id(0);\n"" sdata[sharedIdx]= input[y*in_pitch + x];\n"" unsigned int shared_x= get_local_id(0)+get_local_size(0);\n"" unsigned int shared_y= get_local_id(1);\n"" while(shared_y<sharedRows)\n"" {\n"" while(shared_x<sharedCols)\n"" {\n"" sharedIdx = shared_y * sharedCols + shared_x; \n"" sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];\n"" shared_x = shared_x + get_local_size(0);\n"" }\n"" shared_x = get_local_id(0);\n"" shared_y = shared_y + get_local_size(1);\n"" } \n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if(x<out_cols && y<out_rows)\n"" {\n"" TYPE sum=0;\n"" for(int j=0;j<filter_rows;j++) \n"" {\n"" for(int i=0;i<filter_cols;i++) \n"" {\n"" sum += sdata[(get_local_id(1)+j) * sharedCols + (get_local_id(0)+i) ];\n"" }\n"" }\n"" output[y*out_pitch+x] = sum / (filter_rows * filter_cols);\n"" }\n""}") |
| template<typename T > | |
| T | skepu::max (T a, T b) |
| template<typename T > | |
| T | skepu::min (T a, T b) |
| template<typename T > | |
| int | skepu::calculateTiling (int regCountPerThread, int filterSizeX, int filterSizeY, int inputSizeX, bool maximizeTiling=false) |
| template<typename T , typename OverlapFunc > | |
| __global__ void | skepu::conv_cuda_2D_kernel (OverlapFunc mapOverlapFunc, T *input, T *output, const int out_rows, const int out_cols, const int filter_rows, const int filter_cols, size_t in_pitch, size_t out_pitch, const int sharedRows, const int sharedCols) |
| template<bool useFilter, typename T > | |
| __global__ void | skepu::conv_cuda_shared_kernel (T *input, T *output, const int in_rows, const int in_cols, const int out_rows, const int out_cols, const int filter_rows, const int filter_cols, size_t in_pitch, size_t out_pitch, const int sharedRows, const int sharedCols) |
| template<bool useFilter, typename T > | |
| __global__ void | skepu::conv_cuda_shared_tiling_kernel (T *input, T *output, const int numTiles, const int in_cols, const int out_rows, const int out_cols, const int filter_rows, const int filter_cols, size_t in_pitch, size_t out_pitch, const int sharedRows, const int sharedCols) |
| template<bool useFilter, typename T > | |
| __global__ void | skepu::conv_cuda_shared_tiling_2_kernel (T *input, T *output, const int in_cols, const int out_rows, const int out_cols, const int filter_rows, const int filter_cols, size_t in_pitch, size_t out_pitch, const int sharedRows, const int sharedCols) |
| template<bool useFilter, typename T > | |
| __global__ void | skepu::conv_cuda_shared_tiling_4_kernel (T *input, T *output, const int in_cols, const int out_rows, const int out_cols, const int filter_rows, const int filter_cols, size_t in_pitch, size_t out_pitch, const int sharedRows, const int sharedCols) |
| template<bool useFilter, typename T > | |
| __global__ void | skepu::conv_cuda_shared_tiling_6_kernel (T *input, T *output, const int in_cols, const int out_rows, const int out_cols, const int filter_rows, const int filter_cols, size_t in_pitch, size_t out_pitch, const int sharedRows, const int sharedCols) |
| template<bool useFilter, typename T > | |
| __global__ void | skepu::conv_cuda_shared_tiling_8_kernel (T *input, T *output, const int in_cols, const int out_rows, const int out_cols, const int filter_rows, const int filter_cols, size_t in_pitch, size_t out_pitch, const int sharedRows, const int sharedCols) |
| template<bool useFilter, typename T > | |
| __global__ void | skepu::conv_cuda_shared_tiling_10_kernel (T *input, T *output, const int in_cols, const int out_rows, const int out_cols, const int filter_rows, const int filter_cols, size_t in_pitch, size_t out_pitch, const int sharedRows, const int sharedCols) |
| template<bool useFilter, typename T > | |
| __global__ void | skepu::conv_cuda_shared_tiling_12_kernel (T *input, T *output, const int in_cols, const int out_rows, const int out_cols, const int filter_rows, const int filter_cols, size_t in_pitch, size_t out_pitch, const int sharedRows, const int sharedCols) |
| template<bool useFilter, typename T > | |
| __global__ void | skepu::conv_cuda_shared_tiling_14_kernel (T *input, T *output, const int in_cols, const int out_rows, const int out_cols, const int filter_rows, const int filter_cols, size_t in_pitch, size_t out_pitch, const int sharedRows, const int sharedCols) |
| template<bool useFilter, typename T > | |
| __global__ void | skepu::conv_cuda_shared_tiling_16_kernel (T *input, T *output, const int in_cols, const int out_rows, const int out_cols, const int filter_rows, const int filter_cols, size_t in_pitch, size_t out_pitch, const int sharedRows, const int sharedCols) |
| static std::string | skepu::MatrixTranspose_CL ("__kernel void matrix_transpose_KERNELNAME(__global float *odata, __global float *idata, int offset, int width, int height, __local float* block)\n""{\n"" unsigned int xIndex = get_global_id(0);\n"" unsigned int yIndex = get_global_id(1);\n"" if((xIndex + offset < width) && (yIndex < height))\n"" {\n"" unsigned int index_in = yIndex * width + xIndex + offset;\n"" block[get_local_id(1)*(BLOCK_DIM+1)+get_local_id(0)] = idata[index_in];\n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" xIndex = get_group_id(1) * BLOCK_DIM + get_local_id(0);\n"" yIndex = get_group_id(0) * BLOCK_DIM + get_local_id(1);\n"" if((xIndex < height) && (yIndex + offset < width))\n"" {\n"" unsigned int index_out = yIndex * height + xIndex;\n"" odata[index_out] = block[get_local_id(0)*(BLOCK_DIM+1)+get_local_id(1)];\n"" }\n""}\n") |
| static std::string | skepu::MapOverlapKernel_CL ("__kernel void MapOverlapKernel_KERNELNAME(__global TYPE* input, __global TYPE* output, __global TYPE* wrap, int n, int overlap, int out_offset, int out_numelements, int poly, TYPE pad, __local TYPE* sdata)\n""{\n"" int tid = get_local_id(0);\n"" int i = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"" if(poly == 0)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[i] : pad;\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (get_group_id(0) == 0) ? pad : input[i-overlap];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != get_num_groups(0)-1 && i+overlap < n) ? input[i+overlap] : pad;\n"" }\n"" }\n"" else if(poly == 1)\n"" {\n"" if(i < n)\n"" {\n"" sdata[overlap+tid] = input[i];\n"" }\n"" else if(i-n < overlap)\n"" {\n"" sdata[overlap+tid] = wrap[overlap+(i-n)];\n"" }\n"" else\n"" {\n"" sdata[overlap+tid] = pad;\n"" }\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (get_group_id(0) == 0) ? wrap[tid] : input[i-overlap];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != get_num_groups(0)-1 && i+overlap < n) ? input[i+overlap] : wrap[overlap+(i+overlap-n)];\n"" }\n"" }\n"" else if(poly == 2)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[i] : input[n-1];\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (get_group_id(0) == 0) ? input[0] : input[i-overlap];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != get_num_groups(0)-1 && i+overlap < n) ? input[i+overlap] : input[n-1];\n"" }\n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if( (i >= out_offset) && (i < out_offset+out_numelements) )\n"" {\n"" output[i-out_offset] = FUNCTIONNAME(&(sdata[tid+overlap]));\n"" }\n""}\n") |
| static std::string | skepu::MapOverlapKernel_CL_Matrix_Row ("__kernel void MapOverlapKernel_MatRowWise_KERNELNAME(__global TYPE* input, __global TYPE* output, __global TYPE* wrap, int n, int overlap, int out_offset, int out_numelements, int poly, TYPE pad, int blocksPerRow, int rowWidth, __local TYPE* sdata)\n""{\n"" int tid = get_local_id(0);\n"" int i = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"" int wrapIndex= 2 * overlap * (int)(get_group_id(0)/blocksPerRow);\n"" int tmp= (get_group_id(0) % blocksPerRow);\n"" int tmp2= (get_group_id(0) / blocksPerRow);\n"" if(poly == 0)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[i] : pad;\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? pad : input[i-overlap];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (i+overlap < n) && tmp!=(blocksPerRow-1)) ? input[i+overlap] : pad;\n"" }\n"" }\n"" else if(poly == 1)\n"" {\n"" if(i < n)\n"" {\n"" sdata[overlap+tid] = input[i];\n"" }\n"" else if(i-n < overlap)\n"" {\n"" sdata[overlap+tid] = wrap[(overlap+(i-n))+ wrapIndex];\n"" }\n"" else\n"" {\n"" sdata[overlap+tid] = pad;\n"" }\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? wrap[tid+wrapIndex] : input[i-overlap];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && i+overlap < n && tmp!=(blocksPerRow-1)) ? input[i+overlap] : wrap[overlap+wrapIndex+(tid+overlap-get_local_size(0))];\n"" }\n"" }\n"" else if(poly == 2)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[i] : input[n-1];\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? input[tmp2*rowWidth] : input[i-overlap];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (i+overlap < n) && (tmp!=(blocksPerRow-1))) ? input[i+overlap] : input[(tmp2+1)*rowWidth-1];\n"" }\n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if( (i >= out_offset) && (i < out_offset+out_numelements) )\n"" {\n"" output[i-out_offset] = FUNCTIONNAME(&(sdata[tid+overlap]));\n"" }\n""}\n") |
| static std::string | skepu::MapOverlapKernel_CL_Matrix_Col ("__kernel void MapOverlapKernel_MatColWise_KERNELNAME(__global TYPE* input, __global TYPE* output, __global TYPE* wrap, int n, int overlap, int out_offset, int out_numelements, int poly, TYPE pad, int blocksPerCol, int rowWidth, int colWidth, __local TYPE* sdata)\n""{\n"" int tid = get_local_id(0);\n"" int i = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"" int wrapIndex= 2 * overlap * (int)(get_group_id(0)/blocksPerCol);\n"" int tmp= (get_group_id(0) % blocksPerCol);\n"" int tmp2= (get_group_id(0) / blocksPerCol);\n"" int arrInd = (tid + tmp*get_local_size(0))*rowWidth + tmp2;\n"" if(poly == 0)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[arrInd] : pad;\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? pad : input[(arrInd-(overlap*rowWidth))];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+(overlap*rowWidth))] : pad;\n"" }\n"" }\n"" else if(poly == 1)\n"" {\n"" if(i < n)\n"" {\n"" sdata[overlap+tid] = input[arrInd];\n"" }\n"" else if(i-n < overlap)\n"" {\n"" sdata[overlap+tid] = wrap[(overlap+(i-n))+ wrapIndex];\n"" }\n"" else\n"" {\n"" sdata[overlap+tid] = pad;\n"" }\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? wrap[tid+wrapIndex] : input[(arrInd-(overlap*rowWidth))];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+(overlap*rowWidth))] : wrap[overlap+wrapIndex+(tid+overlap-get_local_size(0))];\n"" }\n"" }\n"" else if(poly == 2)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[arrInd] : input[n-1];\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? input[tmp2] : input[(arrInd-(overlap*rowWidth))];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+(overlap*rowWidth))] : input[tmp2+(colWidth-1)*rowWidth];\n"" }\n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if( (arrInd >= out_offset) && (arrInd < out_offset+out_numelements) )\n"" {\n"" output[arrInd-out_offset] = FUNCTIONNAME(&(sdata[tid+overlap]));\n"" }\n""}\n") |
| static std::string | skepu::MapOverlapKernel_CL_Matrix_ColMulti ("__kernel void MapOverlapKernel_MatColWiseMulti_KERNELNAME(__global TYPE* input, __global TYPE* output, __global TYPE* wrap, int n, int overlap, int in_offset, int out_numelements, int poly, int deviceType, TYPE pad, int blocksPerCol, int rowWidth, int colWidth, __local TYPE* sdata)\n""{\n"" int tid = get_local_id(0);\n"" int i = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"" int wrapIndex= 2 * overlap * (int)(get_group_id(0)/blocksPerCol);\n"" int tmp= (get_group_id(0) % blocksPerCol);\n"" int tmp2= (get_group_id(0) / blocksPerCol);\n"" int arrInd = (tid + tmp*get_local_size(0))*rowWidth + tmp2;\n"" if(poly == 0)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[arrInd+in_offset] : pad;\n"" if(deviceType == -1)\n"" {\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? pad : input[(arrInd-(overlap*rowWidth))];\n"" }\n"" \n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = input[(arrInd+in_offset+(overlap*rowWidth))];\n"" }\n"" }\n"" else if(deviceType == 0) \n"" {\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = input[arrInd];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = input[(arrInd+in_offset+(overlap*rowWidth))];\n"" }\n"" }\n"" else if(deviceType == 1)\n"" {\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = input[arrInd];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+in_offset+(overlap*rowWidth))] : pad;\n"" }\n"" }\n"" }\n"" else if(poly == 1)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[arrInd+in_offset] : ((i-n < overlap) ? wrap[(i-n)+ (overlap * tmp2)] : pad);\n"" if(deviceType == -1)\n"" {\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? wrap[tid+(overlap * tmp2)] : input[(arrInd-(overlap*rowWidth))];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = input[(arrInd+in_offset+(overlap*rowWidth))];\n"" }\n"" }\n"" else if(deviceType == 0)\n"" {\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = input[arrInd];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = input[(arrInd+in_offset+(overlap*rowWidth))];\n"" }\n"" }\n"" else if(deviceType == 1)\n"" {\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = input[arrInd];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+in_offset+(overlap*rowWidth))] : wrap[(overlap * tmp2)+(tid+overlap-get_local_size(0))];\n"" }\n"" }\n"" }\n"" else if(poly == 2)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[arrInd+in_offset] : input[n+in_offset-1];\n"" if(deviceType == -1)\n"" {\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? input[tmp2] : input[(arrInd-(overlap*rowWidth))];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = input[(arrInd+in_offset+(overlap*rowWidth))];\n"" }\n"" }\n"" else if(deviceType == 0)\n"" {\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = input[arrInd];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = input[(arrInd+in_offset+(overlap*rowWidth))];\n"" }\n"" }\n"" else if(deviceType == 1)\n"" {\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = input[arrInd];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+in_offset+(overlap*rowWidth))] : input[tmp2+in_offset+(colWidth-1)*rowWidth];\n"" }\n"" }\n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if( arrInd < out_numelements )\n"" {\n"" output[arrInd] = FUNCTIONNAME(&(sdata[tid+overlap]));\n"" }\n""}\n") |
| template<typename T > | |
| __global__ void | skepu::transpose (T *odata, T *idata, int width, int height) |
| template<int poly, typename T , typename OverlapFunc > | |
| __global__ void | skepu::MapOverlapKernel_CU (OverlapFunc mapOverlapFunc, T *input, T *output, T *wrap, unsigned int n, unsigned int out_offset, unsigned int out_numelements, T pad) |
| template<int poly, typename T , typename OverlapFunc > | |
| __global__ void | skepu::MapOverlapKernel_CU_Matrix_Row (OverlapFunc mapOverlapFunc, T *input, T *output, T *wrap, unsigned int n, unsigned int out_offset, unsigned int out_numelements, T pad, unsigned int blocksPerRow, unsigned int rowWidth) |
| template<int poly, typename T , typename OverlapFunc > | |
| __global__ void | skepu::MapOverlapKernel_CU_Matrix_Col (OverlapFunc mapOverlapFunc, T *input, T *output, T *wrap, unsigned int n, unsigned int out_offset, unsigned int out_numelements, T pad, unsigned int blocksPerCol, unsigned int rowWidth, unsigned int colWidth) |
| template<int poly, int deviceType, typename T , typename OverlapFunc > | |
| __global__ void | skepu::MapOverlapKernel_CU_Matrix_ColMulti (OverlapFunc mapOverlapFunc, T *input, T *output, T *wrap, unsigned int n, unsigned int in_offset, unsigned int out_numelements, T pad, unsigned int blocksPerCol, unsigned int rowWidth, unsigned int colWidth) |
Definitions of CUDA and OpenCL kernels for the MapOverlap skeleton.
| int skepu::calculateTiling | ( | int | regCountPerThread, |
| int | filterSizeX, | ||
| int | filterSizeY, | ||
| int | inputSizeX, | ||
| bool | maximizeTiling = false |
||
| ) |
Helper: to calculate tiling factor.
References skepu::min().

| __global__ void skepu::conv_cuda_2D_kernel | ( | OverlapFunc | mapOverlapFunc, |
| T * | input, | ||
| T * | output, | ||
| const int | out_rows, | ||
| const int | out_cols, | ||
| const int | filter_rows, | ||
| const int | filter_cols, | ||
| size_t | in_pitch, | ||
| size_t | out_pitch, | ||
| const int | sharedRows, | ||
| const int | sharedCols | ||
| ) |
The 2D mapoverlap CUDA kernel to apply on neighbourhood of each element in the matrix.
| __global__ void skepu::conv_cuda_shared_kernel | ( | T * | input, |
| T * | output, | ||
| const int | in_rows, | ||
| const int | in_cols, | ||
| const int | out_rows, | ||
| const int | out_cols, | ||
| const int | filter_rows, | ||
| const int | filter_cols, | ||
| size_t | in_pitch, | ||
| size_t | out_pitch, | ||
| const int | sharedRows, | ||
| const int | sharedCols | ||
| ) |
The mapoverlap CUDA kernel with (or without) a filter matrix to apply on neighbourhood of each element in the matrix.
| __global__ void skepu::conv_cuda_shared_tiling_10_kernel | ( | T * | input, |
| T * | output, | ||
| const int | in_cols, | ||
| const int | out_rows, | ||
| const int | out_cols, | ||
| const int | filter_rows, | ||
| const int | filter_cols, | ||
| size_t | in_pitch, | ||
| size_t | out_pitch, | ||
| const int | sharedRows, | ||
| const int | sharedCols | ||
| ) |
The mapoverlap CUDA kernel with tiling support (tiling factor: 10); with (or without) a filter matrix to apply on neighbourhood of each element in the matrix.
| __global__ void skepu::conv_cuda_shared_tiling_12_kernel | ( | T * | input, |
| T * | output, | ||
| const int | in_cols, | ||
| const int | out_rows, | ||
| const int | out_cols, | ||
| const int | filter_rows, | ||
| const int | filter_cols, | ||
| size_t | in_pitch, | ||
| size_t | out_pitch, | ||
| const int | sharedRows, | ||
| const int | sharedCols | ||
| ) |
The mapoverlap CUDA kernel with tiling support (tiling factor: 22); with (or without) a filter matrix to apply on neighbourhood of each element in the matrix.
| __global__ void skepu::conv_cuda_shared_tiling_14_kernel | ( | T * | input, |
| T * | output, | ||
| const int | in_cols, | ||
| const int | out_rows, | ||
| const int | out_cols, | ||
| const int | filter_rows, | ||
| const int | filter_cols, | ||
| size_t | in_pitch, | ||
| size_t | out_pitch, | ||
| const int | sharedRows, | ||
| const int | sharedCols | ||
| ) |
The mapoverlap CUDA kernel with tiling support (tiling factor: 14); with (or without) a filter matrix to apply on neighbourhood of each element in the matrix.
| __global__ void skepu::conv_cuda_shared_tiling_16_kernel | ( | T * | input, |
| T * | output, | ||
| const int | in_cols, | ||
| const int | out_rows, | ||
| const int | out_cols, | ||
| const int | filter_rows, | ||
| const int | filter_cols, | ||
| size_t | in_pitch, | ||
| size_t | out_pitch, | ||
| const int | sharedRows, | ||
| const int | sharedCols | ||
| ) |
The mapoverlap CUDA kernel with tiling support (tiling factor: 16); with (or without) a filter matrix to apply on neighbourhood of each element in the matrix.
| __global__ void skepu::conv_cuda_shared_tiling_2_kernel | ( | T * | input, |
| T * | output, | ||
| const int | in_cols, | ||
| const int | out_rows, | ||
| const int | out_cols, | ||
| const int | filter_rows, | ||
| const int | filter_cols, | ||
| size_t | in_pitch, | ||
| size_t | out_pitch, | ||
| const int | sharedRows, | ||
| const int | sharedCols | ||
| ) |
The mapoverlap CUDA kernel with tiling support (tiling factor: 2); with (or without) a filter matrix to apply on neighbourhood of each element in the matrix.
| __global__ void skepu::conv_cuda_shared_tiling_4_kernel | ( | T * | input, |
| T * | output, | ||
| const int | in_cols, | ||
| const int | out_rows, | ||
| const int | out_cols, | ||
| const int | filter_rows, | ||
| const int | filter_cols, | ||
| size_t | in_pitch, | ||
| size_t | out_pitch, | ||
| const int | sharedRows, | ||
| const int | sharedCols | ||
| ) |
The mapoverlap CUDA kernel with tiling support (tiling factor: 4); with (or without) a filter matrix to apply on neighbourhood of each element in the matrix.
| __global__ void skepu::conv_cuda_shared_tiling_6_kernel | ( | T * | input, |
| T * | output, | ||
| const int | in_cols, | ||
| const int | out_rows, | ||
| const int | out_cols, | ||
| const int | filter_rows, | ||
| const int | filter_cols, | ||
| size_t | in_pitch, | ||
| size_t | out_pitch, | ||
| const int | sharedRows, | ||
| const int | sharedCols | ||
| ) |
The mapoverlap CUDA kernel with tiling support (tiling factor: 6); with (or without) a filter matrix to apply on neighbourhood of each element in the matrix.
| __global__ void skepu::conv_cuda_shared_tiling_8_kernel | ( | T * | input, |
| T * | output, | ||
| const int | in_cols, | ||
| const int | out_rows, | ||
| const int | out_cols, | ||
| const int | filter_rows, | ||
| const int | filter_cols, | ||
| size_t | in_pitch, | ||
| size_t | out_pitch, | ||
| const int | sharedRows, | ||
| const int | sharedCols | ||
| ) |
The mapoverlap CUDA kernel with tiling support (tiling factor: 8); with (or without) a filter matrix to apply on neighbourhood of each element in the matrix.
| __global__ void skepu::conv_cuda_shared_tiling_kernel | ( | T * | input, |
| T * | output, | ||
| const int | numTiles, | ||
| const int | in_cols, | ||
| const int | out_rows, | ||
| const int | out_cols, | ||
| const int | filter_rows, | ||
| const int | filter_cols, | ||
| size_t | in_pitch, | ||
| size_t | out_pitch, | ||
| const int | sharedRows, | ||
| const int | sharedCols | ||
| ) |
The mapoverlap CUDA kernel with tiling support; with (or without) a filter matrix to apply on neighbourhood of each element in the matrix.
| static std::string skepu::MapOverlapKernel_CL | ( | ) | [static] |
OpenCL MapOverlap kernel for vector. It uses one device thread per element so maximum number of device threads limits the maximum number of elements this kernel can handle. Also the amount of shared memory and the maximum blocksize limits the maximum overlap that is possible to use, typically limits the overlap to < 256.
| static std::string skepu::MapOverlapKernel_CL_Matrix_Col | ( | ) | [static] |
OpenCL MapOverlap kernel for applying column-wise overlap on matrix operands. It uses one device thread per element so maximum number of device threads limits the maximum number of elements this kernel can handle. Also the amount of shared memory and the maximum blocksize limits the maximum overlap that is possible to use, typically limits the overlap to < 256.
| static std::string skepu::MapOverlapKernel_CL_Matrix_ColMulti | ( | ) | [static] |
OpenCL MapOverlap kernel for applying column-wise overlap on matrix operands when using multiple GPUs. It uses one device thread per element so maximum number of device threads limits the maximum number of elements this kernel can handle. Also the amount of shared memory and the maximum blocksize limits the maximum overlap that is possible to use, typically limits the overlap to < 256.
| static std::string skepu::MapOverlapKernel_CL_Matrix_Row | ( | ) | [static] |
OpenCL MapOverlap kernel for applying row-wise overlap on matrix operands. It uses one device thread per element so maximum number of device threads limits the maximum number of elements this kernel can handle. Also the amount of shared memory and the maximum blocksize limits the maximum overlap that is possible to use, typically limits the overlap to < 256.
| __global__ void skepu::MapOverlapKernel_CU | ( | OverlapFunc | mapOverlapFunc, |
| T * | input, | ||
| T * | output, | ||
| T * | wrap, | ||
| unsigned int | n, | ||
| unsigned int | out_offset, | ||
| unsigned int | out_numelements, | ||
| T | pad | ||
| ) |
CUDA MapOverlap kernel for vector operands. It uses one device thread per element so maximum number of device threads limits the maximum number of elements this kernel can handle. Also the amount of shared memory and the maximum blocksize limits the maximum overlap that is possible to use, typically limits the overlap to < 256.
| __global__ void skepu::MapOverlapKernel_CU_Matrix_Col | ( | OverlapFunc | mapOverlapFunc, |
| T * | input, | ||
| T * | output, | ||
| T * | wrap, | ||
| unsigned int | n, | ||
| unsigned int | out_offset, | ||
| unsigned int | out_numelements, | ||
| T | pad, | ||
| unsigned int | blocksPerCol, | ||
| unsigned int | rowWidth, | ||
| unsigned int | colWidth | ||
| ) |
CUDA MapOverlap kernel for applying column-wise overlap on matrix operands. It uses one device thread per element so maximum number of device threads limits the maximum number of elements this kernel can handle. Also the amount of shared memory and the maximum blocksize limits the maximum overlap that is possible to use, typically limits the overlap to < 256.
| __global__ void skepu::MapOverlapKernel_CU_Matrix_ColMulti | ( | OverlapFunc | mapOverlapFunc, |
| T * | input, | ||
| T * | output, | ||
| T * | wrap, | ||
| unsigned int | n, | ||
| unsigned int | in_offset, | ||
| unsigned int | out_numelements, | ||
| T | pad, | ||
| unsigned int | blocksPerCol, | ||
| unsigned int | rowWidth, | ||
| unsigned int | colWidth | ||
| ) |
CUDA MapOverlap kernel for applying column-wise overlap on matrix operands with multiple devices. It uses one device thread per element so maximum number of device threads limits the maximum number of elements this kernel can handle. Also the amount of shared memory and the maximum blocksize limits the maximum overlap that is possible to use, typically limits the overlap to < 256. Device type : -1 (first), 0 (middleDevice), 1(last),
| __global__ void skepu::MapOverlapKernel_CU_Matrix_Row | ( | OverlapFunc | mapOverlapFunc, |
| T * | input, | ||
| T * | output, | ||
| T * | wrap, | ||
| unsigned int | n, | ||
| unsigned int | out_offset, | ||
| unsigned int | out_numelements, | ||
| T | pad, | ||
| unsigned int | blocksPerRow, | ||
| unsigned int | rowWidth | ||
| ) |
CUDA MapOverlap kernel for applying row-wise overlap on matrix operands. It uses one device thread per element so maximum number of device threads limits the maximum number of elements this kernel can handle. Also the amount of shared memory and the maximum blocksize limits the maximum overlap that is possible to use, typically limits the overlap to < 256.
| static std::string skepu::MatrixConvol2D_CL | ( | ) | [static] |
The mapoverlap OpenCL kernel to apply on neighbourhood of each element in the matrix.
| static std::string skepu::MatrixConvolShared_CL | ( | ) | [static] |
The mapoverlap OpenCL kernel to apply on neighbourhood of each element in the matrix.
| static std::string skepu::MatrixConvolSharedFilter_CL | ( | ) | [static] |
The mapoverlap OpenCL kernel with a filter matrix to apply on neighbourhood of each element in the matrix.
| static std::string skepu::MatrixTranspose_CL | ( | ) | [static] |
OpenCL matrix transpose kernel.
| T skepu::max | ( | T | a, |
| T | b | ||
| ) |
Helper: to calculate maximum.
| T skepu::min | ( | T | a, |
| T | b | ||
| ) |
Helper: to calculate minimum.
| __global__ void skepu::transpose | ( | T * | odata, |
| T * | idata, | ||
| int | width, | ||
| int | height | ||
| ) |
Matrix transpose CUDA kernel
1.7.4