SkePU 0.7
Functions
MapOverlap Kernels
Kernels
Collaboration diagram for MapOverlap Kernels:

Functions

static std::string skepu::MatrixConvolSharedFilter_CL ("__kernel void conv_cuda_shared_filter_KERNELNAME(__global TYPE* input, __global TYPE* output, __constant TYPE* filter, int in_rows, int in_cols, int out_rows, int out_cols, int filter_rows, int filter_cols, int in_pitch, int out_pitch, int sharedRows, int sharedCols, __local TYPE* sdata)\n""{\n"" unsigned int xx = ( (int)(get_global_id(0)/get_local_size(0))) * get_local_size(0);\n"" unsigned int yy = ( (int)(get_global_id(1)/get_local_size(1))) * get_local_size(1);\n"" unsigned int x = get_global_id(0);\n"" unsigned int y = get_global_id(1);\n"" if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))\n"" {\n"" unsigned int sharedIdx = get_local_id(1) * sharedCols + get_local_id(0);\n"" sdata[sharedIdx]= input[y*in_pitch + x];\n"" unsigned int shared_x= get_local_id(0)+get_local_size(0);\n"" unsigned int shared_y= get_local_id(1);\n"" while(shared_y<sharedRows)\n"" {\n"" while(shared_x<sharedCols)\n"" {\n"" sharedIdx = shared_y * sharedCols + shared_x; \n"" sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];\n"" shared_x = shared_x + get_local_size(0);\n"" }\n"" shared_x = get_local_id(0);\n"" shared_y = shared_y + get_local_size(1);\n"" } \n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if(x<out_cols && y<out_rows)\n"" {\n"" TYPE sum=0;\n"" for(int j=0;j<filter_rows;j++) \n"" {\n"" for(int i=0;i<filter_cols;i++) \n"" {\n"" sum += sdata[(get_local_id(1)+j) * sharedCols + (get_local_id(0)+i) ] * filter[j*filter_cols+i];\n"" }\n"" }\n"" output[y*out_pitch+x] = sum / (filter_rows * filter_cols);\n"" }\n""}")
static std::string skepu::MatrixConvol2D_CL ("__kernel void conv_cuda_2D_KERNELNAME(__global TYPE* input, __global TYPE* output, int out_rows, int out_cols, int filter_rows, int filter_cols, int in_pitch, int out_pitch, int stride, int sharedRows, int sharedCols, __local TYPE* sdata)\n""{\n"" unsigned int xx = ( (int)(get_global_id(0)/get_local_size(0))) * get_local_size(0);\n"" unsigned int yy = ( (int)(get_global_id(1)/get_local_size(1))) * get_local_size(1);\n"" unsigned int x = get_global_id(0);\n"" unsigned int y = get_global_id(1);\n"" if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))\n"" {\n"" unsigned int sharedIdx = get_local_id(1) * sharedCols + get_local_id(0);\n"" sdata[sharedIdx]= input[y*in_pitch + x];\n"" unsigned int shared_x= get_local_id(0)+get_local_size(0);\n"" unsigned int shared_y= get_local_id(1);\n"" while(shared_y<sharedRows)\n"" {\n"" while(shared_x<sharedCols)\n"" {\n"" sharedIdx = shared_y * sharedCols + shared_x; \n"" sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];\n"" shared_x = shared_x + get_local_size(0);\n"" }\n"" shared_x = get_local_id(0);\n"" shared_y = shared_y + get_local_size(1);\n"" } \n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if(x<out_cols && y<out_rows)\n"" {\n"" output[y*out_pitch+x] = FUNCTIONNAME(&(sdata[(get_local_id(1)+(filter_rows/2)) * sharedCols + (get_local_id(0)+(filter_cols/2))]), stride);\n"" }\n""}")
static std::string skepu::MatrixConvolShared_CL ("__kernel void conv_cuda_shared_KERNELNAME(__global TYPE* input, __global TYPE* output, int in_rows, int in_cols, int out_rows, int out_cols, int filter_rows, int filter_cols, int in_pitch, int out_pitch, int sharedRows, int sharedCols, __local TYPE* sdata)\n""{\n"" unsigned int xx = ( (int)(get_global_id(0)/get_local_size(0))) * get_local_size(0);\n"" unsigned int yy = ( (int)(get_global_id(1)/get_local_size(1))) * get_local_size(1);\n"" unsigned int x = get_global_id(0);\n"" unsigned int y = get_global_id(1);\n"" if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))\n"" {\n"" unsigned int sharedIdx = get_local_id(1) * sharedCols + get_local_id(0);\n"" sdata[sharedIdx]= input[y*in_pitch + x];\n"" unsigned int shared_x= get_local_id(0)+get_local_size(0);\n"" unsigned int shared_y= get_local_id(1);\n"" while(shared_y<sharedRows)\n"" {\n"" while(shared_x<sharedCols)\n"" {\n"" sharedIdx = shared_y * sharedCols + shared_x; \n"" sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];\n"" shared_x = shared_x + get_local_size(0);\n"" }\n"" shared_x = get_local_id(0);\n"" shared_y = shared_y + get_local_size(1);\n"" } \n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if(x<out_cols && y<out_rows)\n"" {\n"" TYPE sum=0;\n"" for(int j=0;j<filter_rows;j++) \n"" {\n"" for(int i=0;i<filter_cols;i++) \n"" {\n"" sum += sdata[(get_local_id(1)+j) * sharedCols + (get_local_id(0)+i) ];\n"" }\n"" }\n"" output[y*out_pitch+x] = sum / (filter_rows * filter_cols);\n"" }\n""}")
template<typename T >
skepu::max (T a, T b)
template<typename T >
skepu::min (T a, T b)
template<typename T >
int skepu::calculateTiling (int regCountPerThread, int filterSizeX, int filterSizeY, int inputSizeX, bool maximizeTiling=false)
template<typename T , typename OverlapFunc >
__global__ void skepu::conv_cuda_2D_kernel (OverlapFunc mapOverlapFunc, T *input, T *output, const int out_rows, const int out_cols, const int filter_rows, const int filter_cols, size_t in_pitch, size_t out_pitch, const int sharedRows, const int sharedCols)
template<bool useFilter, typename T >
__global__ void skepu::conv_cuda_shared_kernel (T *input, T *output, const int in_rows, const int in_cols, const int out_rows, const int out_cols, const int filter_rows, const int filter_cols, size_t in_pitch, size_t out_pitch, const int sharedRows, const int sharedCols)
template<bool useFilter, typename T >
__global__ void skepu::conv_cuda_shared_tiling_kernel (T *input, T *output, const int numTiles, const int in_cols, const int out_rows, const int out_cols, const int filter_rows, const int filter_cols, size_t in_pitch, size_t out_pitch, const int sharedRows, const int sharedCols)
template<bool useFilter, typename T >
__global__ void skepu::conv_cuda_shared_tiling_2_kernel (T *input, T *output, const int in_cols, const int out_rows, const int out_cols, const int filter_rows, const int filter_cols, size_t in_pitch, size_t out_pitch, const int sharedRows, const int sharedCols)
template<bool useFilter, typename T >
__global__ void skepu::conv_cuda_shared_tiling_4_kernel (T *input, T *output, const int in_cols, const int out_rows, const int out_cols, const int filter_rows, const int filter_cols, size_t in_pitch, size_t out_pitch, const int sharedRows, const int sharedCols)
template<bool useFilter, typename T >
__global__ void skepu::conv_cuda_shared_tiling_6_kernel (T *input, T *output, const int in_cols, const int out_rows, const int out_cols, const int filter_rows, const int filter_cols, size_t in_pitch, size_t out_pitch, const int sharedRows, const int sharedCols)
template<bool useFilter, typename T >
__global__ void skepu::conv_cuda_shared_tiling_8_kernel (T *input, T *output, const int in_cols, const int out_rows, const int out_cols, const int filter_rows, const int filter_cols, size_t in_pitch, size_t out_pitch, const int sharedRows, const int sharedCols)
template<bool useFilter, typename T >
__global__ void skepu::conv_cuda_shared_tiling_10_kernel (T *input, T *output, const int in_cols, const int out_rows, const int out_cols, const int filter_rows, const int filter_cols, size_t in_pitch, size_t out_pitch, const int sharedRows, const int sharedCols)
template<bool useFilter, typename T >
__global__ void skepu::conv_cuda_shared_tiling_12_kernel (T *input, T *output, const int in_cols, const int out_rows, const int out_cols, const int filter_rows, const int filter_cols, size_t in_pitch, size_t out_pitch, const int sharedRows, const int sharedCols)
template<bool useFilter, typename T >
__global__ void skepu::conv_cuda_shared_tiling_14_kernel (T *input, T *output, const int in_cols, const int out_rows, const int out_cols, const int filter_rows, const int filter_cols, size_t in_pitch, size_t out_pitch, const int sharedRows, const int sharedCols)
template<bool useFilter, typename T >
__global__ void skepu::conv_cuda_shared_tiling_16_kernel (T *input, T *output, const int in_cols, const int out_rows, const int out_cols, const int filter_rows, const int filter_cols, size_t in_pitch, size_t out_pitch, const int sharedRows, const int sharedCols)
static std::string skepu::MatrixTranspose_CL ("__kernel void matrix_transpose_KERNELNAME(__global float *odata, __global float *idata, int offset, int width, int height, __local float* block)\n""{\n"" unsigned int xIndex = get_global_id(0);\n"" unsigned int yIndex = get_global_id(1);\n"" if((xIndex + offset < width) && (yIndex < height))\n"" {\n"" unsigned int index_in = yIndex * width + xIndex + offset;\n"" block[get_local_id(1)*(BLOCK_DIM+1)+get_local_id(0)] = idata[index_in];\n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" xIndex = get_group_id(1) * BLOCK_DIM + get_local_id(0);\n"" yIndex = get_group_id(0) * BLOCK_DIM + get_local_id(1);\n"" if((xIndex < height) && (yIndex + offset < width))\n"" {\n"" unsigned int index_out = yIndex * height + xIndex;\n"" odata[index_out] = block[get_local_id(0)*(BLOCK_DIM+1)+get_local_id(1)];\n"" }\n""}\n")
static std::string skepu::MapOverlapKernel_CL ("__kernel void MapOverlapKernel_KERNELNAME(__global TYPE* input, __global TYPE* output, __global TYPE* wrap, int n, int overlap, int out_offset, int out_numelements, int poly, TYPE pad, __local TYPE* sdata)\n""{\n"" int tid = get_local_id(0);\n"" int i = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"" if(poly == 0)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[i] : pad;\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (get_group_id(0) == 0) ? pad : input[i-overlap];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != get_num_groups(0)-1 && i+overlap < n) ? input[i+overlap] : pad;\n"" }\n"" }\n"" else if(poly == 1)\n"" {\n"" if(i < n)\n"" {\n"" sdata[overlap+tid] = input[i];\n"" }\n"" else if(i-n < overlap)\n"" {\n"" sdata[overlap+tid] = wrap[overlap+(i-n)];\n"" }\n"" else\n"" {\n"" sdata[overlap+tid] = pad;\n"" }\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (get_group_id(0) == 0) ? wrap[tid] : input[i-overlap];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != get_num_groups(0)-1 && i+overlap < n) ? input[i+overlap] : wrap[overlap+(i+overlap-n)];\n"" }\n"" }\n"" else if(poly == 2)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[i] : input[n-1];\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (get_group_id(0) == 0) ? input[0] : input[i-overlap];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != get_num_groups(0)-1 && i+overlap < n) ? input[i+overlap] : input[n-1];\n"" }\n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if( (i >= out_offset) && (i < out_offset+out_numelements) )\n"" {\n"" output[i-out_offset] = FUNCTIONNAME(&(sdata[tid+overlap]));\n"" }\n""}\n")
static std::string skepu::MapOverlapKernel_CL_Matrix_Row ("__kernel void MapOverlapKernel_MatRowWise_KERNELNAME(__global TYPE* input, __global TYPE* output, __global TYPE* wrap, int n, int overlap, int out_offset, int out_numelements, int poly, TYPE pad, int blocksPerRow, int rowWidth, __local TYPE* sdata)\n""{\n"" int tid = get_local_id(0);\n"" int i = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"" int wrapIndex= 2 * overlap * (int)(get_group_id(0)/blocksPerRow);\n"" int tmp= (get_group_id(0) % blocksPerRow);\n"" int tmp2= (get_group_id(0) / blocksPerRow);\n"" if(poly == 0)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[i] : pad;\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? pad : input[i-overlap];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (i+overlap < n) && tmp!=(blocksPerRow-1)) ? input[i+overlap] : pad;\n"" }\n"" }\n"" else if(poly == 1)\n"" {\n"" if(i < n)\n"" {\n"" sdata[overlap+tid] = input[i];\n"" }\n"" else if(i-n < overlap)\n"" {\n"" sdata[overlap+tid] = wrap[(overlap+(i-n))+ wrapIndex];\n"" }\n"" else\n"" {\n"" sdata[overlap+tid] = pad;\n"" }\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? wrap[tid+wrapIndex] : input[i-overlap];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && i+overlap < n && tmp!=(blocksPerRow-1)) ? input[i+overlap] : wrap[overlap+wrapIndex+(tid+overlap-get_local_size(0))];\n"" }\n"" }\n"" else if(poly == 2)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[i] : input[n-1];\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? input[tmp2*rowWidth] : input[i-overlap];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (i+overlap < n) && (tmp!=(blocksPerRow-1))) ? input[i+overlap] : input[(tmp2+1)*rowWidth-1];\n"" }\n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if( (i >= out_offset) && (i < out_offset+out_numelements) )\n"" {\n"" output[i-out_offset] = FUNCTIONNAME(&(sdata[tid+overlap]));\n"" }\n""}\n")
static std::string skepu::MapOverlapKernel_CL_Matrix_Col ("__kernel void MapOverlapKernel_MatColWise_KERNELNAME(__global TYPE* input, __global TYPE* output, __global TYPE* wrap, int n, int overlap, int out_offset, int out_numelements, int poly, TYPE pad, int blocksPerCol, int rowWidth, int colWidth, __local TYPE* sdata)\n""{\n"" int tid = get_local_id(0);\n"" int i = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"" int wrapIndex= 2 * overlap * (int)(get_group_id(0)/blocksPerCol);\n"" int tmp= (get_group_id(0) % blocksPerCol);\n"" int tmp2= (get_group_id(0) / blocksPerCol);\n"" int arrInd = (tid + tmp*get_local_size(0))*rowWidth + tmp2;\n"" if(poly == 0)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[arrInd] : pad;\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? pad : input[(arrInd-(overlap*rowWidth))];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+(overlap*rowWidth))] : pad;\n"" }\n"" }\n"" else if(poly == 1)\n"" {\n"" if(i < n)\n"" {\n"" sdata[overlap+tid] = input[arrInd];\n"" }\n"" else if(i-n < overlap)\n"" {\n"" sdata[overlap+tid] = wrap[(overlap+(i-n))+ wrapIndex];\n"" }\n"" else\n"" {\n"" sdata[overlap+tid] = pad;\n"" }\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? wrap[tid+wrapIndex] : input[(arrInd-(overlap*rowWidth))];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+(overlap*rowWidth))] : wrap[overlap+wrapIndex+(tid+overlap-get_local_size(0))];\n"" }\n"" }\n"" else if(poly == 2)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[arrInd] : input[n-1];\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? input[tmp2] : input[(arrInd-(overlap*rowWidth))];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+(overlap*rowWidth))] : input[tmp2+(colWidth-1)*rowWidth];\n"" }\n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if( (arrInd >= out_offset) && (arrInd < out_offset+out_numelements) )\n"" {\n"" output[arrInd-out_offset] = FUNCTIONNAME(&(sdata[tid+overlap]));\n"" }\n""}\n")
static std::string skepu::MapOverlapKernel_CL_Matrix_ColMulti ("__kernel void MapOverlapKernel_MatColWiseMulti_KERNELNAME(__global TYPE* input, __global TYPE* output, __global TYPE* wrap, int n, int overlap, int in_offset, int out_numelements, int poly, int deviceType, TYPE pad, int blocksPerCol, int rowWidth, int colWidth, __local TYPE* sdata)\n""{\n"" int tid = get_local_id(0);\n"" int i = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"" int wrapIndex= 2 * overlap * (int)(get_group_id(0)/blocksPerCol);\n"" int tmp= (get_group_id(0) % blocksPerCol);\n"" int tmp2= (get_group_id(0) / blocksPerCol);\n"" int arrInd = (tid + tmp*get_local_size(0))*rowWidth + tmp2;\n"" if(poly == 0)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[arrInd+in_offset] : pad;\n"" if(deviceType == -1)\n"" {\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? pad : input[(arrInd-(overlap*rowWidth))];\n"" }\n"" \n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = input[(arrInd+in_offset+(overlap*rowWidth))];\n"" }\n"" }\n"" else if(deviceType == 0) \n"" {\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = input[arrInd];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = input[(arrInd+in_offset+(overlap*rowWidth))];\n"" }\n"" }\n"" else if(deviceType == 1)\n"" {\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = input[arrInd];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+in_offset+(overlap*rowWidth))] : pad;\n"" }\n"" }\n"" }\n"" else if(poly == 1)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[arrInd+in_offset] : ((i-n < overlap) ? wrap[(i-n)+ (overlap * tmp2)] : pad);\n"" if(deviceType == -1)\n"" {\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? wrap[tid+(overlap * tmp2)] : input[(arrInd-(overlap*rowWidth))];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = input[(arrInd+in_offset+(overlap*rowWidth))];\n"" }\n"" }\n"" else if(deviceType == 0)\n"" {\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = input[arrInd];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = input[(arrInd+in_offset+(overlap*rowWidth))];\n"" }\n"" }\n"" else if(deviceType == 1)\n"" {\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = input[arrInd];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+in_offset+(overlap*rowWidth))] : wrap[(overlap * tmp2)+(tid+overlap-get_local_size(0))];\n"" }\n"" }\n"" }\n"" else if(poly == 2)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[arrInd+in_offset] : input[n+in_offset-1];\n"" if(deviceType == -1)\n"" {\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? input[tmp2] : input[(arrInd-(overlap*rowWidth))];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = input[(arrInd+in_offset+(overlap*rowWidth))];\n"" }\n"" }\n"" else if(deviceType == 0)\n"" {\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = input[arrInd];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = input[(arrInd+in_offset+(overlap*rowWidth))];\n"" }\n"" }\n"" else if(deviceType == 1)\n"" {\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = input[arrInd];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+in_offset+(overlap*rowWidth))] : input[tmp2+in_offset+(colWidth-1)*rowWidth];\n"" }\n"" }\n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if( arrInd < out_numelements )\n"" {\n"" output[arrInd] = FUNCTIONNAME(&(sdata[tid+overlap]));\n"" }\n""}\n")
template<typename T >
__global__ void skepu::transpose (T *odata, T *idata, int width, int height)
template<int poly, typename T , typename OverlapFunc >
__global__ void skepu::MapOverlapKernel_CU (OverlapFunc mapOverlapFunc, T *input, T *output, T *wrap, unsigned int n, unsigned int out_offset, unsigned int out_numelements, T pad)
template<int poly, typename T , typename OverlapFunc >
__global__ void skepu::MapOverlapKernel_CU_Matrix_Row (OverlapFunc mapOverlapFunc, T *input, T *output, T *wrap, unsigned int n, unsigned int out_offset, unsigned int out_numelements, T pad, unsigned int blocksPerRow, unsigned int rowWidth)
template<int poly, typename T , typename OverlapFunc >
__global__ void skepu::MapOverlapKernel_CU_Matrix_Col (OverlapFunc mapOverlapFunc, T *input, T *output, T *wrap, unsigned int n, unsigned int out_offset, unsigned int out_numelements, T pad, unsigned int blocksPerCol, unsigned int rowWidth, unsigned int colWidth)
template<int poly, int deviceType, typename T , typename OverlapFunc >
__global__ void skepu::MapOverlapKernel_CU_Matrix_ColMulti (OverlapFunc mapOverlapFunc, T *input, T *output, T *wrap, unsigned int n, unsigned int in_offset, unsigned int out_numelements, T pad, unsigned int blocksPerCol, unsigned int rowWidth, unsigned int colWidth)

Detailed Description

Definitions of CUDA and OpenCL kernels for the MapOverlap skeleton.


Function Documentation

template<typename T >
int skepu::calculateTiling ( int  regCountPerThread,
int  filterSizeX,
int  filterSizeY,
int  inputSizeX,
bool  maximizeTiling = false 
)

Helper: to calculate tiling factor.

References skepu::min().

Here is the call graph for this function:

template<typename T , typename OverlapFunc >
__global__ void skepu::conv_cuda_2D_kernel ( OverlapFunc  mapOverlapFunc,
T *  input,
T *  output,
const int  out_rows,
const int  out_cols,
const int  filter_rows,
const int  filter_cols,
size_t  in_pitch,
size_t  out_pitch,
const int  sharedRows,
const int  sharedCols 
)

The 2D mapoverlap CUDA kernel to apply on neighbourhood of each element in the matrix.

template<bool useFilter, typename T >
__global__ void skepu::conv_cuda_shared_kernel ( T *  input,
T *  output,
const int  in_rows,
const int  in_cols,
const int  out_rows,
const int  out_cols,
const int  filter_rows,
const int  filter_cols,
size_t  in_pitch,
size_t  out_pitch,
const int  sharedRows,
const int  sharedCols 
)

The mapoverlap CUDA kernel with (or without) a filter matrix to apply on neighbourhood of each element in the matrix.

template<bool useFilter, typename T >
__global__ void skepu::conv_cuda_shared_tiling_10_kernel ( T *  input,
T *  output,
const int  in_cols,
const int  out_rows,
const int  out_cols,
const int  filter_rows,
const int  filter_cols,
size_t  in_pitch,
size_t  out_pitch,
const int  sharedRows,
const int  sharedCols 
)

The mapoverlap CUDA kernel with tiling support (tiling factor: 10); with (or without) a filter matrix to apply on neighbourhood of each element in the matrix.

template<bool useFilter, typename T >
__global__ void skepu::conv_cuda_shared_tiling_12_kernel ( T *  input,
T *  output,
const int  in_cols,
const int  out_rows,
const int  out_cols,
const int  filter_rows,
const int  filter_cols,
size_t  in_pitch,
size_t  out_pitch,
const int  sharedRows,
const int  sharedCols 
)

The mapoverlap CUDA kernel with tiling support (tiling factor: 22); with (or without) a filter matrix to apply on neighbourhood of each element in the matrix.

template<bool useFilter, typename T >
__global__ void skepu::conv_cuda_shared_tiling_14_kernel ( T *  input,
T *  output,
const int  in_cols,
const int  out_rows,
const int  out_cols,
const int  filter_rows,
const int  filter_cols,
size_t  in_pitch,
size_t  out_pitch,
const int  sharedRows,
const int  sharedCols 
)

The mapoverlap CUDA kernel with tiling support (tiling factor: 14); with (or without) a filter matrix to apply on neighbourhood of each element in the matrix.

template<bool useFilter, typename T >
__global__ void skepu::conv_cuda_shared_tiling_16_kernel ( T *  input,
T *  output,
const int  in_cols,
const int  out_rows,
const int  out_cols,
const int  filter_rows,
const int  filter_cols,
size_t  in_pitch,
size_t  out_pitch,
const int  sharedRows,
const int  sharedCols 
)

The mapoverlap CUDA kernel with tiling support (tiling factor: 16); with (or without) a filter matrix to apply on neighbourhood of each element in the matrix.

template<bool useFilter, typename T >
__global__ void skepu::conv_cuda_shared_tiling_2_kernel ( T *  input,
T *  output,
const int  in_cols,
const int  out_rows,
const int  out_cols,
const int  filter_rows,
const int  filter_cols,
size_t  in_pitch,
size_t  out_pitch,
const int  sharedRows,
const int  sharedCols 
)

The mapoverlap CUDA kernel with tiling support (tiling factor: 2); with (or without) a filter matrix to apply on neighbourhood of each element in the matrix.

template<bool useFilter, typename T >
__global__ void skepu::conv_cuda_shared_tiling_4_kernel ( T *  input,
T *  output,
const int  in_cols,
const int  out_rows,
const int  out_cols,
const int  filter_rows,
const int  filter_cols,
size_t  in_pitch,
size_t  out_pitch,
const int  sharedRows,
const int  sharedCols 
)

The mapoverlap CUDA kernel with tiling support (tiling factor: 4); with (or without) a filter matrix to apply on neighbourhood of each element in the matrix.

template<bool useFilter, typename T >
__global__ void skepu::conv_cuda_shared_tiling_6_kernel ( T *  input,
T *  output,
const int  in_cols,
const int  out_rows,
const int  out_cols,
const int  filter_rows,
const int  filter_cols,
size_t  in_pitch,
size_t  out_pitch,
const int  sharedRows,
const int  sharedCols 
)

The mapoverlap CUDA kernel with tiling support (tiling factor: 6); with (or without) a filter matrix to apply on neighbourhood of each element in the matrix.

template<bool useFilter, typename T >
__global__ void skepu::conv_cuda_shared_tiling_8_kernel ( T *  input,
T *  output,
const int  in_cols,
const int  out_rows,
const int  out_cols,
const int  filter_rows,
const int  filter_cols,
size_t  in_pitch,
size_t  out_pitch,
const int  sharedRows,
const int  sharedCols 
)

The mapoverlap CUDA kernel with tiling support (tiling factor: 8); with (or without) a filter matrix to apply on neighbourhood of each element in the matrix.

template<bool useFilter, typename T >
__global__ void skepu::conv_cuda_shared_tiling_kernel ( T *  input,
T *  output,
const int  numTiles,
const int  in_cols,
const int  out_rows,
const int  out_cols,
const int  filter_rows,
const int  filter_cols,
size_t  in_pitch,
size_t  out_pitch,
const int  sharedRows,
const int  sharedCols 
)

The mapoverlap CUDA kernel with tiling support; with (or without) a filter matrix to apply on neighbourhood of each element in the matrix.

static std::string skepu::MapOverlapKernel_CL ( ) [static]
Version:
0.7

OpenCL MapOverlap kernel for vector. It uses one device thread per element so maximum number of device threads limits the maximum number of elements this kernel can handle. Also the amount of shared memory and the maximum blocksize limits the maximum overlap that is possible to use, typically limits the overlap to < 256.

static std::string skepu::MapOverlapKernel_CL_Matrix_Col ( ) [static]
Version:
0.7

OpenCL MapOverlap kernel for applying column-wise overlap on matrix operands. It uses one device thread per element so maximum number of device threads limits the maximum number of elements this kernel can handle. Also the amount of shared memory and the maximum blocksize limits the maximum overlap that is possible to use, typically limits the overlap to < 256.

static std::string skepu::MapOverlapKernel_CL_Matrix_ColMulti ( ) [static]
Version:
0.7

OpenCL MapOverlap kernel for applying column-wise overlap on matrix operands when using multiple GPUs. It uses one device thread per element so maximum number of device threads limits the maximum number of elements this kernel can handle. Also the amount of shared memory and the maximum blocksize limits the maximum overlap that is possible to use, typically limits the overlap to < 256.

static std::string skepu::MapOverlapKernel_CL_Matrix_Row ( ) [static]
Version:
0.7

OpenCL MapOverlap kernel for applying row-wise overlap on matrix operands. It uses one device thread per element so maximum number of device threads limits the maximum number of elements this kernel can handle. Also the amount of shared memory and the maximum blocksize limits the maximum overlap that is possible to use, typically limits the overlap to < 256.

template<int poly, typename T , typename OverlapFunc >
__global__ void skepu::MapOverlapKernel_CU ( OverlapFunc  mapOverlapFunc,
T *  input,
T *  output,
T *  wrap,
unsigned int  n,
unsigned int  out_offset,
unsigned int  out_numelements,
pad 
)

CUDA MapOverlap kernel for vector operands. It uses one device thread per element so maximum number of device threads limits the maximum number of elements this kernel can handle. Also the amount of shared memory and the maximum blocksize limits the maximum overlap that is possible to use, typically limits the overlap to < 256.

template<int poly, typename T , typename OverlapFunc >
__global__ void skepu::MapOverlapKernel_CU_Matrix_Col ( OverlapFunc  mapOverlapFunc,
T *  input,
T *  output,
T *  wrap,
unsigned int  n,
unsigned int  out_offset,
unsigned int  out_numelements,
pad,
unsigned int  blocksPerCol,
unsigned int  rowWidth,
unsigned int  colWidth 
)
Version:
0.7

CUDA MapOverlap kernel for applying column-wise overlap on matrix operands. It uses one device thread per element so maximum number of device threads limits the maximum number of elements this kernel can handle. Also the amount of shared memory and the maximum blocksize limits the maximum overlap that is possible to use, typically limits the overlap to < 256.

template<int poly, int deviceType, typename T , typename OverlapFunc >
__global__ void skepu::MapOverlapKernel_CU_Matrix_ColMulti ( OverlapFunc  mapOverlapFunc,
T *  input,
T *  output,
T *  wrap,
unsigned int  n,
unsigned int  in_offset,
unsigned int  out_numelements,
pad,
unsigned int  blocksPerCol,
unsigned int  rowWidth,
unsigned int  colWidth 
)
Version:
0.7

CUDA MapOverlap kernel for applying column-wise overlap on matrix operands with multiple devices. It uses one device thread per element so maximum number of device threads limits the maximum number of elements this kernel can handle. Also the amount of shared memory and the maximum blocksize limits the maximum overlap that is possible to use, typically limits the overlap to < 256. Device type : -1 (first), 0 (middleDevice), 1(last),

template<int poly, typename T , typename OverlapFunc >
__global__ void skepu::MapOverlapKernel_CU_Matrix_Row ( OverlapFunc  mapOverlapFunc,
T *  input,
T *  output,
T *  wrap,
unsigned int  n,
unsigned int  out_offset,
unsigned int  out_numelements,
pad,
unsigned int  blocksPerRow,
unsigned int  rowWidth 
)

CUDA MapOverlap kernel for applying row-wise overlap on matrix operands. It uses one device thread per element so maximum number of device threads limits the maximum number of elements this kernel can handle. Also the amount of shared memory and the maximum blocksize limits the maximum overlap that is possible to use, typically limits the overlap to < 256.

static std::string skepu::MatrixConvol2D_CL ( ) [static]

The mapoverlap OpenCL kernel to apply on neighbourhood of each element in the matrix.

static std::string skepu::MatrixConvolShared_CL ( ) [static]

The mapoverlap OpenCL kernel to apply on neighbourhood of each element in the matrix.

static std::string skepu::MatrixConvolSharedFilter_CL ( ) [static]

The mapoverlap OpenCL kernel with a filter matrix to apply on neighbourhood of each element in the matrix.

static std::string skepu::MatrixTranspose_CL ( ) [static]
Version:
0.7

OpenCL matrix transpose kernel.

template<typename T >
T skepu::max ( a,
b 
)

Helper: to calculate maximum.

template<typename T >
T skepu::min ( a,
b 
)

Helper: to calculate minimum.

template<typename T >
__global__ void skepu::transpose ( T *  odata,
T *  idata,
int  width,
int  height 
)

Matrix transpose CUDA kernel

 All Classes Namespaces Files Functions Enumerations Friends Defines