SkePU  1.2
 All Classes Namespaces Files Functions Variables Enumerations Friends Macros Groups Pages
Functions
MapOverlap Kernels
Collaboration diagram for MapOverlap Kernels:

Functions

static std::string skepu::MatrixConvolSharedFilter_CL ("__kernel void conv_opencl_shared_filter_KERNELNAME(__global TYPE* input, __global TYPE* output, __constant TYPE* filter, size_t in_rows, size_t in_cols, size_t out_rows, size_t out_cols, size_t filter_rows, size_t filter_cols, size_t in_pitch, size_t out_pitch, size_t sharedRows, size_t sharedCols, __local TYPE* sdata)\n""{\n"" size_t xx = ( (size_t)(get_global_id(0)/get_local_size(0))) * get_local_size(0);\n"" size_t yy = ( (size_t)(get_global_id(1)/get_local_size(1))) * get_local_size(1);\n"" size_t x = get_global_id(0);\n"" size_t y = get_global_id(1);\n"" if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))\n"" {\n"" size_t sharedIdx = get_local_id(1) * sharedCols + get_local_id(0);\n"" sdata[sharedIdx]= input[y*in_pitch + x];\n"" size_t shared_x= get_local_id(0)+get_local_size(0);\n"" size_t shared_y= get_local_id(1);\n"" while(shared_y<sharedRows)\n"" {\n"" while(shared_x<sharedCols)\n"" {\n"" sharedIdx = shared_y * sharedCols + shared_x; \n"" sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];\n"" shared_x = shared_x + get_local_size(0);\n"" }\n"" shared_x = get_local_id(0);\n"" shared_y = shared_y + get_local_size(1);\n"" } \n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if(x<out_cols && y<out_rows)\n"" {\n"" TYPE sum=0;\n"" for(size_t j=0;j<filter_rows;j++) \n"" {\n"" for(size_t i=0;i<filter_cols;i++) \n"" {\n"" sum += sdata[(get_local_id(1)+j) * sharedCols + (get_local_id(0)+i) ] * filter[j*filter_cols+i];\n"" }\n"" }\n"" output[y*out_pitch+x] = sum / (filter_rows * filter_cols);\n"" }\n""}")
 
static std::string skepu::MatrixConvol2D_CL ("__kernel void conv_opencl_2D_KERNELNAME(__global TYPE* input, __global TYPE* output, size_t out_rows, size_t out_cols, size_t filter_rows, size_t filter_cols, size_t in_pitch, size_t out_pitch, size_t stride, size_t sharedRows, size_t sharedCols, __local TYPE* sdata)\n""{\n"" size_t xx = ( (size_t)(get_global_id(0)/get_local_size(0))) * get_local_size(0);\n"" size_t yy = ( (size_t)(get_global_id(1)/get_local_size(1))) * get_local_size(1);\n"" size_t x = get_global_id(0);\n"" size_t y = get_global_id(1);\n"" if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))\n"" {\n"" size_t sharedIdx = get_local_id(1) * sharedCols + get_local_id(0);\n"" sdata[sharedIdx]= input[y*in_pitch + x];\n"" size_t shared_x= get_local_id(0)+get_local_size(0);\n"" size_t shared_y= get_local_id(1);\n"" while(shared_y<sharedRows)\n"" {\n"" while(shared_x<sharedCols)\n"" {\n"" sharedIdx = shared_y * sharedCols + shared_x; \n"" sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];\n"" shared_x = shared_x + get_local_size(0);\n"" }\n"" shared_x = get_local_id(0);\n"" shared_y = shared_y + get_local_size(1);\n"" } \n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if(x<out_cols && y<out_rows)\n"" {\n"" output[y*out_pitch+x] = FUNCTIONNAME(&(sdata[(get_local_id(1)+(filter_rows/2)) * sharedCols + (get_local_id(0)+(filter_cols/2))]), stride);\n"" }\n""}")
 
static std::string skepu::MatrixConvolShared_CL ("__kernel void conv_opencl_shared_KERNELNAME(__global TYPE* input, __global TYPE* output, size_t in_rows, size_t in_cols, size_t out_rows, size_t out_cols, size_t filter_rows, size_t filter_cols, size_t in_pitch, size_t out_pitch, size_t sharedRows, size_t sharedCols, __local TYPE* sdata)\n""{\n"" size_t xx = ( (size_t)(get_global_id(0)/get_local_size(0))) * get_local_size(0);\n"" size_t yy = ( (size_t)(get_global_id(1)/get_local_size(1))) * get_local_size(1);\n"" size_t x = get_global_id(0);\n"" size_t y = get_global_id(1);\n"" if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))\n"" {\n"" size_t sharedIdx = get_local_id(1) * sharedCols + get_local_id(0);\n"" sdata[sharedIdx]= input[y*in_pitch + x];\n"" size_t shared_x= get_local_id(0)+get_local_size(0);\n"" size_t shared_y= get_local_id(1);\n"" while(shared_y<sharedRows)\n"" {\n"" while(shared_x<sharedCols)\n"" {\n"" sharedIdx = shared_y * sharedCols + shared_x; \n"" sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];\n"" shared_x = shared_x + get_local_size(0);\n"" }\n"" shared_x = get_local_id(0);\n"" shared_y = shared_y + get_local_size(1);\n"" } \n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if(x<out_cols && y<out_rows)\n"" {\n"" TYPE sum=0;\n"" for(size_t j=0;j<filter_rows;j++) \n"" {\n"" for(size_t i=0;i<filter_cols;i++) \n"" {\n"" sum += sdata[(get_local_id(1)+j) * sharedCols + (get_local_id(0)+i) ];\n"" }\n"" }\n"" output[y*out_pitch+x] = sum / (filter_rows * filter_cols);\n"" }\n""}")
 
template<typename T >
skepu::max (T a, T b)
 
template<typename T >
skepu::min (T a, T b)
 
template<typename T >
size_t skepu::calculateTiling (size_t regCountPerThread, size_t filterSizeX, size_t filterSizeY, size_t inputSizeX, bool maximizeTiling=false)
 
template<typename T , typename OverlapFunc >
__global__ void skepu::conv_cuda_2D_kernel (OverlapFunc mapOverlapFunc, T *input, T *output, const size_t out_rows, const size_t out_cols, const size_t filter_rows, const size_t filter_cols, size_t in_pitch, size_t out_pitch, const size_t sharedRows, const size_t sharedCols)
 
template<bool useFilter, typename T >
__global__ void skepu::conv_cuda_shared_kernel (T *input, T *output, const size_t in_rows, const size_t in_cols, const size_t out_rows, const size_t out_cols, const size_t filter_rows, const size_t filter_cols, size_t in_pitch, size_t out_pitch, const size_t sharedRows, const size_t sharedCols)
 
template<bool useFilter, typename T >
__global__ void skepu::conv_cuda_shared_tiling_kernel (T *input, T *output, const size_t numTiles, const size_t in_cols, const size_t out_rows, const size_t out_cols, const size_t filter_rows, const size_t filter_cols, size_t in_pitch, size_t out_pitch, const size_t sharedRows, const size_t sharedCols)
 
template<bool useFilter, typename T >
__global__ void skepu::conv_cuda_shared_tiling_2_kernel (T *input, T *output, const size_t in_cols, const size_t out_rows, const size_t out_cols, const size_t filter_rows, const size_t filter_cols, size_t in_pitch, size_t out_pitch, const size_t sharedRows, const size_t sharedCols)
 
template<bool useFilter, typename T >
__global__ void skepu::conv_cuda_shared_tiling_4_kernel (T *input, T *output, const size_t in_cols, const size_t out_rows, const size_t out_cols, const size_t filter_rows, const size_t filter_cols, size_t in_pitch, size_t out_pitch, const size_t sharedRows, const size_t sharedCols)
 
template<bool useFilter, typename T >
__global__ void skepu::conv_cuda_shared_tiling_6_kernel (T *input, T *output, const size_t in_cols, const size_t out_rows, const size_t out_cols, const size_t filter_rows, const size_t filter_cols, size_t in_pitch, size_t out_pitch, const size_t sharedRows, const size_t sharedCols)
 
template<bool useFilter, typename T >
__global__ void skepu::conv_cuda_shared_tiling_8_kernel (T *input, T *output, const size_t in_cols, const size_t out_rows, const size_t out_cols, const size_t filter_rows, const size_t filter_cols, size_t in_pitch, size_t out_pitch, const size_t sharedRows, const size_t sharedCols)
 
template<bool useFilter, typename T >
__global__ void skepu::conv_cuda_shared_tiling_10_kernel (T *input, T *output, const size_t in_cols, const size_t out_rows, const size_t out_cols, const size_t filter_rows, const size_t filter_cols, size_t in_pitch, size_t out_pitch, const size_t sharedRows, const size_t sharedCols)
 
template<bool useFilter, typename T >
__global__ void skepu::conv_cuda_shared_tiling_12_kernel (T *input, T *output, const size_t in_cols, const size_t out_rows, const size_t out_cols, const size_t filter_rows, const size_t filter_cols, size_t in_pitch, size_t out_pitch, const size_t sharedRows, const size_t sharedCols)
 
template<bool useFilter, typename T >
__global__ void skepu::conv_cuda_shared_tiling_14_kernel (T *input, T *output, const size_t in_cols, const size_t out_rows, const size_t out_cols, const size_t filter_rows, const size_t filter_cols, size_t in_pitch, size_t out_pitch, const size_t sharedRows, const size_t sharedCols)
 
template<bool useFilter, typename T >
__global__ void skepu::conv_cuda_shared_tiling_16_kernel (T *input, T *output, const size_t in_cols, const size_t out_rows, const size_t out_cols, const size_t filter_rows, const size_t filter_cols, size_t in_pitch, size_t out_pitch, const size_t sharedRows, const size_t sharedCols)
 
static std::string skepu::MapOverlapKernel_CL ("__kernel void MapOverlapKernel_KERNELNAME(__global TYPE* input, __global TYPE* output, __global TYPE* wrap, size_t n, size_t overlap, size_t out_offset, size_t out_numelements, int poly, TYPE pad, __local TYPE* sdata)\n""{\n"" size_t tid = get_local_id(0);\n"" size_t i = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"" if(poly == 0)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[i] : pad;\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (get_group_id(0) == 0) ? pad : input[i-overlap];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != get_num_groups(0)-1 && i+overlap < n) ? input[i+overlap] : pad;\n"" }\n"" }\n"" else if(poly == 1)\n"" {\n"" if(i < n)\n"" {\n"" sdata[overlap+tid] = input[i];\n"" }\n"" else if(i-n < overlap)\n"" {\n"" sdata[overlap+tid] = wrap[overlap+(i-n)];\n"" }\n"" else\n"" {\n"" sdata[overlap+tid] = pad;\n"" }\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (get_group_id(0) == 0) ? wrap[tid] : input[i-overlap];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != get_num_groups(0)-1 && i+overlap < n) ? input[i+overlap] : wrap[overlap+(i+overlap-n)];\n"" }\n"" }\n"" else if(poly == 2)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[i] : input[n-1];\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (get_group_id(0) == 0) ? input[0] : input[i-overlap];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != get_num_groups(0)-1 && i+overlap < n) ? input[i+overlap] : input[n-1];\n"" }\n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if( (i >= out_offset) && (i < out_offset+out_numelements) )\n"" {\n"" output[i-out_offset] = FUNCTIONNAME(&(sdata[tid+overlap]));\n"" }\n""}\n")
 
static std::string skepu::MapOverlapKernel_CL_Matrix_Row ("__kernel void MapOverlapKernel_MatRowWise_KERNELNAME(__global TYPE* input, __global TYPE* output, __global TYPE* wrap, size_t n, size_t overlap, size_t out_offset, size_t out_numelements, int poly, TYPE pad, size_t blocksPerRow, size_t rowWidth, __local TYPE* sdata)\n""{\n"" size_t tid = get_local_id(0);\n"" size_t i = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"" size_t wrapIndex= 2 * overlap * (int)(get_group_id(0)/blocksPerRow);\n"" size_t tmp= (get_group_id(0) % blocksPerRow);\n"" size_t tmp2= (get_group_id(0) / blocksPerRow);\n"" if(poly == 0)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[i] : pad;\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? pad : input[i-overlap];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (i+overlap < n) && tmp!=(blocksPerRow-1)) ? input[i+overlap] : pad;\n"" }\n"" }\n"" else if(poly == 1)\n"" {\n"" if(i < n)\n"" {\n"" sdata[overlap+tid] = input[i];\n"" }\n"" else if(i-n < overlap)\n"" {\n"" sdata[overlap+tid] = wrap[(overlap+(i-n))+ wrapIndex];\n"" }\n"" else\n"" {\n"" sdata[overlap+tid] = pad;\n"" }\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? wrap[tid+wrapIndex] : input[i-overlap];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && i+overlap < n && tmp!=(blocksPerRow-1)) ? input[i+overlap] : wrap[overlap+wrapIndex+(tid+overlap-get_local_size(0))];\n"" }\n"" }\n"" else if(poly == 2)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[i] : input[n-1];\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? input[tmp2*rowWidth] : input[i-overlap];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (i+overlap < n) && (tmp!=(blocksPerRow-1))) ? input[i+overlap] : input[(tmp2+1)*rowWidth-1];\n"" }\n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if( (i >= out_offset) && (i < out_offset+out_numelements) )\n"" {\n"" output[i-out_offset] = FUNCTIONNAME(&(sdata[tid+overlap]));\n"" }\n""}\n")
 
static std::string skepu::MapOverlapKernel_CL_Matrix_Col ("__kernel void MapOverlapKernel_MatColWise_KERNELNAME(__global TYPE* input, __global TYPE* output, __global TYPE* wrap, size_t n, size_t overlap, size_t out_offset, size_t out_numelements, int poly, TYPE pad, size_t blocksPerCol, size_t rowWidth, size_t colWidth, __local TYPE* sdata)\n""{\n"" size_t tid = get_local_id(0);\n"" size_t i = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"" size_t wrapIndex= 2 * overlap * (int)(get_group_id(0)/blocksPerCol);\n"" size_t tmp= (get_group_id(0) % blocksPerCol);\n"" size_t tmp2= (get_group_id(0) / blocksPerCol);\n"" size_t arrInd = (tid + tmp*get_local_size(0))*rowWidth + tmp2;\n"" if(poly == 0)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[arrInd] : pad;\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? pad : input[(arrInd-(overlap*rowWidth))];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+(overlap*rowWidth))] : pad;\n"" }\n"" }\n"" else if(poly == 1)\n"" {\n"" if(i < n)\n"" {\n"" sdata[overlap+tid] = input[arrInd];\n"" }\n"" else if(i-n < overlap)\n"" {\n"" sdata[overlap+tid] = wrap[(overlap+(i-n))+ wrapIndex];\n"" }\n"" else\n"" {\n"" sdata[overlap+tid] = pad;\n"" }\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? wrap[tid+wrapIndex] : input[(arrInd-(overlap*rowWidth))];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+(overlap*rowWidth))] : wrap[overlap+wrapIndex+(tid+overlap-get_local_size(0))];\n"" }\n"" }\n"" else if(poly == 2)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[arrInd] : input[n-1];\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? input[tmp2] : input[(arrInd-(overlap*rowWidth))];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+(overlap*rowWidth))] : input[tmp2+(colWidth-1)*rowWidth];\n"" }\n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if( (arrInd >= out_offset) && (arrInd < out_offset+out_numelements) )\n"" {\n"" output[arrInd-out_offset] = FUNCTIONNAME(&(sdata[tid+overlap]));\n"" }\n""}\n")
 
static std::string skepu::MapOverlapKernel_CL_Matrix_ColMulti ("__kernel void MapOverlapKernel_MatColWiseMulti_KERNELNAME(__global TYPE* input, __global TYPE* output, __global TYPE* wrap, size_t n, size_t overlap, size_t in_offset, size_t out_numelements, int poly, int deviceType, TYPE pad, size_t blocksPerCol, size_t rowWidth, size_t colWidth, __local TYPE* sdata)\n""{\n"" size_t tid = get_local_id(0);\n"" size_t i = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"" size_t wrapIndex= 2 * overlap * (int)(get_group_id(0)/blocksPerCol);\n"" size_t tmp= (get_group_id(0) % blocksPerCol);\n"" size_t tmp2= (get_group_id(0) / blocksPerCol);\n"" size_t arrInd = (tid + tmp*get_local_size(0))*rowWidth + tmp2;\n"" if(poly == 0)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[arrInd+in_offset] : pad;\n"" if(deviceType == -1)\n"" {\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? pad : input[(arrInd-(overlap*rowWidth))];\n"" }\n"" \n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = input[(arrInd+in_offset+(overlap*rowWidth))];\n"" }\n"" }\n"" else if(deviceType == 0) \n"" {\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = input[arrInd];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = input[(arrInd+in_offset+(overlap*rowWidth))];\n"" }\n"" }\n"" else if(deviceType == 1)\n"" {\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = input[arrInd];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+in_offset+(overlap*rowWidth))] : pad;\n"" }\n"" }\n"" }\n"" else if(poly == 1)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[arrInd+in_offset] : ((i-n < overlap) ? wrap[(i-n)+ (overlap * tmp2)] : pad);\n"" if(deviceType == -1)\n"" {\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? wrap[tid+(overlap * tmp2)] : input[(arrInd-(overlap*rowWidth))];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = input[(arrInd+in_offset+(overlap*rowWidth))];\n"" }\n"" }\n"" else if(deviceType == 0)\n"" {\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = input[arrInd];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = input[(arrInd+in_offset+(overlap*rowWidth))];\n"" }\n"" }\n"" else if(deviceType == 1)\n"" {\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = input[arrInd];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+in_offset+(overlap*rowWidth))] : wrap[(overlap * tmp2)+(tid+overlap-get_local_size(0))];\n"" }\n"" }\n"" }\n"" else if(poly == 2)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[arrInd+in_offset] : input[n+in_offset-1];\n"" if(deviceType == -1)\n"" {\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? input[tmp2] : input[(arrInd-(overlap*rowWidth))];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = input[(arrInd+in_offset+(overlap*rowWidth))];\n"" }\n"" }\n"" else if(deviceType == 0)\n"" {\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = input[arrInd];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = input[(arrInd+in_offset+(overlap*rowWidth))];\n"" }\n"" }\n"" else if(deviceType == 1)\n"" {\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = input[arrInd];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+in_offset+(overlap*rowWidth))] : input[tmp2+in_offset+(colWidth-1)*rowWidth];\n"" }\n"" }\n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if( arrInd < out_numelements )\n"" {\n"" output[arrInd] = FUNCTIONNAME(&(sdata[tid+overlap]));\n"" }\n""}\n")
 
template<typename T >
__global__ void skepu::transpose (T *odata, T *idata, size_t width, size_t height)
 
template<int poly, typename T , typename OverlapFunc >
__global__ void skepu::MapOverlapKernel_CU (OverlapFunc mapOverlapFunc, T *input, T *output, T *wrap, size_t n, size_t out_offset, size_t out_numelements, T pad)
 
template<int poly, typename T , typename OverlapFunc >
__global__ void skepu::MapOverlapKernel_CU_Matrix_Row (OverlapFunc mapOverlapFunc, T *input, T *output, T *wrap, size_t n, size_t out_offset, size_t out_numelements, T pad, size_t blocksPerRow, size_t rowWidth)
 
template<int poly, typename T , typename OverlapFunc >
__global__ void skepu::MapOverlapKernel_CU_Matrix_Col (OverlapFunc mapOverlapFunc, T *input, T *output, T *wrap, size_t n, size_t out_offset, size_t out_numelements, T pad, size_t blocksPerCol, size_t rowWidth, size_t colWidth)
 
template<int poly, int deviceType, typename T , typename OverlapFunc >
__global__ void skepu::MapOverlapKernel_CU_Matrix_ColMulti (OverlapFunc mapOverlapFunc, T *input, T *output, T *wrap, size_t n, size_t in_offset, size_t out_numelements, T pad, size_t blocksPerCol, size_t rowWidth, size_t colWidth)
 

Detailed Description

Definitions of CUDA and OpenCL kernels for the MapOverlap skeleton.

Function Documentation

template<typename T >
size_t skepu::calculateTiling ( size_t  regCountPerThread,
size_t  filterSizeX,
size_t  filterSizeY,
size_t  inputSizeX,
bool  maximizeTiling = false 
)

Helper: to calculate tiling factor.

References skepu::min().

Here is the call graph for this function:

template<typename T , typename OverlapFunc >
__global__ void skepu::conv_cuda_2D_kernel ( OverlapFunc  mapOverlapFunc,
T *  input,
T *  output,
const size_t  out_rows,
const size_t  out_cols,
const size_t  filter_rows,
const size_t  filter_cols,
size_t  in_pitch,
size_t  out_pitch,
const size_t  sharedRows,
const size_t  sharedCols 
)

The 2D mapoverlap CUDA kernel to apply the given user function on neighbourhood of each element in the matrix.

template<bool useFilter, typename T >
__global__ void skepu::conv_cuda_shared_kernel ( T *  input,
T *  output,
const size_t  in_rows,
const size_t  in_cols,
const size_t  out_rows,
const size_t  out_cols,
const size_t  filter_rows,
const size_t  filter_cols,
size_t  in_pitch,
size_t  out_pitch,
const size_t  sharedRows,
const size_t  sharedCols 
)

The mapoverlap CUDA kernel with (or without) a filter matrix to apply on neighbourhood of each element in the matrix.

template<bool useFilter, typename T >
__global__ void skepu::conv_cuda_shared_tiling_10_kernel ( T *  input,
T *  output,
const size_t  in_cols,
const size_t  out_rows,
const size_t  out_cols,
const size_t  filter_rows,
const size_t  filter_cols,
size_t  in_pitch,
size_t  out_pitch,
const size_t  sharedRows,
const size_t  sharedCols 
)

The mapoverlap CUDA kernel with tiling support (tiling factor: 10); with (or without) a filter matrix to apply on neighbourhood of each element in the matrix.

template<bool useFilter, typename T >
__global__ void skepu::conv_cuda_shared_tiling_12_kernel ( T *  input,
T *  output,
const size_t  in_cols,
const size_t  out_rows,
const size_t  out_cols,
const size_t  filter_rows,
const size_t  filter_cols,
size_t  in_pitch,
size_t  out_pitch,
const size_t  sharedRows,
const size_t  sharedCols 
)

The mapoverlap CUDA kernel with tiling support (tiling factor: 22); with (or without) a filter matrix to apply on neighbourhood of each element in the matrix.

template<bool useFilter, typename T >
__global__ void skepu::conv_cuda_shared_tiling_14_kernel ( T *  input,
T *  output,
const size_t  in_cols,
const size_t  out_rows,
const size_t  out_cols,
const size_t  filter_rows,
const size_t  filter_cols,
size_t  in_pitch,
size_t  out_pitch,
const size_t  sharedRows,
const size_t  sharedCols 
)

The mapoverlap CUDA kernel with tiling support (tiling factor: 14); with (or without) a filter matrix to apply on neighbourhood of each element in the matrix.

template<bool useFilter, typename T >
__global__ void skepu::conv_cuda_shared_tiling_16_kernel ( T *  input,
T *  output,
const size_t  in_cols,
const size_t  out_rows,
const size_t  out_cols,
const size_t  filter_rows,
const size_t  filter_cols,
size_t  in_pitch,
size_t  out_pitch,
const size_t  sharedRows,
const size_t  sharedCols 
)

The mapoverlap CUDA kernel with tiling support (tiling factor: 16); with (or without) a filter matrix to apply on neighbourhood of each element in the matrix.

template<bool useFilter, typename T >
__global__ void skepu::conv_cuda_shared_tiling_2_kernel ( T *  input,
T *  output,
const size_t  in_cols,
const size_t  out_rows,
const size_t  out_cols,
const size_t  filter_rows,
const size_t  filter_cols,
size_t  in_pitch,
size_t  out_pitch,
const size_t  sharedRows,
const size_t  sharedCols 
)

The mapoverlap CUDA kernel with tiling support (tiling factor: 2); with (or without) a filter matrix to apply on neighbourhood of each element in the matrix.

template<bool useFilter, typename T >
__global__ void skepu::conv_cuda_shared_tiling_4_kernel ( T *  input,
T *  output,
const size_t  in_cols,
const size_t  out_rows,
const size_t  out_cols,
const size_t  filter_rows,
const size_t  filter_cols,
size_t  in_pitch,
size_t  out_pitch,
const size_t  sharedRows,
const size_t  sharedCols 
)

The mapoverlap CUDA kernel with tiling support (tiling factor: 4); with (or without) a filter matrix to apply on neighbourhood of each element in the matrix.

template<bool useFilter, typename T >
__global__ void skepu::conv_cuda_shared_tiling_6_kernel ( T *  input,
T *  output,
const size_t  in_cols,
const size_t  out_rows,
const size_t  out_cols,
const size_t  filter_rows,
const size_t  filter_cols,
size_t  in_pitch,
size_t  out_pitch,
const size_t  sharedRows,
const size_t  sharedCols 
)

The mapoverlap CUDA kernel with tiling support (tiling factor: 6); with (or without) a filter matrix to apply on neighbourhood of each element in the matrix.

template<bool useFilter, typename T >
__global__ void skepu::conv_cuda_shared_tiling_8_kernel ( T *  input,
T *  output,
const size_t  in_cols,
const size_t  out_rows,
const size_t  out_cols,
const size_t  filter_rows,
const size_t  filter_cols,
size_t  in_pitch,
size_t  out_pitch,
const size_t  sharedRows,
const size_t  sharedCols 
)

The mapoverlap CUDA kernel with tiling support (tiling factor: 8); with (or without) a filter matrix to apply on neighbourhood of each element in the matrix.

template<bool useFilter, typename T >
__global__ void skepu::conv_cuda_shared_tiling_kernel ( T *  input,
T *  output,
const size_t  numTiles,
const size_t  in_cols,
const size_t  out_rows,
const size_t  out_cols,
const size_t  filter_rows,
const size_t  filter_cols,
size_t  in_pitch,
size_t  out_pitch,
const size_t  sharedRows,
const size_t  sharedCols 
)

The mapoverlap CUDA kernel with tiling support; with (or without) a filter matrix to apply on neighbourhood of each element in the matrix.

static std::string skepu::MapOverlapKernel_CL ( "__kernel void MapOverlapKernel_KERNELNAME(__global TYPE* input, __global TYPE* output, __global TYPE* wrap, size_t n, size_t overlap, size_t out_offset, size_t out_numelements, int poly, TYPE pad, __local TYPE* sdata)\n""{\n"" size_t tid = get_local_id(0);\n"" size_t i = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"" if(poly == 0)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[i] : pad;\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (get_group_id(0) == 0) ? pad : input[i-overlap];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != get_num_groups(0)-1 && i+overlap < n) ? input[i+overlap] : pad;\n"" }\n"" }\n"" else if(poly == 1)\n"" {\n"" if(i < n)\n"" {\n"" sdata[overlap+tid] = input[i];\n"" }\n"" else if(i-n < overlap)\n"" {\n"" sdata[overlap+tid] = wrap[overlap+(i-n)];\n"" }\n"" else\n"" {\n"" sdata[overlap+tid] = pad;\n"" }\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (get_group_id(0) == 0) ? wrap[tid] : input[i-overlap];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != get_num_groups(0)-1 && i+overlap < n) ? input[i+overlap] : wrap[overlap+(i+overlap-n)];\n"" }\n"" }\n"" else if(poly == 2)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[i] : input[n-1];\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (get_group_id(0) == 0) ? input[0] : input[i-overlap];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != get_num_groups(0)-1 && i+overlap < n) ? input[i+overlap] : input[n-1];\n"" }\n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if( (i >= out_offset) && (i < out_offset+out_numelements) )\n"" {\n"" output[i-out_offset] = FUNCTIONNAME(&(sdata[tid+overlap]));\n"" }\n""}\n"  )
static

OpenCL MapOverlap kernel for vector. It uses one device thread per element so maximum number of device threads limits the maximum number of elements this kernel can handle. Also the amount of shared memory and the maximum blocksize limits the maximum overlap that is possible to use, typically limits the overlap to < 256.

static std::string skepu::MapOverlapKernel_CL_Matrix_Col ( "__kernel void MapOverlapKernel_MatColWise_KERNELNAME(__global TYPE* input, __global TYPE* output, __global TYPE* wrap, size_t n, size_t overlap, size_t out_offset, size_t out_numelements, int poly, TYPE pad, size_t blocksPerCol, size_t rowWidth, size_t colWidth, __local TYPE* sdata)\n""{\n"" size_t tid = get_local_id(0);\n"" size_t i = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"" size_t wrapIndex= 2 * overlap * (int)(get_group_id(0)/blocksPerCol);\n"" size_t tmp= (get_group_id(0) % blocksPerCol);\n"" size_t tmp2= (get_group_id(0) / blocksPerCol);\n"" size_t arrInd = (tid + tmp*get_local_size(0))*rowWidth + tmp2;\n"" if(poly == 0)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[arrInd] : pad;\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? pad : input[(arrInd-(overlap*rowWidth))];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+(overlap*rowWidth))] : pad;\n"" }\n"" }\n"" else if(poly == 1)\n"" {\n"" if(i < n)\n"" {\n"" sdata[overlap+tid] = input[arrInd];\n"" }\n"" else if(i-n < overlap)\n"" {\n"" sdata[overlap+tid] = wrap[(overlap+(i-n))+ wrapIndex];\n"" }\n"" else\n"" {\n"" sdata[overlap+tid] = pad;\n"" }\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? wrap[tid+wrapIndex] : input[(arrInd-(overlap*rowWidth))];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+(overlap*rowWidth))] : wrap[overlap+wrapIndex+(tid+overlap-get_local_size(0))];\n"" }\n"" }\n"" else if(poly == 2)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[arrInd] : input[n-1];\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? input[tmp2] : input[(arrInd-(overlap*rowWidth))];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+(overlap*rowWidth))] : input[tmp2+(colWidth-1)*rowWidth];\n"" }\n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if( (arrInd >= out_offset) && (arrInd < out_offset+out_numelements) )\n"" {\n"" output[arrInd-out_offset] = FUNCTIONNAME(&(sdata[tid+overlap]));\n"" }\n""}\n"  )
static

OpenCL MapOverlap kernel for applying column-wise overlap on matrix operands. It uses one device thread per element so maximum number of device threads limits the maximum number of elements this kernel can handle. Also the amount of shared memory and the maximum blocksize limits the maximum overlap that is possible to use, typically limits the overlap to < 256.

static std::string skepu::MapOverlapKernel_CL_Matrix_ColMulti ( "__kernel void MapOverlapKernel_MatColWiseMulti_KERNELNAME(__global TYPE* input, __global TYPE* output, __global TYPE* wrap, size_t n, size_t overlap, size_t in_offset, size_t out_numelements, int poly, int deviceType, TYPE pad, size_t blocksPerCol, size_t rowWidth, size_t colWidth, __local TYPE* sdata)\n""{\n"" size_t tid = get_local_id(0);\n"" size_t i = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"" size_t wrapIndex= 2 * overlap * (int)(get_group_id(0)/blocksPerCol);\n"" size_t tmp= (get_group_id(0) % blocksPerCol);\n"" size_t tmp2= (get_group_id(0) / blocksPerCol);\n"" size_t arrInd = (tid + tmp*get_local_size(0))*rowWidth + tmp2;\n"" if(poly == 0)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[arrInd+in_offset] : pad;\n"" if(deviceType == -1)\n"" {\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? pad : input[(arrInd-(overlap*rowWidth))];\n"" }\n"" \n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = input[(arrInd+in_offset+(overlap*rowWidth))];\n"" }\n"" }\n"" else if(deviceType == 0) \n"" {\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = input[arrInd];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = input[(arrInd+in_offset+(overlap*rowWidth))];\n"" }\n"" }\n"" else if(deviceType == 1)\n"" {\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = input[arrInd];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+in_offset+(overlap*rowWidth))] : pad;\n"" }\n"" }\n"" }\n"" else if(poly == 1)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[arrInd+in_offset] : ((i-n < overlap) ? wrap[(i-n)+ (overlap * tmp2)] : pad);\n"" if(deviceType == -1)\n"" {\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? wrap[tid+(overlap * tmp2)] : input[(arrInd-(overlap*rowWidth))];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = input[(arrInd+in_offset+(overlap*rowWidth))];\n"" }\n"" }\n"" else if(deviceType == 0)\n"" {\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = input[arrInd];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = input[(arrInd+in_offset+(overlap*rowWidth))];\n"" }\n"" }\n"" else if(deviceType == 1)\n"" {\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = input[arrInd];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+in_offset+(overlap*rowWidth))] : wrap[(overlap * tmp2)+(tid+overlap-get_local_size(0))];\n"" }\n"" }\n"" }\n"" else if(poly == 2)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[arrInd+in_offset] : input[n+in_offset-1];\n"" if(deviceType == -1)\n"" {\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? input[tmp2] : input[(arrInd-(overlap*rowWidth))];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = input[(arrInd+in_offset+(overlap*rowWidth))];\n"" }\n"" }\n"" else if(deviceType == 0)\n"" {\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = input[arrInd];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = input[(arrInd+in_offset+(overlap*rowWidth))];\n"" }\n"" }\n"" else if(deviceType == 1)\n"" {\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = input[arrInd];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (arrInd+(overlap*rowWidth)) < n && (tmp!=(blocksPerCol-1))) ? input[(arrInd+in_offset+(overlap*rowWidth))] : input[tmp2+in_offset+(colWidth-1)*rowWidth];\n"" }\n"" }\n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if( arrInd < out_numelements )\n"" {\n"" output[arrInd] = FUNCTIONNAME(&(sdata[tid+overlap]));\n"" }\n""}\n"  )
static

OpenCL MapOverlap kernel for applying column-wise overlap on matrix operands when using multiple GPUs. It uses one device thread per element so maximum number of device threads limits the maximum number of elements this kernel can handle. Also the amount of shared memory and the maximum blocksize limits the maximum overlap that is possible to use, typically limits the overlap to < 256.

static std::string skepu::MapOverlapKernel_CL_Matrix_Row ( "__kernel void MapOverlapKernel_MatRowWise_KERNELNAME(__global TYPE* input, __global TYPE* output, __global TYPE* wrap, size_t n, size_t overlap, size_t out_offset, size_t out_numelements, int poly, TYPE pad, size_t blocksPerRow, size_t rowWidth, __local TYPE* sdata)\n""{\n"" size_t tid = get_local_id(0);\n"" size_t i = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"" size_t wrapIndex= 2 * overlap * (int)(get_group_id(0)/blocksPerRow);\n"" size_t tmp= (get_group_id(0) % blocksPerRow);\n"" size_t tmp2= (get_group_id(0) / blocksPerRow);\n"" if(poly == 0)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[i] : pad;\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? pad : input[i-overlap];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (i+overlap < n) && tmp!=(blocksPerRow-1)) ? input[i+overlap] : pad;\n"" }\n"" }\n"" else if(poly == 1)\n"" {\n"" if(i < n)\n"" {\n"" sdata[overlap+tid] = input[i];\n"" }\n"" else if(i-n < overlap)\n"" {\n"" sdata[overlap+tid] = wrap[(overlap+(i-n))+ wrapIndex];\n"" }\n"" else\n"" {\n"" sdata[overlap+tid] = pad;\n"" }\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? wrap[tid+wrapIndex] : input[i-overlap];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && i+overlap < n && tmp!=(blocksPerRow-1)) ? input[i+overlap] : wrap[overlap+wrapIndex+(tid+overlap-get_local_size(0))];\n"" }\n"" }\n"" else if(poly == 2)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[i] : input[n-1];\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (tmp==0) ? input[tmp2*rowWidth] : input[i-overlap];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != (get_num_groups(0)-1) && (i+overlap < n) && (tmp!=(blocksPerRow-1))) ? input[i+overlap] : input[(tmp2+1)*rowWidth-1];\n"" }\n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if( (i >= out_offset) && (i < out_offset+out_numelements) )\n"" {\n"" output[i-out_offset] = FUNCTIONNAME(&(sdata[tid+overlap]));\n"" }\n""}\n"  )
static

OpenCL MapOverlap kernel for applying row-wise overlap on matrix operands. It uses one device thread per element so maximum number of device threads limits the maximum number of elements this kernel can handle. Also the amount of shared memory and the maximum blocksize limits the maximum overlap that is possible to use, typically limits the overlap to < 256.

template<int poly, typename T , typename OverlapFunc >
__global__ void skepu::MapOverlapKernel_CU ( OverlapFunc  mapOverlapFunc,
T *  input,
T *  output,
T *  wrap,
size_t  n,
size_t  out_offset,
size_t  out_numelements,
pad 
)

CUDA MapOverlap kernel for vector operands. It uses one device thread per element so maximum number of device threads limits the maximum number of elements this kernel can handle. Also the amount of shared memory and the maximum blocksize limits the maximum overlap that is possible to use, typically limits the overlap to < 256.

template<int poly, typename T , typename OverlapFunc >
__global__ void skepu::MapOverlapKernel_CU_Matrix_Col ( OverlapFunc  mapOverlapFunc,
T *  input,
T *  output,
T *  wrap,
size_t  n,
size_t  out_offset,
size_t  out_numelements,
pad,
size_t  blocksPerCol,
size_t  rowWidth,
size_t  colWidth 
)

CUDA MapOverlap kernel for applying column-wise overlap on matrix operands. It uses one device thread per element so maximum number of device threads limits the maximum number of elements this kernel can handle. Also the amount of shared memory and the maximum blocksize limits the maximum overlap that is possible to use, typically limits the overlap to < 256.

template<int poly, int deviceType, typename T , typename OverlapFunc >
__global__ void skepu::MapOverlapKernel_CU_Matrix_ColMulti ( OverlapFunc  mapOverlapFunc,
T *  input,
T *  output,
T *  wrap,
size_t  n,
size_t  in_offset,
size_t  out_numelements,
pad,
size_t  blocksPerCol,
size_t  rowWidth,
size_t  colWidth 
)

CUDA MapOverlap kernel for applying column-wise overlap on matrix operands with multiple devices. It uses one device thread per element so maximum number of device threads limits the maximum number of elements this kernel can handle. Also the amount of shared memory and the maximum blocksize limits the maximum overlap that is possible to use, typically limits the overlap to < 256. Device type : -1 (first), 0 (middleDevice), 1(last),

template<int poly, typename T , typename OverlapFunc >
__global__ void skepu::MapOverlapKernel_CU_Matrix_Row ( OverlapFunc  mapOverlapFunc,
T *  input,
T *  output,
T *  wrap,
size_t  n,
size_t  out_offset,
size_t  out_numelements,
pad,
size_t  blocksPerRow,
size_t  rowWidth 
)

CUDA MapOverlap kernel for applying row-wise overlap on matrix operands. It uses one device thread per element so maximum number of device threads limits the maximum number of elements this kernel can handle. Also the amount of shared memory and the maximum blocksize limits the maximum overlap that is possible to use, typically limits the overlap to < 256.

static std::string skepu::MatrixConvol2D_CL ( "__kernel void conv_opencl_2D_KERNELNAME(__global TYPE* input, __global TYPE* output, size_t out_rows, size_t out_cols, size_t filter_rows, size_t filter_cols, size_t in_pitch, size_t out_pitch, size_t stride, size_t sharedRows, size_t sharedCols, __local TYPE* sdata)\n""{\n"" size_t xx = ( (size_t)(get_global_id(0)/get_local_size(0))) * get_local_size(0);\n"" size_t yy = ( (size_t)(get_global_id(1)/get_local_size(1))) * get_local_size(1);\n"" size_t x = get_global_id(0);\n"" size_t y = get_global_id(1);\n"" if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))\n"" {\n"" size_t sharedIdx = get_local_id(1) * sharedCols + get_local_id(0);\n"" sdata[sharedIdx]= input[y*in_pitch + x];\n"" size_t shared_x= get_local_id(0)+get_local_size(0);\n"" size_t shared_y= get_local_id(1);\n"" while(shared_y<sharedRows)\n"" {\n"" while(shared_x<sharedCols)\n"" {\n"" sharedIdx = shared_y * sharedCols + shared_x; \n"" sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];\n"" shared_x = shared_x + get_local_size(0);\n"" }\n"" shared_x = get_local_id(0);\n"" shared_y = shared_y + get_local_size(1);\n"" } \n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if(x<out_cols && y<out_rows)\n"" {\n"" output[y*out_pitch+x] = FUNCTIONNAME(&(sdata[(get_local_id(1)+(filter_rows/2)) * sharedCols + (get_local_id(0)+(filter_cols/2))]), stride);\n"" }\n""}"  )
static

The mapoverlap OpenCL kernel to apply a user function on neighbourhood of each element in the matrix.

static std::string skepu::MatrixConvolShared_CL ( "__kernel void conv_opencl_shared_KERNELNAME(__global TYPE* input, __global TYPE* output, size_t in_rows, size_t in_cols, size_t out_rows, size_t out_cols, size_t filter_rows, size_t filter_cols, size_t in_pitch, size_t out_pitch, size_t sharedRows, size_t sharedCols, __local TYPE* sdata)\n""{\n"" size_t xx = ( (size_t)(get_global_id(0)/get_local_size(0))) * get_local_size(0);\n"" size_t yy = ( (size_t)(get_global_id(1)/get_local_size(1))) * get_local_size(1);\n"" size_t x = get_global_id(0);\n"" size_t y = get_global_id(1);\n"" if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))\n"" {\n"" size_t sharedIdx = get_local_id(1) * sharedCols + get_local_id(0);\n"" sdata[sharedIdx]= input[y*in_pitch + x];\n"" size_t shared_x= get_local_id(0)+get_local_size(0);\n"" size_t shared_y= get_local_id(1);\n"" while(shared_y<sharedRows)\n"" {\n"" while(shared_x<sharedCols)\n"" {\n"" sharedIdx = shared_y * sharedCols + shared_x; \n"" sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];\n"" shared_x = shared_x + get_local_size(0);\n"" }\n"" shared_x = get_local_id(0);\n"" shared_y = shared_y + get_local_size(1);\n"" } \n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if(x<out_cols && y<out_rows)\n"" {\n"" TYPE sum=0;\n"" for(size_t j=0;j<filter_rows;j++) \n"" {\n"" for(size_t i=0;i<filter_cols;i++) \n"" {\n"" sum += sdata[(get_local_id(1)+j) * sharedCols + (get_local_id(0)+i) ];\n"" }\n"" }\n"" output[y*out_pitch+x] = sum / (filter_rows * filter_cols);\n"" }\n""}"  )
static

The mapoverlap OpenCL kernel to apply on neighbourhood of each element in the matrix.

static std::string skepu::MatrixConvolSharedFilter_CL ( "__kernel void conv_opencl_shared_filter_KERNELNAME(__global TYPE* input, __global TYPE* output, __constant TYPE* filter, size_t in_rows, size_t in_cols, size_t out_rows, size_t out_cols, size_t filter_rows, size_t filter_cols, size_t in_pitch, size_t out_pitch, size_t sharedRows, size_t sharedCols, __local TYPE* sdata)\n""{\n"" size_t xx = ( (size_t)(get_global_id(0)/get_local_size(0))) * get_local_size(0);\n"" size_t yy = ( (size_t)(get_global_id(1)/get_local_size(1))) * get_local_size(1);\n"" size_t x = get_global_id(0);\n"" size_t y = get_global_id(1);\n"" if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))\n"" {\n"" size_t sharedIdx = get_local_id(1) * sharedCols + get_local_id(0);\n"" sdata[sharedIdx]= input[y*in_pitch + x];\n"" size_t shared_x= get_local_id(0)+get_local_size(0);\n"" size_t shared_y= get_local_id(1);\n"" while(shared_y<sharedRows)\n"" {\n"" while(shared_x<sharedCols)\n"" {\n"" sharedIdx = shared_y * sharedCols + shared_x; \n"" sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];\n"" shared_x = shared_x + get_local_size(0);\n"" }\n"" shared_x = get_local_id(0);\n"" shared_y = shared_y + get_local_size(1);\n"" } \n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if(x<out_cols && y<out_rows)\n"" {\n"" TYPE sum=0;\n"" for(size_t j=0;j<filter_rows;j++) \n"" {\n"" for(size_t i=0;i<filter_cols;i++) \n"" {\n"" sum += sdata[(get_local_id(1)+j) * sharedCols + (get_local_id(0)+i) ] * filter[j*filter_cols+i];\n"" }\n"" }\n"" output[y*out_pitch+x] = sum / (filter_rows * filter_cols);\n"" }\n""}"  )
static

The mapoverlap OpenCL kernel with a filter matrix to apply on neighbourhood of each element in the matrix.

template<typename T >
T skepu::max ( a,
b 
)
template<typename T >
T skepu::min ( a,
b 
)
template<typename T >
__global__ void skepu::transpose ( T *  odata,
T *  idata,
size_t  width,
size_t  height 
)

Matrix transpose CUDA kernel