Contains the OpenCL and CUDA kernels for the MapOverlap convolution which supports overlap of neighbouring elements. More...

#include <string>

Include dependency graph for mapoverlap_convol_kernels.h:

This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Namespaces
	skepu
	The main namespace for SkePU library.

Functions
static std::string	skepu::MatrixConvolSharedFilter_CL ("__kernel void conv_opencl_shared_filter_KERNELNAME(__global TYPE* input, __global TYPE* output, __constant TYPE* filter, size_t in_rows, size_t in_cols, size_t out_rows, size_t out_cols, size_t filter_rows, size_t filter_cols, size_t in_pitch, size_t out_pitch, size_t sharedRows, size_t sharedCols, __local TYPE* sdata)\n""{\n"" size_t xx = ( (size_t)(get_global_id(0)/get_local_size(0))) * get_local_size(0);\n"" size_t yy = ( (size_t)(get_global_id(1)/get_local_size(1))) * get_local_size(1);\n"" size_t x = get_global_id(0);\n"" size_t y = get_global_id(1);\n"" if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))\n"" {\n"" size_t sharedIdx = get_local_id(1) * sharedCols + get_local_id(0);\n"" sdata[sharedIdx]= input[yin_pitch + x];\n"" size_t shared_x= get_local_id(0)+get_local_size(0);\n"" size_t shared_y= get_local_id(1);\n"" while(shared_y<sharedRows)\n"" {\n"" while(shared_x<sharedCols)\n"" {\n"" sharedIdx = shared_y sharedCols + shared_x; \n"" sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];\n"" shared_x = shared_x + get_local_size(0);\n"" }\n"" shared_x = get_local_id(0);\n"" shared_y = shared_y + get_local_size(1);\n"" } \n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if(x<out_cols && y<out_rows)\n"" {\n"" TYPE sum=0;\n"" for(size_t j=0;j<filter_rows;j++) \n"" {\n"" for(size_t i=0;i<filter_cols;i++) \n"" {\n"" sum += sdata[(get_local_id(1)+j) * sharedCols + (get_local_id(0)+i) ] * filter[jfilter_cols+i];\n"" }\n"" }\n"" output[yout_pitch+x] = sum / (filter_rows * filter_cols);\n"" }\n""}")

static std::string	skepu::MatrixConvol2D_CL ("__kernel void conv_opencl_2D_KERNELNAME(__global TYPE* input, __global TYPE* output, size_t out_rows, size_t out_cols, size_t filter_rows, size_t filter_cols, size_t in_pitch, size_t out_pitch, size_t stride, size_t sharedRows, size_t sharedCols, __local TYPE* sdata)\n""{\n"" size_t xx = ( (size_t)(get_global_id(0)/get_local_size(0))) * get_local_size(0);\n"" size_t yy = ( (size_t)(get_global_id(1)/get_local_size(1))) * get_local_size(1);\n"" size_t x = get_global_id(0);\n"" size_t y = get_global_id(1);\n"" if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))\n"" {\n"" size_t sharedIdx = get_local_id(1) * sharedCols + get_local_id(0);\n"" sdata[sharedIdx]= input[yin_pitch + x];\n"" size_t shared_x= get_local_id(0)+get_local_size(0);\n"" size_t shared_y= get_local_id(1);\n"" while(shared_y<sharedRows)\n"" {\n"" while(shared_x<sharedCols)\n"" {\n"" sharedIdx = shared_y sharedCols + shared_x; \n"" sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];\n"" shared_x = shared_x + get_local_size(0);\n"" }\n"" shared_x = get_local_id(0);\n"" shared_y = shared_y + get_local_size(1);\n"" } \n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if(x<out_cols && y<out_rows)\n"" {\n"" output[yout_pitch+x] = FUNCTIONNAME(&(sdata[(get_local_id(1)+(filter_rows/2)) sharedCols + (get_local_id(0)+(filter_cols/2))]), stride);\n"" }\n""}")

static std::string	skepu::MatrixConvolShared_CL ("__kernel void conv_opencl_shared_KERNELNAME(__global TYPE* input, __global TYPE* output, size_t in_rows, size_t in_cols, size_t out_rows, size_t out_cols, size_t filter_rows, size_t filter_cols, size_t in_pitch, size_t out_pitch, size_t sharedRows, size_t sharedCols, __local TYPE* sdata)\n""{\n"" size_t xx = ( (size_t)(get_global_id(0)/get_local_size(0))) * get_local_size(0);\n"" size_t yy = ( (size_t)(get_global_id(1)/get_local_size(1))) * get_local_size(1);\n"" size_t x = get_global_id(0);\n"" size_t y = get_global_id(1);\n"" if(x<(out_cols+filter_cols-1) && y<(out_rows+filter_rows-1))\n"" {\n"" size_t sharedIdx = get_local_id(1) * sharedCols + get_local_id(0);\n"" sdata[sharedIdx]= input[yin_pitch + x];\n"" size_t shared_x= get_local_id(0)+get_local_size(0);\n"" size_t shared_y= get_local_id(1);\n"" while(shared_y<sharedRows)\n"" {\n"" while(shared_x<sharedCols)\n"" {\n"" sharedIdx = shared_y sharedCols + shared_x; \n"" sdata[sharedIdx]= input[(yy+shared_y) * in_pitch + xx + shared_x];\n"" shared_x = shared_x + get_local_size(0);\n"" }\n"" shared_x = get_local_id(0);\n"" shared_y = shared_y + get_local_size(1);\n"" } \n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if(x<out_cols && y<out_rows)\n"" {\n"" TYPE sum=0;\n"" for(size_t j=0;j<filter_rows;j++) \n"" {\n"" for(size_t i=0;i<filter_cols;i++) \n"" {\n"" sum += sdata[(get_local_id(1)+j) * sharedCols + (get_local_id(0)+i) ];\n"" }\n"" }\n"" output[yout_pitch+x] = sum / (filter_rows filter_cols);\n"" }\n""}")

template<typename T >
T	skepu::max (T a, T b)

template<typename T >
T	skepu::min (T a, T b)

template<typename T >
size_t	skepu::calculateTiling (size_t regCountPerThread, size_t filterSizeX, size_t filterSizeY, size_t inputSizeX, bool maximizeTiling=false)

template<typename T , typename OverlapFunc >
__global__ void	skepu::conv_cuda_2D_kernel (OverlapFunc mapOverlapFunc, T input, T output, const size_t out_rows, const size_t out_cols, const size_t filter_rows, const size_t filter_cols, size_t in_pitch, size_t out_pitch, const size_t sharedRows, const size_t sharedCols)

template<bool useFilter, typename T >
__global__ void	skepu::conv_cuda_shared_kernel (T input, T output, const size_t in_rows, const size_t in_cols, const size_t out_rows, const size_t out_cols, const size_t filter_rows, const size_t filter_cols, size_t in_pitch, size_t out_pitch, const size_t sharedRows, const size_t sharedCols)

template<bool useFilter, typename T >
__global__ void	skepu::conv_cuda_shared_tiling_kernel (T input, T output, const size_t numTiles, const size_t in_cols, const size_t out_rows, const size_t out_cols, const size_t filter_rows, const size_t filter_cols, size_t in_pitch, size_t out_pitch, const size_t sharedRows, const size_t sharedCols)

template<bool useFilter, typename T >
__global__ void	skepu::conv_cuda_shared_tiling_2_kernel (T input, T output, const size_t in_cols, const size_t out_rows, const size_t out_cols, const size_t filter_rows, const size_t filter_cols, size_t in_pitch, size_t out_pitch, const size_t sharedRows, const size_t sharedCols)

template<bool useFilter, typename T >
__global__ void	skepu::conv_cuda_shared_tiling_4_kernel (T input, T output, const size_t in_cols, const size_t out_rows, const size_t out_cols, const size_t filter_rows, const size_t filter_cols, size_t in_pitch, size_t out_pitch, const size_t sharedRows, const size_t sharedCols)

template<bool useFilter, typename T >
__global__ void	skepu::conv_cuda_shared_tiling_6_kernel (T input, T output, const size_t in_cols, const size_t out_rows, const size_t out_cols, const size_t filter_rows, const size_t filter_cols, size_t in_pitch, size_t out_pitch, const size_t sharedRows, const size_t sharedCols)

template<bool useFilter, typename T >
__global__ void	skepu::conv_cuda_shared_tiling_8_kernel (T input, T output, const size_t in_cols, const size_t out_rows, const size_t out_cols, const size_t filter_rows, const size_t filter_cols, size_t in_pitch, size_t out_pitch, const size_t sharedRows, const size_t sharedCols)

template<bool useFilter, typename T >
__global__ void	skepu::conv_cuda_shared_tiling_10_kernel (T input, T output, const size_t in_cols, const size_t out_rows, const size_t out_cols, const size_t filter_rows, const size_t filter_cols, size_t in_pitch, size_t out_pitch, const size_t sharedRows, const size_t sharedCols)

template<bool useFilter, typename T >
__global__ void	skepu::conv_cuda_shared_tiling_12_kernel (T input, T output, const size_t in_cols, const size_t out_rows, const size_t out_cols, const size_t filter_rows, const size_t filter_cols, size_t in_pitch, size_t out_pitch, const size_t sharedRows, const size_t sharedCols)

template<bool useFilter, typename T >
__global__ void	skepu::conv_cuda_shared_tiling_14_kernel (T input, T output, const size_t in_cols, const size_t out_rows, const size_t out_cols, const size_t filter_rows, const size_t filter_cols, size_t in_pitch, size_t out_pitch, const size_t sharedRows, const size_t sharedCols)

template<bool useFilter, typename T >
__global__ void	skepu::conv_cuda_shared_tiling_16_kernel (T input, T output, const size_t in_cols, const size_t out_rows, const size_t out_cols, const size_t filter_rows, const size_t filter_cols, size_t in_pitch, size_t out_pitch, const size_t sharedRows, const size_t sharedCols)

Detailed Description

Contains the OpenCL and CUDA kernels for the MapOverlap convolution which supports overlap of neighbouring elements.

Namespaces

Functions

Detailed Description