SkePU 0.7
include_starpu/skepu/src/reduce_kernels.h
Go to the documentation of this file.
00001 
00005 #ifndef REDUCE_KERNELS_H
00006 #define REDUCE_KERNELS_H
00007 
00008 #ifdef SKEPU_OPENCL
00009 
00010 #include <string>
00011 
00012 namespace skepu
00013 {
00014 
00032 static std::string ReduceKernel_CL(
00033 "__kernel void ReduceKernel_KERNELNAME(__global TYPE* input, __global TYPE* output, unsigned int n, __local TYPE* sdata)\n"
00034 "{\n"
00035 "    unsigned int blockSize = get_local_size(0);\n"
00036 "    unsigned int tid = get_local_id(0);\n"
00037 "    unsigned int i = get_group_id(0)*blockSize + get_local_id(0);\n"
00038 "    unsigned int gridSize = blockSize*get_num_groups(0);\n"
00039 "    TYPE result = 0;\n"
00040 "    if(i < n)\n"
00041 "    {\n"
00042 "        result = input[i];\n"
00043 "        i += gridSize;\n"
00044 "    }\n"
00045 "    while(i < n)\n"
00046 "    {\n"
00047 "        result = FUNCTIONNAME(result, input[i], (TYPE)0);\n"
00048 "        i += gridSize;\n"
00049 "    }\n"
00050 "    sdata[tid] = result;\n"
00051 "    barrier(CLK_LOCAL_MEM_FENCE);\n"
00052 "    if(blockSize >= 512) { if (tid < 256 && tid + 256 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 256], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
00053 "    if(blockSize >= 256) { if (tid < 128 && tid + 128 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 128], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
00054 "    if(blockSize >= 128) { if (tid <  64 && tid +  64 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid +  64], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
00055 "    if(blockSize >=  64) { if (tid <  32 && tid +  32 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid +  32], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
00056 "    if(blockSize >=  32) { if (tid <  16 && tid +  16 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid +  16], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
00057 "    if(blockSize >=  16) { if (tid <   8 && tid +   8 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid +   8], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
00058 "    if(blockSize >=   8) { if (tid <   4 && tid +   4 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid +   4], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
00059 "    if(blockSize >=   4) { if (tid <   2 && tid +   2 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid +   2], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
00060 "    if(blockSize >=   2) { if (tid <   1 && tid +   1 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid +   1], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
00061 "    if(tid == 0)\n"
00062 "    {\n"
00063 "        output[get_group_id(0)] = sdata[tid];\n"
00064 "    }\n"
00065 "}\n"
00066 );
00067 
00072 }
00073 
00074 #endif
00075 
00076 #ifdef SKEPU_CUDA
00077 
00078 namespace skepu
00079 {
00080 
00098 template<typename T, typename BinaryFunc>
00099 __global__ void ReduceKernel_CU(BinaryFunc reduceFunc, T* input, T* output, unsigned int n)
00100 {
00101     //A bit ugly
00102     extern __shared__ char _sdata[];
00103     T* sdata = reinterpret_cast<T*>(_sdata);
00104 
00105     unsigned int blockSize = blockDim.x;
00106     unsigned int tid = threadIdx.x;
00107     unsigned int i = blockIdx.x * blockSize + tid;
00108     unsigned int gridSize = blockSize*gridDim.x;
00109     T result = 0;
00110 
00111     if(i < n)
00112     {
00113         result = input[i];
00114         i += gridSize;
00115     }
00116 
00117     while(i < n)
00118     {
00119         result = reduceFunc.CU(result, input[i]);
00120         i += gridSize;
00121     }
00122 
00123     sdata[tid] = result;
00124 
00125     __syncthreads();
00126 
00127     if(blockSize >= 512) { if (tid < 256 && tid + 256 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 256]); } __syncthreads(); }
00128     if(blockSize >= 256) { if (tid < 128 && tid + 128 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 128]); } __syncthreads(); }
00129     if(blockSize >= 128) { if (tid <  64 && tid +  64 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid +  64]); } __syncthreads(); }
00130     if(blockSize >=  64) { if (tid <  32 && tid +  32 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid +  32]); } __syncthreads(); }
00131     if(blockSize >=  32) { if (tid <  16 && tid +  16 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid +  16]); } __syncthreads(); }
00132     if(blockSize >=  16) { if (tid <   8 && tid +   8 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid +   8]); } __syncthreads(); }
00133     if(blockSize >=   8) { if (tid <   4 && tid +   4 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid +   4]); } __syncthreads(); }
00134     if(blockSize >=   4) { if (tid <   2 && tid +   2 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid +   2]); } __syncthreads(); }
00135     if(blockSize >=   2) { if (tid <   1 && tid +   1 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid +   1]); } __syncthreads(); }
00136 
00137     if(tid == 0)
00138     {
00139         output[blockIdx.x] = sdata[tid];
00140     }
00141 }
00142 
00147 }
00148 
00149 #endif
00150 
00151 #endif
00152 
 All Classes Namespaces Files Functions Enumerations Friends Defines