|
SkePU 0.7
|
00001 00005 #ifndef REDUCE_KERNELS_H 00006 #define REDUCE_KERNELS_H 00007 00008 #ifdef SKEPU_OPENCL 00009 00010 #include <string> 00011 00012 namespace skepu 00013 { 00014 00032 static std::string ReduceKernel_CL( 00033 "__kernel void ReduceKernel_KERNELNAME(__global TYPE* input, __global TYPE* output, unsigned int n, __local TYPE* sdata)\n" 00034 "{\n" 00035 " unsigned int blockSize = get_local_size(0);\n" 00036 " unsigned int tid = get_local_id(0);\n" 00037 " unsigned int i = get_group_id(0)*blockSize + get_local_id(0);\n" 00038 " unsigned int gridSize = blockSize*get_num_groups(0);\n" 00039 " TYPE result = 0;\n" 00040 " if(i < n)\n" 00041 " {\n" 00042 " result = input[i];\n" 00043 " i += gridSize;\n" 00044 " }\n" 00045 " while(i < n)\n" 00046 " {\n" 00047 " result = FUNCTIONNAME(result, input[i], (TYPE)0);\n" 00048 " i += gridSize;\n" 00049 " }\n" 00050 " sdata[tid] = result;\n" 00051 " barrier(CLK_LOCAL_MEM_FENCE);\n" 00052 " if(blockSize >= 512) { if (tid < 256 && tid + 256 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 256], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n" 00053 " if(blockSize >= 256) { if (tid < 128 && tid + 128 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 128], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n" 00054 " if(blockSize >= 128) { if (tid < 64 && tid + 64 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 64], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n" 00055 " if(blockSize >= 64) { if (tid < 32 && tid + 32 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 32], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n" 00056 " if(blockSize >= 32) { if (tid < 16 && tid + 16 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 16], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n" 00057 " if(blockSize >= 16) { if (tid < 8 && tid + 8 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 8], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n" 00058 " if(blockSize >= 8) { if (tid < 4 && tid + 4 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 4], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n" 00059 " if(blockSize >= 4) { if (tid < 2 && tid + 2 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 2], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n" 00060 " if(blockSize >= 2) { if (tid < 1 && tid + 1 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 1], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n" 00061 " if(tid == 0)\n" 00062 " {\n" 00063 " output[get_group_id(0)] = sdata[tid];\n" 00064 " }\n" 00065 "}\n" 00066 ); 00067 00072 } 00073 00074 #endif 00075 00076 #ifdef SKEPU_CUDA 00077 00078 namespace skepu 00079 { 00080 00098 template<typename T, typename BinaryFunc> 00099 __global__ void ReduceKernel_CU(BinaryFunc reduceFunc, T* input, T* output, unsigned int n) 00100 { 00101 //A bit ugly 00102 extern __shared__ char _sdata[]; 00103 T* sdata = reinterpret_cast<T*>(_sdata); 00104 00105 unsigned int blockSize = blockDim.x; 00106 unsigned int tid = threadIdx.x; 00107 unsigned int i = blockIdx.x * blockSize + tid; 00108 unsigned int gridSize = blockSize*gridDim.x; 00109 T result = 0; 00110 00111 if(i < n) 00112 { 00113 result = input[i]; 00114 i += gridSize; 00115 } 00116 00117 while(i < n) 00118 { 00119 result = reduceFunc.CU(result, input[i]); 00120 i += gridSize; 00121 } 00122 00123 sdata[tid] = result; 00124 00125 __syncthreads(); 00126 00127 if(blockSize >= 512) { if (tid < 256 && tid + 256 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 256]); } __syncthreads(); } 00128 if(blockSize >= 256) { if (tid < 128 && tid + 128 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 128]); } __syncthreads(); } 00129 if(blockSize >= 128) { if (tid < 64 && tid + 64 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 64]); } __syncthreads(); } 00130 if(blockSize >= 64) { if (tid < 32 && tid + 32 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 32]); } __syncthreads(); } 00131 if(blockSize >= 32) { if (tid < 16 && tid + 16 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 16]); } __syncthreads(); } 00132 if(blockSize >= 16) { if (tid < 8 && tid + 8 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 8]); } __syncthreads(); } 00133 if(blockSize >= 8) { if (tid < 4 && tid + 4 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 4]); } __syncthreads(); } 00134 if(blockSize >= 4) { if (tid < 2 && tid + 2 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 2]); } __syncthreads(); } 00135 if(blockSize >= 2) { if (tid < 1 && tid + 1 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 1]); } __syncthreads(); } 00136 00137 if(tid == 0) 00138 { 00139 output[blockIdx.x] = sdata[tid]; 00140 } 00141 } 00142 00147 } 00148 00149 #endif 00150 00151 #endif 00152
1.7.4