00001
00005 #ifndef MAPREDUCE_KERNELS_H
00006 #define MAPREDUCE_KERNELS_H
00007
00008 #ifdef SKEPU_OPENCL
00009
00010 #include <string>
00011
00012 namespace skepu
00013 {
00014
00032 static std::string UnaryMapReduceKernel_CL
00033 (
00034 "__kernel void UnaryMapReduceKernel_KERNELNAME(__global TYPE* input, __global TYPE* output, unsigned int n, __local TYPE* sdata, TYPE const1)\n"
00035 "{\n"
00036 " unsigned int blockSize = get_local_size(0);\n"
00037 " unsigned int tid = get_local_id(0);\n"
00038 " unsigned int i = get_group_id(0)*blockSize + get_local_id(0);\n"
00039 " unsigned int gridSize = blockSize*get_num_groups(0);\n"
00040 " TYPE result = 0;\n"
00041 " if(i < n)\n"
00042 " {\n"
00043 " result = FUNCTIONNAME_MAP(input[i], const1);\n"
00044 " i += gridSize;\n"
00045 " }\n"
00046 " while(i < n)\n"
00047 " {\n"
00048 " TYPE tempMap;\n"
00049 " tempMap = FUNCTIONNAME_MAP(input[i], const1);\n"
00050 " result = FUNCTIONNAME_REDUCE(result, tempMap, (TYPE)0);\n"
00051 " i += gridSize;\n"
00052 " }\n"
00053 " sdata[tid] = result;\n"
00054 " barrier(CLK_LOCAL_MEM_FENCE);\n"
00055 " if(blockSize >= 512) { if (tid < 256 && tid + 256 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 256], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
00056 " if(blockSize >= 256) { if (tid < 128 && tid + 128 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 128], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
00057 " if(blockSize >= 128) { if (tid < 64 && tid + 64 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 64], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
00058 " if(blockSize >= 64) { if (tid < 32 && tid + 32 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 32], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
00059 " if(blockSize >= 32) { if (tid < 16 && tid + 16 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 16], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
00060 " if(blockSize >= 16) { if (tid < 8 && tid + 8 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 8], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
00061 " if(blockSize >= 8) { if (tid < 4 && tid + 4 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 4], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
00062 " if(blockSize >= 4) { if (tid < 2 && tid + 2 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 2], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
00063 " if(blockSize >= 2) { if (tid < 1 && tid + 1 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 1], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
00064 " if(tid == 0)\n"
00065 " {\n"
00066 " output[get_group_id(0)] = sdata[tid];\n"
00067 " }\n"
00068 "}\n"
00069 );
00070
00077 static std::string BinaryMapReduceKernel_CL
00078 (
00079 "__kernel void BinaryMapReduceKernel_KERNELNAME(__global TYPE* input1, __global TYPE* input2, __global TYPE* output, unsigned int n, __local TYPE* sdata, TYPE const1)\n"
00080 "{\n"
00081 " unsigned int blockSize = get_local_size(0);\n"
00082 " unsigned int tid = get_local_id(0);\n"
00083 " unsigned int i = get_group_id(0)*blockSize + get_local_id(0);\n"
00084 " unsigned int gridSize = blockSize*get_num_groups(0);\n"
00085 " TYPE result = 0;\n"
00086 " if(i < n)\n"
00087 " {\n"
00088 " result = FUNCTIONNAME_MAP(input1[i], input2[i], const1);\n"
00089 " i += gridSize;\n"
00090 " }\n"
00091 " while(i < n)\n"
00092 " {\n"
00093 " TYPE tempMap;\n"
00094 " tempMap = FUNCTIONNAME_MAP(input1[i], input2[i], const1);\n"
00095 " result = FUNCTIONNAME_REDUCE(result, tempMap, (TYPE)0);\n"
00096 " i += gridSize;\n"
00097 " }\n"
00098 " sdata[tid] = result;\n"
00099 " barrier(CLK_LOCAL_MEM_FENCE);\n"
00100 " if(blockSize >= 512) { if (tid < 256 && tid + 256 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 256], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
00101 " if(blockSize >= 256) { if (tid < 128 && tid + 128 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 128], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
00102 " if(blockSize >= 128) { if (tid < 64 && tid + 64 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 64], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
00103 " if(blockSize >= 64) { if (tid < 32 && tid + 32 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 32], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
00104 " if(blockSize >= 32) { if (tid < 16 && tid + 16 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 16], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
00105 " if(blockSize >= 16) { if (tid < 8 && tid + 8 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 8], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
00106 " if(blockSize >= 8) { if (tid < 4 && tid + 4 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 4], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
00107 " if(blockSize >= 4) { if (tid < 2 && tid + 2 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 2], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
00108 " if(blockSize >= 2) { if (tid < 1 && tid + 1 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 1], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
00109 " if(tid == 0)\n"
00110 " {\n"
00111 " output[get_group_id(0)] = sdata[tid];\n"
00112 " }\n"
00113 "}\n"
00114 );
00115
00122 static std::string TrinaryMapReduceKernel_CL
00123 (
00124 "__kernel void TrinaryMapReduceKernel_KERNELNAME(__global TYPE* input1, __global TYPE* input2, __global TYPE* input3, __global TYPE* output, unsigned int n, __local TYPE* sdata, TYPE const1)\n"
00125 "{\n"
00126 " unsigned int blockSize = get_local_size(0);\n"
00127 " unsigned int tid = get_local_id(0);\n"
00128 " unsigned int i = get_group_id(0)*blockSize + get_local_id(0);\n"
00129 " unsigned int gridSize = blockSize*get_num_groups(0);\n"
00130 " TYPE result = 0;\n"
00131 " if(i < n)\n"
00132 " {\n"
00133 " result = FUNCTIONNAME_MAP(input1[i], input2[i], input3[i], const1);\n"
00134 " i += gridSize;\n"
00135 " }\n"
00136 " while(i < n)\n"
00137 " {\n"
00138 " TYPE tempMap;\n"
00139 " tempMap = FUNCTIONNAME_MAP(input1[i], input2[i], input3[i], const1);\n"
00140 " result = FUNCTIONNAME_REDUCE(result, tempMap, (TYPE)0);\n"
00141 " i += gridSize;\n"
00142 " }\n"
00143 " sdata[tid] = result;\n"
00144 " barrier(CLK_LOCAL_MEM_FENCE);\n"
00145 " if(blockSize >= 512) { if (tid < 256 && tid + 256 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 256], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
00146 " if(blockSize >= 256) { if (tid < 128 && tid + 128 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 128], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
00147 " if(blockSize >= 128) { if (tid < 64 && tid + 64 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 64], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
00148 " if(blockSize >= 64) { if (tid < 32 && tid + 32 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 32], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
00149 " if(blockSize >= 32) { if (tid < 16 && tid + 16 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 16], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
00150 " if(blockSize >= 16) { if (tid < 8 && tid + 8 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 8], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
00151 " if(blockSize >= 8) { if (tid < 4 && tid + 4 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 4], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
00152 " if(blockSize >= 4) { if (tid < 2 && tid + 2 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 2], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
00153 " if(blockSize >= 2) { if (tid < 1 && tid + 1 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 1], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"
00154 " if(tid == 0)\n"
00155 " {\n"
00156 " output[get_group_id(0)] = sdata[tid];\n"
00157 " }\n"
00158 "}\n"
00159 );
00160
00165 }
00166
00167 #endif
00168
00169 #ifdef SKEPU_CUDA
00170
00171 namespace skepu
00172 {
00173
00191 template <typename T, typename UnaryFunc, typename BinaryFunc>
00192 __global__ void MapReduceKernel1_CU(UnaryFunc mapFunc, BinaryFunc reduceFunc, T* input, T* output, unsigned int n)
00193 {
00194 extern __shared__ char _sdata[];
00195 T* sdata = reinterpret_cast<T*>(_sdata);
00196
00197 unsigned int blockSize = blockDim.x;
00198 unsigned int tid = threadIdx.x;
00199 unsigned int i = blockIdx.x * blockSize + tid;
00200 unsigned int gridSize = blockSize*gridDim.x;
00201 T result = 0;
00202
00203 if(i < n)
00204 {
00205 result = mapFunc.CU(input[i]);
00206 i += gridSize;
00207 }
00208
00209 while(i < n)
00210 {
00211 T tempMap;
00212 tempMap = mapFunc.CU(input[i]);
00213 result = reduceFunc.CU(result, tempMap);
00214 i += gridSize;
00215 }
00216
00217 sdata[tid] = result;
00218
00219 __syncthreads();
00220
00221 if(blockSize >= 512) { if (tid < 256 && tid + 256 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 256]); } __syncthreads(); }
00222 if(blockSize >= 256) { if (tid < 128 && tid + 128 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 128]); } __syncthreads(); }
00223 if(blockSize >= 128) { if (tid < 64 && tid + 64 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 64]); } __syncthreads(); }
00224 if(blockSize >= 64) { if (tid < 32 && tid + 32 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 32]); } __syncthreads(); }
00225 if(blockSize >= 32) { if (tid < 16 && tid + 16 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 16]); } __syncthreads(); }
00226 if(blockSize >= 16) { if (tid < 8 && tid + 8 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 8]); } __syncthreads(); }
00227 if(blockSize >= 8) { if (tid < 4 && tid + 4 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 4]); } __syncthreads(); }
00228 if(blockSize >= 4) { if (tid < 2 && tid + 2 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 2]); } __syncthreads(); }
00229 if(blockSize >= 2) { if (tid < 1 && tid + 1 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 1]); } __syncthreads(); }
00230
00231 if(tid == 0)
00232 {
00233 output[blockIdx.x] = sdata[tid];
00234 }
00235 }
00236
00243 template <typename T, typename BinaryFunc1, typename BinaryFunc2>
00244 __global__ void MapReduceKernel2_CU(BinaryFunc1 mapFunc, BinaryFunc2 reduceFunc, T* input1, T* input2, T* output, unsigned int n)
00245 {
00246 extern __shared__ char _sdata[];
00247 T* sdata = reinterpret_cast<T*>(_sdata);
00248
00249 unsigned int blockSize = blockDim.x;
00250 unsigned int tid = threadIdx.x;
00251 unsigned int i = blockIdx.x * blockSize + tid;
00252 unsigned int gridSize = blockSize*gridDim.x;
00253 T result = 0;
00254
00255 if(i < n)
00256 {
00257 result = mapFunc.CU(input1[i], input2[i]);
00258 i += gridSize;
00259 }
00260
00261 while(i < n)
00262 {
00263 T tempMap;
00264 tempMap = mapFunc.CU(input1[i], input2[i]);
00265 result = reduceFunc.CU(result, tempMap);
00266 i += gridSize;
00267 }
00268
00269 sdata[tid] = result;
00270
00271 __syncthreads();
00272
00273 if(blockSize >= 512) { if (tid < 256 && tid + 256 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 256]); } __syncthreads(); }
00274 if(blockSize >= 256) { if (tid < 128 && tid + 128 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 128]); } __syncthreads(); }
00275 if(blockSize >= 128) { if (tid < 64 && tid + 64 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 64]); } __syncthreads(); }
00276 if(blockSize >= 64) { if (tid < 32 && tid + 32 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 32]); } __syncthreads(); }
00277 if(blockSize >= 32) { if (tid < 16 && tid + 16 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 16]); } __syncthreads(); }
00278 if(blockSize >= 16) { if (tid < 8 && tid + 8 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 8]); } __syncthreads(); }
00279 if(blockSize >= 8) { if (tid < 4 && tid + 4 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 4]); } __syncthreads(); }
00280 if(blockSize >= 4) { if (tid < 2 && tid + 2 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 2]); } __syncthreads(); }
00281 if(blockSize >= 2) { if (tid < 1 && tid + 1 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 1]); } __syncthreads(); }
00282
00283 if(tid == 0)
00284 {
00285 output[blockIdx.x] = sdata[tid];
00286 }
00287 }
00288
00295 template <typename T, typename TrinaryFunc, typename BinaryFunc>
00296 __global__ void MapReduceKernel3_CU(TrinaryFunc mapFunc, BinaryFunc reduceFunc, T* input1, T* input2, T* input3, T* output, unsigned int n)
00297 {
00298 extern __shared__ char _sdata[];
00299 T* sdata = reinterpret_cast<T*>(_sdata);
00300
00301 unsigned int blockSize = blockDim.x;
00302 unsigned int tid = threadIdx.x;
00303 unsigned int i = blockIdx.x * blockSize + tid;
00304 unsigned int gridSize = blockSize*gridDim.x;
00305 T result = 0;
00306
00307 if(i < n)
00308 {
00309 result = mapFunc.CU(input1[i], input2[i], input3[i]);
00310 i += gridSize;
00311 }
00312
00313 while(i < n)
00314 {
00315 T tempMap;
00316 tempMap = mapFunc.CU(input1[i], input2[i], input3[i]);
00317 result = reduceFunc.CU(result, tempMap);
00318 i += gridSize;
00319 }
00320
00321 sdata[tid] = result;
00322
00323 __syncthreads();
00324
00325 if(blockSize >= 512) { if (tid < 256 && tid + 256 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 256]); } __syncthreads(); }
00326 if(blockSize >= 256) { if (tid < 128 && tid + 128 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 128]); } __syncthreads(); }
00327 if(blockSize >= 128) { if (tid < 64 && tid + 64 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 64]); } __syncthreads(); }
00328 if(blockSize >= 64) { if (tid < 32 && tid + 32 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 32]); } __syncthreads(); }
00329 if(blockSize >= 32) { if (tid < 16 && tid + 16 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 16]); } __syncthreads(); }
00330 if(blockSize >= 16) { if (tid < 8 && tid + 8 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 8]); } __syncthreads(); }
00331 if(blockSize >= 8) { if (tid < 4 && tid + 4 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 4]); } __syncthreads(); }
00332 if(blockSize >= 4) { if (tid < 2 && tid + 2 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 2]); } __syncthreads(); }
00333 if(blockSize >= 2) { if (tid < 1 && tid + 1 < n) { sdata[tid] = reduceFunc.CU(sdata[tid], sdata[tid + 1]); } __syncthreads(); }
00334
00335 if(tid == 0)
00336 {
00337 output[blockIdx.x] = sdata[tid];
00338 }
00339 }
00340
00345 }
00346
00347 #endif
00348
00349 #endif