00001
00005 #ifndef SCAN_KERNELS_H
00006 #define SCAN_KERNELS_H
00007
00008 #ifdef SKEPU_OPENCL
00009
00010 #include <string>
00011
00012 namespace skepu
00013 {
00014
00033 static std::string ScanKernel_CL(
00034 "__kernel void ScanKernel_KERNELNAME(__global TYPE* input, __global TYPE* output, __global TYPE* blockSums, unsigned int n, unsigned int numElements, __local TYPE* sdata)\n"
00035 "{\n"
00036 " unsigned int threadIdx = get_local_id(0);\n"
00037 " unsigned int blockDim = get_local_size(0);\n"
00038 " unsigned int blockIdx = get_group_id(0);\n"
00039 " unsigned int gridDim = get_num_groups(0);\n"
00040 " int thid = threadIdx;\n"
00041 " int pout = 0;\n"
00042 " int pin = 1;\n"
00043 " int mem = get_global_id(0);\n"
00044 " int blockNr = blockIdx;\n"
00045 " unsigned int gridSize = blockDim*gridDim;\n"
00046 " unsigned int numBlocks = numElements/(blockDim) + (numElements%(blockDim) == 0 ? 0:1);\n"
00047 " int offset;\n"
00048 " while(blockNr < numBlocks)\n"
00049 " {\n"
00050 " sdata[pout*n+thid] = (mem < numElements) ? input[mem] : 0;\n"
00051 " barrier(CLK_LOCAL_MEM_FENCE);\n"
00052 " for(offset = 1; offset < n; offset *=2)\n"
00053 " {\n"
00054 " pout = 1-pout;\n"
00055 " pin = 1-pout;\n"
00056 " if(thid >= offset)\n"
00057 " sdata[pout*n+thid] = FUNCTIONNAME(sdata[pin*n+thid], sdata[pin*n+thid-offset], (TYPE)0);\n"
00058 " else\n"
00059 " sdata[pout*n+thid] = sdata[pin*n+thid];\n"
00060 " barrier(CLK_LOCAL_MEM_FENCE);\n"
00061 " }\n"
00062 " if(thid == blockDim - 1)\n"
00063 " blockSums[blockNr] = sdata[pout*n+blockDim-1];\n"
00064 " if(mem < numElements)\n"
00065 " output[mem] = sdata[pout*n+thid];\n"
00066 " mem += gridSize;\n"
00067 " blockNr += gridDim;\n"
00068 " barrier(CLK_LOCAL_MEM_FENCE);\n"
00069 " }\n"
00070 "}\n"
00071 );
00072
00079 static std::string ScanUpdate_CL(
00080 "__kernel void ScanUpdate_KERNELNAME(__global TYPE* data, __global TYPE* sums, int isInclusive, TYPE init, int n, __global TYPE* ret, __local TYPE* sdata)\n"
00081 "{\n"
00082 " __local TYPE offset;\n"
00083 " __local TYPE inc_offset;\n"
00084 " unsigned int threadIdx = get_local_id(0);\n"
00085 " unsigned int blockDim = get_local_size(0);\n"
00086 " unsigned int blockIdx = get_group_id(0);\n"
00087 " unsigned int gridDim = get_num_groups(0);\n"
00088 " int thid = threadIdx;\n"
00089 " int blockNr = blockIdx;\n"
00090 " unsigned int gridSize = blockDim*gridDim;\n"
00091 " int mem = get_global_id(0);\n"
00092 " unsigned int numBlocks = n/(blockDim) + (n%(blockDim) == 0 ? 0:1);\n"
00093 " while(blockNr < numBlocks)\n"
00094 " {\n"
00095 " if(thid == 0)\n"
00096 " {\n"
00097 " if(isInclusive == 0)\n"
00098 " {\n"
00099 " offset = init;\n"
00100 " if(blockNr > 0)\n"
00101 " {\n"
00102 " offset = FUNCTIONNAME(offset, sums[blockNr-1], (TYPE)0);\n"
00103 " inc_offset = sums[blockNr-1];\n"
00104 " }\n"
00105 " }\n"
00106 " else\n"
00107 " {\n"
00108 " if(blockNr > 0)\n"
00109 " offset = sums[blockNr-1];\n"
00110 " }\n"
00111 " }\n"
00112 " barrier(CLK_LOCAL_MEM_FENCE);\n"
00113 " if(isInclusive == 1)\n"
00114 " {\n"
00115 " if(blockNr > 0)\n"
00116 " sdata[thid] = (mem < n) ? FUNCTIONNAME(offset, data[mem], (TYPE)0) : 0;\n"
00117 " else\n"
00118 " sdata[thid] = (mem < n) ? data[mem] : 0;\n"
00119 " if(mem == n-1)\n"
00120 " *ret = sdata[thid];\n"
00121 " }\n"
00122 " else\n"
00123 " {\n"
00124 " if(mem == n-1)\n"
00125 " *ret = FUNCTIONNAME(inc_offset, data[mem], (TYPE)0);\n"
00126 " if(thid == 0)\n"
00127 " sdata[thid] = offset;\n"
00128 " else\n"
00129 " sdata[thid] = (mem-1 < n) ? FUNCTIONNAME(offset, data[mem-1], (TYPE)0) : 0;\n"
00130 " }\n"
00131 " barrier(CLK_LOCAL_MEM_FENCE);\n"
00132 " if(mem < n)\n"
00133 " data[mem] = sdata[thid];\n"
00134 " mem += gridSize;\n"
00135 " blockNr += gridDim;\n"
00136 " barrier(CLK_LOCAL_MEM_FENCE);\n"
00137 " }\n"
00138 "}\n"
00139 );
00140
00146 static std::string ScanAdd_CL(
00147 "__kernel void ScanAdd_KERNELNAME(__global TYPE* data, TYPE sum, int n)\n"
00148 "{\n"
00149 " int i = get_global_id(0);\n"
00150 " unsigned int gridSize = get_local_size(0)*get_num_groups(0);\n"
00151 " while(i < n)\n"
00152 " {\n"
00153 " data[i] = FUNCTIONNAME(data[i], sum, (TYPE)0);\n"
00154 " i += gridSize;\n"
00155 " }\n"
00156 "}\n"
00157 );
00158
00163 }
00164
00165 #endif
00166
00167 #ifdef SKEPU_CUDA
00168
00169 namespace skepu
00170 {
00171
00190 template<typename T, typename BinaryFunc>
00191 __global__ void ScanKernel_CU(BinaryFunc scanFunc, T* input, T* output, T* blockSums, unsigned int n, unsigned int numElements)
00192 {
00193 extern __shared__ char _sdata[];
00194 T* sdata = reinterpret_cast<T*>(_sdata);
00195
00196 int thid = threadIdx.x;
00197 int pout = 0;
00198 int pin = 1;
00199 int mem = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;
00200 int blockNr = blockIdx.x;
00201 unsigned int gridSize = blockDim.x*gridDim.x;
00202 unsigned int numBlocks = numElements/(blockDim.x) + (numElements%(blockDim.x) == 0 ? 0:1);
00203 int offset;
00204
00205 while(blockNr < numBlocks)
00206 {
00207 sdata[pout*n+thid] = (mem < numElements) ? input[mem] : 0;
00208
00209 __syncthreads();
00210
00211 for(offset = 1; offset < n; offset *=2)
00212 {
00213 pout = 1-pout;
00214 pin = 1-pout;
00215 if(thid >= offset)
00216 sdata[pout*n+thid] = scanFunc.CU(sdata[pin*n+thid], sdata[pin*n+thid-offset]);
00217 else
00218 sdata[pout*n+thid] = sdata[pin*n+thid];
00219
00220 __syncthreads();
00221 }
00222 if(thid == blockDim.x - 1)
00223 blockSums[blockNr] = sdata[pout*n+blockDim.x-1];
00224
00225 if(mem < numElements)
00226 output[mem] = sdata[pout*n+thid];
00227
00228 mem += gridSize;
00229 blockNr += gridDim.x;
00230
00231 __syncthreads();
00232 }
00233 }
00234
00241 template <typename T, typename BinaryFunc>
00242 __global__ void ScanUpdate_CU(BinaryFunc scanFunc, T *data, T *sums, int isInclusive, T init, int n, T *ret)
00243 {
00244 extern __shared__ char _sdata[];
00245 T* sdata = reinterpret_cast<T*>(_sdata);
00246
00247 __shared__ T offset;
00248 __shared__ T inc_offset;
00249
00250 int thid = threadIdx.x;
00251 int blockNr = blockIdx.x;
00252 unsigned int gridSize = blockDim.x*gridDim.x;
00253 int mem = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;
00254 unsigned int numBlocks = n/(blockDim.x) + (n%(blockDim.x) == 0 ? 0:1);
00255
00256 while(blockNr < numBlocks)
00257 {
00258 if(thid == 0)
00259 {
00260 if(isInclusive == 0)
00261 {
00262 offset = init;
00263 if(blockNr > 0)
00264 {
00265 offset = scanFunc.CU(offset, sums[blockNr-1]);
00266 inc_offset = sums[blockNr-1];
00267 }
00268 }
00269 else
00270 {
00271 if(blockNr > 0)
00272 offset = sums[blockNr-1];
00273 }
00274 }
00275
00276 __syncthreads();
00277
00278 if(isInclusive == 1)
00279 {
00280 if(blockNr > 0)
00281 sdata[thid] = (mem < n) ? scanFunc.CU(offset, data[mem]) : 0;
00282 else
00283 sdata[thid] = (mem < n) ? data[mem] : 0;
00284 if(mem == n-1)
00285 *ret = sdata[thid];
00286 }
00287 else
00288 {
00289 if(mem == n-1)
00290 *ret = scanFunc.CU(inc_offset, data[mem]);
00291 if(thid == 0)
00292 sdata[thid] = offset;
00293 else
00294 sdata[thid] = (mem-1 < n) ? scanFunc.CU(offset, data[mem-1]) : 0;
00295 }
00296
00297 __syncthreads();
00298
00299 if(mem < n)
00300 data[mem] = sdata[thid];
00301
00302 mem += gridSize;
00303 blockNr += gridDim.x;
00304
00305 __syncthreads();
00306 }
00307 }
00308
00314 template <typename T, typename BinaryFunc>
00315 __global__ void ScanAdd_CU(BinaryFunc scanFunc, T *data, T sum, int n)
00316 {
00317 int i = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;
00318 unsigned int gridSize = blockDim.x*gridDim.x;
00319
00320 while(i < n)
00321 {
00322 data[i] = scanFunc.CU(data[i], sum);
00323 i += gridSize;
00324 }
00325 }
00326
00331 }
00332
00333 #endif
00334
00335 #endif