The main nemaspace for SkePU library. More...
Classes | |
| class | Generate |
| A class representing the Generate skeleton. More... | |
| class | Map |
| A class representing the Map skeleton. More... | |
| class | MapArray |
| A class representing the MapArray skeleton. More... | |
| class | MapOverlap |
| A class representing the MapOverlap skeleton. More... | |
| class | MapReduce |
| A class representing the MapReduce skeleton. More... | |
| class | Reduce |
| A class representing the Reduce skeleton. More... | |
| class | Scan |
| A class representing the Scan skeleton. More... | |
| class | DataCollector2D |
| A class that can be used to collect 2D data. More... | |
| class | Device_CL |
| A class representing an OpenCL device. More... | |
| class | Device_CU |
| A class representing a CUDA device. More... | |
| class | DeviceMemPointer_CL |
| A class representing an OpenCL device memory allocation. More... | |
| class | DeviceMemPointer_CU |
| A class representing a CUDA device memory allocation. More... | |
| struct | openclGenProp |
| struct | openclDeviceProp |
| class | Environment |
| A class representing a execution environment. More... | |
| class | ExecPlan |
| A class that describes an execution plan. More... | |
| struct | generateThreadFuncArgs_CU |
| struct | mapThreadFunc1Args_CU |
| struct | mapThreadFunc2Args_CU |
| struct | mapThreadFunc3Args_CU |
| struct | mapArrayThreadFuncArgs_CU |
| struct | mapOverlapThreadFuncArgs_CU |
| struct | mapReduceThreadFuncArgs1_CU |
| struct | mapReduceThreadFuncArgs2_CU |
| struct | mapReduceThreadFuncArgs3_CU |
| struct | reduceThreadFuncArgs_CU |
| struct | scanThreadFuncArgs_CU |
| class | Threads |
| class | TimerLinux_GTOD |
| A class that can be used measure time on Linux systems. More... | |
| class | Vector |
| A vector container class, implemented as a wrapper for std::vector. More... | |
Enumerations | |
| enum | EdgePolicy |
| enum | ScanType |
Functions | |
| template<typename GenerateFunc , typename OutputIterator > | |
| static void * | generateThreadFunc_CU (void *_args) |
| static std::string | GenerateKernel_CL ("__kernel void GenerateKernel_KERNELNAME(__global TYPE* output, unsigned int numElements, TYPE const1, unsigned int indexOffset)\n""{\n"" unsigned int i = get_global_id(0);\n"" unsigned int gridSize = get_local_size(0)*get_num_groups(0);\n"" while(i < numElements)\n"" {\n"" output[i] = FUNCTIONNAME(i+indexOffset, const1);\n"" i += gridSize;\n"" }\n""}\n") |
| template<typename T , typename GenerateFunc > | |
| __global__ void | GenerateKernel_CU (GenerateFunc generateFunc, T *output, unsigned int numElements, unsigned int indexOffset) |
| template<typename UnaryFunc , typename InputIterator , typename OutputIterator > | |
| static void * | mapThreadFunc1_CU (void *_args) |
| template<typename BinaryFunc , typename Input1Iterator , typename Input2Iterator , typename OutputIterator > | |
| static void * | mapThreadFunc2_CU (void *_args) |
| template<typename TrinaryFunc , typename Input1Iterator , typename Input2Iterator , typename Input3Iterator , typename OutputIterator > | |
| static void * | mapThreadFunc3_CU (void *_args) |
| static std::string | UnaryMapKernel_CL ("__kernel void UnaryMapKernel_KERNELNAME(__global TYPE* input, __global TYPE* output, unsigned int numElements, TYPE const1)\n""{\n"" int i = get_global_id(0);\n"" unsigned int gridSize = get_local_size(0)*get_num_groups(0);\n"" while(i < numElements)\n"" {\n"" output[i] = FUNCTIONNAME(input[i], const1);\n"" i += gridSize;\n"" }\n""}\n") |
| static std::string | BinaryMapKernel_CL ("__kernel void BinaryMapKernel_KERNELNAME(__global TYPE* input1, __global TYPE* input2, __global TYPE* output, unsigned int n, TYPE const1)\n""{\n"" int i = get_global_id(0);\n"" unsigned int gridSize = get_local_size(0)*get_num_groups(0);\n"" while(i < n)\n"" {\n"" output[i] = FUNCTIONNAME(input1[i], input2[i], const1);\n"" i += gridSize;\n"" }\n""}\n") |
| static std::string | TrinaryMapKernel_CL ("__kernel void TrinaryMapKernel_KERNELNAME(__global TYPE* input1, __global TYPE* input2, __global TYPE* input3, __global TYPE* output, unsigned int n, TYPE const1)\n""{\n"" int i = get_global_id(0);\n"" unsigned int gridSize = get_local_size(0)*get_num_groups(0);\n"" while(i < n)\n"" {\n"" output[i] = FUNCTIONNAME(input1[i], input2[i], input3[i], const1);\n"" i += gridSize;\n"" }\n""}\n") |
| template<typename T , typename UnaryFunc > | |
| __global__ void | MapKernelUnary_CU (UnaryFunc mapFunc, T *input, T *output, unsigned int n) |
| template<typename T , typename BinaryFunc > | |
| __global__ void | MapKernelBinary_CU (BinaryFunc mapFunc, T *input1, T *input2, T *output, unsigned int n) |
| template<typename T , typename TrinaryFunc > | |
| __global__ void | MapKernelTrinary_CU (TrinaryFunc mapFunc, T *input1, T *input2, T *input3, T *output, unsigned int n) |
| template<typename ArrayFunc , typename Input1Iterator , typename Input2Iterator , typename OutputIterator > | |
| static void * | mapArrayThreadFunc_CU (void *_args) |
| static std::string | MapArrayKernel_CL ("__kernel void MapArrayKernel_KERNELNAME(__global TYPE* input1, __global TYPE* input2, __global TYPE* output, unsigned int n)\n""{\n"" int i = get_global_id(0);\n"" unsigned int gridSize = get_local_size(0)*get_num_groups(0);\n"" while(i < n)\n"" {\n"" output[i] = FUNCTIONNAME(&input1[0], input2[i]);\n"" i += gridSize;\n"" }\n""}\n") |
| template<typename T , typename ArrayFunc > | |
| __global__ void | MapArrayKernel_CU (ArrayFunc mapArrayFunc, T *input1, T *input2, T *output, unsigned int n) |
| template<typename OverlapFunc , typename InputIterator , typename OutputIterator > | |
| static void * | mapOverlapThreadFunc_CU (void *_args) |
| static std::string | MapOverlapKernel_CL ("__kernel void MapOverlapKernel_KERNELNAME(__global TYPE* input, __global TYPE* output, __global TYPE* wrap, int n, int overlap, int out_offset, int out_numelements, int poly, TYPE pad, __local TYPE* sdata)\n""{\n"" int tid = get_local_id(0);\n"" int i = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"" if(poly == 0)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[i] : pad;\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (get_group_id(0) == 0) ? pad : input[i-overlap];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != get_num_groups(0)-1 && i+overlap < n) ? input[i+overlap] : pad;\n"" }\n"" }\n"" else if(poly == 1)\n"" {\n"" if(i < n)\n"" {\n"" sdata[overlap+tid] = input[i];\n"" }\n"" else if(i-n < overlap)\n"" {\n"" sdata[overlap+tid] = wrap[overlap+(i-n)];\n"" }\n"" else\n"" {\n"" sdata[overlap+tid] = pad;\n"" }\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (get_group_id(0) == 0) ? wrap[tid] : input[i-overlap];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != get_num_groups(0)-1 && i+overlap < n) ? input[i+overlap] : wrap[overlap+(i+overlap-n)];\n"" }\n"" }\n"" else if(poly == 2)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[i] : input[n-1];\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (get_group_id(0) == 0) ? input[0] : input[i-overlap];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != get_num_groups(0)-1 && i+overlap < n) ? input[i+overlap] : input[n-1];\n"" }\n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if( (i >= out_offset) && (i < out_offset+out_numelements) )\n"" {\n"" output[i-out_offset] = FUNCTIONNAME(&(sdata[tid+overlap]));\n"" }\n""}\n") |
| template<int poly, typename T , typename OverlapFunc > | |
| __global__ void | MapOverlapKernel_CU (OverlapFunc mapOverlapFunc, T *input, T *output, T *wrap, unsigned int n, unsigned int out_offset, unsigned int out_numelements, T pad) |
| template<typename UnaryFunc , typename BinaryFunc , typename InputIterator > | |
| static void * | mapReduceThreadFunc1_CU (void *_args) |
| template<typename BinaryFunc1 , typename BinaryFunc2 , typename Input1Iterator , typename Input2Iterator > | |
| static void * | mapReduceThreadFunc2_CU (void *_args) |
| template<typename TrinaryFunc , typename BinaryFunc , typename Input1Iterator , typename Input2Iterator , typename Input3Iterator > | |
| static void * | mapReduceThreadFunc3_CU (void *_args) |
| static std::string | UnaryMapReduceKernel_CL ("__kernel void UnaryMapReduceKernel_KERNELNAME(__global TYPE* input, __global TYPE* output, unsigned int n, __local TYPE* sdata, TYPE const1)\n""{\n"" unsigned int blockSize = get_local_size(0);\n"" unsigned int tid = get_local_id(0);\n"" unsigned int i = get_group_id(0)*blockSize + get_local_id(0);\n"" unsigned int gridSize = blockSize*get_num_groups(0);\n"" TYPE result = 0;\n"" if(i < n)\n"" {\n"" result = FUNCTIONNAME_MAP(input[i], const1);\n"" i += gridSize;\n"" }\n"" while(i < n)\n"" {\n"" TYPE tempMap;\n"" tempMap = FUNCTIONNAME_MAP(input[i], const1);\n"" result = FUNCTIONNAME_REDUCE(result, tempMap, (TYPE)0);\n"" i += gridSize;\n"" }\n"" sdata[tid] = result;\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if(blockSize >= 512) { if (tid < 256 && tid + 256 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 256], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 256) { if (tid < 128 && tid + 128 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 128], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 128) { if (tid < 64 && tid + 64 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 64], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 64) { if (tid < 32 && tid + 32 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 32], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 32) { if (tid < 16 && tid + 16 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 16], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 16) { if (tid < 8 && tid + 8 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 8], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 8) { if (tid < 4 && tid + 4 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 4], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 4) { if (tid < 2 && tid + 2 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 2], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 2) { if (tid < 1 && tid + 1 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 1], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(tid == 0)\n"" {\n"" output[get_group_id(0)] = sdata[tid];\n"" }\n""}\n") |
| static std::string | BinaryMapReduceKernel_CL ("__kernel void BinaryMapReduceKernel_KERNELNAME(__global TYPE* input1, __global TYPE* input2, __global TYPE* output, unsigned int n, __local TYPE* sdata, TYPE const1)\n""{\n"" unsigned int blockSize = get_local_size(0);\n"" unsigned int tid = get_local_id(0);\n"" unsigned int i = get_group_id(0)*blockSize + get_local_id(0);\n"" unsigned int gridSize = blockSize*get_num_groups(0);\n"" TYPE result = 0;\n"" if(i < n)\n"" {\n"" result = FUNCTIONNAME_MAP(input1[i], input2[i], const1);\n"" i += gridSize;\n"" }\n"" while(i < n)\n"" {\n"" TYPE tempMap;\n"" tempMap = FUNCTIONNAME_MAP(input1[i], input2[i], const1);\n"" result = FUNCTIONNAME_REDUCE(result, tempMap, (TYPE)0);\n"" i += gridSize;\n"" }\n"" sdata[tid] = result;\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if(blockSize >= 512) { if (tid < 256 && tid + 256 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 256], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 256) { if (tid < 128 && tid + 128 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 128], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 128) { if (tid < 64 && tid + 64 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 64], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 64) { if (tid < 32 && tid + 32 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 32], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 32) { if (tid < 16 && tid + 16 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 16], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 16) { if (tid < 8 && tid + 8 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 8], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 8) { if (tid < 4 && tid + 4 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 4], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 4) { if (tid < 2 && tid + 2 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 2], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 2) { if (tid < 1 && tid + 1 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 1], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(tid == 0)\n"" {\n"" output[get_group_id(0)] = sdata[tid];\n"" }\n""}\n") |
| static std::string | TrinaryMapReduceKernel_CL ("__kernel void TrinaryMapReduceKernel_KERNELNAME(__global TYPE* input1, __global TYPE* input2, __global TYPE* input3, __global TYPE* output, unsigned int n, __local TYPE* sdata, TYPE const1)\n""{\n"" unsigned int blockSize = get_local_size(0);\n"" unsigned int tid = get_local_id(0);\n"" unsigned int i = get_group_id(0)*blockSize + get_local_id(0);\n"" unsigned int gridSize = blockSize*get_num_groups(0);\n"" TYPE result = 0;\n"" if(i < n)\n"" {\n"" result = FUNCTIONNAME_MAP(input1[i], input2[i], input3[i], const1);\n"" i += gridSize;\n"" }\n"" while(i < n)\n"" {\n"" TYPE tempMap;\n"" tempMap = FUNCTIONNAME_MAP(input1[i], input2[i], input3[i], const1);\n"" result = FUNCTIONNAME_REDUCE(result, tempMap, (TYPE)0);\n"" i += gridSize;\n"" }\n"" sdata[tid] = result;\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if(blockSize >= 512) { if (tid < 256 && tid + 256 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 256], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 256) { if (tid < 128 && tid + 128 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 128], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 128) { if (tid < 64 && tid + 64 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 64], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 64) { if (tid < 32 && tid + 32 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 32], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 32) { if (tid < 16 && tid + 16 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 16], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 16) { if (tid < 8 && tid + 8 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 8], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 8) { if (tid < 4 && tid + 4 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 4], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 4) { if (tid < 2 && tid + 2 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 2], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 2) { if (tid < 1 && tid + 1 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 1], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(tid == 0)\n"" {\n"" output[get_group_id(0)] = sdata[tid];\n"" }\n""}\n") |
| template<typename T , typename UnaryFunc , typename BinaryFunc > | |
| __global__ void | MapReduceKernel1_CU (UnaryFunc mapFunc, BinaryFunc reduceFunc, T *input, T *output, unsigned int n) |
| template<typename T , typename BinaryFunc1 , typename BinaryFunc2 > | |
| __global__ void | MapReduceKernel2_CU (BinaryFunc1 mapFunc, BinaryFunc2 reduceFunc, T *input1, T *input2, T *output, unsigned int n) |
| template<typename T , typename TrinaryFunc , typename BinaryFunc > | |
| __global__ void | MapReduceKernel3_CU (TrinaryFunc mapFunc, BinaryFunc reduceFunc, T *input1, T *input2, T *input3, T *output, unsigned int n) |
| template<typename BinaryFunc , typename InputIterator > | |
| static void * | reduceThreadFunc_CU (void *_args) |
| static std::string | ReduceKernel_CL ("__kernel void ReduceKernel_KERNELNAME(__global TYPE* input, __global TYPE* output, unsigned int n, __local TYPE* sdata)\n""{\n"" unsigned int blockSize = get_local_size(0);\n"" unsigned int tid = get_local_id(0);\n"" unsigned int i = get_group_id(0)*blockSize + get_local_id(0);\n"" unsigned int gridSize = blockSize*get_num_groups(0);\n"" TYPE result = 0;\n"" if(i < n)\n"" {\n"" result = input[i];\n"" i += gridSize;\n"" }\n"" while(i < n)\n"" {\n"" result = FUNCTIONNAME(result, input[i], (TYPE)0);\n"" i += gridSize;\n"" }\n"" sdata[tid] = result;\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if(blockSize >= 512) { if (tid < 256 && tid + 256 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 256], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 256) { if (tid < 128 && tid + 128 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 128], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 128) { if (tid < 64 && tid + 64 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 64], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 64) { if (tid < 32 && tid + 32 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 32], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 32) { if (tid < 16 && tid + 16 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 16], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 16) { if (tid < 8 && tid + 8 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 8], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 8) { if (tid < 4 && tid + 4 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 4], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 4) { if (tid < 2 && tid + 2 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 2], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 2) { if (tid < 1 && tid + 1 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 1], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(tid == 0)\n"" {\n"" output[get_group_id(0)] = sdata[tid];\n"" }\n""}\n") |
| template<typename T , typename BinaryFunc > | |
| __global__ void | ReduceKernel_CU (BinaryFunc reduceFunc, T *input, T *output, unsigned int n) |
| template<typename BinaryFunc , typename T > | |
| static T | scanLargeVectorRecursivelyM_CU (DeviceMemPointer_CU< T > *input, DeviceMemPointer_CU< T > *output, std::vector< DeviceMemPointer_CU< T > * > &blockSums, unsigned int numElements, int level, ScanType type, T init, Device_CU *device, BinaryFunc scanFunc) |
| template<typename BinaryFunc , typename InputIterator , typename OutputIterator > | |
| static void * | scanThreadFunc_CU (void *_args) |
| template<typename BinaryFunc , typename InputIterator , typename OutputIterator > | |
| static void * | scanAddThreadFunc_CU (void *_args) |
| static std::string | ScanKernel_CL ("__kernel void ScanKernel_KERNELNAME(__global TYPE* input, __global TYPE* output, __global TYPE* blockSums, unsigned int n, unsigned int numElements, __local TYPE* sdata)\n""{\n"" unsigned int threadIdx = get_local_id(0);\n"" unsigned int blockDim = get_local_size(0);\n"" unsigned int blockIdx = get_group_id(0);\n"" unsigned int gridDim = get_num_groups(0);\n"" int thid = threadIdx;\n"" int pout = 0;\n"" int pin = 1;\n"" int mem = get_global_id(0);\n"" int blockNr = blockIdx;\n"" unsigned int gridSize = blockDim*gridDim;\n"" unsigned int numBlocks = numElements/(blockDim) + (numElements%(blockDim) == 0 ? 0:1);\n"" int offset;\n"" while(blockNr < numBlocks)\n"" {\n"" sdata[pout*n+thid] = (mem < numElements) ? input[mem] : 0;\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" for(offset = 1; offset < n; offset *=2)\n"" {\n"" pout = 1-pout;\n"" pin = 1-pout;\n"" if(thid >= offset)\n"" sdata[pout*n+thid] = FUNCTIONNAME(sdata[pin*n+thid], sdata[pin*n+thid-offset], (TYPE)0);\n"" else\n"" sdata[pout*n+thid] = sdata[pin*n+thid];\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" }\n"" if(thid == blockDim - 1)\n"" blockSums[blockNr] = sdata[pout*n+blockDim-1];\n"" if(mem < numElements)\n"" output[mem] = sdata[pout*n+thid];\n"" mem += gridSize;\n"" blockNr += gridDim;\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" }\n""}\n") |
| static std::string | ScanUpdate_CL ("__kernel void ScanUpdate_KERNELNAME(__global TYPE* data, __global TYPE* sums, int isInclusive, TYPE init, int n, __global TYPE* ret, __local TYPE* sdata)\n""{\n"" __local TYPE offset;\n"" __local TYPE inc_offset;\n"" unsigned int threadIdx = get_local_id(0);\n"" unsigned int blockDim = get_local_size(0);\n"" unsigned int blockIdx = get_group_id(0);\n"" unsigned int gridDim = get_num_groups(0);\n"" int thid = threadIdx;\n"" int blockNr = blockIdx;\n"" unsigned int gridSize = blockDim*gridDim;\n"" int mem = get_global_id(0);\n"" unsigned int numBlocks = n/(blockDim) + (n%(blockDim) == 0 ? 0:1);\n"" while(blockNr < numBlocks)\n"" {\n"" if(thid == 0)\n"" {\n"" if(isInclusive == 0)\n"" {\n"" offset = init;\n"" if(blockNr > 0)\n"" {\n"" offset = FUNCTIONNAME(offset, sums[blockNr-1], (TYPE)0);\n"" inc_offset = sums[blockNr-1];\n"" }\n"" }\n"" else\n"" {\n"" if(blockNr > 0)\n"" offset = sums[blockNr-1];\n"" }\n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if(isInclusive == 1)\n"" {\n"" if(blockNr > 0)\n"" sdata[thid] = (mem < n) ? FUNCTIONNAME(offset, data[mem], (TYPE)0) : 0;\n"" else\n"" sdata[thid] = (mem < n) ? data[mem] : 0;\n"" if(mem == n-1)\n"" *ret = sdata[thid];\n"" }\n"" else\n"" {\n"" if(mem == n-1)\n"" *ret = FUNCTIONNAME(inc_offset, data[mem], (TYPE)0);\n"" if(thid == 0)\n"" sdata[thid] = offset;\n"" else\n"" sdata[thid] = (mem-1 < n) ? FUNCTIONNAME(offset, data[mem-1], (TYPE)0) : 0;\n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if(mem < n)\n"" data[mem] = sdata[thid];\n"" mem += gridSize;\n"" blockNr += gridDim;\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" }\n""}\n") |
| static std::string | ScanAdd_CL ("__kernel void ScanAdd_KERNELNAME(__global TYPE* data, TYPE sum, int n)\n""{\n"" int i = get_global_id(0);\n"" unsigned int gridSize = get_local_size(0)*get_num_groups(0);\n"" while(i < n)\n"" {\n"" data[i] = FUNCTIONNAME(data[i], sum, (TYPE)0);\n"" i += gridSize;\n"" }\n""}\n") |
| template<typename T , typename BinaryFunc > | |
| __global__ void | ScanKernel_CU (BinaryFunc scanFunc, T *input, T *output, T *blockSums, unsigned int n, unsigned int numElements) |
| template<typename T , typename BinaryFunc > | |
| __global__ void | ScanUpdate_CU (BinaryFunc scanFunc, T *data, T *sums, int isInclusive, T init, int n, T *ret) |
| template<typename T , typename BinaryFunc > | |
| __global__ void | ScanAdd_CU (BinaryFunc scanFunc, T *data, T sum, int n) |
The main nemaspace for SkePU library.
All classes and functions in the SkePU library are in this namespace.
| enum skepu::EdgePolicy |
Enumeration of the different edge policies (what happens when a read outside the vector is perfromed) that the map overlap skeletons support.
| enum skepu::ScanType |
Enumeration of the two types of Scan that can be performed: Inclusive and Exclusive.
| static void* skepu::generateThreadFunc_CU | ( | void * | _args | ) | [static] |
When applying the Generate skeleton using CUDA and multiple devices, one host thread per device have to be created. This function is run by several host threads, each doing calculations on a seperate range. Since the device - host thread binding disappears when the thread has finished, the data is copied back to the host before function returns. The lazy memory copying therefore does not work when using multi-GPU with CUDA.
| _args | void pointer to struct containing arguments needed to do generating. |
References skepu::Device_CU::getDeviceID(), skepu::Device_CU::getMaxBlocks(), and skepu::Device_CU::getMaxThreads().

| static void* skepu::mapArrayThreadFunc_CU | ( | void * | _args | ) | [static] |
When applying the MapArray skeleton using CUDA and multiple devices, one host thread per device have to be created. This fucntion is run by several host threads, each doing calculations on seperate ranges. Since the device - host thread binding disappears when the thread has finished, the data is copied back to the host before function returns. The lazy memory copying therefore does not work when using multi-GPU with CUDA.
| _args | void pointer to struct containing arguments needed to perform the MapArray. |
References skepu::Device_CU::getDeviceID(), skepu::Device_CU::getMaxBlocks(), and skepu::Device_CU::getMaxThreads().

| static void* skepu::mapOverlapThreadFunc_CU | ( | void * | _args | ) | [static] |
When applying the MapOverlap skeleton using CUDA and multiple devices, one host thread per device have to be created. This function is run by several host threads, each doing calculations on a seperate range. Since the device - host thread binding disappears when the thread has finished, the data is copied back to the host before function returns. The lazy memory copying therefore does not work when using multi-GPU with CUDA.
| _args | void pointer to struct containing arguments needed to do mapping. |
References skepu::Device_CU::getDeviceID(), skepu::Device_CU::getMaxBlocks(), and skepu::Device_CU::getMaxThreads().

| static void* skepu::mapReduceThreadFunc1_CU | ( | void * | _args | ) | [static] |
Performs the Map on one range of elements and Reduce on the result with CUDA as backend. Returns a scalar result. The function is called by the thread library and is used for multi-GPU reduction using CUDA. Each host thread corresponds to one device.
| _args | void pointer to struct containing arguments needed to do mapping. |
References skepu::DeviceMemPointer_CU< T >::changeDeviceData(), skepu::DeviceMemPointer_CU< T >::copyDeviceToHost(), skepu::DeviceMemPointer_CU< T >::getDeviceDataPointer(), skepu::Device_CU::getDeviceID(), and skepu::Device_CU::getMaxThreads().

| static void* skepu::mapReduceThreadFunc2_CU | ( | void * | _args | ) | [static] |
Performs the Map on two ranges of elements and Reduce on the result with CUDA as backend. Returns a scalar result. The function is called by the thread library and is used for multi-GPU reduction using CUDA. Each host thread corresponds to one device.
| _args | void pointer to struct containing arguments needed to do mapping. |
References skepu::DeviceMemPointer_CU< T >::changeDeviceData(), skepu::DeviceMemPointer_CU< T >::copyDeviceToHost(), skepu::DeviceMemPointer_CU< T >::getDeviceDataPointer(), skepu::Device_CU::getDeviceID(), and skepu::Device_CU::getMaxThreads().

| static void* skepu::mapReduceThreadFunc3_CU | ( | void * | _args | ) | [static] |
Performs the Map on three range of elements and Reduce on the result with CUDA as backend. Returns a scalar result. The function is called by the thread library and is used for multi-GPU reduction using CUDA. Each host thread corresponds to one device.
| _args | void pointer to struct containing arguments needed to do mapping. |
References skepu::DeviceMemPointer_CU< T >::changeDeviceData(), skepu::DeviceMemPointer_CU< T >::copyDeviceToHost(), skepu::DeviceMemPointer_CU< T >::getDeviceDataPointer(), skepu::Device_CU::getDeviceID(), and skepu::Device_CU::getMaxThreads().

| static void* skepu::mapThreadFunc1_CU | ( | void * | _args | ) | [static] |
When applying the Map skeleton using CUDA and multiple devices, one host thread per device have to be created. This fucntion is run by several host threads, each doing calculations on a seperate range. Since the device - host thread binding disappears when the thread has finished, the data is copied back to the host before function returns. The lazy memory copying therefore does not work when using multi-GPU with CUDA.
The skeleton must have been created with a unary user function when using this. Maps only one range of vectors.
| _args | void pointer to struct containing arguments needed to do mapping. |
References skepu::Device_CU::getDeviceID(), and skepu::Device_CU::getMaxThreads().

| static void* skepu::mapThreadFunc2_CU | ( | void * | _args | ) | [static] |
When applying the Map skeleton using CUDA and multiple devices, one host thread per device have to be created. This fucntion is run by several host threads, each doing calculations on seperate ranges. Since the device - host thread binding disappears when the thread has finished, the data is copied back to the host before function returns. The lazy memory copying therefore does not work when using multi-GPU with CUDA.
The skeleton must have been created with a binary user function when using this. Maps two ranges of vectors.
| _args | void pointer to struct containing arguments needed to do mapping. |
References skepu::Device_CU::getDeviceID(), and skepu::Device_CU::getMaxThreads().

| static void* skepu::mapThreadFunc3_CU | ( | void * | _args | ) | [static] |
When applying the Map skeleton using CUDA and multiple devices, one host thread per device have to be created. This fucntion is run by several host threads, each doing calculations on seperate ranges. Since the device - host thread binding disappears when the thread has finished, the data is copied back to the host before function returns. The lazy memory copying therefore does not work when using multi-GPU with CUDA.
The skeleton must have been created with a trinary user function when using this. Maps three ranges of vectors.
| _args | void pointer to struct containing arguments needed to do mapping. |
References skepu::Device_CU::getDeviceID(), and skepu::Device_CU::getMaxThreads().

| static void* skepu::reduceThreadFunc_CU | ( | void * | _args | ) | [static] |
Performs the Reduction on a range of elements with CUDA as backend. Returns a scalar result. The function is called by the thread library and is used for multi-GPU reduction using CUDA. Each host thread corresponds to one device.
| _args | void pointer to struct containing arguments needed to do mapping. |
References skepu::DeviceMemPointer_CU< T >::changeDeviceData(), skepu::DeviceMemPointer_CU< T >::copyDeviceToHost(), skepu::DeviceMemPointer_CU< T >::getDeviceDataPointer(), skepu::Device_CU::getDeviceID(), and skepu::Device_CU::getMaxThreads().

| static void* skepu::scanAddThreadFunc_CU | ( | void * | _args | ) | [static] |
When applying the Scan skeleton using CUDA and multiple devices, one host thread per device have to be created. This function is run by several host threads, each doing calculations on a seperate range. Since the device - host thread binding disappears when the thread has finished, the data is copied back to the host before function returns. The lazy memory copying therefore does not work when using multi-GPU with CUDA. This function adds the partial results from each device after a multi-GPU CUDA Scan.
| _args | void pointer to struct containing arguments needed for the scan. |
References skepu::Device_CU::getDeviceID(), skepu::Device_CU::getMaxBlocks(), and skepu::Device_CU::getMaxThreads().

| static T skepu::scanLargeVectorRecursivelyM_CU | ( | DeviceMemPointer_CU< T > * | input, | |
| DeviceMemPointer_CU< T > * | output, | |||
| std::vector< DeviceMemPointer_CU< T > * > & | blockSums, | |||
| unsigned int | numElements, | |||
| int | level, | |||
| ScanType | type, | |||
| T | init, | |||
| Device_CU * | device, | |||
| BinaryFunc | scanFunc | |||
| ) | [static] |
Scans a Vector using the same recursive algorithm as NVIDIA SDK. First the vector is scanned producing partial results for each block. Then the function is called recursively to scan these partial results, which in turn can produce partial results and so on. This continues until only one block with partial results is left. Used by multi-GPU CUDA implementation.
| input | Pointer to the device memory where the input vector resides. | |
| output | Pointer to the device memory where the output vector resides. | |
| blockSums | A Vector of device memory pointers where the partial results for each level is stored. | |
| numElements | The number of elements to scan. | |
| level | The current recursion level. | |
| type | The scan type, either INCLUSIVE or EXCLUSIVE. | |
| init | The initialization value for exclusive scans. | |
| device | Pointer to the device that will be used for the scan. | |
| scanFunc | The user function used in the scan. |
References skepu::DeviceMemPointer_CU< T >::changeDeviceData(), skepu::DeviceMemPointer_CU< T >::copyDeviceToHost(), skepu::DeviceMemPointer_CU< T >::getDeviceDataPointer(), skepu::Device_CU::getDeviceID(), skepu::Device_CU::getMaxBlocks(), and skepu::Device_CU::getMaxThreads().

| static void* skepu::scanThreadFunc_CU | ( | void * | _args | ) | [static] |
When applying the Scan skeleton using CUDA and multiple devices, one host thread per device have to be created. This function is run by several host threads, each doing calculations on a seperate range. Since the device - host thread binding disappears when the thread has finished, the data is copied back to the host before function returns. The lazy memory copying therefore does not work when using multi-GPU with CUDA.
| _args | void pointer to struct containing arguments needed for the scan. |
References skepu::Device_CU::getDeviceID(), skepu::Device_CU::getMaxThreads(), and scanLargeVectorRecursivelyM_CU().

1.7.1