Classes | Enumerations | Functions

skepu Namespace Reference

The main nemaspace for SkePU library. More...

Classes

class  Generate
 A class representing the Generate skeleton. More...
class  Map
 A class representing the Map skeleton. More...
class  MapArray
 A class representing the MapArray skeleton. More...
class  MapOverlap
 A class representing the MapOverlap skeleton. More...
class  MapReduce
 A class representing the MapReduce skeleton. More...
class  Reduce
 A class representing the Reduce skeleton. More...
class  Scan
 A class representing the Scan skeleton. More...
class  DataCollector2D
 A class that can be used to collect 2D data. More...
class  Device_CL
 A class representing an OpenCL device. More...
class  Device_CU
 A class representing a CUDA device. More...
class  DeviceMemPointer_CL
 A class representing an OpenCL device memory allocation. More...
class  DeviceMemPointer_CU
 A class representing a CUDA device memory allocation. More...
struct  openclGenProp
struct  openclDeviceProp
class  Environment
 A class representing a execution environment. More...
class  ExecPlan
 A class that describes an execution plan. More...
struct  generateThreadFuncArgs_CU
struct  mapThreadFunc1Args_CU
struct  mapThreadFunc2Args_CU
struct  mapThreadFunc3Args_CU
struct  mapArrayThreadFuncArgs_CU
struct  mapOverlapThreadFuncArgs_CU
struct  mapReduceThreadFuncArgs1_CU
struct  mapReduceThreadFuncArgs2_CU
struct  mapReduceThreadFuncArgs3_CU
struct  reduceThreadFuncArgs_CU
struct  scanThreadFuncArgs_CU
class  Threads
class  TimerLinux_GTOD
 A class that can be used measure time on Linux systems. More...
class  Vector
 A vector container class, implemented as a wrapper for std::vector. More...

Enumerations

enum  EdgePolicy
enum  ScanType

Functions

template<typename GenerateFunc , typename OutputIterator >
static void * generateThreadFunc_CU (void *_args)
static std::string GenerateKernel_CL ("__kernel void GenerateKernel_KERNELNAME(__global TYPE* output, unsigned int numElements, TYPE const1, unsigned int indexOffset)\n""{\n"" unsigned int i = get_global_id(0);\n"" unsigned int gridSize = get_local_size(0)*get_num_groups(0);\n"" while(i < numElements)\n"" {\n"" output[i] = FUNCTIONNAME(i+indexOffset, const1);\n"" i += gridSize;\n"" }\n""}\n")
template<typename T , typename GenerateFunc >
__global__ void GenerateKernel_CU (GenerateFunc generateFunc, T *output, unsigned int numElements, unsigned int indexOffset)
template<typename UnaryFunc , typename InputIterator , typename OutputIterator >
static void * mapThreadFunc1_CU (void *_args)
template<typename BinaryFunc , typename Input1Iterator , typename Input2Iterator , typename OutputIterator >
static void * mapThreadFunc2_CU (void *_args)
template<typename TrinaryFunc , typename Input1Iterator , typename Input2Iterator , typename Input3Iterator , typename OutputIterator >
static void * mapThreadFunc3_CU (void *_args)
static std::string UnaryMapKernel_CL ("__kernel void UnaryMapKernel_KERNELNAME(__global TYPE* input, __global TYPE* output, unsigned int numElements, TYPE const1)\n""{\n"" int i = get_global_id(0);\n"" unsigned int gridSize = get_local_size(0)*get_num_groups(0);\n"" while(i < numElements)\n"" {\n"" output[i] = FUNCTIONNAME(input[i], const1);\n"" i += gridSize;\n"" }\n""}\n")
static std::string BinaryMapKernel_CL ("__kernel void BinaryMapKernel_KERNELNAME(__global TYPE* input1, __global TYPE* input2, __global TYPE* output, unsigned int n, TYPE const1)\n""{\n"" int i = get_global_id(0);\n"" unsigned int gridSize = get_local_size(0)*get_num_groups(0);\n"" while(i < n)\n"" {\n"" output[i] = FUNCTIONNAME(input1[i], input2[i], const1);\n"" i += gridSize;\n"" }\n""}\n")
static std::string TrinaryMapKernel_CL ("__kernel void TrinaryMapKernel_KERNELNAME(__global TYPE* input1, __global TYPE* input2, __global TYPE* input3, __global TYPE* output, unsigned int n, TYPE const1)\n""{\n"" int i = get_global_id(0);\n"" unsigned int gridSize = get_local_size(0)*get_num_groups(0);\n"" while(i < n)\n"" {\n"" output[i] = FUNCTIONNAME(input1[i], input2[i], input3[i], const1);\n"" i += gridSize;\n"" }\n""}\n")
template<typename T , typename UnaryFunc >
__global__ void MapKernelUnary_CU (UnaryFunc mapFunc, T *input, T *output, unsigned int n)
template<typename T , typename BinaryFunc >
__global__ void MapKernelBinary_CU (BinaryFunc mapFunc, T *input1, T *input2, T *output, unsigned int n)
template<typename T , typename TrinaryFunc >
__global__ void MapKernelTrinary_CU (TrinaryFunc mapFunc, T *input1, T *input2, T *input3, T *output, unsigned int n)
template<typename ArrayFunc , typename Input1Iterator , typename Input2Iterator , typename OutputIterator >
static void * mapArrayThreadFunc_CU (void *_args)
static std::string MapArrayKernel_CL ("__kernel void MapArrayKernel_KERNELNAME(__global TYPE* input1, __global TYPE* input2, __global TYPE* output, unsigned int n)\n""{\n"" int i = get_global_id(0);\n"" unsigned int gridSize = get_local_size(0)*get_num_groups(0);\n"" while(i < n)\n"" {\n"" output[i] = FUNCTIONNAME(&input1[0], input2[i]);\n"" i += gridSize;\n"" }\n""}\n")
template<typename T , typename ArrayFunc >
__global__ void MapArrayKernel_CU (ArrayFunc mapArrayFunc, T *input1, T *input2, T *output, unsigned int n)
template<typename OverlapFunc , typename InputIterator , typename OutputIterator >
static void * mapOverlapThreadFunc_CU (void *_args)
static std::string MapOverlapKernel_CL ("__kernel void MapOverlapKernel_KERNELNAME(__global TYPE* input, __global TYPE* output, __global TYPE* wrap, int n, int overlap, int out_offset, int out_numelements, int poly, TYPE pad, __local TYPE* sdata)\n""{\n"" int tid = get_local_id(0);\n"" int i = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"" if(poly == 0)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[i] : pad;\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (get_group_id(0) == 0) ? pad : input[i-overlap];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != get_num_groups(0)-1 && i+overlap < n) ? input[i+overlap] : pad;\n"" }\n"" }\n"" else if(poly == 1)\n"" {\n"" if(i < n)\n"" {\n"" sdata[overlap+tid] = input[i];\n"" }\n"" else if(i-n < overlap)\n"" {\n"" sdata[overlap+tid] = wrap[overlap+(i-n)];\n"" }\n"" else\n"" {\n"" sdata[overlap+tid] = pad;\n"" }\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (get_group_id(0) == 0) ? wrap[tid] : input[i-overlap];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != get_num_groups(0)-1 && i+overlap < n) ? input[i+overlap] : wrap[overlap+(i+overlap-n)];\n"" }\n"" }\n"" else if(poly == 2)\n"" {\n"" sdata[overlap+tid] = (i < n) ? input[i] : input[n-1];\n"" if(tid < overlap)\n"" {\n"" sdata[tid] = (get_group_id(0) == 0) ? input[0] : input[i-overlap];\n"" }\n"" if(tid >= (get_local_size(0)-overlap))\n"" {\n"" sdata[tid+2*overlap] = (get_group_id(0) != get_num_groups(0)-1 && i+overlap < n) ? input[i+overlap] : input[n-1];\n"" }\n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if( (i >= out_offset) && (i < out_offset+out_numelements) )\n"" {\n"" output[i-out_offset] = FUNCTIONNAME(&(sdata[tid+overlap]));\n"" }\n""}\n")
template<int poly, typename T , typename OverlapFunc >
__global__ void MapOverlapKernel_CU (OverlapFunc mapOverlapFunc, T *input, T *output, T *wrap, unsigned int n, unsigned int out_offset, unsigned int out_numelements, T pad)
template<typename UnaryFunc , typename BinaryFunc , typename InputIterator >
static void * mapReduceThreadFunc1_CU (void *_args)
template<typename BinaryFunc1 , typename BinaryFunc2 , typename Input1Iterator , typename Input2Iterator >
static void * mapReduceThreadFunc2_CU (void *_args)
template<typename TrinaryFunc , typename BinaryFunc , typename Input1Iterator , typename Input2Iterator , typename Input3Iterator >
static void * mapReduceThreadFunc3_CU (void *_args)
static std::string UnaryMapReduceKernel_CL ("__kernel void UnaryMapReduceKernel_KERNELNAME(__global TYPE* input, __global TYPE* output, unsigned int n, __local TYPE* sdata, TYPE const1)\n""{\n"" unsigned int blockSize = get_local_size(0);\n"" unsigned int tid = get_local_id(0);\n"" unsigned int i = get_group_id(0)*blockSize + get_local_id(0);\n"" unsigned int gridSize = blockSize*get_num_groups(0);\n"" TYPE result = 0;\n"" if(i < n)\n"" {\n"" result = FUNCTIONNAME_MAP(input[i], const1);\n"" i += gridSize;\n"" }\n"" while(i < n)\n"" {\n"" TYPE tempMap;\n"" tempMap = FUNCTIONNAME_MAP(input[i], const1);\n"" result = FUNCTIONNAME_REDUCE(result, tempMap, (TYPE)0);\n"" i += gridSize;\n"" }\n"" sdata[tid] = result;\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if(blockSize >= 512) { if (tid < 256 && tid + 256 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 256], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 256) { if (tid < 128 && tid + 128 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 128], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 128) { if (tid < 64 && tid + 64 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 64], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 64) { if (tid < 32 && tid + 32 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 32], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 32) { if (tid < 16 && tid + 16 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 16], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 16) { if (tid < 8 && tid + 8 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 8], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 8) { if (tid < 4 && tid + 4 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 4], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 4) { if (tid < 2 && tid + 2 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 2], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 2) { if (tid < 1 && tid + 1 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 1], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(tid == 0)\n"" {\n"" output[get_group_id(0)] = sdata[tid];\n"" }\n""}\n")
static std::string BinaryMapReduceKernel_CL ("__kernel void BinaryMapReduceKernel_KERNELNAME(__global TYPE* input1, __global TYPE* input2, __global TYPE* output, unsigned int n, __local TYPE* sdata, TYPE const1)\n""{\n"" unsigned int blockSize = get_local_size(0);\n"" unsigned int tid = get_local_id(0);\n"" unsigned int i = get_group_id(0)*blockSize + get_local_id(0);\n"" unsigned int gridSize = blockSize*get_num_groups(0);\n"" TYPE result = 0;\n"" if(i < n)\n"" {\n"" result = FUNCTIONNAME_MAP(input1[i], input2[i], const1);\n"" i += gridSize;\n"" }\n"" while(i < n)\n"" {\n"" TYPE tempMap;\n"" tempMap = FUNCTIONNAME_MAP(input1[i], input2[i], const1);\n"" result = FUNCTIONNAME_REDUCE(result, tempMap, (TYPE)0);\n"" i += gridSize;\n"" }\n"" sdata[tid] = result;\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if(blockSize >= 512) { if (tid < 256 && tid + 256 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 256], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 256) { if (tid < 128 && tid + 128 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 128], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 128) { if (tid < 64 && tid + 64 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 64], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 64) { if (tid < 32 && tid + 32 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 32], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 32) { if (tid < 16 && tid + 16 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 16], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 16) { if (tid < 8 && tid + 8 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 8], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 8) { if (tid < 4 && tid + 4 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 4], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 4) { if (tid < 2 && tid + 2 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 2], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 2) { if (tid < 1 && tid + 1 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 1], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(tid == 0)\n"" {\n"" output[get_group_id(0)] = sdata[tid];\n"" }\n""}\n")
static std::string TrinaryMapReduceKernel_CL ("__kernel void TrinaryMapReduceKernel_KERNELNAME(__global TYPE* input1, __global TYPE* input2, __global TYPE* input3, __global TYPE* output, unsigned int n, __local TYPE* sdata, TYPE const1)\n""{\n"" unsigned int blockSize = get_local_size(0);\n"" unsigned int tid = get_local_id(0);\n"" unsigned int i = get_group_id(0)*blockSize + get_local_id(0);\n"" unsigned int gridSize = blockSize*get_num_groups(0);\n"" TYPE result = 0;\n"" if(i < n)\n"" {\n"" result = FUNCTIONNAME_MAP(input1[i], input2[i], input3[i], const1);\n"" i += gridSize;\n"" }\n"" while(i < n)\n"" {\n"" TYPE tempMap;\n"" tempMap = FUNCTIONNAME_MAP(input1[i], input2[i], input3[i], const1);\n"" result = FUNCTIONNAME_REDUCE(result, tempMap, (TYPE)0);\n"" i += gridSize;\n"" }\n"" sdata[tid] = result;\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if(blockSize >= 512) { if (tid < 256 && tid + 256 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 256], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 256) { if (tid < 128 && tid + 128 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 128], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 128) { if (tid < 64 && tid + 64 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 64], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 64) { if (tid < 32 && tid + 32 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 32], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 32) { if (tid < 16 && tid + 16 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 16], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 16) { if (tid < 8 && tid + 8 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 8], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 8) { if (tid < 4 && tid + 4 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 4], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 4) { if (tid < 2 && tid + 2 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 2], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 2) { if (tid < 1 && tid + 1 < n) { sdata[tid] = FUNCTIONNAME_REDUCE(sdata[tid], sdata[tid + 1], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(tid == 0)\n"" {\n"" output[get_group_id(0)] = sdata[tid];\n"" }\n""}\n")
template<typename T , typename UnaryFunc , typename BinaryFunc >
__global__ void MapReduceKernel1_CU (UnaryFunc mapFunc, BinaryFunc reduceFunc, T *input, T *output, unsigned int n)
template<typename T , typename BinaryFunc1 , typename BinaryFunc2 >
__global__ void MapReduceKernel2_CU (BinaryFunc1 mapFunc, BinaryFunc2 reduceFunc, T *input1, T *input2, T *output, unsigned int n)
template<typename T , typename TrinaryFunc , typename BinaryFunc >
__global__ void MapReduceKernel3_CU (TrinaryFunc mapFunc, BinaryFunc reduceFunc, T *input1, T *input2, T *input3, T *output, unsigned int n)
template<typename BinaryFunc , typename InputIterator >
static void * reduceThreadFunc_CU (void *_args)
static std::string ReduceKernel_CL ("__kernel void ReduceKernel_KERNELNAME(__global TYPE* input, __global TYPE* output, unsigned int n, __local TYPE* sdata)\n""{\n"" unsigned int blockSize = get_local_size(0);\n"" unsigned int tid = get_local_id(0);\n"" unsigned int i = get_group_id(0)*blockSize + get_local_id(0);\n"" unsigned int gridSize = blockSize*get_num_groups(0);\n"" TYPE result = 0;\n"" if(i < n)\n"" {\n"" result = input[i];\n"" i += gridSize;\n"" }\n"" while(i < n)\n"" {\n"" result = FUNCTIONNAME(result, input[i], (TYPE)0);\n"" i += gridSize;\n"" }\n"" sdata[tid] = result;\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if(blockSize >= 512) { if (tid < 256 && tid + 256 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 256], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 256) { if (tid < 128 && tid + 128 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 128], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 128) { if (tid < 64 && tid + 64 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 64], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 64) { if (tid < 32 && tid + 32 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 32], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 32) { if (tid < 16 && tid + 16 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 16], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 16) { if (tid < 8 && tid + 8 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 8], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 8) { if (tid < 4 && tid + 4 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 4], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 4) { if (tid < 2 && tid + 2 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 2], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(blockSize >= 2) { if (tid < 1 && tid + 1 < n) { sdata[tid] = FUNCTIONNAME(sdata[tid], sdata[tid + 1], (TYPE)0); } barrier(CLK_LOCAL_MEM_FENCE); }\n"" if(tid == 0)\n"" {\n"" output[get_group_id(0)] = sdata[tid];\n"" }\n""}\n")
template<typename T , typename BinaryFunc >
__global__ void ReduceKernel_CU (BinaryFunc reduceFunc, T *input, T *output, unsigned int n)
template<typename BinaryFunc , typename T >
static T scanLargeVectorRecursivelyM_CU (DeviceMemPointer_CU< T > *input, DeviceMemPointer_CU< T > *output, std::vector< DeviceMemPointer_CU< T > * > &blockSums, unsigned int numElements, int level, ScanType type, T init, Device_CU *device, BinaryFunc scanFunc)
template<typename BinaryFunc , typename InputIterator , typename OutputIterator >
static void * scanThreadFunc_CU (void *_args)
template<typename BinaryFunc , typename InputIterator , typename OutputIterator >
static void * scanAddThreadFunc_CU (void *_args)
static std::string ScanKernel_CL ("__kernel void ScanKernel_KERNELNAME(__global TYPE* input, __global TYPE* output, __global TYPE* blockSums, unsigned int n, unsigned int numElements, __local TYPE* sdata)\n""{\n"" unsigned int threadIdx = get_local_id(0);\n"" unsigned int blockDim = get_local_size(0);\n"" unsigned int blockIdx = get_group_id(0);\n"" unsigned int gridDim = get_num_groups(0);\n"" int thid = threadIdx;\n"" int pout = 0;\n"" int pin = 1;\n"" int mem = get_global_id(0);\n"" int blockNr = blockIdx;\n"" unsigned int gridSize = blockDim*gridDim;\n"" unsigned int numBlocks = numElements/(blockDim) + (numElements%(blockDim) == 0 ? 0:1);\n"" int offset;\n"" while(blockNr < numBlocks)\n"" {\n"" sdata[pout*n+thid] = (mem < numElements) ? input[mem] : 0;\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" for(offset = 1; offset < n; offset *=2)\n"" {\n"" pout = 1-pout;\n"" pin = 1-pout;\n"" if(thid >= offset)\n"" sdata[pout*n+thid] = FUNCTIONNAME(sdata[pin*n+thid], sdata[pin*n+thid-offset], (TYPE)0);\n"" else\n"" sdata[pout*n+thid] = sdata[pin*n+thid];\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" }\n"" if(thid == blockDim - 1)\n"" blockSums[blockNr] = sdata[pout*n+blockDim-1];\n"" if(mem < numElements)\n"" output[mem] = sdata[pout*n+thid];\n"" mem += gridSize;\n"" blockNr += gridDim;\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" }\n""}\n")
static std::string ScanUpdate_CL ("__kernel void ScanUpdate_KERNELNAME(__global TYPE* data, __global TYPE* sums, int isInclusive, TYPE init, int n, __global TYPE* ret, __local TYPE* sdata)\n""{\n"" __local TYPE offset;\n"" __local TYPE inc_offset;\n"" unsigned int threadIdx = get_local_id(0);\n"" unsigned int blockDim = get_local_size(0);\n"" unsigned int blockIdx = get_group_id(0);\n"" unsigned int gridDim = get_num_groups(0);\n"" int thid = threadIdx;\n"" int blockNr = blockIdx;\n"" unsigned int gridSize = blockDim*gridDim;\n"" int mem = get_global_id(0);\n"" unsigned int numBlocks = n/(blockDim) + (n%(blockDim) == 0 ? 0:1);\n"" while(blockNr < numBlocks)\n"" {\n"" if(thid == 0)\n"" {\n"" if(isInclusive == 0)\n"" {\n"" offset = init;\n"" if(blockNr > 0)\n"" {\n"" offset = FUNCTIONNAME(offset, sums[blockNr-1], (TYPE)0);\n"" inc_offset = sums[blockNr-1];\n"" }\n"" }\n"" else\n"" {\n"" if(blockNr > 0)\n"" offset = sums[blockNr-1];\n"" }\n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if(isInclusive == 1)\n"" {\n"" if(blockNr > 0)\n"" sdata[thid] = (mem < n) ? FUNCTIONNAME(offset, data[mem], (TYPE)0) : 0;\n"" else\n"" sdata[thid] = (mem < n) ? data[mem] : 0;\n"" if(mem == n-1)\n"" *ret = sdata[thid];\n"" }\n"" else\n"" {\n"" if(mem == n-1)\n"" *ret = FUNCTIONNAME(inc_offset, data[mem], (TYPE)0);\n"" if(thid == 0)\n"" sdata[thid] = offset;\n"" else\n"" sdata[thid] = (mem-1 < n) ? FUNCTIONNAME(offset, data[mem-1], (TYPE)0) : 0;\n"" }\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" if(mem < n)\n"" data[mem] = sdata[thid];\n"" mem += gridSize;\n"" blockNr += gridDim;\n"" barrier(CLK_LOCAL_MEM_FENCE);\n"" }\n""}\n")
static std::string ScanAdd_CL ("__kernel void ScanAdd_KERNELNAME(__global TYPE* data, TYPE sum, int n)\n""{\n"" int i = get_global_id(0);\n"" unsigned int gridSize = get_local_size(0)*get_num_groups(0);\n"" while(i < n)\n"" {\n"" data[i] = FUNCTIONNAME(data[i], sum, (TYPE)0);\n"" i += gridSize;\n"" }\n""}\n")
template<typename T , typename BinaryFunc >
__global__ void ScanKernel_CU (BinaryFunc scanFunc, T *input, T *output, T *blockSums, unsigned int n, unsigned int numElements)
template<typename T , typename BinaryFunc >
__global__ void ScanUpdate_CU (BinaryFunc scanFunc, T *data, T *sums, int isInclusive, T init, int n, T *ret)
template<typename T , typename BinaryFunc >
__global__ void ScanAdd_CU (BinaryFunc scanFunc, T *data, T sum, int n)

Detailed Description

The main nemaspace for SkePU library.

All classes and functions in the SkePU library are in this namespace.


Enumeration Type Documentation

Enumeration of the different edge policies (what happens when a read outside the vector is perfromed) that the map overlap skeletons support.

Enumeration of the two types of Scan that can be performed: Inclusive and Exclusive.


Function Documentation

template<typename GenerateFunc , typename OutputIterator >
static void* skepu::generateThreadFunc_CU ( void *  _args  )  [static]

When applying the Generate skeleton using CUDA and multiple devices, one host thread per device have to be created. This function is run by several host threads, each doing calculations on a seperate range. Since the device - host thread binding disappears when the thread has finished, the data is copied back to the host before function returns. The lazy memory copying therefore does not work when using multi-GPU with CUDA.

Parameters:
_args void pointer to struct containing arguments needed to do generating.

References skepu::Device_CU::getDeviceID(), skepu::Device_CU::getMaxBlocks(), and skepu::Device_CU::getMaxThreads().

Here is the call graph for this function:

template<typename ArrayFunc , typename Input1Iterator , typename Input2Iterator , typename OutputIterator >
static void* skepu::mapArrayThreadFunc_CU ( void *  _args  )  [static]

When applying the MapArray skeleton using CUDA and multiple devices, one host thread per device have to be created. This fucntion is run by several host threads, each doing calculations on seperate ranges. Since the device - host thread binding disappears when the thread has finished, the data is copied back to the host before function returns. The lazy memory copying therefore does not work when using multi-GPU with CUDA.

Parameters:
_args void pointer to struct containing arguments needed to perform the MapArray.

References skepu::Device_CU::getDeviceID(), skepu::Device_CU::getMaxBlocks(), and skepu::Device_CU::getMaxThreads().

Here is the call graph for this function:

template<typename OverlapFunc , typename InputIterator , typename OutputIterator >
static void* skepu::mapOverlapThreadFunc_CU ( void *  _args  )  [static]

When applying the MapOverlap skeleton using CUDA and multiple devices, one host thread per device have to be created. This function is run by several host threads, each doing calculations on a seperate range. Since the device - host thread binding disappears when the thread has finished, the data is copied back to the host before function returns. The lazy memory copying therefore does not work when using multi-GPU with CUDA.

Parameters:
_args void pointer to struct containing arguments needed to do mapping.

References skepu::Device_CU::getDeviceID(), skepu::Device_CU::getMaxBlocks(), and skepu::Device_CU::getMaxThreads().

Here is the call graph for this function:

template<typename UnaryFunc , typename BinaryFunc , typename InputIterator >
static void* skepu::mapReduceThreadFunc1_CU ( void *  _args  )  [static]

Performs the Map on one range of elements and Reduce on the result with CUDA as backend. Returns a scalar result. The function is called by the thread library and is used for multi-GPU reduction using CUDA. Each host thread corresponds to one device.

Parameters:
_args void pointer to struct containing arguments needed to do mapping.

References skepu::DeviceMemPointer_CU< T >::changeDeviceData(), skepu::DeviceMemPointer_CU< T >::copyDeviceToHost(), skepu::DeviceMemPointer_CU< T >::getDeviceDataPointer(), skepu::Device_CU::getDeviceID(), and skepu::Device_CU::getMaxThreads().

Here is the call graph for this function:

template<typename BinaryFunc1 , typename BinaryFunc2 , typename Input1Iterator , typename Input2Iterator >
static void* skepu::mapReduceThreadFunc2_CU ( void *  _args  )  [static]

Performs the Map on two ranges of elements and Reduce on the result with CUDA as backend. Returns a scalar result. The function is called by the thread library and is used for multi-GPU reduction using CUDA. Each host thread corresponds to one device.

Parameters:
_args void pointer to struct containing arguments needed to do mapping.

References skepu::DeviceMemPointer_CU< T >::changeDeviceData(), skepu::DeviceMemPointer_CU< T >::copyDeviceToHost(), skepu::DeviceMemPointer_CU< T >::getDeviceDataPointer(), skepu::Device_CU::getDeviceID(), and skepu::Device_CU::getMaxThreads().

Here is the call graph for this function:

template<typename TrinaryFunc , typename BinaryFunc , typename Input1Iterator , typename Input2Iterator , typename Input3Iterator >
static void* skepu::mapReduceThreadFunc3_CU ( void *  _args  )  [static]

Performs the Map on three range of elements and Reduce on the result with CUDA as backend. Returns a scalar result. The function is called by the thread library and is used for multi-GPU reduction using CUDA. Each host thread corresponds to one device.

Parameters:
_args void pointer to struct containing arguments needed to do mapping.

References skepu::DeviceMemPointer_CU< T >::changeDeviceData(), skepu::DeviceMemPointer_CU< T >::copyDeviceToHost(), skepu::DeviceMemPointer_CU< T >::getDeviceDataPointer(), skepu::Device_CU::getDeviceID(), and skepu::Device_CU::getMaxThreads().

Here is the call graph for this function:

template<typename UnaryFunc , typename InputIterator , typename OutputIterator >
static void* skepu::mapThreadFunc1_CU ( void *  _args  )  [static]

When applying the Map skeleton using CUDA and multiple devices, one host thread per device have to be created. This fucntion is run by several host threads, each doing calculations on a seperate range. Since the device - host thread binding disappears when the thread has finished, the data is copied back to the host before function returns. The lazy memory copying therefore does not work when using multi-GPU with CUDA.

The skeleton must have been created with a unary user function when using this. Maps only one range of vectors.

Parameters:
_args void pointer to struct containing arguments needed to do mapping.

References skepu::Device_CU::getDeviceID(), and skepu::Device_CU::getMaxThreads().

Here is the call graph for this function:

template<typename BinaryFunc , typename Input1Iterator , typename Input2Iterator , typename OutputIterator >
static void* skepu::mapThreadFunc2_CU ( void *  _args  )  [static]

When applying the Map skeleton using CUDA and multiple devices, one host thread per device have to be created. This fucntion is run by several host threads, each doing calculations on seperate ranges. Since the device - host thread binding disappears when the thread has finished, the data is copied back to the host before function returns. The lazy memory copying therefore does not work when using multi-GPU with CUDA.

The skeleton must have been created with a binary user function when using this. Maps two ranges of vectors.

Parameters:
_args void pointer to struct containing arguments needed to do mapping.

References skepu::Device_CU::getDeviceID(), and skepu::Device_CU::getMaxThreads().

Here is the call graph for this function:

template<typename TrinaryFunc , typename Input1Iterator , typename Input2Iterator , typename Input3Iterator , typename OutputIterator >
static void* skepu::mapThreadFunc3_CU ( void *  _args  )  [static]

When applying the Map skeleton using CUDA and multiple devices, one host thread per device have to be created. This fucntion is run by several host threads, each doing calculations on seperate ranges. Since the device - host thread binding disappears when the thread has finished, the data is copied back to the host before function returns. The lazy memory copying therefore does not work when using multi-GPU with CUDA.

The skeleton must have been created with a trinary user function when using this. Maps three ranges of vectors.

Parameters:
_args void pointer to struct containing arguments needed to do mapping.

References skepu::Device_CU::getDeviceID(), and skepu::Device_CU::getMaxThreads().

Here is the call graph for this function:

template<typename BinaryFunc , typename InputIterator >
static void* skepu::reduceThreadFunc_CU ( void *  _args  )  [static]

Performs the Reduction on a range of elements with CUDA as backend. Returns a scalar result. The function is called by the thread library and is used for multi-GPU reduction using CUDA. Each host thread corresponds to one device.

Parameters:
_args void pointer to struct containing arguments needed to do mapping.

References skepu::DeviceMemPointer_CU< T >::changeDeviceData(), skepu::DeviceMemPointer_CU< T >::copyDeviceToHost(), skepu::DeviceMemPointer_CU< T >::getDeviceDataPointer(), skepu::Device_CU::getDeviceID(), and skepu::Device_CU::getMaxThreads().

Here is the call graph for this function:

template<typename BinaryFunc , typename InputIterator , typename OutputIterator >
static void* skepu::scanAddThreadFunc_CU ( void *  _args  )  [static]

When applying the Scan skeleton using CUDA and multiple devices, one host thread per device have to be created. This function is run by several host threads, each doing calculations on a seperate range. Since the device - host thread binding disappears when the thread has finished, the data is copied back to the host before function returns. The lazy memory copying therefore does not work when using multi-GPU with CUDA. This function adds the partial results from each device after a multi-GPU CUDA Scan.

Parameters:
_args void pointer to struct containing arguments needed for the scan.

References skepu::Device_CU::getDeviceID(), skepu::Device_CU::getMaxBlocks(), and skepu::Device_CU::getMaxThreads().

Here is the call graph for this function:

template<typename BinaryFunc , typename T >
static T skepu::scanLargeVectorRecursivelyM_CU ( DeviceMemPointer_CU< T > *  input,
DeviceMemPointer_CU< T > *  output,
std::vector< DeviceMemPointer_CU< T > * > &  blockSums,
unsigned int  numElements,
int  level,
ScanType  type,
init,
Device_CU *  device,
BinaryFunc  scanFunc 
) [static]

Scans a Vector using the same recursive algorithm as NVIDIA SDK. First the vector is scanned producing partial results for each block. Then the function is called recursively to scan these partial results, which in turn can produce partial results and so on. This continues until only one block with partial results is left. Used by multi-GPU CUDA implementation.

Parameters:
input Pointer to the device memory where the input vector resides.
output Pointer to the device memory where the output vector resides.
blockSums A Vector of device memory pointers where the partial results for each level is stored.
numElements The number of elements to scan.
level The current recursion level.
type The scan type, either INCLUSIVE or EXCLUSIVE.
init The initialization value for exclusive scans.
device Pointer to the device that will be used for the scan.
scanFunc The user function used in the scan.

References skepu::DeviceMemPointer_CU< T >::changeDeviceData(), skepu::DeviceMemPointer_CU< T >::copyDeviceToHost(), skepu::DeviceMemPointer_CU< T >::getDeviceDataPointer(), skepu::Device_CU::getDeviceID(), skepu::Device_CU::getMaxBlocks(), and skepu::Device_CU::getMaxThreads().

Here is the call graph for this function:

template<typename BinaryFunc , typename InputIterator , typename OutputIterator >
static void* skepu::scanThreadFunc_CU ( void *  _args  )  [static]

When applying the Scan skeleton using CUDA and multiple devices, one host thread per device have to be created. This function is run by several host threads, each doing calculations on a seperate range. Since the device - host thread binding disappears when the thread has finished, the data is copied back to the host before function returns. The lazy memory copying therefore does not work when using multi-GPU with CUDA.

Parameters:
_args void pointer to struct containing arguments needed for the scan.

References skepu::Device_CU::getDeviceID(), skepu::Device_CU::getMaxThreads(), and scanLargeVectorRecursivelyM_CU().

Here is the call graph for this function:

 All Classes Namespaces Files Functions Enumerations Friends Defines