13 #include "../globals.h"
34 cudaStream_t m_streams[MAX_POSSIBLE_CUDA_STREAMS_PER_GPU];
37 unsigned int m_deviceID;
38 cudaDeviceProp m_deviceProp;
42 unsigned int m_noConcurrKernelsSupported;
44 unsigned int m_noCoresSupported;
51 void initDeviceProps(
unsigned int device)
54 err = cudaGetDeviceProperties(&m_deviceProp, device);
55 if (err != cudaSuccess)
57 SKEPU_ERROR(
"getDeviceProps failed!\n");
60 if (m_deviceProp.major == 9999 && m_deviceProp.minor == 9999)
62 m_noConcurrKernelsSupported = 1;
63 m_noCoresSupported = 1;
67 m_noConcurrKernelsSupported = getMaxConcurKernelsSupported(m_deviceProp.major, m_deviceProp.minor);
68 if(m_noConcurrKernelsSupported > MAX_POSSIBLE_CUDA_STREAMS_PER_GPU)
70 SKEPU_WARNING(
"Potential problem as stream size specified is larger tham what is maximum possible specified in MAX_POSSIBLE_CUDA_STREAMS_PER_GPU.\n");
71 m_noConcurrKernelsSupported = MAX_POSSIBLE_CUDA_STREAMS_PER_GPU;
74 m_noCoresSupported = ConvertSMVer2Cores_local(m_deviceProp.major, m_deviceProp.minor);
87 int ConvertSMVer2Cores_local(
int major,
int minor)
96 sSMtoCores nGpuArchCoresPerSM[] =
109 while (nGpuArchCoresPerSM[index].SM != -1)
111 if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) )
112 return nGpuArchCoresPerSM[index].Cores;
116 SKEPU_WARNING(
"MapSMtoCores undefined SMversion " << major <<
"," << minor <<
"\n");
129 int getMaxConcurKernelsSupported(
int major,
int minor)
135 int totConcurrKernels;
138 sSMtoCores nGpuArchCoresPerSM[] =
156 while (nGpuArchCoresPerSM[index].SM != -1)
158 if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) )
159 return nGpuArchCoresPerSM[index].totConcurrKernels;
163 SKEPU_WARNING(
"MapSMtoCores undefined SMversion " << major <<
"," << minor <<
"\n");
179 cudaSetDevice(m_deviceID);
183 #ifdef USE_PINNED_MEMORY
184 for(
unsigned int i=0; i<m_noConcurrKernelsSupported; i++)
185 cudaStreamCreate(&(m_streams[i]));
188 if(m_deviceProp.major == 1 && m_deviceProp.minor < 2)
194 m_maxThreads = m_deviceProp.maxThreadsPerBlock;
197 m_maxBlocks = m_deviceProp.maxGridSize[0];
207 cudaSetDevice(m_deviceID);
217 return m_deviceProp.deviceOverlap;
225 return m_deviceProp.maxThreadsPerBlock;
233 return m_deviceProp.major;
241 return m_deviceProp.minor;
249 return m_noCoresSupported;
257 return m_deviceProp.name;
267 return m_deviceProp.clockRate;
276 return m_deviceProp.asyncEngineCount;
284 return m_deviceProp.concurrentKernels;
292 return m_noConcurrKernelsSupported;
300 return m_deviceProp.multiProcessorCount;
308 return m_deviceProp.totalGlobalMem;
316 return m_deviceProp.sharedMemPerBlock;
324 #ifdef SKEPU_MAX_GPU_THREADS
325 return SKEPU_MAX_GPU_THREADS;
336 #ifdef SKEPU_MAX_GPU_BLOCKS
337 return SKEPU_MAX_GPU_BLOCKS;
Device_CU(unsigned int id)
Definition: device_cu.h:175
unsigned int getSmPerMultiProc() const
Definition: device_cu.h:247
bool IsConcurrentKernels() const
Definition: device_cu.h:282
size_t getGlobalMemSize() const
Definition: device_cu.h:306
~Device_CU()
The destructor.
Definition: device_cu.h:203
size_t getMaxThreads() const
Definition: device_cu.h:322
std::string getDeviceName() const
Definition: device_cu.h:255
bool isOverlapSupported()
Definition: device_cu.h:215
unsigned int getNoConcurrentKernels() const
Definition: device_cu.h:290
int getMajorVersion() const
Definition: device_cu.h:231
int getClockRate() const
Definition: device_cu.h:265
size_t getMaxBlocks() const
Definition: device_cu.h:334
size_t getMaxBlockSize() const
Definition: device_cu.h:223
int getNumComputeUnits() const
Definition: device_cu.h:298
size_t getSharedMemPerBlock() const
Definition: device_cu.h:314
unsigned int getDeviceID() const
Definition: device_cu.h:346
A class representing a CUDA device.
Definition: device_cu.h:30
int getAsyncEngineCount() const
Definition: device_cu.h:274
int getMinorVersion() const
Definition: device_cu.h:239