SkePU  1.2
 All Classes Namespaces Files Functions Variables Enumerations Friends Macros Groups Pages
device_cu.h
Go to the documentation of this file.
1 
5 #ifndef DEVICE_CU_H
6 #define DEVICE_CU_H
7 
8 #ifdef SKEPU_CUDA
9 
10 #include <iostream>
11 #include <cuda.h>
12 
13 #include "../globals.h"
14 
15 namespace skepu
16 {
17 
30 class Device_CU
31 {
32 
33 public:
34  cudaStream_t m_streams[MAX_POSSIBLE_CUDA_STREAMS_PER_GPU];
35 
36 private:
37  unsigned int m_deviceID;
38  cudaDeviceProp m_deviceProp;
39  size_t m_maxThreads;
40  size_t m_maxBlocks;
41 
42  unsigned int m_noConcurrKernelsSupported;
43 
44  unsigned int m_noCoresSupported;
45 
51  void initDeviceProps(unsigned int device)
52  {
53  cudaError_t err;
54  err = cudaGetDeviceProperties(&m_deviceProp, device);
55  if (err != cudaSuccess)
56  {
57  SKEPU_ERROR("getDeviceProps failed!\n");
58  }
59 
60  if (m_deviceProp.major == 9999 && m_deviceProp.minor == 9999)
61  {
62  m_noConcurrKernelsSupported = 1;
63  m_noCoresSupported = 1;
64  }
65  else
66  {
67  m_noConcurrKernelsSupported = getMaxConcurKernelsSupported(m_deviceProp.major, m_deviceProp.minor);
68  if(m_noConcurrKernelsSupported > MAX_POSSIBLE_CUDA_STREAMS_PER_GPU)
69  {
70  SKEPU_WARNING("Potential problem as stream size specified is larger tham what is maximum possible specified in MAX_POSSIBLE_CUDA_STREAMS_PER_GPU.\n");
71  m_noConcurrKernelsSupported = MAX_POSSIBLE_CUDA_STREAMS_PER_GPU; // reset it to max as we have allocated stream array of size MAX_POSSIBLE_CUDA_STREAMS_PER_GPU
72  }
73 
74  m_noCoresSupported = ConvertSMVer2Cores_local(m_deviceProp.major, m_deviceProp.minor);
75 
76  }
77  }
78 
79 
87  int ConvertSMVer2Cores_local(int major, int minor)
88  {
89  // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
90  typedef struct
91  {
92  int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
93  int Cores;
94  } sSMtoCores;
95 
96  sSMtoCores nGpuArchCoresPerSM[] =
97  {
98  { 0x10, 8 }, // Tesla Generation (SM 1.0) G80 class
99  { 0x11, 8 }, // Tesla Generation (SM 1.1) G8x class
100  { 0x12, 8 }, // Tesla Generation (SM 1.2) G9x class
101  { 0x13, 8 }, // Tesla Generation (SM 1.3) GT200 class
102  { 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
103  { 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
104  { 0x30, 192}, // Fermi Generation (SM 3.0) GK10x class
105  { -1, -1 }
106  };
107 
108  int index = 0;
109  while (nGpuArchCoresPerSM[index].SM != -1)
110  {
111  if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) )
112  return nGpuArchCoresPerSM[index].Cores;
113 
114  index++;
115  }
116  SKEPU_WARNING("MapSMtoCores undefined SMversion " << major << "," << minor << "\n");
117  return -1;
118  }
119 
120 
121 
129  int getMaxConcurKernelsSupported(int major, int minor)
130  {
131  // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
132  typedef struct
133  {
134  int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
135  int totConcurrKernels;
136  } sSMtoCores;
137 
138  sSMtoCores nGpuArchCoresPerSM[] =
139  {
140  { 0x10, 1 }, // Tesla Generation (SM 1.0) G80 class
141  { 0x11, 1 }, // Tesla Generation (SM 1.1) G8x class
142  { 0x12, 1 }, // Tesla Generation (SM 1.2) G9x class
143  { 0x13, 1 }, // Tesla Generation (SM 1.3) GT200 class
144  { 0x20, 4 }, // Fermi Generation (SM 2.0) GF100 class
145  { 0x21, 16 }, // Fermi Generation (SM 2.1) GF10x class
146  { 0x30, 16}, // Kepler Generation (SM 3.0) GK10x class
147  { 0x32, 4}, // special kepler? (SM 3.2)
148  { 0x35, 32}, // Kepler Generation (SM 3.5) GK11x class
149  { 0x37, 32}, // (SM 3.7)
150  { 0x50, 32}, // Maxwell Generation (SM 5.0) GM10x class
151  { 0x52, 32}, // Maxwell Generation (SM 5.2) GM20x class
152  { -1, 1 }
153  };
154 
155  int index = 0;
156  while (nGpuArchCoresPerSM[index].SM != -1)
157  {
158  if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) )
159  return nGpuArchCoresPerSM[index].totConcurrKernels;
160 
161  index++;
162  }
163  SKEPU_WARNING("MapSMtoCores undefined SMversion " << major << "," << minor << "\n");
164  return 1;
165  }
166 
167 
168 public:
169 
175  Device_CU(unsigned int id)
176  {
177  m_deviceID = id;
178 
179  cudaSetDevice(m_deviceID);
180 
181  initDeviceProps(id);
182 
183 #ifdef USE_PINNED_MEMORY
184  for(unsigned int i=0; i<m_noConcurrKernelsSupported; i++)
185  cudaStreamCreate(&(m_streams[i]));
186 #endif
187 
188  if(m_deviceProp.major == 1 && m_deviceProp.minor < 2)
189  {
190  m_maxThreads = 256;
191  }
192  else
193  {
194  m_maxThreads = m_deviceProp.maxThreadsPerBlock;
195  }
196 
197  m_maxBlocks = m_deviceProp.maxGridSize[0];
198  }
199 
204  {
205  // Explicitly destroys and cleans up all resources associated with the current device in the current process.
206  // Any subsequent API call to this device will reinitialize the device.
207  cudaSetDevice(m_deviceID);
208  cudaDeviceReset();
209  };
210 
211 
216  {
217  return m_deviceProp.deviceOverlap;
218  }
219 
223  size_t getMaxBlockSize() const
224  {
225  return m_deviceProp.maxThreadsPerBlock;
226  }
227 
231  int getMajorVersion() const
232  {
233  return m_deviceProp.major;
234  }
235 
239  int getMinorVersion() const
240  {
241  return m_deviceProp.minor;
242  }
243 
247  unsigned int getSmPerMultiProc() const
248  {
249  return m_noCoresSupported;
250  }
251 
255  std::string getDeviceName() const
256  {
257  return m_deviceProp.name;
258  }
259 
265  int getClockRate() const
266  {
267  return m_deviceProp.clockRate;
268  }
269 
275  {
276  return m_deviceProp.asyncEngineCount;
277  }
278 
282  bool IsConcurrentKernels() const
283  {
284  return m_deviceProp.concurrentKernels;
285  }
286 
290  unsigned int getNoConcurrentKernels() const
291  {
292  return m_noConcurrKernelsSupported;
293  }
294 
298  int getNumComputeUnits() const
299  {
300  return m_deviceProp.multiProcessorCount;
301  }
302 
306  size_t getGlobalMemSize() const
307  {
308  return m_deviceProp.totalGlobalMem;
309  }
310 
314  size_t getSharedMemPerBlock() const
315  {
316  return m_deviceProp.sharedMemPerBlock;
317  }
318 
322  size_t getMaxThreads() const
323  {
324 #ifdef SKEPU_MAX_GPU_THREADS
325  return SKEPU_MAX_GPU_THREADS;
326 #else
327  return m_maxThreads;
328 #endif
329  }
330 
334  size_t getMaxBlocks() const
335  {
336 #ifdef SKEPU_MAX_GPU_BLOCKS
337  return SKEPU_MAX_GPU_BLOCKS;
338 #else
339  return m_maxBlocks;
340 #endif
341  }
342 
346  unsigned int getDeviceID() const
347  {
348  return m_deviceID;
349  }
350 };
351 
352 }
353 
354 #endif
355 
356 #endif
357 
358 
Device_CU(unsigned int id)
Definition: device_cu.h:175
unsigned int getSmPerMultiProc() const
Definition: device_cu.h:247
bool IsConcurrentKernels() const
Definition: device_cu.h:282
size_t getGlobalMemSize() const
Definition: device_cu.h:306
~Device_CU()
The destructor.
Definition: device_cu.h:203
size_t getMaxThreads() const
Definition: device_cu.h:322
std::string getDeviceName() const
Definition: device_cu.h:255
bool isOverlapSupported()
Definition: device_cu.h:215
unsigned int getNoConcurrentKernels() const
Definition: device_cu.h:290
int getMajorVersion() const
Definition: device_cu.h:231
int getClockRate() const
Definition: device_cu.h:265
size_t getMaxBlocks() const
Definition: device_cu.h:334
size_t getMaxBlockSize() const
Definition: device_cu.h:223
int getNumComputeUnits() const
Definition: device_cu.h:298
size_t getSharedMemPerBlock() const
Definition: device_cu.h:314
unsigned int getDeviceID() const
Definition: device_cu.h:346
A class representing a CUDA device.
Definition: device_cu.h:30
int getAsyncEngineCount() const
Definition: device_cu.h:274
int getMinorVersion() const
Definition: device_cu.h:239