SkePU  1.2
 All Classes Namespaces Files Functions Variables Enumerations Friends Macros Groups Pages
skepu_cuda_helpers.h
Go to the documentation of this file.
1 
5 #ifndef SKEPU_CUDA_HELPER_H
6 #define SKEPU_CUDA_HELPER_H
7 
8 #include <stdio.h>
9 #include <string.h>
10 #include <stdlib.h>
11 
12 #include <cufft.h>
13 #include <curand.h>
14 
15 
16 
17 
18 // We define these calls here, so the user doesn't need to include __FILE__ and __LINE__
19 // The advantage is the developers gets to use the inline function so they can debug
20 #define cutilSafeCallNoSync(err) __cudaSafeCallNoSync(err, __FILE__, __LINE__)
21 #define cutilSafeCall(err) __cudaSafeCall (err, __FILE__, __LINE__)
22 #define cutilSafeThreadSync() __cudaSafeThreadSync(__FILE__, __LINE__)
23 #define cufftSafeCall(err) __cufftSafeCall (err, __FILE__, __LINE__)
24 #define curandSafeCall(err) __curandSafeCall (err, __FILE__, __LINE__)
25 #define cutilCheckError(err) __cutilCheckError (err, __FILE__, __LINE__)
26 #define cutilCheckMsg(msg) __cutilGetLastError (msg, __FILE__, __LINE__)
27 #define cutilCheckMsgAndSync(msg) __cutilGetLastErrorAndSync (msg, __FILE__, __LINE__)
28 #define cutilSafeMalloc(mallocCall) __cutilSafeMalloc ((mallocCall), __FILE__, __LINE__)
29 #define cutilCondition(val) __cutilCondition (val, __FILE__, __LINE__)
30 #define cutilExit(argc, argv) __cutilExit (argc, argv)
31 
32 
33 #define CHECK_CUDA_ERROR(stmt) { cudaError_t err = stmt; __checkCudaError (err, __FILE__, __LINE__); }
34 
35 
36 // #define CHECK_CUDA_ERROR(stmt) {cudaError_t err = stmt; if(err != cudaSuccess){std::cerr<<"CUDA Error at " << __FILE__ << ":" << __LINE__ << " => " << cudaGetErrorString(err) << "\n"; }}
40 
41 #ifndef MIN
42 #define MIN(a,b) ((a < b) ? a : b)
43 #endif
44 
45 #ifndef MAX
46 #define MAX(a,b) ((a > b) ? a : b)
47 #endif
48 
49 
50 inline cudaError cutilDeviceSynchronize()
51 {
52 #if CUDART_VERSION >= 4000
53  return cudaDeviceSynchronize();
54 #else
55  return cudaThreadSynchronize();
56 #endif
57 }
58 
59 
64 {
65  CUTFalse = 0,
66  CUTTrue = 1
67 };
68 
69 #ifdef _WIN32
70 #define CUTIL_API __stdcall
71 #else
72 #define CUTIL_API
73 #endif
74 
75 
76 
77 
78 
79 // Give a little more for Windows : the console window often disapears before we can read the message
80 #ifdef _WIN32
81 # if 1//ndef UNICODE
82 # ifdef _DEBUG // Do this only in debug mode...
83 inline void VSPrintf(FILE *file, LPCSTR fmt, ...)
84 {
85  size_t fmt2_sz = 2048;
86  char *fmt2 = (char*)malloc(fmt2_sz);
87  va_list vlist;
88  va_start(vlist, fmt);
89  while((_vsnprintf(fmt2, fmt2_sz, fmt, vlist)) < 0) // means there wasn't anough room
90  {
91  fmt2_sz *= 2;
92  if(fmt2) free(fmt2);
93  fmt2 = (char*)malloc(fmt2_sz);
94  }
95  OutputDebugStringA(fmt2);
96  fprintf(file, fmt2);
97  free(fmt2);
98 }
99 # define FPRINTF(a) VSPrintf a
100 # else //debug
101 # define FPRINTF(a) fprintf a
102 // For other than Win32
103 # endif //debug
104 # else //unicode
105 // Unicode case... let's give-up for now and keep basic printf
106 # define FPRINTF(a) fprintf a
107 # endif //unicode
108 #else //win32
109 # define FPRINTF(a) fprintf a
110 #endif //win32
111 
112 
113 // NOTE: "%s(%i) : " allows Visual Studio to directly jump to the file at the right line
114 // when the user double clicks on the error line in the Output pane. Like any compile error.
115 
116 inline void __checkCudaError( cudaError_t err, const char *file, const int line )
117 {
118  if( cudaSuccess != err)
119  {
120  FPRINTF((stderr, "CUDA ERROR at %s: %i. Error is %d: %s.\n",file, line, (int)err, cudaGetErrorString(err)));
121  }
122 }
123 
124 
125 
126 inline void __cudaSafeCallNoSync( cudaError_t err, const char *file, const int line )
127 {
128  if( cudaSuccess != err)
129  {
130  FPRINTF((stderr, "%s(%i) : cudaSafeCallNoSync() Runtime API error %d : %s.\n",
131  file, line, (int)err, cudaGetErrorString( err ) ));
132  exit(-1);
133  }
134 }
135 
136 inline void __cudaSafeCall( cudaError_t err, const char *file, const int line )
137 {
138  if( cudaSuccess != err)
139  {
140  FPRINTF((stderr, "%s(%i) : cudaSafeCall() Runtime API error %d: %s.\n",
141  file, line, (int)err, cudaGetErrorString( err ) ));
142  exit(-1);
143  }
144 }
145 
146 
147 
148 inline void __cudaSafeThreadSync( const char *file, const int line )
149 {
150  cudaError_t err = cutilDeviceSynchronize();
151  if ( cudaSuccess != err)
152  {
153  FPRINTF((stderr, "%s(%i) : cudaDeviceSynchronize() Runtime API error %d: %s.\n",
154  file, line, (int)err, cudaGetErrorString( err ) ));
155  exit(-1);
156  }
157 }
158 
159 inline void __cufftSafeCall( cufftResult err, const char *file, const int line )
160 {
161  if( CUFFT_SUCCESS != err)
162  {
163  FPRINTF((stderr, "%s(%i) : cufftSafeCall() CUFFT error %d: ",
164  file, line, (int)err));
165  switch (err)
166  {
167  case CUFFT_INVALID_PLAN:
168  FPRINTF((stderr, "CUFFT_INVALID_PLAN\n"));
169  case CUFFT_ALLOC_FAILED:
170  FPRINTF((stderr, "CUFFT_ALLOC_FAILED\n"));
171  case CUFFT_INVALID_TYPE:
172  FPRINTF((stderr, "CUFFT_INVALID_TYPE\n"));
173  case CUFFT_INVALID_VALUE:
174  FPRINTF((stderr, "CUFFT_INVALID_VALUE\n"));
175  case CUFFT_INTERNAL_ERROR:
176  FPRINTF((stderr, "CUFFT_INTERNAL_ERROR\n"));
177  case CUFFT_EXEC_FAILED:
178  FPRINTF((stderr, "CUFFT_EXEC_FAILED\n"));
179  case CUFFT_SETUP_FAILED:
180  FPRINTF((stderr, "CUFFT_SETUP_FAILED\n"));
181  case CUFFT_INVALID_SIZE:
182  FPRINTF((stderr, "CUFFT_INVALID_SIZE\n"));
183  case CUFFT_UNALIGNED_DATA:
184  FPRINTF((stderr, "CUFFT_UNALIGNED_DATA\n"));
185  default:
186  FPRINTF((stderr, "CUFFT Unknown error code\n"));
187  }
188  exit(-1);
189  }
190 }
191 
192 inline void __curandSafeCall( curandStatus_t err, const char *file, const int line )
193 {
194  if( CURAND_STATUS_SUCCESS != err)
195  {
196  FPRINTF((stderr, "%s(%i) : curandSafeCall() CURAND error %d: ",
197  file, line, (int)err));
198  switch (err)
199  {
200  case CURAND_STATUS_VERSION_MISMATCH:
201  FPRINTF((stderr, "CURAND_STATUS_VERSION_MISMATCH"));
202  case CURAND_STATUS_NOT_INITIALIZED:
203  FPRINTF((stderr, "CURAND_STATUS_NOT_INITIALIZED"));
204  case CURAND_STATUS_ALLOCATION_FAILED:
205  FPRINTF((stderr, "CURAND_STATUS_ALLOCATION_FAILED"));
206  case CURAND_STATUS_TYPE_ERROR:
207  FPRINTF((stderr, "CURAND_STATUS_TYPE_ERROR"));
208  case CURAND_STATUS_OUT_OF_RANGE:
209  FPRINTF((stderr, "CURAND_STATUS_OUT_OF_RANGE"));
210  case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
211  FPRINTF((stderr, "CURAND_STATUS_LENGTH_NOT_MULTIPLE"));
212 // case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
213 // FPRINTF((stderr, "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED"));
214  case CURAND_STATUS_LAUNCH_FAILURE:
215  FPRINTF((stderr, "CURAND_STATUS_LAUNCH_FAILURE"));
216  case CURAND_STATUS_PREEXISTING_FAILURE:
217  FPRINTF((stderr, "CURAND_STATUS_PREEXISTING_FAILURE"));
218  case CURAND_STATUS_INITIALIZATION_FAILED:
219  FPRINTF((stderr, "CURAND_STATUS_INITIALIZATION_FAILED"));
220  case CURAND_STATUS_ARCH_MISMATCH:
221  FPRINTF((stderr, "CURAND_STATUS_ARCH_MISMATCH"));
222  case CURAND_STATUS_INTERNAL_ERROR:
223  FPRINTF((stderr, "CURAND_STATUS_INTERNAL_ERROR"));
224  default:
225  FPRINTF((stderr, "CURAND Unknown error code\n"));
226  }
227  exit(-1);
228  }
229 }
230 
231 
232 inline void __cutilCheckError( CUTBoolean err, const char *file, const int line )
233 {
234  if( CUTTrue != err)
235  {
236  FPRINTF((stderr, "%s(%i) : CUTIL CUDA error.\n",
237  file, line));
238  exit(-1);
239  }
240 }
241 
242 inline void __cutilGetLastError( const char *errorMessage, const char *file, const int line )
243 {
244  cudaError_t err = cudaGetLastError();
245  if( cudaSuccess != err)
246  {
247  FPRINTF((stderr, "%s(%i) : cutilCheckMsg() CUTIL CUDA error : %s : (%d) %s.\n",
248  file, line, errorMessage, (int)err, cudaGetErrorString( err ) ));
249  exit(-1);
250  }
251 }
252 
253 inline void __cutilGetLastErrorAndSync( const char *errorMessage, const char *file, const int line )
254 {
255  cudaError_t err = cudaGetLastError();
256  if( cudaSuccess != err)
257  {
258  FPRINTF((stderr, "%s(%i) : cutilCheckMsg() CUTIL CUDA error : %s : (%d) %s.\n",
259  file, line, errorMessage, (int)err, cudaGetErrorString( err ) ));
260  exit(-1);
261  }
262 
263  err = cutilDeviceSynchronize();
264  if( cudaSuccess != err)
265  {
266  FPRINTF((stderr, "%s(%i) : cutilCheckMsg cudaDeviceSynchronize error: %s : (%d) %s.\n",
267  file, line, errorMessage, (int)err, cudaGetErrorString( err ) ));
268  exit(-1);
269  }
270 }
271 
272 inline void __cutilSafeMalloc( void *pointer, const char *file, const int line )
273 {
274  if( !(pointer))
275  {
276  FPRINTF((stderr, "%s(%i) : cutilSafeMalloc host malloc failure\n",
277  file, line));
278  exit(-1);
279  }
280 }
281 
282 
283 // Beginning of GPU Architecture definitions
284 inline int _ConvertSMVer2Cores_local(int major, int minor)
285 {
286  // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
287  typedef struct
288  {
289  int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
290  int Cores;
291  } sSMtoCores;
292 
293  sSMtoCores nGpuArchCoresPerSM[] =
294  {
295  { 0x10, 8 }, // Tesla Generation (SM 1.0) G80 class
296  { 0x11, 8 }, // Tesla Generation (SM 1.1) G8x class
297  { 0x12, 8 }, // Tesla Generation (SM 1.2) G9x class
298  { 0x13, 8 }, // Tesla Generation (SM 1.3) GT200 class
299  { 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
300  { 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
301  { 0x30, 192}, // Fermi Generation (SM 3.0) GK10x class
302  { -1, -1 }
303  };
304 
305  int index = 0;
306  while (nGpuArchCoresPerSM[index].SM != -1)
307  {
308  if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) )
309  {
310  return nGpuArchCoresPerSM[index].Cores;
311  }
312  index++;
313  }
314  SKEPU_WARNING("MapSMtoCores undefined SMversion " << major << "." << minor << "\n");
315  return -1;
316 }
317 // end of GPU Architecture definitions
318 
319 // This function returns the best GPU (with maximum GFLOPS)
320 inline int cutGetMaxGflopsDeviceId()
321 {
322  int current_device = 0, sm_per_multiproc = 0;
323  int max_compute_perf = 0, max_perf_device = 0;
324  int device_count = 0, best_SM_arch = 0;
325  cudaDeviceProp deviceProp;
326 
327  cudaGetDeviceCount( &device_count );
328  // Find the best major SM Architecture GPU device
329  while ( current_device < device_count )
330  {
331  cudaGetDeviceProperties( &deviceProp, current_device );
332  if (deviceProp.major > 0 && deviceProp.major < 9999)
333  {
334  best_SM_arch = MAX(best_SM_arch, deviceProp.major);
335  }
336  current_device++;
337  }
338 
339  // Find the best CUDA capable GPU device
340  current_device = 0;
341  while( current_device < device_count )
342  {
343  cudaGetDeviceProperties( &deviceProp, current_device );
344  if (deviceProp.major == 9999 && deviceProp.minor == 9999)
345  {
346  sm_per_multiproc = 1;
347  }
348  else
349  {
350  sm_per_multiproc = _ConvertSMVer2Cores_local(deviceProp.major, deviceProp.minor);
351  }
352 
353  int compute_perf = deviceProp.multiProcessorCount * sm_per_multiproc * deviceProp.clockRate;
354  if( compute_perf > max_compute_perf )
355  {
356  // If we find GPU with SM major > 2, search only these
357  if ( best_SM_arch > 2 )
358  {
359  // If our device==dest_SM_arch, choose this, or else pass
360  if (deviceProp.major == best_SM_arch)
361  {
362  max_compute_perf = compute_perf;
363  max_perf_device = current_device;
364  }
365  }
366  else
367  {
368  max_compute_perf = compute_perf;
369  max_perf_device = current_device;
370  }
371  }
372  ++current_device;
373  }
374  return max_perf_device;
375 }
376 
377 
378 
379 // General initialization call to pick the best CUDA Device
380 inline int cutilChooseCudaDevice()
381 {
382  cudaDeviceProp deviceProp;
383  int devID = 0;
384 
385  // Otherwise pick the device with highest Gflops/s
386  devID = cutGetMaxGflopsDeviceId();
387  cutilSafeCallNoSync( cudaSetDevice( devID ) );
388  cutilSafeCallNoSync( cudaGetDeviceProperties(&deviceProp, devID) );
389  DEBUG_TEXT_LEVEL1("Best CUDA device [" << devID << "]: " << deviceProp.name << "\n");
390 
391  return devID;
392 }
393 
394 
395 // ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ \\
396 // ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ \\
397 // ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ \\
398 // ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ \\
399 // ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ \\
400 
401 // ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ \\
402 // ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ \\
403 // ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ \\
404 // ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ \\
405 // ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ \\
406 
407 
408 
409 #ifdef USE_PINNED_MEMORY
410 template <typename T>
411 void copyDeviceToHost(T *hostPtr, T *devPtr, int numElements, cudaStream_t &stream)
412 #else
413 template <typename T>
414 void copyDeviceToHost(T *hostPtr, T *devPtr, int numElements)
415 #endif
416 {
417  if(devPtr != NULL && hostPtr != NULL)
418  {
419  DEBUG_TEXT_LEVEL2("** DEVICE_TO_HOST CUDA: "<< numElements <<"!!!\n")
420 
421  size_t sizeVec;
422 
423  sizeVec = numElements*sizeof(T);
424 
425 #ifdef USE_PINNED_MEMORY
426  cutilSafeCallNoSync( cudaMemcpyAsync(hostPtr, devPtr, sizeVec, cudaMemcpyDeviceToHost, stream) );
427 #else
428  cutilSafeCallNoSync( cudaMemcpy(hostPtr, devPtr, sizeVec, cudaMemcpyDeviceToHost) );
429 #endif
430  }
431 }
432 
433 
434 
435 #ifdef USE_PINNED_MEMORY
436 template <typename T>
437 void copyHostToDevice(T *hostPtr, T *devPtr, int numElements, cudaStream_t &stream)
438 #else
439 template <typename T>
440 void copyHostToDevice(T *hostPtr, T *devPtr, int numElements)
441 #endif
442 {
443  if(hostPtr != NULL && devPtr != NULL)
444  {
445  DEBUG_TEXT_LEVEL2("** HOST_TO_DEVICE CUDA: "<< numElements <<"!!!\n")
446 
447  size_t sizeVec;
448 
449  sizeVec = numElements*sizeof(T);
450 
451 #ifdef USE_PINNED_MEMORY
452  cutilSafeCallNoSync( cudaMemcpyAsync(devPtr, hostPtr, sizeVec, cudaMemcpyHostToDevice, stream) );
453 #else
454  cutilSafeCallNoSync( cudaMemcpy(devPtr, hostPtr, sizeVec, cudaMemcpyHostToDevice) );
455 #endif
456  }
457 }
458 
459 
460 template <typename T>
461 inline void allocateCudaMemory(T **devicePointer, unsigned int size)
462 {
463  DEBUG_TEXT_LEVEL2("** ALLOC CUDA: "<< size <<"!!!\n")
464 
465  size_t sizeVec = size*sizeof(T);
466 
467  cutilSafeCallNoSync( cudaMalloc((void**)devicePointer, sizeVec) );
468 }
469 
470 
471 template <typename T>
472 inline void freeCudaMemory(T *d_pointer)
473 {
474  DEBUG_TEXT_LEVEL2("** DE-ALLOC CUDA: !!!\n")
475 
476  if(d_pointer!=NULL)
477  cutilSafeCallNoSync(cudaFree(d_pointer));
478 }
479 
480 
481 
482 
483 
484 
485 
486 #endif
487 
CUTBoolean
CUT bool type.
Definition: skepu_cuda_helpers.h:63