SkePU(integratedwithStarPU)  0.8.1
 All Classes Namespaces Files Functions Enumerations Friends Macros Groups Pages
skepu_cuda_helpers.h
Go to the documentation of this file.
1 
5 #ifndef SKEPU_CUDA_HELPER_H
6 #define SKEPU_CUDA_HELPER_H
7 
8 #include <stdio.h>
9 #include <string.h>
10 #include <stdlib.h>
11 
12 #include <cufft.h>
13 #include <curand.h>
14 
15 
16 
17 
18 // We define these calls here, so the user doesn't need to include __FILE__ and __LINE__
19 // The advantage is the developers gets to use the inline function so they can debug
20 #define cutilSafeCallNoSync(err) __cudaSafeCallNoSync(err, __FILE__, __LINE__)
21 #define cutilSafeCall(err) __cudaSafeCall (err, __FILE__, __LINE__)
22 #define cutilSafeThreadSync() __cudaSafeThreadSync(__FILE__, __LINE__)
23 #define cufftSafeCall(err) __cufftSafeCall (err, __FILE__, __LINE__)
24 #define cutilCheckError(err) __cutilCheckError (err, __FILE__, __LINE__)
25 #define cutilCheckMsg(msg) __cutilGetLastError (msg, __FILE__, __LINE__)
26 #define cutilCheckMsgAndSync(msg) __cutilGetLastErrorAndSync (msg, __FILE__, __LINE__)
27 #define cutilSafeMalloc(mallocCall) __cutilSafeMalloc ((mallocCall), __FILE__, __LINE__)
28 #define cutilCondition(val) __cutilCondition (val, __FILE__, __LINE__)
29 #define cutilExit(argc, argv) __cutilExit (argc, argv)
30 
31 
35 
36 #ifndef MIN
37 #define MIN(a,b) ((a < b) ? a : b)
38 #endif
39 
40 #ifndef MAX
41 #define MAX(a,b) ((a > b) ? a : b)
42 #endif
43 
44 
45 inline cudaError cutilDeviceSynchronize()
46 {
47 #if CUDART_VERSION >= 4000
48  return cudaDeviceSynchronize();
49 #else
50  return cudaThreadSynchronize();
51 #endif
52 }
53 
54 
59 {
60  CUTFalse = 0,
61  CUTTrue = 1
62 };
63 
64 #ifdef _WIN32
65  #define CUTIL_API __stdcall
66 #else
67  #define CUTIL_API
68 #endif
69 
70 
71 
72 
73 
74 // Give a little more for Windows : the console window often disapears before we can read the message
75 #ifdef _WIN32
76 # if 1//ndef UNICODE
77 # ifdef _DEBUG // Do this only in debug mode...
78  inline void VSPrintf(FILE *file, LPCSTR fmt, ...)
79  {
80  size_t fmt2_sz = 2048;
81  char *fmt2 = (char*)malloc(fmt2_sz);
82  va_list vlist;
83  va_start(vlist, fmt);
84  while((_vsnprintf(fmt2, fmt2_sz, fmt, vlist)) < 0) // means there wasn't anough room
85  {
86  fmt2_sz *= 2;
87  if(fmt2) free(fmt2);
88  fmt2 = (char*)malloc(fmt2_sz);
89  }
90  OutputDebugStringA(fmt2);
91  fprintf(file, fmt2);
92  free(fmt2);
93  }
94 # define FPRINTF(a) VSPrintf a
95 # else //debug
96 # define FPRINTF(a) fprintf a
97 // For other than Win32
98 # endif //debug
99 # else //unicode
100 // Unicode case... let's give-up for now and keep basic printf
101 # define FPRINTF(a) fprintf a
102 # endif //unicode
103 #else //win32
104 # define FPRINTF(a) fprintf a
105 #endif //win32
106 
107 
108 // NOTE: "%s(%i) : " allows Visual Studio to directly jump to the file at the right line
109 // when the user double clicks on the error line in the Output pane. Like any compile error.
110 
111 inline void __cudaSafeCallNoSync( cudaError err, const char *file, const int line )
112 {
113  if( cudaSuccess != err) {
114  FPRINTF((stderr, "%s(%i) : cudaSafeCallNoSync() Runtime API error %d : %s.\n",
115  file, line, (int)err, cudaGetErrorString( err ) ));
116  exit(-1);
117  }
118 }
119 
120 inline void __cudaSafeCall( cudaError err, const char *file, const int line )
121 {
122  if( cudaSuccess != err) {
123  FPRINTF((stderr, "%s(%i) : cudaSafeCall() Runtime API error %d: %s.\n",
124  file, line, (int)err, cudaGetErrorString( err ) ));
125  exit(-1);
126  }
127 }
128 
129 inline void __cudaSafeThreadSync( const char *file, const int line )
130 {
131  cudaError err = cutilDeviceSynchronize();
132  if ( cudaSuccess != err) {
133  FPRINTF((stderr, "%s(%i) : cudaDeviceSynchronize() Runtime API error %d: %s.\n",
134  file, line, (int)err, cudaGetErrorString( err ) ));
135  exit(-1);
136  }
137 }
138 
139 inline void __cufftSafeCall( cufftResult err, const char *file, const int line )
140 {
141  if( CUFFT_SUCCESS != err) {
142  FPRINTF((stderr, "%s(%i) : cufftSafeCall() CUFFT error %d: ",
143  file, line, (int)err));
144  switch (err) {
145  case CUFFT_INVALID_PLAN: FPRINTF((stderr, "CUFFT_INVALID_PLAN\n"));
146  case CUFFT_ALLOC_FAILED: FPRINTF((stderr, "CUFFT_ALLOC_FAILED\n"));
147  case CUFFT_INVALID_TYPE: FPRINTF((stderr, "CUFFT_INVALID_TYPE\n"));
148  case CUFFT_INVALID_VALUE: FPRINTF((stderr, "CUFFT_INVALID_VALUE\n"));
149  case CUFFT_INTERNAL_ERROR: FPRINTF((stderr, "CUFFT_INTERNAL_ERROR\n"));
150  case CUFFT_EXEC_FAILED: FPRINTF((stderr, "CUFFT_EXEC_FAILED\n"));
151  case CUFFT_SETUP_FAILED: FPRINTF((stderr, "CUFFT_SETUP_FAILED\n"));
152  case CUFFT_INVALID_SIZE: FPRINTF((stderr, "CUFFT_INVALID_SIZE\n"));
153  case CUFFT_UNALIGNED_DATA: FPRINTF((stderr, "CUFFT_UNALIGNED_DATA\n"));
154  default: FPRINTF((stderr, "CUFFT Unknown error code\n"));
155  }
156  exit(-1);
157  }
158 }
159 
160 
161 
162 inline void __cutilCheckError( CUTBoolean err, const char *file, const int line )
163 {
164  if( CUTTrue != err) {
165  FPRINTF((stderr, "%s(%i) : CUTIL CUDA error.\n",
166  file, line));
167  exit(-1);
168  }
169 }
170 
171 inline void __cutilGetLastError( const char *errorMessage, const char *file, const int line )
172 {
173  cudaError_t err = cudaGetLastError();
174  if( cudaSuccess != err) {
175  FPRINTF((stderr, "%s(%i) : cutilCheckMsg() CUTIL CUDA error : %s : (%d) %s.\n",
176  file, line, errorMessage, (int)err, cudaGetErrorString( err ) ));
177  exit(-1);
178  }
179 }
180 
181 inline void __cutilGetLastErrorAndSync( const char *errorMessage, const char *file, const int line )
182 {
183  cudaError_t err = cudaGetLastError();
184  if( cudaSuccess != err) {
185  FPRINTF((stderr, "%s(%i) : cutilCheckMsg() CUTIL CUDA error : %s : (%d) %s.\n",
186  file, line, errorMessage, (int)err, cudaGetErrorString( err ) ));
187  exit(-1);
188  }
189 
190  err = cutilDeviceSynchronize();
191  if( cudaSuccess != err) {
192  FPRINTF((stderr, "%s(%i) : cutilCheckMsg cudaDeviceSynchronize error: %s : (%d) %s.\n",
193  file, line, errorMessage, (int)err, cudaGetErrorString( err ) ));
194  exit(-1);
195  }
196 }
197 
198 inline void __cutilSafeMalloc( void *pointer, const char *file, const int line )
199 {
200  if( !(pointer)) {
201  FPRINTF((stderr, "%s(%i) : cutilSafeMalloc host malloc failure\n",
202  file, line));
203  exit(-1);
204  }
205 }
206 
207 
208 // Beginning of GPU Architecture definitions
209 inline int _ConvertSMVer2Cores_local(int major, int minor)
210 {
211  // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
212  typedef struct {
213  int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
214  int Cores;
215  } sSMtoCores;
216 
217  sSMtoCores nGpuArchCoresPerSM[] =
218  { { 0x10, 8 }, // Tesla Generation (SM 1.0) G80 class
219  { 0x11, 8 }, // Tesla Generation (SM 1.1) G8x class
220  { 0x12, 8 }, // Tesla Generation (SM 1.2) G9x class
221  { 0x13, 8 }, // Tesla Generation (SM 1.3) GT200 class
222  { 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
223  { 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
224  { 0x30, 192}, // Fermi Generation (SM 3.0) GK10x class
225  { -1, -1 }
226  };
227 
228  int index = 0;
229  while (nGpuArchCoresPerSM[index].SM != -1) {
230  if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) {
231  return nGpuArchCoresPerSM[index].Cores;
232  }
233  index++;
234  }
235  printf("MapSMtoCores undefined SMversion %d.%d!\n", major, minor);
236  return -1;
237 }
238 // end of GPU Architecture definitions
239 
240 // This function returns the best GPU (with maximum GFLOPS)
241 inline int cutGetMaxGflopsDeviceId()
242 {
243  int current_device = 0, sm_per_multiproc = 0;
244  int max_compute_perf = 0, max_perf_device = 0;
245  int device_count = 0, best_SM_arch = 0;
246  cudaDeviceProp deviceProp;
247 
248  cudaGetDeviceCount( &device_count );
249  // Find the best major SM Architecture GPU device
250  while ( current_device < device_count ) {
251  cudaGetDeviceProperties( &deviceProp, current_device );
252  if (deviceProp.major > 0 && deviceProp.major < 9999) {
253  best_SM_arch = MAX(best_SM_arch, deviceProp.major);
254  }
255  current_device++;
256  }
257 
258  // Find the best CUDA capable GPU device
259  current_device = 0;
260  while( current_device < device_count ) {
261  cudaGetDeviceProperties( &deviceProp, current_device );
262  if (deviceProp.major == 9999 && deviceProp.minor == 9999) {
263  sm_per_multiproc = 1;
264  } else {
265  sm_per_multiproc = _ConvertSMVer2Cores_local(deviceProp.major, deviceProp.minor);
266  }
267 
268  int compute_perf = deviceProp.multiProcessorCount * sm_per_multiproc * deviceProp.clockRate;
269  if( compute_perf > max_compute_perf ) {
270  // If we find GPU with SM major > 2, search only these
271  if ( best_SM_arch > 2 ) {
272  // If our device==dest_SM_arch, choose this, or else pass
273  if (deviceProp.major == best_SM_arch) {
274  max_compute_perf = compute_perf;
275  max_perf_device = current_device;
276  }
277  } else {
278  max_compute_perf = compute_perf;
279  max_perf_device = current_device;
280  }
281  }
282  ++current_device;
283  }
284  return max_perf_device;
285 }
286 
287 
288 
289 // General initialization call to pick the best CUDA Device
290 inline int cutilChooseCudaDevice()
291 {
292  cudaDeviceProp deviceProp;
293  int devID = 0;
294 
295  // Otherwise pick the device with highest Gflops/s
296  devID = cutGetMaxGflopsDeviceId();
297  cutilSafeCallNoSync( cudaSetDevice( devID ) );
298  cutilSafeCallNoSync( cudaGetDeviceProperties(&deviceProp, devID) );
299  printf("> Best CUDA device [%d]: %s\n", devID, deviceProp.name);
300 
301  return devID;
302 }
303 
304 
305 // ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ \\
306 // ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ \\
307 // ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ \\
308 // ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ \\
309 // ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ \\
310 
311 // ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ \\
312 // ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ \\
313 // ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ \\
314 // ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ \\
315 // ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ \\
316 
317 
318 
319 #ifdef USE_PINNED_MEMORY
320 template <typename T>
321 void copyDeviceToHost(T *hostPtr, T *devPtr, int numElements, cudaStream_t &stream)
322 #else
323 template <typename T>
324 void copyDeviceToHost(T *hostPtr, T *devPtr, int numElements)
325 #endif
326 {
327  if(devPtr != NULL && hostPtr != NULL)
328  {
329  DEBUG_TEXT_LEVEL2("** DEVICE_TO_HOST CUDA: "<< numElements <<"!!!\n")
330 
331  size_t sizeVec;
332 
333  sizeVec = numElements*sizeof(T);
334 
335  #ifdef USE_PINNED_MEMORY
336  cutilSafeCallNoSync( cudaMemcpyAsync(hostPtr, devPtr, sizeVec, cudaMemcpyDeviceToHost, stream) );
337  #else
338  cutilSafeCallNoSync( cudaMemcpy(hostPtr, devPtr, sizeVec, cudaMemcpyDeviceToHost) );
339  #endif
340  }
341 }
342 
343 
344 
345 #ifdef USE_PINNED_MEMORY
346 template <typename T>
347 void copyHostToDevice(T *hostPtr, T *devPtr, int numElements, cudaStream_t &stream)
348 #else
349 template <typename T>
350 void copyHostToDevice(T *hostPtr, T *devPtr, int numElements)
351 #endif
352 {
353  if(hostPtr != NULL && devPtr != NULL)
354  {
355  DEBUG_TEXT_LEVEL2("** HOST_TO_DEVICE CUDA: "<< numElements <<"!!!\n")
356 
357  size_t sizeVec;
358 
359  sizeVec = numElements*sizeof(T);
360 
361  #ifdef USE_PINNED_MEMORY
362  cutilSafeCallNoSync( cudaMemcpyAsync(devPtr, hostPtr, sizeVec, cudaMemcpyHostToDevice, stream) );
363  #else
364  cutilSafeCallNoSync( cudaMemcpy(devPtr, hostPtr, sizeVec, cudaMemcpyHostToDevice) );
365  #endif
366  }
367 }
368 
369 
370 template <typename T>
371 inline void allocateCudaMemory(T **devicePointer, unsigned int size)
372 {
373  DEBUG_TEXT_LEVEL2("** ALLOC CUDA: "<< size <<"!!!\n")
374 
375  size_t sizeVec = size*sizeof(T);
376 
377  cutilSafeCallNoSync( cudaMalloc((void**)devicePointer, sizeVec) );
378 }
379 
380 
381 template <typename T>
382 inline void freeCudaMemory(T *d_pointer)
383 {
384  DEBUG_TEXT_LEVEL2("** DE-ALLOC CUDA: !!!\n")
385 
386  if(d_pointer!=NULL)
387  cutilSafeCallNoSync(cudaFree(d_pointer));
388 }
389 
390 
391 
392 
393 
394 
395 
396 #endif
397 
CUTBoolean
CUT bool type.
Definition: skepu_cuda_helpers.h:58