5 #ifndef SKEPU_CUDA_HELPER_H
6 #define SKEPU_CUDA_HELPER_H
20 #define cutilSafeCallNoSync(err) __cudaSafeCallNoSync(err, __FILE__, __LINE__)
21 #define cutilSafeCall(err) __cudaSafeCall (err, __FILE__, __LINE__)
22 #define cutilSafeThreadSync() __cudaSafeThreadSync(__FILE__, __LINE__)
23 #define cufftSafeCall(err) __cufftSafeCall (err, __FILE__, __LINE__)
24 #define curandSafeCall(err) __curandSafeCall (err, __FILE__, __LINE__)
25 #define cutilCheckError(err) __cutilCheckError (err, __FILE__, __LINE__)
26 #define cutilCheckMsg(msg) __cutilGetLastError (msg, __FILE__, __LINE__)
27 #define cutilCheckMsgAndSync(msg) __cutilGetLastErrorAndSync (msg, __FILE__, __LINE__)
28 #define cutilSafeMalloc(mallocCall) __cutilSafeMalloc ((mallocCall), __FILE__, __LINE__)
29 #define cutilCondition(val) __cutilCondition (val, __FILE__, __LINE__)
30 #define cutilExit(argc, argv) __cutilExit (argc, argv)
33 #define CHECK_CUDA_ERROR(stmt) { cudaError_t err = stmt; __checkCudaError (err, __FILE__, __LINE__); }
42 #define MIN(a,b) ((a < b) ? a : b)
46 #define MAX(a,b) ((a > b) ? a : b)
50 inline cudaError cutilDeviceSynchronize()
52 #if CUDART_VERSION >= 4000
53 return cudaDeviceSynchronize();
55 return cudaThreadSynchronize();
70 #define CUTIL_API __stdcall
82 # ifdef _DEBUG // Do this only in debug mode...
83 inline void VSPrintf(FILE *file, LPCSTR fmt, ...)
85 size_t fmt2_sz = 2048;
86 char *fmt2 = (
char*)malloc(fmt2_sz);
89 while((_vsnprintf(fmt2, fmt2_sz, fmt, vlist)) < 0)
93 fmt2 = (
char*)malloc(fmt2_sz);
95 OutputDebugStringA(fmt2);
99 # define FPRINTF(a) VSPrintf a
101 # define FPRINTF(a) fprintf a
106 # define FPRINTF(a) fprintf a
109 # define FPRINTF(a) fprintf a
116 inline void __checkCudaError( cudaError_t err,
const char *file,
const int line )
118 if( cudaSuccess != err)
120 FPRINTF((stderr,
"CUDA ERROR at %s: %i. Error is %d: %s.\n",file, line, (
int)err, cudaGetErrorString(err)));
126 inline void __cudaSafeCallNoSync( cudaError_t err,
const char *file,
const int line )
128 if( cudaSuccess != err)
130 FPRINTF((stderr,
"%s(%i) : cudaSafeCallNoSync() Runtime API error %d : %s.\n",
131 file, line, (
int)err, cudaGetErrorString( err ) ));
136 inline void __cudaSafeCall( cudaError_t err,
const char *file,
const int line )
138 if( cudaSuccess != err)
140 FPRINTF((stderr,
"%s(%i) : cudaSafeCall() Runtime API error %d: %s.\n",
141 file, line, (
int)err, cudaGetErrorString( err ) ));
148 inline void __cudaSafeThreadSync(
const char *file,
const int line )
150 cudaError_t err = cutilDeviceSynchronize();
151 if ( cudaSuccess != err)
153 FPRINTF((stderr,
"%s(%i) : cudaDeviceSynchronize() Runtime API error %d: %s.\n",
154 file, line, (
int)err, cudaGetErrorString( err ) ));
159 inline void __cufftSafeCall( cufftResult err,
const char *file,
const int line )
161 if( CUFFT_SUCCESS != err)
163 FPRINTF((stderr,
"%s(%i) : cufftSafeCall() CUFFT error %d: ",
164 file, line, (
int)err));
167 case CUFFT_INVALID_PLAN:
168 FPRINTF((stderr,
"CUFFT_INVALID_PLAN\n"));
169 case CUFFT_ALLOC_FAILED:
170 FPRINTF((stderr,
"CUFFT_ALLOC_FAILED\n"));
171 case CUFFT_INVALID_TYPE:
172 FPRINTF((stderr,
"CUFFT_INVALID_TYPE\n"));
173 case CUFFT_INVALID_VALUE:
174 FPRINTF((stderr,
"CUFFT_INVALID_VALUE\n"));
175 case CUFFT_INTERNAL_ERROR:
176 FPRINTF((stderr,
"CUFFT_INTERNAL_ERROR\n"));
177 case CUFFT_EXEC_FAILED:
178 FPRINTF((stderr,
"CUFFT_EXEC_FAILED\n"));
179 case CUFFT_SETUP_FAILED:
180 FPRINTF((stderr,
"CUFFT_SETUP_FAILED\n"));
181 case CUFFT_INVALID_SIZE:
182 FPRINTF((stderr,
"CUFFT_INVALID_SIZE\n"));
183 case CUFFT_UNALIGNED_DATA:
184 FPRINTF((stderr,
"CUFFT_UNALIGNED_DATA\n"));
186 FPRINTF((stderr,
"CUFFT Unknown error code\n"));
192 inline void __curandSafeCall( curandStatus_t err,
const char *file,
const int line )
194 if( CURAND_STATUS_SUCCESS != err)
196 FPRINTF((stderr,
"%s(%i) : curandSafeCall() CURAND error %d: ",
197 file, line, (
int)err));
200 case CURAND_STATUS_VERSION_MISMATCH:
201 FPRINTF((stderr,
"CURAND_STATUS_VERSION_MISMATCH"));
202 case CURAND_STATUS_NOT_INITIALIZED:
203 FPRINTF((stderr,
"CURAND_STATUS_NOT_INITIALIZED"));
204 case CURAND_STATUS_ALLOCATION_FAILED:
205 FPRINTF((stderr,
"CURAND_STATUS_ALLOCATION_FAILED"));
206 case CURAND_STATUS_TYPE_ERROR:
207 FPRINTF((stderr,
"CURAND_STATUS_TYPE_ERROR"));
208 case CURAND_STATUS_OUT_OF_RANGE:
209 FPRINTF((stderr,
"CURAND_STATUS_OUT_OF_RANGE"));
210 case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
211 FPRINTF((stderr,
"CURAND_STATUS_LENGTH_NOT_MULTIPLE"));
214 case CURAND_STATUS_LAUNCH_FAILURE:
215 FPRINTF((stderr,
"CURAND_STATUS_LAUNCH_FAILURE"));
216 case CURAND_STATUS_PREEXISTING_FAILURE:
217 FPRINTF((stderr,
"CURAND_STATUS_PREEXISTING_FAILURE"));
218 case CURAND_STATUS_INITIALIZATION_FAILED:
219 FPRINTF((stderr,
"CURAND_STATUS_INITIALIZATION_FAILED"));
220 case CURAND_STATUS_ARCH_MISMATCH:
221 FPRINTF((stderr,
"CURAND_STATUS_ARCH_MISMATCH"));
222 case CURAND_STATUS_INTERNAL_ERROR:
223 FPRINTF((stderr,
"CURAND_STATUS_INTERNAL_ERROR"));
225 FPRINTF((stderr,
"CURAND Unknown error code\n"));
232 inline void __cutilCheckError(
CUTBoolean err,
const char *file,
const int line )
236 FPRINTF((stderr,
"%s(%i) : CUTIL CUDA error.\n",
242 inline void __cutilGetLastError(
const char *errorMessage,
const char *file,
const int line )
244 cudaError_t err = cudaGetLastError();
245 if( cudaSuccess != err)
247 FPRINTF((stderr,
"%s(%i) : cutilCheckMsg() CUTIL CUDA error : %s : (%d) %s.\n",
248 file, line, errorMessage, (
int)err, cudaGetErrorString( err ) ));
253 inline void __cutilGetLastErrorAndSync(
const char *errorMessage,
const char *file,
const int line )
255 cudaError_t err = cudaGetLastError();
256 if( cudaSuccess != err)
258 FPRINTF((stderr,
"%s(%i) : cutilCheckMsg() CUTIL CUDA error : %s : (%d) %s.\n",
259 file, line, errorMessage, (
int)err, cudaGetErrorString( err ) ));
263 err = cutilDeviceSynchronize();
264 if( cudaSuccess != err)
266 FPRINTF((stderr,
"%s(%i) : cutilCheckMsg cudaDeviceSynchronize error: %s : (%d) %s.\n",
267 file, line, errorMessage, (
int)err, cudaGetErrorString( err ) ));
272 inline void __cutilSafeMalloc(
void *pointer,
const char *file,
const int line )
276 FPRINTF((stderr,
"%s(%i) : cutilSafeMalloc host malloc failure\n",
284 inline int _ConvertSMVer2Cores_local(
int major,
int minor)
293 sSMtoCores nGpuArchCoresPerSM[] =
306 while (nGpuArchCoresPerSM[index].SM != -1)
308 if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) )
310 return nGpuArchCoresPerSM[index].Cores;
314 SKEPU_WARNING(
"MapSMtoCores undefined SMversion " << major <<
"." << minor <<
"\n");
320 inline int cutGetMaxGflopsDeviceId()
322 int current_device = 0, sm_per_multiproc = 0;
323 int max_compute_perf = 0, max_perf_device = 0;
324 int device_count = 0, best_SM_arch = 0;
325 cudaDeviceProp deviceProp;
327 cudaGetDeviceCount( &device_count );
329 while ( current_device < device_count )
331 cudaGetDeviceProperties( &deviceProp, current_device );
332 if (deviceProp.major > 0 && deviceProp.major < 9999)
334 best_SM_arch = MAX(best_SM_arch, deviceProp.major);
341 while( current_device < device_count )
343 cudaGetDeviceProperties( &deviceProp, current_device );
344 if (deviceProp.major == 9999 && deviceProp.minor == 9999)
346 sm_per_multiproc = 1;
350 sm_per_multiproc = _ConvertSMVer2Cores_local(deviceProp.major, deviceProp.minor);
353 int compute_perf = deviceProp.multiProcessorCount * sm_per_multiproc * deviceProp.clockRate;
354 if( compute_perf > max_compute_perf )
357 if ( best_SM_arch > 2 )
360 if (deviceProp.major == best_SM_arch)
362 max_compute_perf = compute_perf;
363 max_perf_device = current_device;
368 max_compute_perf = compute_perf;
369 max_perf_device = current_device;
374 return max_perf_device;
380 inline int cutilChooseCudaDevice()
382 cudaDeviceProp deviceProp;
386 devID = cutGetMaxGflopsDeviceId();
387 cutilSafeCallNoSync( cudaSetDevice( devID ) );
388 cutilSafeCallNoSync( cudaGetDeviceProperties(&deviceProp, devID) );
389 DEBUG_TEXT_LEVEL1(
"Best CUDA device [" << devID <<
"]: " << deviceProp.name <<
"\n");
409 #ifdef USE_PINNED_MEMORY
410 template <
typename T>
411 void copyDeviceToHost(T *hostPtr, T *devPtr,
int numElements, cudaStream_t &stream)
413 template <
typename T>
414 void copyDeviceToHost(T *hostPtr, T *devPtr,
int numElements)
417 if(devPtr != NULL && hostPtr != NULL)
419 DEBUG_TEXT_LEVEL2(
"** DEVICE_TO_HOST CUDA: "<< numElements <<
"!!!\n")
423 sizeVec = numElements*sizeof(T);
425 #ifdef USE_PINNED_MEMORY
426 cutilSafeCallNoSync( cudaMemcpyAsync(hostPtr, devPtr, sizeVec, cudaMemcpyDeviceToHost, stream) );
428 cutilSafeCallNoSync( cudaMemcpy(hostPtr, devPtr, sizeVec, cudaMemcpyDeviceToHost) );
435 #ifdef USE_PINNED_MEMORY
436 template <
typename T>
437 void copyHostToDevice(T *hostPtr, T *devPtr,
int numElements, cudaStream_t &stream)
439 template <
typename T>
440 void copyHostToDevice(T *hostPtr, T *devPtr,
int numElements)
443 if(hostPtr != NULL && devPtr != NULL)
445 DEBUG_TEXT_LEVEL2(
"** HOST_TO_DEVICE CUDA: "<< numElements <<
"!!!\n")
449 sizeVec = numElements*sizeof(T);
451 #ifdef USE_PINNED_MEMORY
452 cutilSafeCallNoSync( cudaMemcpyAsync(devPtr, hostPtr, sizeVec, cudaMemcpyHostToDevice, stream) );
454 cutilSafeCallNoSync( cudaMemcpy(devPtr, hostPtr, sizeVec, cudaMemcpyHostToDevice) );
460 template <
typename T>
461 inline void allocateCudaMemory(T **devicePointer,
unsigned int size)
463 DEBUG_TEXT_LEVEL2(
"** ALLOC CUDA: "<< size <<
"!!!\n")
465 size_t sizeVec = size*sizeof(T);
467 cutilSafeCallNoSync( cudaMalloc((
void**)devicePointer, sizeVec) );
471 template <typename T>
472 inline
void freeCudaMemory(T *d_pointer)
474 DEBUG_TEXT_LEVEL2(
"** DE-ALLOC CUDA: !!!\n")
477 cutilSafeCallNoSync(cudaFree(d_pointer));
CUTBoolean
CUT bool type.
Definition: skepu_cuda_helpers.h:63