5 #ifndef SKEPU_CUDA_HELPER_H
6 #define SKEPU_CUDA_HELPER_H
20 #define cutilSafeCallNoSync(err) __cudaSafeCallNoSync(err, __FILE__, __LINE__)
21 #define cutilSafeCall(err) __cudaSafeCall (err, __FILE__, __LINE__)
22 #define cutilSafeThreadSync() __cudaSafeThreadSync(__FILE__, __LINE__)
23 #define cufftSafeCall(err) __cufftSafeCall (err, __FILE__, __LINE__)
24 #define cutilCheckError(err) __cutilCheckError (err, __FILE__, __LINE__)
25 #define cutilCheckMsg(msg) __cutilGetLastError (msg, __FILE__, __LINE__)
26 #define cutilCheckMsgAndSync(msg) __cutilGetLastErrorAndSync (msg, __FILE__, __LINE__)
27 #define cutilSafeMalloc(mallocCall) __cutilSafeMalloc ((mallocCall), __FILE__, __LINE__)
28 #define cutilCondition(val) __cutilCondition (val, __FILE__, __LINE__)
29 #define cutilExit(argc, argv) __cutilExit (argc, argv)
37 #define MIN(a,b) ((a < b) ? a : b)
41 #define MAX(a,b) ((a > b) ? a : b)
45 inline cudaError cutilDeviceSynchronize()
47 #if CUDART_VERSION >= 4000
48 return cudaDeviceSynchronize();
50 return cudaThreadSynchronize();
65 #define CUTIL_API __stdcall
77 # ifdef _DEBUG // Do this only in debug mode...
78 inline void VSPrintf(FILE *file, LPCSTR fmt, ...)
80 size_t fmt2_sz = 2048;
81 char *fmt2 = (
char*)malloc(fmt2_sz);
84 while((_vsnprintf(fmt2, fmt2_sz, fmt, vlist)) < 0)
88 fmt2 = (
char*)malloc(fmt2_sz);
90 OutputDebugStringA(fmt2);
94 # define FPRINTF(a) VSPrintf a
96 # define FPRINTF(a) fprintf a
101 # define FPRINTF(a) fprintf a
104 # define FPRINTF(a) fprintf a
111 inline void __cudaSafeCallNoSync( cudaError err,
const char *file,
const int line )
113 if( cudaSuccess != err) {
114 FPRINTF((stderr,
"%s(%i) : cudaSafeCallNoSync() Runtime API error %d : %s.\n",
115 file, line, (
int)err, cudaGetErrorString( err ) ));
120 inline void __cudaSafeCall( cudaError err,
const char *file,
const int line )
122 if( cudaSuccess != err) {
123 FPRINTF((stderr,
"%s(%i) : cudaSafeCall() Runtime API error %d: %s.\n",
124 file, line, (
int)err, cudaGetErrorString( err ) ));
129 inline void __cudaSafeThreadSync(
const char *file,
const int line )
131 cudaError err = cutilDeviceSynchronize();
132 if ( cudaSuccess != err) {
133 FPRINTF((stderr,
"%s(%i) : cudaDeviceSynchronize() Runtime API error %d: %s.\n",
134 file, line, (
int)err, cudaGetErrorString( err ) ));
139 inline void __cufftSafeCall( cufftResult err,
const char *file,
const int line )
141 if( CUFFT_SUCCESS != err) {
142 FPRINTF((stderr,
"%s(%i) : cufftSafeCall() CUFFT error %d: ",
143 file, line, (
int)err));
145 case CUFFT_INVALID_PLAN: FPRINTF((stderr,
"CUFFT_INVALID_PLAN\n"));
146 case CUFFT_ALLOC_FAILED: FPRINTF((stderr,
"CUFFT_ALLOC_FAILED\n"));
147 case CUFFT_INVALID_TYPE: FPRINTF((stderr,
"CUFFT_INVALID_TYPE\n"));
148 case CUFFT_INVALID_VALUE: FPRINTF((stderr,
"CUFFT_INVALID_VALUE\n"));
149 case CUFFT_INTERNAL_ERROR: FPRINTF((stderr,
"CUFFT_INTERNAL_ERROR\n"));
150 case CUFFT_EXEC_FAILED: FPRINTF((stderr,
"CUFFT_EXEC_FAILED\n"));
151 case CUFFT_SETUP_FAILED: FPRINTF((stderr,
"CUFFT_SETUP_FAILED\n"));
152 case CUFFT_INVALID_SIZE: FPRINTF((stderr,
"CUFFT_INVALID_SIZE\n"));
153 case CUFFT_UNALIGNED_DATA: FPRINTF((stderr,
"CUFFT_UNALIGNED_DATA\n"));
154 default: FPRINTF((stderr,
"CUFFT Unknown error code\n"));
162 inline void __cutilCheckError(
CUTBoolean err,
const char *file,
const int line )
164 if( CUTTrue != err) {
165 FPRINTF((stderr,
"%s(%i) : CUTIL CUDA error.\n",
171 inline void __cutilGetLastError(
const char *errorMessage,
const char *file,
const int line )
173 cudaError_t err = cudaGetLastError();
174 if( cudaSuccess != err) {
175 FPRINTF((stderr,
"%s(%i) : cutilCheckMsg() CUTIL CUDA error : %s : (%d) %s.\n",
176 file, line, errorMessage, (
int)err, cudaGetErrorString( err ) ));
181 inline void __cutilGetLastErrorAndSync(
const char *errorMessage,
const char *file,
const int line )
183 cudaError_t err = cudaGetLastError();
184 if( cudaSuccess != err) {
185 FPRINTF((stderr,
"%s(%i) : cutilCheckMsg() CUTIL CUDA error : %s : (%d) %s.\n",
186 file, line, errorMessage, (
int)err, cudaGetErrorString( err ) ));
190 err = cutilDeviceSynchronize();
191 if( cudaSuccess != err) {
192 FPRINTF((stderr,
"%s(%i) : cutilCheckMsg cudaDeviceSynchronize error: %s : (%d) %s.\n",
193 file, line, errorMessage, (
int)err, cudaGetErrorString( err ) ));
198 inline void __cutilSafeMalloc(
void *pointer,
const char *file,
const int line )
201 FPRINTF((stderr,
"%s(%i) : cutilSafeMalloc host malloc failure\n",
209 inline int _ConvertSMVer2Cores_local(
int major,
int minor)
217 sSMtoCores nGpuArchCoresPerSM[] =
229 while (nGpuArchCoresPerSM[index].SM != -1) {
230 if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) {
231 return nGpuArchCoresPerSM[index].Cores;
235 printf(
"MapSMtoCores undefined SMversion %d.%d!\n", major, minor);
241 inline int cutGetMaxGflopsDeviceId()
243 int current_device = 0, sm_per_multiproc = 0;
244 int max_compute_perf = 0, max_perf_device = 0;
245 int device_count = 0, best_SM_arch = 0;
246 cudaDeviceProp deviceProp;
248 cudaGetDeviceCount( &device_count );
250 while ( current_device < device_count ) {
251 cudaGetDeviceProperties( &deviceProp, current_device );
252 if (deviceProp.major > 0 && deviceProp.major < 9999) {
253 best_SM_arch = MAX(best_SM_arch, deviceProp.major);
260 while( current_device < device_count ) {
261 cudaGetDeviceProperties( &deviceProp, current_device );
262 if (deviceProp.major == 9999 && deviceProp.minor == 9999) {
263 sm_per_multiproc = 1;
265 sm_per_multiproc = _ConvertSMVer2Cores_local(deviceProp.major, deviceProp.minor);
268 int compute_perf = deviceProp.multiProcessorCount * sm_per_multiproc * deviceProp.clockRate;
269 if( compute_perf > max_compute_perf ) {
271 if ( best_SM_arch > 2 ) {
273 if (deviceProp.major == best_SM_arch) {
274 max_compute_perf = compute_perf;
275 max_perf_device = current_device;
278 max_compute_perf = compute_perf;
279 max_perf_device = current_device;
284 return max_perf_device;
290 inline int cutilChooseCudaDevice()
292 cudaDeviceProp deviceProp;
296 devID = cutGetMaxGflopsDeviceId();
297 cutilSafeCallNoSync( cudaSetDevice( devID ) );
298 cutilSafeCallNoSync( cudaGetDeviceProperties(&deviceProp, devID) );
299 printf(
"> Best CUDA device [%d]: %s\n", devID, deviceProp.name);
319 #ifdef USE_PINNED_MEMORY
320 template <
typename T>
321 void copyDeviceToHost(T *hostPtr, T *devPtr,
int numElements, cudaStream_t &stream)
323 template <
typename T>
324 void copyDeviceToHost(T *hostPtr, T *devPtr,
int numElements)
327 if(devPtr != NULL && hostPtr != NULL)
329 DEBUG_TEXT_LEVEL2(
"** DEVICE_TO_HOST CUDA: "<< numElements <<
"!!!\n")
333 sizeVec = numElements*sizeof(T);
335 #ifdef USE_PINNED_MEMORY
336 cutilSafeCallNoSync( cudaMemcpyAsync(hostPtr, devPtr, sizeVec, cudaMemcpyDeviceToHost, stream) );
338 cutilSafeCallNoSync( cudaMemcpy(hostPtr, devPtr, sizeVec, cudaMemcpyDeviceToHost) );
345 #ifdef USE_PINNED_MEMORY
346 template <
typename T>
347 void copyHostToDevice(T *hostPtr, T *devPtr,
int numElements, cudaStream_t &stream)
349 template <
typename T>
350 void copyHostToDevice(T *hostPtr, T *devPtr,
int numElements)
353 if(hostPtr != NULL && devPtr != NULL)
355 DEBUG_TEXT_LEVEL2(
"** HOST_TO_DEVICE CUDA: "<< numElements <<
"!!!\n")
359 sizeVec = numElements*sizeof(T);
361 #ifdef USE_PINNED_MEMORY
362 cutilSafeCallNoSync( cudaMemcpyAsync(devPtr, hostPtr, sizeVec, cudaMemcpyHostToDevice, stream) );
364 cutilSafeCallNoSync( cudaMemcpy(devPtr, hostPtr, sizeVec, cudaMemcpyHostToDevice) );
370 template <
typename T>
371 inline void allocateCudaMemory(T **devicePointer,
unsigned int size)
373 DEBUG_TEXT_LEVEL2(
"** ALLOC CUDA: "<< size <<
"!!!\n")
375 size_t sizeVec = size*sizeof(T);
377 cutilSafeCallNoSync( cudaMalloc((
void**)devicePointer, sizeVec) );
381 template <typename T>
382 inline
void freeCudaMemory(T *d_pointer)
384 DEBUG_TEXT_LEVEL2(
"** DE-ALLOC CUDA: !!!\n")
387 cutilSafeCallNoSync(cudaFree(d_pointer));
CUTBoolean
CUT bool type.
Definition: skepu_cuda_helpers.h:58