1 #undef _GLIBCXX_USE_INT128
2 #undef _GLIBCXX_ATOMIC_BUILTINS
17 #define BW_MEASURE_SIZE_IN_BYTES (32*1024*1024*sizeof(char))
24 #define MEMCOPY_ITERATIONS 128 //10
27 #define CACHE_CLEAR_SIZE (1 << 24) //16 M
39 enum testMode { QUICK_MODE, RANGE_MODE, SHMOO_MODE };
40 enum memcpyKind { DEVICE_TO_HOST, HOST_TO_DEVICE, DEVICE_TO_DEVICE };
42 enum memoryMode { PINNED, PAGEABLE };
46 const std::string BW_FILE_PATH(
"bandwidthMeasures.dat");
48 #ifndef CUDA_SAFE_CALL
49 #define CUDA_SAFE_CALL( call) { \
50 cudaError err = call; \
51 if( SKEPU_UNLIKELY(err) ) { \
52 fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", __FILE__, __LINE__, cudaGetErrorString( err) ); \
60 double testDeviceToHostTransfer(
unsigned int memSize, memoryMode memMode,
bool wc)
62 float elapsedTimeInMs = 0.0f;
63 unsigned char *h_idata = NULL;
64 unsigned char *h_odata = NULL;
65 cudaEvent_t start, stop;
67 CUDA_SAFE_CALL( cudaEventCreate( &start ) );
68 CUDA_SAFE_CALL( cudaEventCreate( &stop ) );
71 if( PINNED == memMode )
74 CUDA_SAFE_CALL( cudaHostAlloc( (
void**)&h_idata, memSize, (wc) ? cudaHostAllocWriteCombined : 0 ) );
75 CUDA_SAFE_CALL( cudaHostAlloc( (
void**)&h_odata, memSize, (wc) ? cudaHostAllocWriteCombined : 0 ) );
80 h_idata = (
unsigned char *)malloc( memSize );
81 h_odata = (
unsigned char *)malloc( memSize );
83 if( h_idata == 0 || h_odata == 0 )
85 fprintf(stderr,
"Not enough memory avaialable on host to run test!\n" );
90 for(
unsigned int i = 0; i < memSize/
sizeof(
unsigned char); i++)
92 h_idata[i] = (
unsigned char) (i & 0xff);
96 unsigned char* d_idata;
97 CUDA_SAFE_CALL( cudaMalloc( (
void**) &d_idata, memSize));
100 CUDA_SAFE_CALL( cudaMemcpy( d_idata, h_idata, memSize, cudaMemcpyHostToDevice) );
103 CUDA_SAFE_CALL( cudaEventRecord( start, 0 ) );
104 if( PINNED == memMode )
106 for(
unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++ )
108 CUDA_SAFE_CALL( cudaMemcpyAsync( h_odata, d_idata, memSize, cudaMemcpyDeviceToHost, 0) );
109 CUDA_SAFE_CALL( cudaDeviceSynchronize() );
114 for(
unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++ )
116 CUDA_SAFE_CALL( cudaMemcpy( h_odata, d_idata, memSize, cudaMemcpyDeviceToHost) );
117 CUDA_SAFE_CALL( cudaDeviceSynchronize() );
120 CUDA_SAFE_CALL( cudaEventRecord( stop, 0 ) );
123 CUDA_SAFE_CALL( cudaDeviceSynchronize() );
126 CUDA_SAFE_CALL( cudaEventElapsedTime( &elapsedTimeInMs, start, stop ) );
128 double elapsedTimeInMicroSec = ((double)elapsedTimeInMs*1000)/MEMCOPY_ITERATIONS;
134 CUDA_SAFE_CALL( cudaEventDestroy(stop) );
135 CUDA_SAFE_CALL( cudaEventDestroy(start) );
137 if( PINNED == memMode )
139 CUDA_SAFE_CALL( cudaFreeHost(h_idata) );
140 CUDA_SAFE_CALL( cudaFreeHost(h_odata) );
148 CUDA_SAFE_CALL(cudaFree(d_idata));
150 return elapsedTimeInMicroSec;
158 float elapsedTimeInMs = 0.0f;
159 cudaEvent_t start, stop;
160 CUDA_SAFE_CALL( cudaEventCreate( &start ) );
161 CUDA_SAFE_CALL( cudaEventCreate( &stop ) );
164 unsigned char *h_odata = NULL;
165 if( PINNED == memMode )
168 CUDA_SAFE_CALL( cudaHostAlloc( (
void**)&h_odata, memSize, (wc) ? cudaHostAllocWriteCombined : 0 ) );
173 h_odata = (
unsigned char *)malloc( memSize );
177 fprintf(stderr,
"Not enough memory avaialable on host to run test!\n" );
182 unsigned char *h_cacheClear1 = (
unsigned char *)malloc( CACHE_CLEAR_SIZE );
183 unsigned char *h_cacheClear2 = (
unsigned char *)malloc( CACHE_CLEAR_SIZE );
185 if( h_cacheClear1 == 0 || h_cacheClear1 == 0 )
187 fprintf(stderr,
"Not enough memory avaialable on host to run test!\n" );
192 for(
unsigned int i = 0; i < memSize/
sizeof(
unsigned char); i++)
194 h_odata[i] = (
unsigned char) (i & 0xff);
196 for(
unsigned int i = 0; i < CACHE_CLEAR_SIZE /
sizeof(
unsigned char); i++)
198 h_cacheClear1[i] = (
unsigned char) (i & 0xff);
199 h_cacheClear2[i] = (
unsigned char) (0xff - (i & 0xff));
203 unsigned char* d_idata;
204 CUDA_SAFE_CALL( cudaMalloc( (
void**) &d_idata, memSize));
206 CUDA_SAFE_CALL( cudaEventRecord( start, 0 ) );
209 if( PINNED == memMode )
211 for(
unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++)
213 CUDA_SAFE_CALL( cudaMemcpyAsync( d_idata, h_odata, memSize, cudaMemcpyHostToDevice, 0) );
214 CUDA_SAFE_CALL( cudaDeviceSynchronize() );
219 for(
unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++)
221 CUDA_SAFE_CALL( cudaMemcpy( d_idata, h_odata, memSize, cudaMemcpyHostToDevice) );
222 CUDA_SAFE_CALL( cudaDeviceSynchronize() );
226 CUDA_SAFE_CALL( cudaEventRecord( stop, 0 ) );
227 CUDA_SAFE_CALL( cudaDeviceSynchronize() );
230 CUDA_SAFE_CALL( cudaEventElapsedTime( &elapsedTimeInMs, start, stop ) );
232 double elapsedTimeInMicroSec = ((double)elapsedTimeInMs*1000)/MEMCOPY_ITERATIONS;
238 CUDA_SAFE_CALL( cudaEventDestroy(stop) );
239 CUDA_SAFE_CALL( cudaEventDestroy(start) );
241 if( PINNED == memMode )
243 CUDA_SAFE_CALL( cudaFreeHost(h_odata) );
253 CUDA_SAFE_CALL(cudaFree(d_idata));
255 return elapsedTimeInMicroSec;
263 float elapsedTimeInMs = 0.0f;
264 cudaEvent_t start, stop;
266 CUDA_SAFE_CALL( cudaEventCreate( &start ) );
267 CUDA_SAFE_CALL( cudaEventCreate( &stop ) );
270 unsigned char *h_idata = (
unsigned char *)malloc( memSize );
273 fprintf(stderr,
"Not enough memory avaialable on host to run test!\n" );
278 for(
unsigned int i = 0; i < memSize/
sizeof(
unsigned char); i++)
280 h_idata[i] = (
unsigned char) (i & 0xff);
284 unsigned char *d_idata;
285 CUDA_SAFE_CALL( cudaMalloc( (
void**) &d_idata, memSize));
287 unsigned char *d_odata;
288 CUDA_SAFE_CALL( cudaMalloc( (
void**) &d_odata, memSize));
291 CUDA_SAFE_CALL( cudaMemcpy( d_idata, h_idata, memSize,
292 cudaMemcpyHostToDevice) );
295 CUDA_SAFE_CALL( cudaEventRecord( start, 0 ) );
296 for(
unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++ )
298 CUDA_SAFE_CALL( cudaMemcpy( d_odata, d_idata, memSize, cudaMemcpyDeviceToDevice) );
299 CUDA_SAFE_CALL( cudaDeviceSynchronize() );
301 CUDA_SAFE_CALL( cudaEventRecord( stop, 0 ) );
306 CUDA_SAFE_CALL( cudaDeviceSynchronize() );
309 CUDA_SAFE_CALL( cudaEventElapsedTime( &elapsedTimeInMs, start, stop ) );
311 double elapsedTimeInMicroSec = ((double)elapsedTimeInMs*1000)/MEMCOPY_ITERATIONS;
318 CUDA_SAFE_CALL(cudaEventDestroy(stop));
319 CUDA_SAFE_CALL(cudaEventDestroy(start));
320 CUDA_SAFE_CALL(cudaFree(d_idata));
321 CUDA_SAFE_CALL(cudaFree(d_odata));
323 return elapsedTimeInMicroSec;
329 void printResultsReadable(
unsigned int *memSizes,
double* bandwidths,
unsigned int count, memcpyKind kind, memoryMode memMode,
int deviceId,
bool wc)
332 if (kind == DEVICE_TO_DEVICE)
334 printf(
" Device to Device Bandwidth, Device %i, \n", deviceId);
338 if (kind == DEVICE_TO_HOST)
340 printf(
" Device to Host Bandwidth, Device %i, ", deviceId);
342 else if (kind == HOST_TO_DEVICE)
344 printf(
" Host to Device Bandwidth, Device %i, ", deviceId);
347 if(memMode == PAGEABLE)
349 printf(
"Paged memory\n");
351 else if (memMode == PINNED)
353 printf(
"Pinned memory");
356 printf(
", Write-Combined Memory Enabled");
362 printf(
" Transfer Size (Bytes)\tBandwidth(MB/s)\n");
364 for(i = 0; i < (count - 1); i++)
366 printf(
" %u\t\t\t%s%.1f\n", memSizes[i], (memSizes[i] < 10000)?
"\t" :
"", bandwidths[i]);
368 printf(
" %u\t\t\t%s%.1f\n\n", memSizes[i], (memSizes[i] < 10000)?
"\t" :
"", bandwidths[i]);
381 DevTimingStruct measurebandwidth(memoryMode memMode = PAGEABLE,
int deviceId = 0,
bool wc =
false)
383 DevTimingStruct devTiming;
385 int size = BW_MEASURE_SIZE_IN_BYTES;
389 cudaSetDevice(deviceId);
392 devTiming.timing_dth = testDeviceToHostTransfer( size, memMode, wc);
396 devTiming.latency_dth = testDeviceToHostTransfer( 1, memMode, wc);
401 int sizeOf1024 = BW_MEASURE_SIZE_IN_BYTES/1024;
402 devTiming.timing_dth /= sizeOf1024;
403 devTiming.timing_htd /= sizeOf1024;
404 devTiming.timing_dtd /= sizeOf1024;
407 cudaSetDevice(deviceId);
421 DevTimingStruct devBW;
422 bool readFromFile =
false;
425 std::ifstream infile(BW_FILE_PATH.c_str());
430 std::istringstream iss;
433 getline(infile, strLine);
438 iss >> devBW.timing_htd >> devBW.timing_dth >> devBW.timing_dtd >> devBW.latency_htd >> devBW.latency_dth >> devBW.latency_dtd;
444 while(infile.good());
451 memoryMode memMode = (pinnedMemory ? PINNED : PAGEABLE);
453 devBW = measurebandwidth(memMode, gpuId, wc);
456 std::ofstream outfile(BW_FILE_PATH.c_str(), std::ios_base::app | std::ios_base::ate);
458 outfile.open(BW_FILE_PATH.c_str());
460 if(!(outfile.good()))
462 SKEPU_ERROR(
"Could not open file for writing/appending. filename = " << BW_FILE_PATH);
465 outfile << gpuId <<
" " << devBW.timing_htd <<
" " << devBW.timing_dth <<
" " << devBW.timing_dtd <<
" " << devBW.latency_htd <<
" " << devBW.latency_dth <<
" " << devBW.latency_dtd <<
"\n";
471 devBW.timing_dth /= 1024;
472 devBW.timing_htd /= 1024;
473 devBW.timing_dtd /= 1024;
DevTimingStruct measureOrLoadCUDABandwidth(int gpuId, bool pinnedMemory=false)
Definition: bandwidthMeasure.h:419
double testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode, bool wc)
test the bandwidth of a host to device memcopy of a specific size
Definition: bandwidthMeasure.h:156
double testDeviceToDeviceTransfer(unsigned int memSize)
test the bandwidth of a device to device memcopy of a specific size
Definition: bandwidthMeasure.h:261