SkePU  1.2
 All Classes Namespaces Files Functions Variables Enumerations Friends Macros Groups Pages
bandwidthMeasure.h
1 #undef _GLIBCXX_USE_INT128
2 #undef _GLIBCXX_ATOMIC_BUILTINS
3 /*
4  * This file is based on source code by NVIDIA.
5  * Copyright 1993-2011 NVIDIA Corporation. All rights reserved.
6  *
7  * Please refer to the NVIDIA end user license agreement (EULA) associated
8  * with this source code for terms and conditions that govern your use of
9  * this software. Any use, reproduction, disclosure, or distribution of
10  * this software and related documentation outside the terms of the EULA
11  * is strictly prohibited.
12  *
13  */
14 
15 
16 // StarPU settings...
17 #define BW_MEASURE_SIZE_IN_BYTES (32*1024*1024*sizeof(char))
18 #define NITER 128
19 
20 
21 
22 
23 // defines, project
24 #define MEMCOPY_ITERATIONS 128 //10
25 // #define DEFAULT_SIZE ( 32 * ( 1 << 20 ) ) //32 M
26 // #define DEFAULT_INCREMENT (1 << 22) //4 M
27 #define CACHE_CLEAR_SIZE (1 << 24) //16 M
28 
29 
30 
31 
32 #ifdef SKEPU_CUDA
33 
34 namespace skepu
35 {
36 
37 
38 //enums, project
39 enum testMode { QUICK_MODE, RANGE_MODE, SHMOO_MODE };
40 enum memcpyKind { DEVICE_TO_HOST, HOST_TO_DEVICE, DEVICE_TO_DEVICE };
41 // enum printMode { USER_READABLE, CSV };
42 enum memoryMode { PINNED, PAGEABLE };
43 
44 
45 
46 const std::string BW_FILE_PATH("bandwidthMeasures.dat");
47 
48 #ifndef CUDA_SAFE_CALL
49 #define CUDA_SAFE_CALL( call) { \
50  cudaError err = call; \
51  if( SKEPU_UNLIKELY(err) ) { \
52  fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", __FILE__, __LINE__, cudaGetErrorString( err) ); \
53  exit(EXIT_FAILURE); \
54  } }
55 #endif
56 
58 // test the bandwidth of a device to host memcopy of a specific size
60 double testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode, bool wc)
61 {
62  float elapsedTimeInMs = 0.0f;
63  unsigned char *h_idata = NULL;
64  unsigned char *h_odata = NULL;
65  cudaEvent_t start, stop;
66 
67  CUDA_SAFE_CALL( cudaEventCreate( &start ) );
68  CUDA_SAFE_CALL( cudaEventCreate( &stop ) );
69 
70  //allocate host memory
71  if( PINNED == memMode )
72  {
73  //pinned memory mode - use special function to get OS-pinned memory
74  CUDA_SAFE_CALL( cudaHostAlloc( (void**)&h_idata, memSize, (wc) ? cudaHostAllocWriteCombined : 0 ) );
75  CUDA_SAFE_CALL( cudaHostAlloc( (void**)&h_odata, memSize, (wc) ? cudaHostAllocWriteCombined : 0 ) );
76  }
77  else
78  {
79  //pageable memory mode - use malloc
80  h_idata = (unsigned char *)malloc( memSize );
81  h_odata = (unsigned char *)malloc( memSize );
82 
83  if( h_idata == 0 || h_odata == 0 )
84  {
85  fprintf(stderr, "Not enough memory avaialable on host to run test!\n" );
86  exit(-1);
87  }
88  }
89  //initialize the memory
90  for(unsigned int i = 0; i < memSize/sizeof(unsigned char); i++)
91  {
92  h_idata[i] = (unsigned char) (i & 0xff);
93  }
94 
95  // allocate device memory
96  unsigned char* d_idata;
97  CUDA_SAFE_CALL( cudaMalloc( (void**) &d_idata, memSize));
98 
99  //initialize the device memory
100  CUDA_SAFE_CALL( cudaMemcpy( d_idata, h_idata, memSize, cudaMemcpyHostToDevice) );
101 
102  //copy data from GPU to Host
103  CUDA_SAFE_CALL( cudaEventRecord( start, 0 ) );
104  if( PINNED == memMode )
105  {
106  for( unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++ )
107  {
108  CUDA_SAFE_CALL( cudaMemcpyAsync( h_odata, d_idata, memSize, cudaMemcpyDeviceToHost, 0) );
109  CUDA_SAFE_CALL( cudaDeviceSynchronize() );
110  }
111  }
112  else
113  {
114  for( unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++ )
115  {
116  CUDA_SAFE_CALL( cudaMemcpy( h_odata, d_idata, memSize, cudaMemcpyDeviceToHost) );
117  CUDA_SAFE_CALL( cudaDeviceSynchronize() );
118  }
119  }
120  CUDA_SAFE_CALL( cudaEventRecord( stop, 0 ) );
121 
122  // make sure GPU has finished copying
123  CUDA_SAFE_CALL( cudaDeviceSynchronize() );
124 
125  //get the the total elapsed time in ms
126  CUDA_SAFE_CALL( cudaEventElapsedTime( &elapsedTimeInMs, start, stop ) );
127 
128  double elapsedTimeInMicroSec = ((double)elapsedTimeInMs*1000)/MEMCOPY_ITERATIONS; // /memSize;
129 
130  //calculate bandwidth in MB/s
131 // bandwidthInMBs = (1e3f * memSize * (double)MEMCOPY_ITERATIONS) / (elapsedTimeInMs * (double)(1 << 20));
132 
133  //clean up memory
134  CUDA_SAFE_CALL( cudaEventDestroy(stop) );
135  CUDA_SAFE_CALL( cudaEventDestroy(start) );
136 
137  if( PINNED == memMode )
138  {
139  CUDA_SAFE_CALL( cudaFreeHost(h_idata) );
140  CUDA_SAFE_CALL( cudaFreeHost(h_odata) );
141  }
142  else
143  {
144  free(h_idata);
145  free(h_odata);
146  }
147 
148  CUDA_SAFE_CALL(cudaFree(d_idata));
149 
150  return elapsedTimeInMicroSec;
151 }
152 
156 double testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode, bool wc)
157 {
158  float elapsedTimeInMs = 0.0f;
159  cudaEvent_t start, stop;
160  CUDA_SAFE_CALL( cudaEventCreate( &start ) );
161  CUDA_SAFE_CALL( cudaEventCreate( &stop ) );
162 
163  //allocate host memory
164  unsigned char *h_odata = NULL;
165  if( PINNED == memMode )
166  {
167  //pinned memory mode - use special function to get OS-pinned memory
168  CUDA_SAFE_CALL( cudaHostAlloc( (void**)&h_odata, memSize, (wc) ? cudaHostAllocWriteCombined : 0 ) );
169  }
170  else
171  {
172  //pageable memory mode - use malloc
173  h_odata = (unsigned char *)malloc( memSize );
174 
175  if( h_odata == 0 )
176  {
177  fprintf(stderr, "Not enough memory avaialable on host to run test!\n" );
178  exit(-1);
179  }
180  }
181 
182  unsigned char *h_cacheClear1 = (unsigned char *)malloc( CACHE_CLEAR_SIZE );
183  unsigned char *h_cacheClear2 = (unsigned char *)malloc( CACHE_CLEAR_SIZE );
184 
185  if( h_cacheClear1 == 0 || h_cacheClear1 == 0 )
186  {
187  fprintf(stderr, "Not enough memory avaialable on host to run test!\n" );
188  exit(-1);
189  }
190 
191  //initialize the memory
192  for(unsigned int i = 0; i < memSize/sizeof(unsigned char); i++)
193  {
194  h_odata[i] = (unsigned char) (i & 0xff);
195  }
196  for(unsigned int i = 0; i < CACHE_CLEAR_SIZE / sizeof(unsigned char); i++)
197  {
198  h_cacheClear1[i] = (unsigned char) (i & 0xff);
199  h_cacheClear2[i] = (unsigned char) (0xff - (i & 0xff));
200  }
201 
202  //allocate device memory
203  unsigned char* d_idata;
204  CUDA_SAFE_CALL( cudaMalloc( (void**) &d_idata, memSize));
205 
206  CUDA_SAFE_CALL( cudaEventRecord( start, 0 ) );
207 
208  //copy host memory to device memory
209  if( PINNED == memMode )
210  {
211  for(unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++)
212  {
213  CUDA_SAFE_CALL( cudaMemcpyAsync( d_idata, h_odata, memSize, cudaMemcpyHostToDevice, 0) );
214  CUDA_SAFE_CALL( cudaDeviceSynchronize() );
215  }
216  }
217  else
218  {
219  for(unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++)
220  {
221  CUDA_SAFE_CALL( cudaMemcpy( d_idata, h_odata, memSize, cudaMemcpyHostToDevice) );
222  CUDA_SAFE_CALL( cudaDeviceSynchronize() );
223  }
224  }
225 
226  CUDA_SAFE_CALL( cudaEventRecord( stop, 0 ) );
227  CUDA_SAFE_CALL( cudaDeviceSynchronize() );
228 
229  //total elapsed time in ms
230  CUDA_SAFE_CALL( cudaEventElapsedTime( &elapsedTimeInMs, start, stop ) );
231 
232  double elapsedTimeInMicroSec = ((double)elapsedTimeInMs*1000)/MEMCOPY_ITERATIONS; // /memSize;
233 
234  //calculate bandwidth in MB/s
235 // bandwidthInMBs = (1e3f * memSize * (double)MEMCOPY_ITERATIONS) / (elapsedTimeInMs * (double)(1 << 20));
236 
237  //clean up memory
238  CUDA_SAFE_CALL( cudaEventDestroy(stop) );
239  CUDA_SAFE_CALL( cudaEventDestroy(start) );
240 
241  if( PINNED == memMode )
242  {
243  CUDA_SAFE_CALL( cudaFreeHost(h_odata) );
244  }
245  else
246  {
247  free(h_odata);
248  }
249 
250  free(h_cacheClear1);
251  free(h_cacheClear2);
252 
253  CUDA_SAFE_CALL(cudaFree(d_idata));
254 
255  return elapsedTimeInMicroSec;
256 }
257 
261 double testDeviceToDeviceTransfer(unsigned int memSize)
262 {
263  float elapsedTimeInMs = 0.0f;
264  cudaEvent_t start, stop;
265 
266  CUDA_SAFE_CALL( cudaEventCreate( &start ) );
267  CUDA_SAFE_CALL( cudaEventCreate( &stop ) );
268 
269  //allocate host memory
270  unsigned char *h_idata = (unsigned char *)malloc( memSize );
271  if( h_idata == 0 )
272  {
273  fprintf(stderr, "Not enough memory avaialable on host to run test!\n" );
274  exit(-1);
275  }
276 
277  //initialize the host memory
278  for(unsigned int i = 0; i < memSize/sizeof(unsigned char); i++)
279  {
280  h_idata[i] = (unsigned char) (i & 0xff);
281  }
282 
283  //allocate device memory
284  unsigned char *d_idata;
285  CUDA_SAFE_CALL( cudaMalloc( (void**) &d_idata, memSize));
286 
287  unsigned char *d_odata;
288  CUDA_SAFE_CALL( cudaMalloc( (void**) &d_odata, memSize));
289 
290  //initialize memory
291  CUDA_SAFE_CALL( cudaMemcpy( d_idata, h_idata, memSize,
292  cudaMemcpyHostToDevice) );
293 
294  //run the memcopy
295  CUDA_SAFE_CALL( cudaEventRecord( start, 0 ) );
296  for( unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++ )
297  {
298  CUDA_SAFE_CALL( cudaMemcpy( d_odata, d_idata, memSize, cudaMemcpyDeviceToDevice) );
299  CUDA_SAFE_CALL( cudaDeviceSynchronize() );
300  }
301  CUDA_SAFE_CALL( cudaEventRecord( stop, 0 ) );
302 
303  //Since device to device memory copies are non-blocking,
304  //cudaDeviceSynchronize() is required in order to get
305  //proper timing.
306  CUDA_SAFE_CALL( cudaDeviceSynchronize() );
307 
308  //get the the total elapsed time in ms
309  CUDA_SAFE_CALL( cudaEventElapsedTime( &elapsedTimeInMs, start, stop ) );
310 
311  double elapsedTimeInMicroSec = ((double)elapsedTimeInMs*1000)/MEMCOPY_ITERATIONS; // /memSize;
312 
313  //calculate bandwidth in MB/s
314 // bandwidthInMBs = 2.0f * (1e3f * memSize * (double)MEMCOPY_ITERATIONS) / (elapsedTimeInMs * (double)(1 << 20));
315 
316  //clean up memory
317  free(h_idata);
318  CUDA_SAFE_CALL(cudaEventDestroy(stop));
319  CUDA_SAFE_CALL(cudaEventDestroy(start));
320  CUDA_SAFE_CALL(cudaFree(d_idata));
321  CUDA_SAFE_CALL(cudaFree(d_odata));
322 
323  return elapsedTimeInMicroSec;
324 }
325 
327 //print results in an easily read format
329 void printResultsReadable(unsigned int *memSizes, double* bandwidths, unsigned int count, memcpyKind kind, memoryMode memMode, int deviceId, bool wc)
330 {
331  // log config information
332  if (kind == DEVICE_TO_DEVICE)
333  {
334  printf(" Device to Device Bandwidth, Device %i, \n", deviceId);
335  }
336  else
337  {
338  if (kind == DEVICE_TO_HOST)
339  {
340  printf(" Device to Host Bandwidth, Device %i, ", deviceId);
341  }
342  else if (kind == HOST_TO_DEVICE)
343  {
344  printf(" Host to Device Bandwidth, Device %i, ", deviceId);
345  }
346 
347  if(memMode == PAGEABLE)
348  {
349  printf("Paged memory\n");
350  }
351  else if (memMode == PINNED)
352  {
353  printf("Pinned memory");
354  if (wc)
355  {
356  printf(", Write-Combined Memory Enabled");
357  }
358  printf("\n");
359  }
360  }
361 
362  printf(" Transfer Size (Bytes)\tBandwidth(MB/s)\n");
363  unsigned int i;
364  for(i = 0; i < (count - 1); i++)
365  {
366  printf(" %u\t\t\t%s%.1f\n", memSizes[i], (memSizes[i] < 10000)? "\t" : "", bandwidths[i]);
367  }
368  printf(" %u\t\t\t%s%.1f\n\n", memSizes[i], (memSizes[i] < 10000)? "\t" : "", bandwidths[i]);
369 }
370 
371 
372 
373 
374 
375 
376 
377 
379 // Run a bandwidth test
381 DevTimingStruct measurebandwidth(memoryMode memMode = PAGEABLE, int deviceId = 0, bool wc = false)
382 {
383  DevTimingStruct devTiming;
384 
385  int size = BW_MEASURE_SIZE_IN_BYTES;
386 
387 
388  // Use the device asked by the user
389  cudaSetDevice(deviceId);
390 
391  //run each of the copies
392  devTiming.timing_dth = testDeviceToHostTransfer( size, memMode, wc);
393  devTiming.timing_htd = testHostToDeviceTransfer( size, memMode, wc);
394  devTiming.timing_dtd = testDeviceToDeviceTransfer( size );
395 
396  devTiming.latency_dth = testDeviceToHostTransfer( 1, memMode, wc);
397  devTiming.latency_htd = testHostToDeviceTransfer( 1, memMode, wc);
398  devTiming.latency_dtd = testDeviceToDeviceTransfer( 1 );
399 
400  // now divide the measure bandwidth time to be time for 1024 bytes...
401  int sizeOf1024 = BW_MEASURE_SIZE_IN_BYTES/1024;
402  devTiming.timing_dth /= sizeOf1024;
403  devTiming.timing_htd /= sizeOf1024;
404  devTiming.timing_dtd /= sizeOf1024;
405 
406  // Ensure that we reset the CUDA Device in question
407  cudaSetDevice(deviceId);
408  cudaDeviceReset();
409 
410  return devTiming;
411 }
412 
419 DevTimingStruct measureOrLoadCUDABandwidth(int gpuId, bool pinnedMemory = false)
420 {
421  DevTimingStruct devBW;
422  bool readFromFile = false;
423 
424  // first try to load from a file if it exists...
425  std::ifstream infile(BW_FILE_PATH.c_str());
426  if(infile.good())
427  {
428  std::string strLine;
429  int id;
430  std::istringstream iss;
431  do
432  {
433  getline(infile, strLine);
434  iss.str(strLine);
435  iss >> id;
436  if(id == gpuId)
437  {
438  iss >> devBW.timing_htd >> devBW.timing_dth >> devBW.timing_dtd >> devBW.latency_htd >> devBW.latency_dth >> devBW.latency_dtd;
439  readFromFile = true;
440  infile.close();
441  break;
442  }
443  }
444  while(infile.good());
445  }
446 
447  if(!readFromFile)
448  {
449  bool wc = false;
450 
451  memoryMode memMode = (pinnedMemory ? PINNED : PAGEABLE);
452 
453  devBW = measurebandwidth(memMode, gpuId, wc);
454 
455  // now try to save it in the file for next usage... first try to append may be measurements does not exist for this gpuid else write it...
456  std::ofstream outfile(BW_FILE_PATH.c_str(), std::ios_base::app | std::ios_base::ate);
457  if(!outfile.good())
458  outfile.open(BW_FILE_PATH.c_str());
459 
460  if(!(outfile.good()))
461  {
462  SKEPU_ERROR("Could not open file for writing/appending. filename = " << BW_FILE_PATH);
463  }
464 
465  outfile << gpuId << " " << devBW.timing_htd << " " << devBW.timing_dth << " " << devBW.timing_dtd << " " << devBW.latency_htd << " " << devBW.latency_dth << " " << devBW.latency_dtd << "\n";
466  outfile.close();
467  }
468 
469  // IN file when saving as well as when measuring we measure bandwidth for 1024 bytes as number becomes too small if consider for 1 byte (precision issue when saving/loading)
470  // however when giving it to the application we give it per byte form...
471  devBW.timing_dth /= 1024;
472  devBW.timing_htd /= 1024;
473  devBW.timing_dtd /= 1024;
474 
475  return devBW;
476 }
477 
478 
479 
480 }
481 
482 #endif
483 
484 
DevTimingStruct measureOrLoadCUDABandwidth(int gpuId, bool pinnedMemory=false)
Definition: bandwidthMeasure.h:419
double testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode, bool wc)
test the bandwidth of a host to device memcopy of a specific size
Definition: bandwidthMeasure.h:156
double testDeviceToDeviceTransfer(unsigned int memSize)
test the bandwidth of a device to device memcopy of a specific size
Definition: bandwidthMeasure.h:261