SkePU  1.2
 All Classes Namespaces Files Functions Variables Enumerations Friends Macros Groups Pages
device_mem_pointer_cu.h
Go to the documentation of this file.
1 
5 #ifndef DEVICE_MEM_POINTER_CU_H
6 #define DEVICE_MEM_POINTER_CU_H
7 
8 #ifdef SKEPU_CUDA
9 
10 #include <iostream>
11 #include <cuda.h>
12 
13 
14 #include "device_cu.h"
15 #include "mem_pointer_base.h"
16 
17 namespace skepu
18 {
19 
26 template<typename T>
27 struct UpdateInf
28 {
29  int srcDevId;
30  T* src;
31  int srcOffset;
32  int dstOffset;
33  size_t copySize;
34  bool srcIsHost;
35 };
36 
38 #define MAX_RANGES 10
39 
43 #define MAX_COPYINF_SIZE 33
44 
57 template <typename T>
58 class DeviceMemPointer_CU : public MemPointerBase
59 {
60 
61 public:
62  DeviceMemPointer_CU(T* start, size_t numElements, Device_CU *device, std::string name="");
63  DeviceMemPointer_CU(T* start, size_t rows, size_t cols, Device_CU *device, bool usePitch=false, std::string name="");
64 
65 // DeviceMemPointer_CU(T* root, T* start, size_t numElements, Device_CU *device);
67 
68  void copyHostToDevice(size_t numElements = 0) const;
69  void copyDeviceToHost(size_t numElements = 0) const;
70 
71  void copiesOverlapInf(DeviceMemPointer_CU<T> *otherCopy, UpdateInf<T>* updateStruct, size_t &sizeUpdStr);
72 
73  size_t getMemSize();
74  void clearDevicePointer();
75 
76  T* getDeviceDataPointer() const;
77  unsigned int getDeviceID() const;
78 
79  void changeDeviceData();
80  bool deviceDataHasChanged() const;
81 
82  void markCopyInvalid();
83  bool isCopyValid() const;
84 
85 // #if SKEPU_DEBUG>0
86  std::string m_nameVerbose;
87 // #endif
88 
89  size_t m_pitch;
90 
91  size_t m_rows;
92 
93  size_t m_cols;
94 
95  bool doCopiesOverlap(DeviceMemPointer_CU<T> *otherCopy, bool oneUnitCheck = false);
97  bool doRangeOverlap(T *hostPtr, size_t numElements);
98  void copyInfFromHostToDevice(UpdateInf<T>* updateStruct, size_t &sizeUpdStr);
99  void copyAllRangesToDevice(UpdateInf<T>* updateStruct, const size_t sizeUpdStr, size_t streamID = 0);
100 
101  std::pair<T*, int> m_rangesToCompare[MAX_RANGES];
102  size_t m_numOfRanges;
103 
104  T* m_hostDataPointer;
105  T* m_deviceDataPointer;
106  size_t m_numElements;
107 
108  void resetRanges()
109  {
111  m_numOfRanges = 0;
112  m_rangesToCompare[m_numOfRanges] = std::make_pair(m_hostDataPointer, m_numElements);
113  m_numOfRanges++;
114  }
115 
116 private:
117  unsigned int m_deviceID;
118  Device_CU *m_dev;
119 
120  bool m_usePitch;
121 
126  mutable bool m_valid;
127 
128  mutable bool m_deviceDataHasChanged;
129 
130  bool freeUpDeviceMem();
131 };
132 
136 template <typename T>
138 {
139  if(m_hostDataPointer <= otherCopy->m_hostDataPointer && (m_hostDataPointer+m_numElements) >= (otherCopy->m_hostDataPointer+otherCopy->m_numElements))
140  return true;
141 
142  return false;
143 }
144 
149 template <typename T>
151 {
152  if(oneUnitCheck)
153  assert(m_numOfRanges == 1 && otherCopy->m_numOfRanges == 1);
154 
155  if(m_numOfRanges < 1)
156  return false;
157 
158  for(size_t i=0; i<m_numOfRanges; ++i)
159  {
160  T *hostDataPointer = m_rangesToCompare[i].first;
161  size_t numElements = m_rangesToCompare[i].second;
162 
163  if( hostDataPointer >= otherCopy->m_hostDataPointer && hostDataPointer < (otherCopy->m_hostDataPointer + otherCopy->m_numElements) )
164  return true;
165 
166  if( otherCopy->m_hostDataPointer >= hostDataPointer && otherCopy->m_hostDataPointer < (hostDataPointer + numElements) )
167  return true;
168  }
169  return false;
170 }
171 
175 template <typename T>
176 bool DeviceMemPointer_CU<T>::doRangeOverlap(T *hostDataPointer, size_t numElements)
177 {
178  if( (m_hostDataPointer+m_numElements) <= hostDataPointer )
179  return false;
180 
181  if( (hostDataPointer+numElements) <= m_hostDataPointer )
182  return false;
183 
184  return true;
185 }
186 
195 template <typename T>
196 DeviceMemPointer_CU<T>::DeviceMemPointer_CU(T* start, size_t numElements, Device_CU *device, std::string name) : m_hostDataPointer(start), m_numElements(numElements), m_dev(device), m_rows(1), m_cols(numElements), m_pitch(numElements), m_valid(false), m_usePitch(false), m_numOfRanges(0)
197 {
198  m_nameVerbose = name;
199 
200  size_t sizeVec = m_numElements*sizeof(T);
201 
202  m_deviceID = m_dev->getDeviceID();
203  CHECK_CUDA_ERROR(cudaSetDevice(m_deviceID));
204 
205  DEBUG_TEXT_LEVEL1(m_nameVerbose + " Alloc: " <<m_numElements << ", GPU_" << m_deviceID << "\n")
206 
207  cudaError_t er;
209  er = cudaMalloc((void**)&m_deviceDataPointer, sizeVec);
210  if (er == cudaErrorMemoryAllocation)
211  {
212  freeUpDeviceMem();
213  }
214  else
215  CHECK_CUDA_ERROR(er);
216  // add to list of device allocations
217  dev_alloc->addAllocation((void*)m_deviceDataPointer,this,m_deviceID);
218 
220  m_rangesToCompare[m_numOfRanges] = std::make_pair(m_hostDataPointer, m_numElements);
221  m_numOfRanges++;
222 
223  m_deviceDataHasChanged = false;
224 }
225 
226 
227 
238 template <typename T>
239 DeviceMemPointer_CU<T>::DeviceMemPointer_CU(T* start, size_t rows, size_t cols, Device_CU *device, bool usePitch, std::string name) : m_hostDataPointer(start), m_numElements(rows*cols), m_rows(rows), m_cols(cols), m_dev(device), m_valid(false), m_usePitch(usePitch), m_numOfRanges(0)
240 {
241  m_nameVerbose = name;
242 
243  size_t sizeVec = m_numElements*sizeof(T);
244 
245  m_deviceID = m_dev->getDeviceID();
246  CHECK_CUDA_ERROR(cudaSetDevice(m_deviceID));
247 
248  DEBUG_TEXT_LEVEL1(m_nameVerbose + " Alloc: " <<m_numElements << ", GPU_" << m_deviceID << "\n")
249 
250  if(m_usePitch)
251  {
252  CHECK_CUDA_ERROR(cudaMallocPitch((void**)&m_deviceDataPointer, &m_pitch, cols * sizeof(T), rows));
253  m_pitch = (m_pitch)/sizeof(T);
254  }
255  else
256  {
257  cudaError_t er;
259  er = cudaMalloc((void**)&m_deviceDataPointer, sizeVec);
260  if (er == cudaErrorMemoryAllocation)
261  {
262  freeUpDeviceMem();
263  }
264  else
265  CHECK_CUDA_ERROR(er);
266  // add to list of device allocations
267  dev_alloc->addAllocation((void*)m_deviceDataPointer,this,m_deviceID);
268  m_pitch = cols;
269  }
270 
271 
273  m_rangesToCompare[m_numOfRanges] = std::make_pair(m_hostDataPointer, m_numElements);
274  m_numOfRanges++;
275 
276  m_deviceDataHasChanged = false;
277 }
278 
279 
280 
284 template <typename T>
286 {
287  DEBUG_TEXT_LEVEL1(m_nameVerbose + " DeAlloc: " <<m_numElements <<", GPU_" << m_deviceID << "\n")
288 
289  CHECK_CUDA_ERROR(cudaSetDevice(m_deviceID));
290 
291  DeviceAllocations_CU<int>::getInstance()->removeAllocation(m_deviceDataPointer,this,m_deviceID);
292 
293  cudaFree(m_deviceDataPointer);
294  m_deviceDataPointer = NULL;
295 }
296 
303 template <typename T>
304 void DeviceMemPointer_CU<T>::copyInfFromHostToDevice(UpdateInf<T>* updateStruct, size_t &sizeUpdStr)
305 {
306  for(size_t i=0; i<m_numOfRanges; ++i)
307  {
309  T *hostDataPointer = m_rangesToCompare[i].first;
310  size_t numElements = m_rangesToCompare[i].second;
311 
312  assert((hostDataPointer - m_hostDataPointer)>=0);
313  int offset = hostDataPointer - m_hostDataPointer;
314 
315  size_t sizeVec = numElements*sizeof(T);
316 
317  updateStruct[sizeUpdStr].srcDevId = -1;
318  updateStruct[sizeUpdStr].src = m_hostDataPointer;
319 // updateStruct[sizeUpdStr].dst = this;
320  updateStruct[sizeUpdStr].srcOffset = offset;
321  updateStruct[sizeUpdStr].dstOffset = offset;
322  updateStruct[sizeUpdStr].copySize = sizeVec;
323  updateStruct[sizeUpdStr].srcIsHost = true;
324  sizeUpdStr++;
325  assert(sizeUpdStr < MAX_COPYINF_SIZE);
326 
328  if(i!=m_numOfRanges-1) // if not last then shift elements back...
329  {
330  for(size_t j=i+1; j<m_numOfRanges; ++j)
331  {
332  m_rangesToCompare[j-1] = m_rangesToCompare[j];
333  }
334  }
335  --i;
336  --m_numOfRanges; // decremement total size
337  }
338 }
339 
340 
349 template <typename T>
350 void DeviceMemPointer_CU<T>::copiesOverlapInf(DeviceMemPointer_CU<T> *otherCopy, UpdateInf<T>* updateStruct, size_t &sizeUpdStr)
351 {
352  for(size_t i=0; i<m_numOfRanges; ++i)
353  {
355  T *hostDataPointer = m_rangesToCompare[i].first;
356  size_t numElements = m_rangesToCompare[i].second;
357 
359  if(otherCopy->doRangeOverlap(hostDataPointer, numElements) == false)
360  continue;
361 
362  assert((hostDataPointer - m_hostDataPointer)>=0);
363  int offset = hostDataPointer - m_hostDataPointer;
364 
365 
367  if( hostDataPointer >= otherCopy->m_hostDataPointer && (hostDataPointer + numElements) <= (otherCopy->m_hostDataPointer + otherCopy->m_numElements) )
368  {
369  size_t sizeVec = numElements*sizeof(T);
370  int srcoffset = hostDataPointer - otherCopy->m_hostDataPointer;
371 
372  updateStruct[sizeUpdStr].srcDevId = otherCopy->m_deviceID;
373  updateStruct[sizeUpdStr].src = otherCopy->m_deviceDataPointer;
374 // updateStruct[sizeUpdStr].dst = this;
375  updateStruct[sizeUpdStr].srcOffset = srcoffset;
376  updateStruct[sizeUpdStr].dstOffset = offset;
377  updateStruct[sizeUpdStr].copySize = sizeVec;
378  updateStruct[sizeUpdStr].srcIsHost = false;
379  sizeUpdStr++;
380  assert(sizeUpdStr < MAX_COPYINF_SIZE);
381 
383  if(i!=m_numOfRanges-1) // if not last then shift elements back...
384  {
385  for(int j=i+1; j<m_numOfRanges; ++j)
386  {
387  m_rangesToCompare[j-1] = m_rangesToCompare[j];
388  }
389  }
390  --i;
391  --m_numOfRanges; // decremement total size
392  }
394  else if( otherCopy->m_hostDataPointer >= hostDataPointer && (otherCopy->m_hostDataPointer + otherCopy->m_numElements) <= (hostDataPointer + numElements) )
395  {
396  size_t sizeVec = otherCopy->m_numElements*sizeof(T);
397  int dstoffset = otherCopy->m_hostDataPointer - hostDataPointer;
398 
399  updateStruct[sizeUpdStr].srcDevId = otherCopy->m_deviceID;
400  updateStruct[sizeUpdStr].src = otherCopy->m_deviceDataPointer;
401 // updateStruct[sizeUpdStr].dst = this;
402  updateStruct[sizeUpdStr].srcOffset = 0;
403  updateStruct[sizeUpdStr].dstOffset = offset + dstoffset;
404  updateStruct[sizeUpdStr].copySize = sizeVec;
405  updateStruct[sizeUpdStr].srcIsHost = false;
406  sizeUpdStr++;
407  assert(sizeUpdStr < MAX_COPYINF_SIZE);
408 
410  if(i!=m_numOfRanges-1) // if not last then shift elements back...
411  {
412  for(size_t j=i+1; j<m_numOfRanges; ++j)
413  {
414  m_rangesToCompare[j-1] = m_rangesToCompare[j];
415  }
416  }
417  --i;
418  --m_numOfRanges; // decremement total size
419 
421  if(dstoffset>0) // its posisble that the dstoffset is 0
422  {
423  m_rangesToCompare[m_numOfRanges] = std::make_pair(hostDataPointer, dstoffset);
424  m_numOfRanges++;
425  assert(m_numOfRanges<MAX_RANGES);
426  }
427 
429  int size = ( (hostDataPointer+numElements) - (otherCopy->m_hostDataPointer + otherCopy->m_numElements) );
430  if(size>0)
431  {
432  int tmpoffset = (otherCopy->m_hostDataPointer + otherCopy->m_numElements) - hostDataPointer;
433  m_rangesToCompare[m_numOfRanges] = std::make_pair(hostDataPointer+tmpoffset, size);
434  m_numOfRanges++;
435  assert(m_numOfRanges<MAX_RANGES);
436  }
437  }
439  else if( otherCopy->m_hostDataPointer >= hostDataPointer && (otherCopy->m_hostDataPointer) < (hostDataPointer + numElements) )
440  {
441  size_t sizeVec = ( (hostDataPointer + numElements) - otherCopy->m_hostDataPointer) * sizeof(T);
442  int dstoffset = otherCopy->m_hostDataPointer - hostDataPointer;
443 
444  updateStruct[sizeUpdStr].srcDevId = otherCopy->m_deviceID;
445  updateStruct[sizeUpdStr].src = otherCopy->m_deviceDataPointer;
446 // updateStruct[sizeUpdStr].dst = this;
447  updateStruct[sizeUpdStr].srcOffset = 0;
448  updateStruct[sizeUpdStr].dstOffset = offset + dstoffset;
449  updateStruct[sizeUpdStr].copySize = sizeVec;
450  updateStruct[sizeUpdStr].srcIsHost = false;
451  sizeUpdStr++;
452  assert(sizeUpdStr < MAX_COPYINF_SIZE);
453 
455  if(i!=m_numOfRanges-1) // if not last then shift elements back...
456  {
457  for(size_t j=i+1; j<m_numOfRanges; ++j)
458  {
459  m_rangesToCompare[j-1] = m_rangesToCompare[j];
460  }
461  }
462  --i;
463  --m_numOfRanges; // decremement total size
464 
466  if(dstoffset>0) // its posisble that the dstoffset is 0
467  {
468  m_rangesToCompare[m_numOfRanges] = std::make_pair(hostDataPointer, dstoffset);
469  m_numOfRanges++;
470  assert(m_numOfRanges<MAX_RANGES);
471  }
472  }
474  else if( hostDataPointer >= otherCopy->m_hostDataPointer && hostDataPointer < (otherCopy->m_hostDataPointer + otherCopy->m_numElements) )
475  {
476  size_t sizeVec = ( (otherCopy->m_hostDataPointer + otherCopy->m_numElements) - hostDataPointer) * sizeof(T);
477  int srcoffset = hostDataPointer - otherCopy->m_hostDataPointer;
478 
479  updateStruct[sizeUpdStr].srcDevId = otherCopy->m_deviceID;
480  updateStruct[sizeUpdStr].src = otherCopy->m_deviceDataPointer;
481 // updateStruct[sizeUpdStr].dst = this;
482  updateStruct[sizeUpdStr].srcOffset = srcoffset;
483  updateStruct[sizeUpdStr].dstOffset = offset;
484  updateStruct[sizeUpdStr].copySize = sizeVec;
485  updateStruct[sizeUpdStr].srcIsHost = false;
486  sizeUpdStr++;
487  assert(sizeUpdStr < MAX_COPYINF_SIZE);
488 
490  if(i!=m_numOfRanges-1) // if not last then shift elements back...
491  {
492  for(size_t j=i+1; j<m_numOfRanges; ++j)
493  {
494  m_rangesToCompare[j-1] = m_rangesToCompare[j];
495  }
496  }
497  --i;
498  --m_numOfRanges; // decremement total size
499 
501  int size = ( (hostDataPointer+numElements) - (otherCopy->m_hostDataPointer + otherCopy->m_numElements) );
502  if(size > 0)
503  {
504  int tmpoffset = (otherCopy->m_hostDataPointer + otherCopy->m_numElements) - hostDataPointer;
505  m_rangesToCompare[m_numOfRanges] = std::make_pair(hostDataPointer+tmpoffset, size);
506  m_numOfRanges++;
507  assert(m_numOfRanges<MAX_RANGES);
508  }
509  }
510  else
511  assert(false);
512  }
513 }
514 
515 template <typename T>
517 {
518  return sizeof(T)*m_numElements;
519 }
520 
521 template <typename T>
522 void DeviceMemPointer_CU<T>::clearDevicePointer()
523 {
524  cudaFree(m_deviceDataPointer);
525  m_deviceDataPointer = NULL;
526 }
527 
528 
537 template <typename T>
538 void DeviceMemPointer_CU<T>::copyAllRangesToDevice(UpdateInf<T>* updateStruct, const size_t sizeUpdStr, size_t streamID)
539 {
540  assert(m_valid == false);
541 
542  // reallocate if datapointer has been cleared
543  if (m_deviceDataPointer == NULL)
544  {
545  cudaError_t er;
546  er = cudaMalloc((void**)&m_deviceDataPointer, m_numElements*sizeof(T));
547  if (er == cudaErrorMemoryAllocation)
548  {
549  freeUpDeviceMem();
550  }
551  }
552 
554  assert(m_deviceDataHasChanged == false);
555 
556  size_t sizeVec;
557 
558  for(int i=0; i<sizeUpdStr; ++i)
559  {
560  if(updateStruct[i].srcIsHost)
561  {
562  DEBUG_TEXT_LEVEL1(m_nameVerbose + " HOST_TO_DEVICE: Host -> GPU_" << m_deviceID << ", size: " << (updateStruct[i].copySize/sizeof(T)) << " # " << updateStruct[i].srcOffset << " -- " << updateStruct[i].srcOffset + (updateStruct[i].copySize/sizeof(T)) <<"!!!\n")
563  }
564  else
565  {
566  DEBUG_TEXT_LEVEL1(m_nameVerbose + " DEVICE_TO_DEVICE: From GPU_" << updateStruct[i].srcDevId << " -> GPU_" << m_deviceID << ", size: " << (updateStruct[i].copySize/sizeof(T)) << " # " << updateStruct[i].srcOffset << " -- " << updateStruct[i].srcOffset + (updateStruct[i].copySize/sizeof(T)) <<"!!!\n")
567  }
568 
573  enum cudaMemcpyKind memKind = ((updateStruct[i].srcIsHost)? cudaMemcpyHostToDevice : ((updateStruct[i].srcDevId == m_deviceID)? cudaMemcpyDeviceToDevice: cudaMemcpyDefault));
574 
575  sizeVec = updateStruct[i].copySize;
576  CHECK_CUDA_ERROR(cudaSetDevice(m_deviceID));
577 
578 #ifdef USE_PINNED_MEMORY
579  if(m_usePitch)
580  assert(false);
581  else if(updateStruct[i].srcIsHost || updateStruct[i].srcDevId == m_deviceID)
582  CHECK_CUDA_ERROR(cudaMemcpyAsync(m_deviceDataPointer + updateStruct[i].dstOffset, updateStruct[i].src + updateStruct[i].srcOffset, sizeVec, memKind, (m_dev->m_streams[streamID])))
583  else
584  CHECK_CUDA_ERROR(cudaMemcpyPeer(m_deviceDataPointer + updateStruct[i].dstOffset, m_deviceID, updateStruct[i].src + updateStruct[i].srcOffset, updateStruct[i].srcDevId, sizeVec));
585 #else
586  if(m_usePitch)
587  assert(false);
588  else if(updateStruct[i].srcIsHost || updateStruct[i].srcDevId == m_deviceID)
589  CHECK_CUDA_ERROR(cudaMemcpy(m_deviceDataPointer + updateStruct[i].dstOffset, updateStruct[i].src + updateStruct[i].srcOffset, sizeVec, memKind))
590  else
591  CHECK_CUDA_ERROR(cudaMemcpyPeer(m_deviceDataPointer + updateStruct[i].dstOffset, m_deviceID, updateStruct[i].src + updateStruct[i].srcOffset, updateStruct[i].srcDevId, sizeVec));
592 #endif
593  }
594 #ifdef VERBOSE
595  T *h_ptr;
596  h_ptr = (T*)malloc(m_numElements*sizeof(T));
597  CHECK_CUDA_ERROR(cudaMemcpy(h_ptr, m_deviceDataPointer, m_numElements*sizeof(T), cudaMemcpyDeviceToHost));
598  std::cerr << "%% printing '" << m_nameVerbose << "' contents at GPU_" << m_deviceID << "\n";
599  for(int i=0; i<m_numElements; ++i)
600  {
601  std::cerr << h_ptr[i] << " ";
602  }
603  std::cerr << "\n-----------------------------\n";
604  free(h_ptr);
605 #endif
606 
607  m_valid = true;
608 }
609 
610 
616 template <typename T>
617 void DeviceMemPointer_CU<T>::copyHostToDevice(size_t numElements) const
618 {
619  if(m_hostDataPointer != NULL)
620  {
621  DEBUG_TEXT_LEVEL1(m_nameVerbose + " HOST_TO_DEVICE: Host -> GPU_" << m_deviceID << ", size: " << ((numElements<1)? m_numElements: numElements)<<" !!!\n")
622 
623  if(m_valid == true)
624  {
625  SKEPU_ERROR("Data copy is already valid.. copying data from host to device failed\n");
626  }
627 
628  size_t sizeVec;
629 
630  // used for pitch allocation.
631  int _rows, _cols;
632 
633  if(numElements < 1)
634  {
635  numElements = m_numElements;
636  }
637 
638  if(m_usePitch)
639  {
640  if( (numElements%m_cols)!=0 || (numElements/m_cols)<1 ) // using pitch option, memory copy must be proper, respecting rows and cols
641  {
642  SKEPU_ERROR("Cannot copy data using pitch option when size mismatches with rows and columns. numElements: "<<numElements<<", rows:"<< m_rows <<", m_cols: "<<m_cols<<"\n");
643  }
644 
645  _rows = numElements/m_cols;
646  _cols = m_cols;
647  }
648 
649  sizeVec = numElements*sizeof(T);
650 
651  CHECK_CUDA_ERROR(cudaSetDevice(m_deviceID));
652 
653 #ifdef USE_PINNED_MEMORY
654  if(m_usePitch)
655  CHECK_CUDA_ERROR(cudaMemcpy2DAsync(m_deviceDataPointer,m_pitch*sizeof(T),m_hostDataPointer,_cols*sizeof(T), _cols*sizeof(T), _rows, cudaMemcpyHostToDevice, (m_dev->m_streams[0])))
656  else
657  CHECK_CUDA_ERROR(cudaMemcpyAsync(m_deviceDataPointer, m_hostDataPointer, sizeVec, cudaMemcpyHostToDevice, (m_dev->m_streams[0])));
658 #else
659  if(m_usePitch)
660  CHECK_CUDA_ERROR(cudaMemcpy2D(m_deviceDataPointer,m_pitch*sizeof(T),m_hostDataPointer,_cols*sizeof(T), _cols*sizeof(T), _rows, cudaMemcpyHostToDevice))
661  else
662  CHECK_CUDA_ERROR(cudaMemcpy(m_deviceDataPointer, m_hostDataPointer, sizeVec, cudaMemcpyHostToDevice));
663 #endif
664 
666  m_valid = true;
667 
668  m_deviceDataHasChanged = false;
669  }
670 }
671 
677 template <typename T>
678 void DeviceMemPointer_CU<T>::copyDeviceToHost(size_t numElements) const
679 {
680  if(m_valid == false)
681  {
682  SKEPU_ERROR("Data copy is not valid.. copying data from device to host failed: " << ((numElements<1)? m_numElements: numElements) << "\n");
683  }
684 
685  if(m_deviceDataHasChanged && m_hostDataPointer != NULL)
686  {
687  DEBUG_TEXT_LEVEL1(m_nameVerbose + " DEVICE_TO_HOST: GPU_" << m_deviceID << " -> Host, size: " << ((numElements<1)? m_numElements: numElements)<<" !!!\n")
688 
689  size_t sizeVec;
690 
691  // used for pitch allocation.
692  size_t _rows, _cols;
693 
694  if(numElements < 1)
695  {
696  numElements = m_numElements;
697  }
698  if(m_usePitch)
699  {
700  if( (numElements%m_cols)!=0 || (numElements/m_cols)<1 ) // using pitch option, memory copy must be proper, respecting rows and cols
701  {
702  SKEPU_ERROR("Cannot copy data using pitch option when size mismatches with rows and columns. numElements: "<<numElements<<", rows:"<< m_rows <<", m_cols: "<<m_cols<<"\n");
703  }
704 
705  _rows = numElements/m_cols;
706  _cols = m_cols;
707  }
708 
709  sizeVec = numElements*sizeof(T);
710 
711  CHECK_CUDA_ERROR(cudaSetDevice(m_deviceID));
712 
713 #ifdef USE_PINNED_MEMORY
714  if(m_usePitch)
715  {
716  CHECK_CUDA_ERROR(cudaMemcpy2DAsync(m_hostDataPointer,_cols*sizeof(T),m_deviceDataPointer,m_pitch*sizeof(T), _cols*sizeof(T), _rows, cudaMemcpyDeviceToHost, (m_dev->m_streams[0])));
717  }
718  else
719  {
720  CHECK_CUDA_ERROR(cudaMemcpyAsync(m_hostDataPointer, m_deviceDataPointer, sizeVec, cudaMemcpyDeviceToHost, (m_dev->m_streams[0])));
721  }
722 #else
723  if(m_usePitch)
724  {
725  CHECK_CUDA_ERROR(cudaMemcpy2D(m_hostDataPointer,_cols*sizeof(T),m_deviceDataPointer,m_pitch*sizeof(T), _cols*sizeof(T), _rows, cudaMemcpyDeviceToHost));
726  }
727  else
728  {
729  CHECK_CUDA_ERROR(cudaMemcpy(m_hostDataPointer, m_deviceDataPointer, sizeVec, cudaMemcpyDeviceToHost));
730  }
731 #endif
732 
733  m_deviceDataHasChanged = false;
734  }
735 }
736 
740 template <typename T>
742 {
743  return m_deviceDataPointer;
744 }
745 
749 template <typename T>
751 {
752  return m_deviceID;
753 }
754 
758 template <typename T>
760 {
761  DEBUG_TEXT_LEVEL2(m_nameVerbose + " DEVICE_DATA_CHANGED: GPU_" << m_deviceID << ", size: " << m_numElements <<" !!!\n")
762  if(m_valid == false) // this is for data that is directly written on gpu....
763  {
764  DEBUG_TEXT_LEVEL2(m_nameVerbose + " DEVICE_DATA_MARKED_VALID: GPU_" << m_deviceID << ", size: " << m_numElements <<" !!!\n")
765  m_valid = true;
766  }
767  m_deviceDataHasChanged = true;
768 }
769 
773 template <typename T>
775 {
776  return m_deviceDataHasChanged;
777 }
778 
779 
784 template <typename T>
786 {
787  if(m_valid)
788  {
789  DEBUG_TEXT_LEVEL2(m_nameVerbose + " DEVICE_DATA_MARKED_INVALID: GPU_" << m_deviceID << ", size: " << m_numElements <<" !!!\n")
790  m_valid = false;
791  m_deviceDataHasChanged = false;
792 
793  DeviceAllocations_CU<int>::getInstance()->removeAllocation(m_deviceDataPointer,this,m_deviceID);
794  }
795 }
796 
800 template <typename T>
802 {
803  return m_valid;
804 }
805 
806 // used to free up memory on device after encountering cuda out of memory error
807 template <typename T>
809 {
811  bool b = dev_alloc->freeAllocation(m_numElements*sizeof(T), m_deviceID);
812  if (b)
813  {
814  cudaMalloc((void**)&m_deviceDataPointer, m_numElements*sizeof(T));
815  }
816  else
817  printf("device out of memory, didnt find a container to free \n");
818  return b;
819 }
820 
821 }
822 
823 #endif
824 
825 #endif
826 
827 
void addAllocation(void *datapointer, MemPointerBase *device_mem_pointer, int deviceid)
Definition: device_allocations_cu.inl:58
~DeviceMemPointer_CU()
Definition: device_mem_pointer_cu.h:285
void removeAllocation(void *datapointer, MemPointerBase *device_mem_pointer, int deviceid)
Definition: device_allocations_cu.inl:71
bool deviceDataHasChanged() const
Definition: device_mem_pointer_cu.h:774
T * getDeviceDataPointer() const
Definition: device_mem_pointer_cu.h:741
void copyHostToDevice(size_t numElements=0) const
Definition: device_mem_pointer_cu.h:617
bool doRangeOverlap(T *hostPtr, size_t numElements)
Definition: device_mem_pointer_cu.h:176
Definition: device_mem_pointer_cu.h:27
DeviceMemPointer_CU(T *start, size_t numElements, Device_CU *device, std::string name="")
Definition: device_mem_pointer_cu.h:196
bool doCopiesOverlap(DeviceMemPointer_CU< T > *otherCopy, bool oneUnitCheck=false)
Definition: device_mem_pointer_cu.h:150
bool freeAllocation(size_t minsize, int deviceid)
Definition: device_allocations_cu.inl:93
void changeDeviceData()
Definition: device_mem_pointer_cu.h:759
#define MAX_RANGES
Definition: device_mem_pointer_cu.h:38
unsigned int getDeviceID() const
Definition: device_mem_pointer_cu.h:750
bool isCopyValid() const
Definition: device_mem_pointer_cu.h:801
void copyAllRangesToDevice(UpdateInf< T > *updateStruct, const size_t sizeUpdStr, size_t streamID=0)
Definition: device_mem_pointer_cu.h:538
void copyDeviceToHost(size_t numElements=0) const
Definition: device_mem_pointer_cu.h:678
Contains a class declaration for the object that represents a CUDA device.
void copyInfFromHostToDevice(UpdateInf< T > *updateStruct, size_t &sizeUpdStr)
Definition: device_mem_pointer_cu.h:304
A class representing a CUDA device memory allocation for container.
Definition: device_mem_pointer_cu.h:58
static DeviceAllocations_CU * getInstance()
Definition: device_allocations_cu.inl:15
unsigned int getDeviceID() const
Definition: device_cu.h:346
void markCopyInvalid()
Definition: device_mem_pointer_cu.h:785
A class representing a CUDA device.
Definition: device_cu.h:30
Definition: device_allocations_cu.h:20
bool doOverlapAndCoverFully(DeviceMemPointer_CU< T > *otherCopy)
Definition: device_mem_pointer_cu.h:137
void copiesOverlapInf(DeviceMemPointer_CU< T > *otherCopy, UpdateInf< T > *updateStruct, size_t &sizeUpdStr)
Definition: device_mem_pointer_cu.h:350
#define MAX_COPYINF_SIZE
Definition: device_mem_pointer_cu.h:43
void resetRanges()
Definition: device_mem_pointer_cu.h:108