5 #ifndef DEVICE_MEM_POINTER_CU_H
6 #define DEVICE_MEM_POINTER_CU_H
15 #include "mem_pointer_base.h"
43 #define MAX_COPYINF_SIZE 33
74 void clearDevicePointer();
86 std::string m_nameVerbose;
101 std::pair<T*, int> m_rangesToCompare[
MAX_RANGES];
102 size_t m_numOfRanges;
104 T* m_hostDataPointer;
105 T* m_deviceDataPointer;
106 size_t m_numElements;
112 m_rangesToCompare[m_numOfRanges] = std::make_pair(m_hostDataPointer, m_numElements);
117 unsigned int m_deviceID;
126 mutable bool m_valid;
128 mutable bool m_deviceDataHasChanged;
130 bool freeUpDeviceMem();
136 template <
typename T>
139 if(m_hostDataPointer <= otherCopy->m_hostDataPointer && (m_hostDataPointer+m_numElements) >= (otherCopy->m_hostDataPointer+otherCopy->m_numElements))
149 template <
typename T>
153 assert(m_numOfRanges == 1 && otherCopy->m_numOfRanges == 1);
155 if(m_numOfRanges < 1)
158 for(
size_t i=0; i<m_numOfRanges; ++i)
160 T *hostDataPointer = m_rangesToCompare[i].first;
161 size_t numElements = m_rangesToCompare[i].second;
163 if( hostDataPointer >= otherCopy->m_hostDataPointer && hostDataPointer < (otherCopy->m_hostDataPointer + otherCopy->m_numElements) )
166 if( otherCopy->m_hostDataPointer >= hostDataPointer && otherCopy->m_hostDataPointer < (hostDataPointer + numElements) )
175 template <
typename T>
178 if( (m_hostDataPointer+m_numElements) <= hostDataPointer )
181 if( (hostDataPointer+numElements) <= m_hostDataPointer )
195 template <
typename T>
196 DeviceMemPointer_CU<T>::DeviceMemPointer_CU(T* start,
size_t numElements,
Device_CU *device, std::string name) : m_hostDataPointer(start), m_numElements(numElements), m_dev(device), m_rows(1), m_cols(numElements), m_pitch(numElements), m_valid(false), m_usePitch(false), m_numOfRanges(0)
198 m_nameVerbose = name;
200 size_t sizeVec = m_numElements*
sizeof(T);
203 CHECK_CUDA_ERROR(cudaSetDevice(m_deviceID));
205 DEBUG_TEXT_LEVEL1(m_nameVerbose +
" Alloc: " <<m_numElements <<
", GPU_" << m_deviceID <<
"\n")
209 er = cudaMalloc((
void**)&m_deviceDataPointer, sizeVec);
210 if (er == cudaErrorMemoryAllocation)
215 CHECK_CUDA_ERROR(er);
217 dev_alloc->
addAllocation((
void*)m_deviceDataPointer,
this,m_deviceID);
220 m_rangesToCompare[m_numOfRanges] = std::make_pair(m_hostDataPointer, m_numElements);
223 m_deviceDataHasChanged =
false;
238 template <
typename T>
239 DeviceMemPointer_CU<T>::DeviceMemPointer_CU(T* start,
size_t rows,
size_t cols,
Device_CU *device,
bool usePitch, std::string name) : m_hostDataPointer(start), m_numElements(rows*cols), m_rows(rows), m_cols(cols), m_dev(device), m_valid(false), m_usePitch(usePitch), m_numOfRanges(0)
241 m_nameVerbose = name;
243 size_t sizeVec = m_numElements*
sizeof(T);
246 CHECK_CUDA_ERROR(cudaSetDevice(m_deviceID));
248 DEBUG_TEXT_LEVEL1(m_nameVerbose +
" Alloc: " <<m_numElements <<
", GPU_" << m_deviceID <<
"\n")
252 CHECK_CUDA_ERROR(cudaMallocPitch((
void**)&m_deviceDataPointer, &m_pitch, cols *
sizeof(T), rows));
253 m_pitch = (m_pitch)/
sizeof(T);
259 er = cudaMalloc((
void**)&m_deviceDataPointer, sizeVec);
260 if (er == cudaErrorMemoryAllocation)
265 CHECK_CUDA_ERROR(er);
267 dev_alloc->
addAllocation((
void*)m_deviceDataPointer,
this,m_deviceID);
273 m_rangesToCompare[m_numOfRanges] = std::make_pair(m_hostDataPointer, m_numElements);
276 m_deviceDataHasChanged =
false;
284 template <
typename T>
287 DEBUG_TEXT_LEVEL1(m_nameVerbose +
" DeAlloc: " <<m_numElements <<
", GPU_" << m_deviceID <<
"\n")
289 CHECK_CUDA_ERROR(cudaSetDevice(m_deviceID));
293 cudaFree(m_deviceDataPointer);
294 m_deviceDataPointer = NULL;
303 template <
typename T>
306 for(
size_t i=0; i<m_numOfRanges; ++i)
309 T *hostDataPointer = m_rangesToCompare[i].first;
310 size_t numElements = m_rangesToCompare[i].second;
312 assert((hostDataPointer - m_hostDataPointer)>=0);
313 int offset = hostDataPointer - m_hostDataPointer;
315 size_t sizeVec = numElements*
sizeof(T);
317 updateStruct[sizeUpdStr].srcDevId = -1;
318 updateStruct[sizeUpdStr].src = m_hostDataPointer;
320 updateStruct[sizeUpdStr].srcOffset = offset;
321 updateStruct[sizeUpdStr].dstOffset = offset;
322 updateStruct[sizeUpdStr].copySize = sizeVec;
323 updateStruct[sizeUpdStr].srcIsHost =
true;
328 if(i!=m_numOfRanges-1)
330 for(
size_t j=i+1; j<m_numOfRanges; ++j)
332 m_rangesToCompare[j-1] = m_rangesToCompare[j];
349 template <
typename T>
352 for(
size_t i=0; i<m_numOfRanges; ++i)
355 T *hostDataPointer = m_rangesToCompare[i].first;
356 size_t numElements = m_rangesToCompare[i].second;
359 if(otherCopy->
doRangeOverlap(hostDataPointer, numElements) ==
false)
362 assert((hostDataPointer - m_hostDataPointer)>=0);
363 int offset = hostDataPointer - m_hostDataPointer;
367 if( hostDataPointer >= otherCopy->m_hostDataPointer && (hostDataPointer + numElements) <= (otherCopy->m_hostDataPointer + otherCopy->m_numElements) )
369 size_t sizeVec = numElements*
sizeof(T);
370 int srcoffset = hostDataPointer - otherCopy->m_hostDataPointer;
372 updateStruct[sizeUpdStr].srcDevId = otherCopy->m_deviceID;
373 updateStruct[sizeUpdStr].src = otherCopy->m_deviceDataPointer;
375 updateStruct[sizeUpdStr].srcOffset = srcoffset;
376 updateStruct[sizeUpdStr].dstOffset = offset;
377 updateStruct[sizeUpdStr].copySize = sizeVec;
378 updateStruct[sizeUpdStr].srcIsHost =
false;
383 if(i!=m_numOfRanges-1)
385 for(
int j=i+1; j<m_numOfRanges; ++j)
387 m_rangesToCompare[j-1] = m_rangesToCompare[j];
394 else if( otherCopy->m_hostDataPointer >= hostDataPointer && (otherCopy->m_hostDataPointer + otherCopy->m_numElements) <= (hostDataPointer + numElements) )
396 size_t sizeVec = otherCopy->m_numElements*
sizeof(T);
397 int dstoffset = otherCopy->m_hostDataPointer - hostDataPointer;
399 updateStruct[sizeUpdStr].srcDevId = otherCopy->m_deviceID;
400 updateStruct[sizeUpdStr].src = otherCopy->m_deviceDataPointer;
402 updateStruct[sizeUpdStr].srcOffset = 0;
403 updateStruct[sizeUpdStr].dstOffset = offset + dstoffset;
404 updateStruct[sizeUpdStr].copySize = sizeVec;
405 updateStruct[sizeUpdStr].srcIsHost =
false;
410 if(i!=m_numOfRanges-1)
412 for(
size_t j=i+1; j<m_numOfRanges; ++j)
414 m_rangesToCompare[j-1] = m_rangesToCompare[j];
423 m_rangesToCompare[m_numOfRanges] = std::make_pair(hostDataPointer, dstoffset);
429 int size = ( (hostDataPointer+numElements) - (otherCopy->m_hostDataPointer + otherCopy->m_numElements) );
432 int tmpoffset = (otherCopy->m_hostDataPointer + otherCopy->m_numElements) - hostDataPointer;
433 m_rangesToCompare[m_numOfRanges] = std::make_pair(hostDataPointer+tmpoffset, size);
439 else if( otherCopy->m_hostDataPointer >= hostDataPointer && (otherCopy->m_hostDataPointer) < (hostDataPointer + numElements) )
441 size_t sizeVec = ( (hostDataPointer + numElements) - otherCopy->m_hostDataPointer) *
sizeof(T);
442 int dstoffset = otherCopy->m_hostDataPointer - hostDataPointer;
444 updateStruct[sizeUpdStr].srcDevId = otherCopy->m_deviceID;
445 updateStruct[sizeUpdStr].src = otherCopy->m_deviceDataPointer;
447 updateStruct[sizeUpdStr].srcOffset = 0;
448 updateStruct[sizeUpdStr].dstOffset = offset + dstoffset;
449 updateStruct[sizeUpdStr].copySize = sizeVec;
450 updateStruct[sizeUpdStr].srcIsHost =
false;
455 if(i!=m_numOfRanges-1)
457 for(
size_t j=i+1; j<m_numOfRanges; ++j)
459 m_rangesToCompare[j-1] = m_rangesToCompare[j];
468 m_rangesToCompare[m_numOfRanges] = std::make_pair(hostDataPointer, dstoffset);
474 else if( hostDataPointer >= otherCopy->m_hostDataPointer && hostDataPointer < (otherCopy->m_hostDataPointer + otherCopy->m_numElements) )
476 size_t sizeVec = ( (otherCopy->m_hostDataPointer + otherCopy->m_numElements) - hostDataPointer) *
sizeof(T);
477 int srcoffset = hostDataPointer - otherCopy->m_hostDataPointer;
479 updateStruct[sizeUpdStr].srcDevId = otherCopy->m_deviceID;
480 updateStruct[sizeUpdStr].src = otherCopy->m_deviceDataPointer;
482 updateStruct[sizeUpdStr].srcOffset = srcoffset;
483 updateStruct[sizeUpdStr].dstOffset = offset;
484 updateStruct[sizeUpdStr].copySize = sizeVec;
485 updateStruct[sizeUpdStr].srcIsHost =
false;
490 if(i!=m_numOfRanges-1)
492 for(
size_t j=i+1; j<m_numOfRanges; ++j)
494 m_rangesToCompare[j-1] = m_rangesToCompare[j];
501 int size = ( (hostDataPointer+numElements) - (otherCopy->m_hostDataPointer + otherCopy->m_numElements) );
504 int tmpoffset = (otherCopy->m_hostDataPointer + otherCopy->m_numElements) - hostDataPointer;
505 m_rangesToCompare[m_numOfRanges] = std::make_pair(hostDataPointer+tmpoffset, size);
515 template <
typename T>
518 return sizeof(T)*m_numElements;
521 template <
typename T>
522 void DeviceMemPointer_CU<T>::clearDevicePointer()
524 cudaFree(m_deviceDataPointer);
525 m_deviceDataPointer = NULL;
537 template <
typename T>
540 assert(m_valid ==
false);
543 if (m_deviceDataPointer == NULL)
546 er = cudaMalloc((
void**)&m_deviceDataPointer, m_numElements*
sizeof(T));
547 if (er == cudaErrorMemoryAllocation)
554 assert(m_deviceDataHasChanged ==
false);
558 for(
int i=0; i<sizeUpdStr; ++i)
560 if(updateStruct[i].srcIsHost)
562 DEBUG_TEXT_LEVEL1(m_nameVerbose +
" HOST_TO_DEVICE: Host -> GPU_" << m_deviceID <<
", size: " << (updateStruct[i].copySize/
sizeof(T)) <<
" # " << updateStruct[i].srcOffset <<
" -- " << updateStruct[i].srcOffset + (updateStruct[i].copySize/
sizeof(T)) <<
"!!!\n")
566 DEBUG_TEXT_LEVEL1(m_nameVerbose +
" DEVICE_TO_DEVICE: From GPU_" << updateStruct[i].srcDevId <<
" -> GPU_" << m_deviceID <<
", size: " << (updateStruct[i].copySize/
sizeof(T)) <<
" # " << updateStruct[i].srcOffset <<
" -- " << updateStruct[i].srcOffset + (updateStruct[i].copySize/
sizeof(T)) <<
"!!!\n")
573 enum cudaMemcpyKind memKind = ((updateStruct[i].srcIsHost)? cudaMemcpyHostToDevice : ((updateStruct[i].srcDevId == m_deviceID)? cudaMemcpyDeviceToDevice: cudaMemcpyDefault));
575 sizeVec = updateStruct[i].copySize;
576 CHECK_CUDA_ERROR(cudaSetDevice(m_deviceID));
578 #ifdef USE_PINNED_MEMORY
581 else if(updateStruct[i].srcIsHost || updateStruct[i].srcDevId == m_deviceID)
582 CHECK_CUDA_ERROR(cudaMemcpyAsync(m_deviceDataPointer + updateStruct[i].dstOffset, updateStruct[i].src + updateStruct[i].srcOffset, sizeVec, memKind, (m_dev->m_streams[streamID])))
584 CHECK_CUDA_ERROR(cudaMemcpyPeer(m_deviceDataPointer + updateStruct[i].dstOffset, m_deviceID, updateStruct[i].src + updateStruct[i].srcOffset, updateStruct[i].srcDevId, sizeVec));
588 else if(updateStruct[i].srcIsHost || updateStruct[i].srcDevId == m_deviceID)
589 CHECK_CUDA_ERROR(cudaMemcpy(m_deviceDataPointer + updateStruct[i].dstOffset, updateStruct[i].src + updateStruct[i].srcOffset, sizeVec, memKind))
591 CHECK_CUDA_ERROR(cudaMemcpyPeer(m_deviceDataPointer + updateStruct[i].dstOffset, m_deviceID, updateStruct[i].src + updateStruct[i].srcOffset, updateStruct[i].srcDevId, sizeVec));
596 h_ptr = (T*)malloc(m_numElements*
sizeof(T));
597 CHECK_CUDA_ERROR(cudaMemcpy(h_ptr, m_deviceDataPointer, m_numElements*
sizeof(T), cudaMemcpyDeviceToHost));
598 std::cerr <<
"%% printing '" << m_nameVerbose <<
"' contents at GPU_" << m_deviceID <<
"\n";
599 for(
int i=0; i<m_numElements; ++i)
601 std::cerr << h_ptr[i] <<
" ";
603 std::cerr <<
"\n-----------------------------\n";
616 template <
typename T>
619 if(m_hostDataPointer != NULL)
621 DEBUG_TEXT_LEVEL1(m_nameVerbose +
" HOST_TO_DEVICE: Host -> GPU_" << m_deviceID <<
", size: " << ((numElements<1)? m_numElements: numElements)<<
" !!!\n")
625 SKEPU_ERROR(
"Data copy is already valid.. copying data from host to device failed\n");
635 numElements = m_numElements;
640 if( (numElements%m_cols)!=0 || (numElements/m_cols)<1 )
642 SKEPU_ERROR(
"Cannot copy data using pitch option when size mismatches with rows and columns. numElements: "<<numElements<<
", rows:"<< m_rows <<
", m_cols: "<<m_cols<<
"\n");
645 _rows = numElements/m_cols;
649 sizeVec = numElements*
sizeof(T);
651 CHECK_CUDA_ERROR(cudaSetDevice(m_deviceID));
653 #ifdef USE_PINNED_MEMORY
655 CHECK_CUDA_ERROR(cudaMemcpy2DAsync(m_deviceDataPointer,m_pitch*
sizeof(T),m_hostDataPointer,_cols*
sizeof(T), _cols*
sizeof(T), _rows, cudaMemcpyHostToDevice, (m_dev->m_streams[0])))
657 CHECK_CUDA_ERROR(cudaMemcpyAsync(m_deviceDataPointer, m_hostDataPointer, sizeVec, cudaMemcpyHostToDevice, (m_dev->m_streams[0])));
660 CHECK_CUDA_ERROR(cudaMemcpy2D(m_deviceDataPointer,m_pitch*
sizeof(T),m_hostDataPointer,_cols*
sizeof(T), _cols*
sizeof(T), _rows, cudaMemcpyHostToDevice))
662 CHECK_CUDA_ERROR(cudaMemcpy(m_deviceDataPointer, m_hostDataPointer, sizeVec, cudaMemcpyHostToDevice));
668 m_deviceDataHasChanged =
false;
677 template <
typename T>
682 SKEPU_ERROR(
"Data copy is not valid.. copying data from device to host failed: " << ((numElements<1)? m_numElements: numElements) <<
"\n");
685 if(m_deviceDataHasChanged && m_hostDataPointer != NULL)
687 DEBUG_TEXT_LEVEL1(m_nameVerbose +
" DEVICE_TO_HOST: GPU_" << m_deviceID <<
" -> Host, size: " << ((numElements<1)? m_numElements: numElements)<<
" !!!\n")
696 numElements = m_numElements;
700 if( (numElements%m_cols)!=0 || (numElements/m_cols)<1 )
702 SKEPU_ERROR(
"Cannot copy data using pitch option when size mismatches with rows and columns. numElements: "<<numElements<<
", rows:"<< m_rows <<
", m_cols: "<<m_cols<<
"\n");
705 _rows = numElements/m_cols;
709 sizeVec = numElements*
sizeof(T);
711 CHECK_CUDA_ERROR(cudaSetDevice(m_deviceID));
713 #ifdef USE_PINNED_MEMORY
716 CHECK_CUDA_ERROR(cudaMemcpy2DAsync(m_hostDataPointer,_cols*
sizeof(T),m_deviceDataPointer,m_pitch*
sizeof(T), _cols*
sizeof(T), _rows, cudaMemcpyDeviceToHost, (m_dev->m_streams[0])));
720 CHECK_CUDA_ERROR(cudaMemcpyAsync(m_hostDataPointer, m_deviceDataPointer, sizeVec, cudaMemcpyDeviceToHost, (m_dev->m_streams[0])));
725 CHECK_CUDA_ERROR(cudaMemcpy2D(m_hostDataPointer,_cols*
sizeof(T),m_deviceDataPointer,m_pitch*
sizeof(T), _cols*
sizeof(T), _rows, cudaMemcpyDeviceToHost));
729 CHECK_CUDA_ERROR(cudaMemcpy(m_hostDataPointer, m_deviceDataPointer, sizeVec, cudaMemcpyDeviceToHost));
733 m_deviceDataHasChanged =
false;
740 template <
typename T>
743 return m_deviceDataPointer;
749 template <
typename T>
758 template <
typename T>
761 DEBUG_TEXT_LEVEL2(m_nameVerbose +
" DEVICE_DATA_CHANGED: GPU_" << m_deviceID <<
", size: " << m_numElements <<
" !!!\n")
764 DEBUG_TEXT_LEVEL2(m_nameVerbose +
" DEVICE_DATA_MARKED_VALID: GPU_" << m_deviceID <<
", size: " << m_numElements <<
" !!!\n")
767 m_deviceDataHasChanged =
true;
773 template <
typename T>
776 return m_deviceDataHasChanged;
784 template <
typename T>
789 DEBUG_TEXT_LEVEL2(m_nameVerbose +
" DEVICE_DATA_MARKED_INVALID: GPU_" << m_deviceID <<
", size: " << m_numElements <<
" !!!\n")
791 m_deviceDataHasChanged =
false;
800 template <
typename T>
807 template <
typename T>
811 bool b = dev_alloc->
freeAllocation(m_numElements*
sizeof(T), m_deviceID);
814 cudaMalloc((
void**)&m_deviceDataPointer, m_numElements*
sizeof(T));
817 printf(
"device out of memory, didnt find a container to free \n");
void addAllocation(void *datapointer, MemPointerBase *device_mem_pointer, int deviceid)
Definition: device_allocations_cu.inl:58
~DeviceMemPointer_CU()
Definition: device_mem_pointer_cu.h:285
void removeAllocation(void *datapointer, MemPointerBase *device_mem_pointer, int deviceid)
Definition: device_allocations_cu.inl:71
bool deviceDataHasChanged() const
Definition: device_mem_pointer_cu.h:774
T * getDeviceDataPointer() const
Definition: device_mem_pointer_cu.h:741
void copyHostToDevice(size_t numElements=0) const
Definition: device_mem_pointer_cu.h:617
bool doRangeOverlap(T *hostPtr, size_t numElements)
Definition: device_mem_pointer_cu.h:176
Definition: device_mem_pointer_cu.h:27
DeviceMemPointer_CU(T *start, size_t numElements, Device_CU *device, std::string name="")
Definition: device_mem_pointer_cu.h:196
bool doCopiesOverlap(DeviceMemPointer_CU< T > *otherCopy, bool oneUnitCheck=false)
Definition: device_mem_pointer_cu.h:150
bool freeAllocation(size_t minsize, int deviceid)
Definition: device_allocations_cu.inl:93
void changeDeviceData()
Definition: device_mem_pointer_cu.h:759
#define MAX_RANGES
Definition: device_mem_pointer_cu.h:38
unsigned int getDeviceID() const
Definition: device_mem_pointer_cu.h:750
bool isCopyValid() const
Definition: device_mem_pointer_cu.h:801
void copyAllRangesToDevice(UpdateInf< T > *updateStruct, const size_t sizeUpdStr, size_t streamID=0)
Definition: device_mem_pointer_cu.h:538
void copyDeviceToHost(size_t numElements=0) const
Definition: device_mem_pointer_cu.h:678
Contains a class declaration for the object that represents a CUDA device.
void copyInfFromHostToDevice(UpdateInf< T > *updateStruct, size_t &sizeUpdStr)
Definition: device_mem_pointer_cu.h:304
A class representing a CUDA device memory allocation for container.
Definition: device_mem_pointer_cu.h:58
static DeviceAllocations_CU * getInstance()
Definition: device_allocations_cu.inl:15
unsigned int getDeviceID() const
Definition: device_cu.h:346
void markCopyInvalid()
Definition: device_mem_pointer_cu.h:785
A class representing a CUDA device.
Definition: device_cu.h:30
Definition: device_allocations_cu.h:20
bool doOverlapAndCoverFully(DeviceMemPointer_CU< T > *otherCopy)
Definition: device_mem_pointer_cu.h:137
void copiesOverlapInf(DeviceMemPointer_CU< T > *otherCopy, UpdateInf< T > *updateStruct, size_t &sizeUpdStr)
Definition: device_mem_pointer_cu.h:350
#define MAX_COPYINF_SIZE
Definition: device_mem_pointer_cu.h:43
void resetRanges()
Definition: device_mem_pointer_cu.h:108