5 #ifndef DEVICE_MEM_POINTER_CU_H
6 #define DEVICE_MEM_POINTER_CU_H
43 void copyDeviceToDevice(T* copyToPointer,
int numElements,
int dstOffset = 0,
int srcOffset = 0)
const;
52 void copyHostToDevice_internal(T* srcPtr, T* destPtr,
int numElements,
int dstOffset=0)
const;
55 mutable std::vector<bool> vecMask;
58 T* m_rootHostDataPointer;
59 T* m_effectiveHostDataPointer;
63 T* m_deviceDataPointer;
64 T* m_effectiveDeviceDataPointer;
67 int m_effectiveNumElements;
70 mutable bool deviceDataHasChanged;
82 DeviceMemPointer_CU<T>::DeviceMemPointer_CU(T* root, T* start,
int numElements,
int deviceID,
int totalVecSize) : m_rootHostDataPointer(root), m_effectiveHostDataPointer(start), m_hostDataPointer(start), m_numElements(numElements), m_effectiveNumElements(numElements), m_deviceID(deviceID)
85 size_t sizeVec = numElements*
sizeof(T);
87 DEBUG_TEXT_LEVEL2(
"Alloc: " <<numElements <<
"\n")
89 #ifdef SKEPU_MEASURE_TIME_DISTRIBUTION
90 devMemAllocTimer.start();
92 err = cudaMalloc((
void**)&m_deviceDataPointer, sizeVec);
93 if(err != cudaSuccess){std::cerr<<
"Error allocating memory on device\n";}
95 m_effectiveDeviceDataPointer = m_deviceDataPointer;
98 #ifdef USE_PESSIMISTIC_LMC
99 emptyRanges = std::vector<std::pair<int, int> >();
101 vecMask = std::vector<bool>( ((totalVecSize == -1) ? numElements: totalVecSize),
false);
104 #ifdef SKEPU_MEASURE_TIME_DISTRIBUTION
105 devMemAllocTimer.stop();
107 deviceDataHasChanged =
false;
113 template <
typename T>
116 DEBUG_TEXT_LEVEL2(
"DeAlloc: " <<m_numElements <<
"\n")
118 cudaFree(m_deviceDataPointer);
127 template <
typename T>
130 if(m_hostDataPointer != NULL)
132 DEBUG_TEXT_LEVEL2(
"HOST_TO_DEVICE!!!\n")
138 if(numElements == -1)
140 totElements = m_numElements;
142 totElements = m_effectiveNumElements;
144 totElements = numElements;
146 sizeVec = totElements*
sizeof(T);
148 #ifdef SKEPU_MEASURE_TIME_DISTRIBUTION
149 #ifdef SKEPU_MEASURE_ONLY_COPY
150 cudaThreadSynchronize();
157 err = cudaMemcpy(m_deviceDataPointer, m_hostDataPointer, sizeVec, cudaMemcpyHostToDevice);
158 tmpPointer = m_hostDataPointer;
162 err = cudaMemcpy(m_effectiveDeviceDataPointer, m_effectiveHostDataPointer, sizeVec, cudaMemcpyHostToDevice);
163 tmpPointer = m_effectiveHostDataPointer;
166 if(err != cudaSuccess){std::cerr<<
"Error copying data to device\n" <<cudaGetErrorString(err) <<
"\n";}
167 #ifdef SKEPU_MEASURE_TIME_DISTRIBUTION
168 #ifdef SKEPU_MEASURE_ONLY_COPY
169 cudaThreadSynchronize();
174 #ifndef DEFAULT_LMC // reset flag for bits copied back
175 int _limit= (tmpPointer+totElements)-m_rootHostDataPointer;
176 for(
int i=(tmpPointer-m_rootHostDataPointer); i<_limit ; i++)
182 deviceDataHasChanged =
false;
198 template <
typename T>
201 if(m_hostDataPointer != NULL)
203 DEBUG_TEXT_LEVEL2(
"DEVICE_TO_DEVICE!!!\n")
208 if(numElements == -1)
209 sizeVec = m_numElements*
sizeof(T);
211 sizeVec = numElements*
sizeof(T);
213 #ifdef SKEPU_MEASURE_TIME_DISTRIBUTION
214 #ifdef SKEPU_MEASURE_ONLY_COPY
215 cudaThreadSynchronize();
220 err = cudaMemcpy(copyToPointer+dstOffset, m_effectiveDeviceDataPointer+srcOffset, sizeVec, cudaMemcpyDeviceToDevice);
222 if(err != cudaSuccess){std::cerr<<
"Error copying data device to device\n" <<cudaGetErrorString(err) <<
"\n";}
223 #ifdef SKEPU_MEASURE_TIME_DISTRIBUTION
224 #ifdef SKEPU_MEASURE_ONLY_COPY
225 cudaThreadSynchronize();
240 template <
typename T>
243 if(deviceDataHasChanged && m_hostDataPointer != NULL)
245 DEBUG_TEXT_LEVEL2(
"DEVICE_TO_HOST!!!\n")
250 #ifndef DEFAULT_LMC // ignore flags set by user especially copyLast
254 int _limit = ( (m_effectiveHostDataPointer+m_effectiveNumElements)-m_rootHostDataPointer);
256 for(
int i=(m_effectiveHostDataPointer-m_rootHostDataPointer); i< _limit;i++)
258 if(!isWrite && vecMask[i])
260 lower = (m_rootHostDataPointer + i) - m_effectiveHostDataPointer;
264 else if( (isWrite && !vecMask[i]) || (isWrite && vecMask[i] && i==(_limit-1) ) )
266 upper = (m_rootHostDataPointer + i) - m_effectiveHostDataPointer;
267 if(isWrite && vecMask[i] && i==(_limit-1))
274 if(lower!=-1 && upper!=-1)
276 sizeVec = (upper-lower)*
sizeof(T);
278 #ifdef SKEPU_MEASURE_TIME_DISTRIBUTION
279 #ifdef SKEPU_MEASURE_ONLY_COPY
280 cudaThreadSynchronize();
282 copyDownTimer.start();
285 err = cudaMemcpy(m_effectiveHostDataPointer+lower, m_effectiveDeviceDataPointer+lower, sizeVec, cudaMemcpyDeviceToHost);
286 if(err != cudaSuccess){std::cerr<<
"Error copying data from device: " <<cudaGetErrorString(err) <<
"\n";}
288 #ifdef SKEPU_MEASURE_TIME_DISTRIBUTION
289 #ifdef SKEPU_MEASURE_ONLY_COPY
290 cudaThreadSynchronize();
292 copyDownTimer.stop();
301 deviceDataHasChanged =
false;
305 if(numElements == -1)
307 totElements = m_numElements;
309 totElements = m_effectiveNumElements;
311 totElements = numElements;
313 sizeVec = totElements*
sizeof(T);
315 #ifdef SKEPU_MEASURE_TIME_DISTRIBUTION
316 #ifdef SKEPU_MEASURE_ONLY_COPY
317 cudaThreadSynchronize();
319 copyDownTimer.start();
322 err = cudaMemcpy(m_hostDataPointer, m_deviceDataPointer, sizeVec, cudaMemcpyDeviceToHost);
324 err = cudaMemcpy(m_effectiveHostDataPointer, m_effectiveDeviceDataPointer, sizeVec, cudaMemcpyDeviceToHost);
326 if(err != cudaSuccess){std::cerr<<
"Error copying data from device: " <<cudaGetErrorString(err) <<
"\n";}
328 #ifdef SKEPU_MEASURE_TIME_DISTRIBUTION
329 #ifdef SKEPU_MEASURE_ONLY_COPY
330 cudaThreadSynchronize();
332 copyDownTimer.stop();
335 deviceDataHasChanged =
false;
350 template <
typename T>
353 if(m_hostDataPointer != NULL)
355 DEBUG_TEXT_LEVEL2(
"DEVICE_TO_DEVICE!!!\n")
358 size_t sizeVec = numElements*sizeof(T);
360 #ifdef SKEPU_MEASURE_TIME_DISTRIBUTION
361 #ifdef SKEPU_MEASURE_ONLY_COPY
362 cudaThreadSynchronize();
367 err = cudaMemcpy(dstPointer+dstOffset, srcPointer, sizeVec, cudaMemcpyHostToDevice);
369 if(err != cudaSuccess){std::cerr<<
"Error copying data device to device\n" <<cudaGetErrorString(err) <<
"\n";}
370 #ifdef SKEPU_MEASURE_TIME_DISTRIBUTION
371 #ifdef SKEPU_MEASURE_ONLY_COPY
372 cudaThreadSynchronize();
385 template <
typename T>
388 return m_deviceDataPointer;
394 template <
typename T>
403 template <
typename T>
406 deviceDataHasChanged =
true;
410 int _limit = (m_hostDataPointer+m_numElements)-m_rootHostDataPointer;
411 for(
int i=(m_hostDataPointer-m_rootHostDataPointer); i< _limit; i++)
~DeviceMemPointer_CU()
Definition: device_mem_pointer_cu.h:114
int getDeviceID() const
Definition: device_mem_pointer_cu.h:395
DeviceMemPointer_CU(T *root, T *start, int numElements, int deviceID, int totalVecSize=-1)
Definition: device_mem_pointer_cu.h:82
Defines a few macros that can be used to output text when debugging. The macros use std::cerr...
T * getDeviceDataPointer() const
Definition: device_mem_pointer_cu.h:386
void copyDeviceToHost(int numElements=-1, bool copyLast=true) const
Definition: device_mem_pointer_cu.h:241
void copyHostToDevice(int numElements=-1, bool copyLast=true) const
Definition: device_mem_pointer_cu.h:128
void changeDeviceData()
Definition: device_mem_pointer_cu.h:404
void copyDeviceToDevice(T *copyToPointer, int numElements, int dstOffset=0, int srcOffset=0) const
Definition: device_mem_pointer_cu.h:199
A class representing a CUDA device memory allocation.
Definition: device_mem_pointer_cu.h:34