doc_1.2/html/device__cu_8h_source.html

 #ifndef DEVICE_CU_H

 #define DEVICE_CU_H


 #ifdef SKEPU_CUDA


 #include <iostream>

 #include <cuda.h>


 #include "../globals.h"


 namespace skepu

 {


 class Device_CU

 {


 public:

    cudaStream_t m_streams[MAX_POSSIBLE_CUDA_STREAMS_PER_GPU];


 private:

    unsigned int m_deviceID;

    cudaDeviceProp m_deviceProp;

    size_t m_maxThreads;

    size_t m_maxBlocks;


    unsigned int m_noConcurrKernelsSupported;


    unsigned int m_noCoresSupported;


    void initDeviceProps(unsigned int device)

    {

       cudaError_t err;

       err = cudaGetDeviceProperties(&m_deviceProp, device);

       if (err != cudaSuccess)

       {

          SKEPU_ERROR("getDeviceProps failed!\n");

       }


       if (m_deviceProp.major == 9999 && m_deviceProp.minor == 9999)

       {

          m_noConcurrKernelsSupported = 1;

          m_noCoresSupported = 1;

       }

       else

       {

          m_noConcurrKernelsSupported = getMaxConcurKernelsSupported(m_deviceProp.major, m_deviceProp.minor);

          if(m_noConcurrKernelsSupported > MAX_POSSIBLE_CUDA_STREAMS_PER_GPU)

          {

             SKEPU_WARNING("Potential problem as stream size specified is larger tham what is maximum possible specified in MAX_POSSIBLE_CUDA_STREAMS_PER_GPU.\n");

             m_noConcurrKernelsSupported = MAX_POSSIBLE_CUDA_STREAMS_PER_GPU; // reset it to max as we have allocated stream array of size MAX_POSSIBLE_CUDA_STREAMS_PER_GPU

          }


          m_noCoresSupported = ConvertSMVer2Cores_local(m_deviceProp.major, m_deviceProp.minor);


       }

    }


    int ConvertSMVer2Cores_local(int major, int minor)

    {

       // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM

       typedef struct

       {

          int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version

          int Cores;

       } sSMtoCores;


       sSMtoCores nGpuArchCoresPerSM[] =

       {

          { 0x10,  8 }, // Tesla Generation (SM 1.0) G80 class

          { 0x11,  8 }, // Tesla Generation (SM 1.1) G8x class

          { 0x12,  8 }, // Tesla Generation (SM 1.2) G9x class

          { 0x13,  8 }, // Tesla Generation (SM 1.3) GT200 class

          { 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class

          { 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class

          { 0x30, 192}, // Fermi Generation (SM 3.0) GK10x class

          {   -1, -1 }

       };


       int index = 0;

       while (nGpuArchCoresPerSM[index].SM != -1)

       {

          if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) )

             return nGpuArchCoresPerSM[index].Cores;


          index++;

       }

       SKEPU_WARNING("MapSMtoCores undefined SMversion " << major << "," << minor << "\n");

       return -1;

    }


    int getMaxConcurKernelsSupported(int major, int minor)

    {

       // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM

       typedef struct

       {

          int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version

          int totConcurrKernels;

       } sSMtoCores;


       sSMtoCores nGpuArchCoresPerSM[] =

       {

          { 0x10,  1 }, // Tesla Generation (SM 1.0) G80 class

          { 0x11,  1 }, // Tesla Generation (SM 1.1) G8x class

          { 0x12,  1 }, // Tesla Generation (SM 1.2) G9x class

          { 0x13,  1 }, // Tesla Generation (SM 1.3) GT200 class

          { 0x20,  4 }, // Fermi Generation (SM 2.0) GF100 class

          { 0x21, 16 }, // Fermi Generation (SM 2.1) GF10x class

          { 0x30, 16}, // Kepler Generation (SM 3.0) GK10x class

                  { 0x32, 4}, // special kepler? (SM 3.2)

                  { 0x35, 32}, // Kepler Generation (SM 3.5) GK11x class

                  { 0x37, 32}, // (SM 3.7)

                  { 0x50, 32}, // Maxwell Generation (SM 5.0) GM10x class

                  { 0x52, 32}, // Maxwell Generation (SM 5.2) GM20x class

          {   -1, 1 }

       };


       int index = 0;

       while (nGpuArchCoresPerSM[index].SM != -1)

       {

          if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) )

             return nGpuArchCoresPerSM[index].totConcurrKernels;


          index++;

       }

       SKEPU_WARNING("MapSMtoCores undefined SMversion " << major << "," << minor << "\n");

       return 1;

    }


 public:


    Device_CU(unsigned int id)

    {

       m_deviceID = id;


       cudaSetDevice(m_deviceID);


       initDeviceProps(id);


 #ifdef USE_PINNED_MEMORY

       for(unsigned int i=0; i<m_noConcurrKernelsSupported; i++)

          cudaStreamCreate(&(m_streams[i]));

 #endif


       if(m_deviceProp.major == 1 && m_deviceProp.minor < 2)

       {

          m_maxThreads = 256;

       }

       else

       {

          m_maxThreads = m_deviceProp.maxThreadsPerBlock;

       }


       m_maxBlocks = m_deviceProp.maxGridSize[0];

    }


    ~Device_CU()

    {

       // Explicitly destroys and cleans up all resources associated with the current device in the current process.

       // Any subsequent API call to this device will reinitialize the device.

       cudaSetDevice(m_deviceID);

       cudaDeviceReset();

    };


    bool isOverlapSupported()

    {

       return m_deviceProp.deviceOverlap;

    }


    size_t getMaxBlockSize() const

    {

       return m_deviceProp.maxThreadsPerBlock;

    }


    int getMajorVersion() const

    {

       return m_deviceProp.major;

    }


    int getMinorVersion() const

    {

       return m_deviceProp.minor;

    }


    unsigned int getSmPerMultiProc() const

    {

       return m_noCoresSupported;

    }


    std::string getDeviceName() const

    {

       return m_deviceProp.name;

    }


    int getClockRate() const

    {

       return m_deviceProp.clockRate;

    }


    int getAsyncEngineCount() const

    {

       return m_deviceProp.asyncEngineCount;

    }


    bool IsConcurrentKernels() const

    {

       return m_deviceProp.concurrentKernels;

    }


    unsigned int getNoConcurrentKernels() const

    {

       return m_noConcurrKernelsSupported;

    }


    int getNumComputeUnits() const

    {

       return m_deviceProp.multiProcessorCount;

    }


    size_t getGlobalMemSize() const

    {

       return m_deviceProp.totalGlobalMem;

    }


    size_t getSharedMemPerBlock() const

    {

       return m_deviceProp.sharedMemPerBlock;

    }


    size_t getMaxThreads() const

    {

 #ifdef SKEPU_MAX_GPU_THREADS

       return SKEPU_MAX_GPU_THREADS;

 #else

       return m_maxThreads;

 #endif

    }


    size_t getMaxBlocks() const

    {

 #ifdef SKEPU_MAX_GPU_BLOCKS

       return SKEPU_MAX_GPU_BLOCKS;

 #else

       return m_maxBlocks;

 #endif

    }


    unsigned int getDeviceID() const

    {

       return m_deviceID;

    }

 };


 }


 #endif


 #endif


skepu::Device_CU::Device_CU
Device_CU(unsigned int id)
Definition: device_cu.h:175

skepu::Device_CU::getSmPerMultiProc
unsigned int getSmPerMultiProc() const
Definition: device_cu.h:247

skepu::Device_CU::IsConcurrentKernels
bool IsConcurrentKernels() const
Definition: device_cu.h:282

skepu::Device_CU::getGlobalMemSize
size_t getGlobalMemSize() const
Definition: device_cu.h:306

skepu::Device_CU::~Device_CU
~Device_CU()
The destructor.
Definition: device_cu.h:203

skepu::Device_CU::getMaxThreads
size_t getMaxThreads() const
Definition: device_cu.h:322

skepu::Device_CU::getDeviceName
std::string getDeviceName() const
Definition: device_cu.h:255

skepu::Device_CU::isOverlapSupported
bool isOverlapSupported()
Definition: device_cu.h:215

skepu::Device_CU::getNoConcurrentKernels
unsigned int getNoConcurrentKernels() const
Definition: device_cu.h:290

skepu::Device_CU::getMajorVersion
int getMajorVersion() const
Definition: device_cu.h:231

skepu::Device_CU::getClockRate
int getClockRate() const
Definition: device_cu.h:265

skepu::Device_CU::getMaxBlocks
size_t getMaxBlocks() const
Definition: device_cu.h:334

skepu::Device_CU::getMaxBlockSize
size_t getMaxBlockSize() const
Definition: device_cu.h:223

skepu::Device_CU::getNumComputeUnits
int getNumComputeUnits() const
Definition: device_cu.h:298

skepu::Device_CU::getSharedMemPerBlock
size_t getSharedMemPerBlock() const
Definition: device_cu.h:314

skepu::Device_CU::getDeviceID
unsigned int getDeviceID() const
Definition: device_cu.h:346

skepu::Device_CU
A class representing a CUDA device.
Definition: device_cu.h:30

skepu::Device_CU::getAsyncEngineCount
int getAsyncEngineCount() const
Definition: device_cu.h:274

skepu::Device_CU::getMinorVersion
int getMinorVersion() const
Definition: device_cu.h:239