core/cudatools/cpp/Device.cpp

   1 #include <iostream>
   2 #include <stdio.h>
   3 #include <stdlib.h>
   4 #include <assert.h>
   5
   6 #include "Device.h"
   7 #include "cudaTools.h"
   8 #include "Chronos.h"
   9 #include "omp.h"
  10 #include "StringTools.h"
  11
  12 using std::string;
  13 using std::cout;
  14 using std::cerr;
  15 using std::endl;
  16 using std::flush;
  17
  18 /*----------------------------------------------------------------------*\
  19  |*                     Declaration                                     *|
  20  \*---------------------------------------------------------------------*/
  21
  22 /*--------------------------------------*\
  23  |*             Public                  *|
  24  \*-------------------------------------*/
  25
  26 /*--------------------------------------*\
  27  |*             Private                 *|
  28  \*-------------------------------------*/
  29
  30 static void cout01(int isTrue);
  31 static void coutBool(bool isFlag);
  32 static int dim(const dim3& dim);
  33
  34 /*----------------------------------------------------------------------*\
  35  |*                     Implementation                                  *|
  36  \*---------------------------------------------------------------------*/
  37
  38 /*--------------------------------------*\
  39  |*             Public                  *|
  40  \*-------------------------------------*/
  41
  42 /*--------------*\
  43 |* wrapper      *|
  44  \*-------------*/
  45
  46 void Device::synchronize(void)
  47     {
  48     cudaDeviceSynchronize();
  49     }
  50
  51 /*--------------*\
  52 |* Tools      *|
  53  \*-------------*/
  54
  55 void Device::checkKernelError(const char *message)
  56     {
  57     cudaError_t error = cudaGetLastError();
  58     if (error != cudaSuccess)
  59         {
  60         if (message != NULL)
  61             {
  62             fprintf(stderr, "\n[CUDA ERROR] : Kernel Execution Failed : %s: %s\n\n", message, cudaGetErrorString(error));
  63             }
  64         else
  65             {
  66             fprintf(stderr, "\n[CUDA ERROR] : Kernel Execution Failed : %s\n\n", cudaGetErrorString(error));
  67             }
  68         exit (EXIT_FAILURE);
  69         }
  70     }
  71
  72 void Device::checkDimError(const dim3& dg, const dim3& db)
  73     {
  74     assertDim(dg, db);
  75     checkDimOptimiser(dg, db);
  76     }
  77
  78 void Device::checkDimOptimiser(const dim3& dg, const dim3& db)
  79     {
  80     int mpCount = Device::getMPCount();
  81     int warpSize = Device::getWarpSize();
  82
  83     int nbBlock = dim(dg);
  84     int nbThreadBlock = dim(db);
  85
  86     // grid
  87     if (nbBlock < mpCount || nbBlock % mpCount != 0)
  88         {
  89         string messageGrid = "nbBlock = " + StringTools::toString(nbBlock) + " : Heuristic : For best performance, for nbBlock, try a multiple of #MP="
  90                 + StringTools::toString(mpCount);
  91
  92         cout << "dg" << "(" << dg.x << "," << dg.y << "," << dg.z << ") " << messageGrid << endl;
  93         }
  94
  95     // block
  96     if (nbThreadBlock < warpSize || nbThreadBlock % warpSize != 0) // TODO kepler
  97         {
  98         string messageBlock = "nbThreadBlock = " + StringTools::toString(nbThreadBlock)
  99                 + " : Heuristic : For best performance, for nbThreadBlock, try a multiple of |warp|=" + StringTools::toString(warpSize);
 100
 101         cout << "db" << "(" << db.x << "," << db.y << "," << db.z << ") " << messageBlock << endl;
 102         }
 103     }
 104
 105 void Device::assertDim(const dim3& dg, const dim3& db)
 106     {
 107     // grid
 108         {
 109         dim3 dimGridMax = Device::getMaxGridDim();
 110
 111         assert(dg.x <= dimGridMax.x);
 112         assert(dg.y <= dimGridMax.y);
 113         assert(dg.z <= dimGridMax.z);
 114         }
 115
 116     // block
 117         {
 118         dim3 dimBlockMax = Device::getMaxBlockDim();
 119
 120         assert(db.x <= dimBlockMax.x);
 121         assert(db.y <= dimBlockMax.y);
 122         assert(db.z <= dimBlockMax.z);
 123         }
 124
 125     // Thread per block
 126     assert(dim(db) <= getMaxThreadPerBlock());
 127     }
 128
 129 void Device::print(const dim3& dg, const dim3& db)
 130     {
 131     cout << "dg" << "(" << dg.x << "," << dg.y << "," << dg.z << ") " << endl;
 132     cout << "db" << "(" << db.x << "," << db.y << "," << db.z << ") " << endl;
 133     }
 134
 135 int Device::dim(const dim3& dim)
 136     {
 137     return dim.x * dim.y * dim.z;
 138     }
 139
 140 int Device::nbThread(const dim3& dg, const dim3& db)
 141     {
 142     return dim(dg) * dim(db);
 143     }
 144
 145 /*--------------*\
 146 |*      get      *|
 147  \*-------------*/
 148
 149 int Device::getDeviceId(void)
 150     {
 151     int deviceId;
 152     HANDLE_ERROR(cudaGetDevice(&deviceId));
 153
 154     return deviceId;
 155     }
 156
 157 int Device::getDeviceCount(void)
 158     {
 159     int nbDevice;
 160     HANDLE_ERROR(cudaGetDeviceCount(&nbDevice));
 161
 162     return nbDevice;
 163     }
 164
 165 cudaDeviceProp Device::getDeviceProp(int idDevice)
 166     {
 167     cudaDeviceProp prop;
 168     HANDLE_ERROR(cudaGetDeviceProperties(&prop, idDevice));
 169     return prop;
 170     }
 171
 172 cudaDeviceProp Device::getDeviceProp(void)
 173     {
 174     return getDeviceProp(getDeviceId());
 175     }
 176
 177 /*--------------------------------------*\
 178  |*             Secondaire              *|
 179  \*-------------------------------------*/
 180
 181 /*--------------*\
 182 |*      get      *|
 183  \*-------------*/
 184
 185 string Device::getNameSimple(int idDevice)
 186     {
 187     return getDeviceProp(idDevice).name;
 188     }
 189
 190 string Device::getNameSimple()
 191     {
 192     return getNameSimple(getDeviceId());
 193     }
 194
 195 string Device::getName(int idDevice)
 196     {
 197     string id = StringTools::toString(idDevice);
 198     string a = StringTools::toString(getCapacityMajor(idDevice));
 199     string b = StringTools::toString(getCapacityMinor(idDevice));
 200
 201     return "[" + getNameSimple(idDevice) + "] id = " + id + " sm=" + a + "" + b;
 202     }
 203
 204 string Device::getName()
 205     {
 206     return getName(getDeviceId());
 207     }
 208
 209 dim3 Device::getMaxGridDim(int idDevice)
 210     {
 211     cudaDeviceProp prop = getDeviceProp(idDevice);
 212
 213     return dim3(prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
 214     }
 215
 216 dim3 Device::getMaxGridDim()
 217     {
 218     return getMaxGridDim(getDeviceId());
 219     }
 220
 221 dim3 Device::getMaxBlockDim(int idDevice)
 222     {
 223     cudaDeviceProp prop = getDeviceProp(idDevice);
 224
 225     return dim3(prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
 226     }
 227
 228 dim3 Device::getMaxBlockDim()
 229     {
 230     return getMaxBlockDim(getDeviceId());
 231     }
 232
 233 int Device::getMaxThreadPerBlock(int idDevice)
 234     {
 235     return getDeviceProp(idDevice).maxThreadsPerBlock;
 236     }
 237
 238 int Device::getMaxThreadPerBlock()
 239     {
 240     return getMaxThreadPerBlock(getDeviceId());
 241     }
 242
 243 int Device::getWarpSize(int idDevice)
 244     {
 245     return getDeviceProp(idDevice).warpSize;
 246     }
 247
 248 int Device::getWarpSize(void)
 249     {
 250     return getWarpSize(getDeviceId());
 251     }
 252
 253 int Device::getMPCount(int idDevice)
 254     {
 255     return getDeviceProp(idDevice).multiProcessorCount;
 256     }
 257
 258 int Device::getMPCount(void)
 259     {
 260     return getMPCount(getDeviceId());
 261     }
 262
 263 int Device::getCapacityMajor(int idDevice)
 264     {
 265     return getDeviceProp(idDevice).major;
 266     }
 267
 268 int Device::getCapacityMajor()
 269     {
 270     return getCapacityMajor(getDeviceId());
 271     }
 272
 273 int Device::getCapacityMinor(int idDevice)
 274     {
 275     return getDeviceProp(idDevice).minor;
 276     }
 277
 278 int Device::getCapacityMinor()
 279     {
 280     return getCapacityMinor(getDeviceId());
 281     }
 282
 283 int Device::getRuntimeVersion()
 284     {
 285     int version = -1;
 286     cudaRuntimeGetVersion(&version);
 287     return version;
 288
 289     //  return CUDART_VERSION;
 290     }
 291
 292 int Device::getDriverVersion(void)
 293     {
 294     int version = -1;
 295     cudaDriverGetVersion(&version);
 296     return version;
 297     }
 298
 299 int Device::getAsyncEngineCount(int idDevice)
 300     {
 301     return getDeviceProp(idDevice).asyncEngineCount;
 302     }
 303
 304 int Device::getAsyncEngineCount()
 305     {
 306     return getAsyncEngineCount(getDeviceId());
 307     }
 308
 309 /*--------------*\
 310 |*      is      *|
 311  \*-------------*/
 312
 313 bool Device::isCuda(void)
 314     {
 315     return getDeviceCount() >= 1;
 316     }
 317
 318 bool Device::isFermi(int idDevice)
 319     {
 320     int c = getCapacityMajor(idDevice);
 321
 322     return c >= 2 && c < 3;
 323     }
 324
 325 bool Device::isFermi()
 326     {
 327     return isFermi(getDeviceId());
 328     }
 329
 330 bool Device::isKepler(int idDevice)
 331     {
 332     int c = getCapacityMajor(idDevice);
 333
 334     return c >= 3 && c < 4;
 335     }
 336
 337 bool Device::isKepler()
 338     {
 339     return isKepler(getDeviceId());
 340     }
 341
 342 /**
 343  * 64 bits only
 344  */
 345 bool Device::isUVAEnable(int idDevice)
 346     {
 347     return getCapacityMajor() >= 2.0 && getRuntimeVersion() >= 4000;
 348     }
 349
 350 /**
 351  * 64 bits only
 352  */
 353 bool Device::isUVAEnable()
 354     {
 355     return isUVAEnable(getDeviceId());
 356     }
 357
 358 bool Device::isAtomicShareMemoryEnable(int idDevice)
 359     {
 360     return (getCapacityMajor(idDevice) == 1 && Device::getCapacityMinor(idDevice) >= 2) || getCapacityMajor(idDevice) >= 2;
 361     }
 362
 363 bool Device::isAtomicShareMemoryEnable()
 364     {
 365     return isAtomicShareMemoryEnable(getDeviceId());
 366     }
 367
 368 bool Device::isHostMapMemoryEnable(int idDevice)
 369     {
 370     return getDeviceProp(idDevice).canMapHostMemory;
 371     }
 372
 373 bool Device::isHostMapMemoryEnable()
 374     {
 375     return isHostMapMemoryEnable(getDeviceId());
 376     }
 377
 378 bool Device::isECCEnable(int idDevice)
 379     {
 380     return getDeviceProp(idDevice).ECCEnabled;
 381     }
 382
 383 bool Device::isECCEnable(void)
 384     {
 385     return isECCEnable(getDeviceId());
 386     }
 387
 388 bool Device::isAsyncEngine(int idDevice)
 389     {
 390     return getDeviceProp(idDevice).deviceOverlap;
 391     }
 392
 393 bool Device::isAsyncEngine(void)
 394     {
 395     return isAsyncEngine(getDeviceId());
 396     }
 397
 398 /*--------------------------------------*\
 399  |*             Print                   *|
 400  \*-------------------------------------*/
 401
 402 void Device::print(int idDevice)
 403     {
 404     cudaDeviceProp prop = getDeviceProp(idDevice);
 405
 406     cout << endl;
 407     cout << "===========================================" << endl;
 408     cout << " " << prop.name << "  :  id=" << idDevice << "  :  sm=" << prop.major << "" << prop.minor << "       :" << endl;
 409     cout << "===========================================" << endl;
 410
 411     cout << endl;
 412     cout << "Device id                : " << idDevice << endl;
 413     cout << "Name                     : " << prop.name << endl;
 414     cout << "GPU capability           : " << prop.major << "." << prop.minor << "" << endl;
 415     cout << "is Fermi                 : ";
 416     coutBool(isFermi(idDevice));
 417     cout << "is Kepler                : ";
 418     coutBool(isKepler(idDevice));
 419     cout << "Clock rate               : " << prop.clockRate / 1000 << " MHZ" << endl;
 420     cout << "GPU integrated on MB     : ";
 421     cout01(prop.integrated);
 422     // cout << "ComputeMode              : " << prop.computeMode << endl;
 423
 424     cout << endl;
 425     cout << "Kernel : " << endl;
 426     cout << "Limit execution (timeout): ";
 427     cout01(prop.kernelExecTimeoutEnabled);
 428
 429     cout << endl;
 430     cout << "Memory : " << endl;
 431     cout << "Global Memory            : " << prop.totalGlobalMem / 1024 / 1024 << " MB" << endl;
 432     cout << "Constant Memory          : " << prop.totalConstMem / 1024 << " KB" << endl;
 433     cout << "Texture1D max size       : (" << prop.maxTexture1D << ")" << endl;
 434     cout << "Texture2D max size       : (" << prop.maxTexture2D[0] << "," << prop.maxTexture2D[1] << ")" << endl;
 435     cout << "Texture3D max size       : (" << prop.maxTexture3D[0] << "," << prop.maxTexture3D[1] << "," << prop.maxTexture3D[2] << ")" << endl;
 436     //cout << "Texture2D Array max Size : (" << ptrProp.maxTexture2DArray[0] << "," << ptrProp.maxTexture2DArray[1] << "," << ptrProp.maxTexture2DArray[2] << ")"<< endl;
 437     cout << "Texture Alignment        : " << prop.textureAlignment << " B" << endl;
 438     cout << "Max mem pitch            : " << prop.memPitch << endl;
 439
 440     cout << endl;
 441     cout << "Multiprocesseur(MP)      : " << endl;
 442     cout << "MP count                 : " << prop.multiProcessorCount << endl;
 443     cout << "Shared memory per block  : " << prop.sharedMemPerBlock / 1024 << " KB " << endl;
 444     cout << "Register memory per block: " << prop.regsPerBlock / 1024 << " KB " << endl;
 445     cout << "Max threads per block    : " << prop.maxThreadsPerBlock << endl;
 446     cout << "Max block dim            : (" << prop.maxThreadsDim[0] << "," << prop.maxThreadsDim[1] << "," << prop.maxThreadsDim[2] << ")" << endl;
 447     cout << "Max grid dim             : (" << prop.maxGridSize[0] << "," << prop.maxGridSize[1] << "," << prop.maxGridSize[2] << ")" << endl;
 448     cout << "Threads in warp          : " << prop.warpSize << endl;
 449
 450     cout << endl;
 451     cout << "GPU Capacity : " << endl;
 452     cout << "MapHostMemory                                                        : ";
 453     cout01(isHostMapMemoryEnable());
 454     cout << "AtomicOperation sharedMemory                                         : ";
 455     cout01(isAtomicShareMemoryEnable());
 456
 457     cout << "UVA (Unified Virtual Addressing)                                     : ";
 458     cout01(isUVAEnable());
 459
 460     cout << "ECCEnabled                                                      : ";
 461     cout01(prop.ECCEnabled);
 462
 463     cout << "Concurrent bidirectional copy  (DMA only!)                           : ";
 464     cout01(prop.deviceOverlap);
 465     if (prop.deviceOverlap)
 466         {
 467         cout << "Concurrent bidirectional copy  : host->device // device->host        : " << getAsyncEngineCount(idDevice) << endl;
 468         }
 469
 470     cout << "Concurrent Kernels (fermi 16x)                                  : ";
 471     cout01(prop.concurrentKernels);
 472
 473     cout << endl;
 474     if (Device::getDeviceCount() >= 2)
 475         {
 476         printP2PmatrixCompatibility();
 477         }
 478
 479     cout << "Cuda Runtime version     :  " << getRuntimeVersion() << endl;
 480     cout << "Cuda Driver  version     :  " << getDriverVersion() << endl;
 481     cout << endl;
 482     //  cout << "=================== end ========================" << endl;
 483
 484 //    cout << endl;
 485 //    cout << "===========================================" << endl;
 486 //    cout << "Cuda Runtime version     :  " << getRuntimeVersion() << endl;
 487 //    cout << "===========================================" << endl;
 488 //    cout << endl;
 489     }
 490
 491 void Device::print()
 492     {
 493     print(getDeviceId());
 494     }
 495
 496 void Device::printAll()
 497     {
 498     cout << "\nList of all GPU available :" << endl;
 499     printAllSimple();
 500
 501     cout << endl << "Details :" << endl;
 502     int deviceCount = getDeviceCount();
 503
 504     for (int id = 0; id < deviceCount; id++)
 505         {
 506         print(id);
 507         }
 508
 509     printCurrent();
 510     }
 511
 512 void Device::printAllSimple()
 513     {
 514     cout << endl;
 515     cout << "==============================================================" << endl;
 516     cout << "[CUDA] : List GPU Available : cuda version = " << getRuntimeVersion() <<endl;
 517     cout << "==============================================================" << endl;
 518     cout << endl;
 519
 520     int current=getDeviceId();
 521     for (int id = 0; id < getDeviceCount(); id++)
 522         {
 523         cout << getName(id) ;
 524         if(id == current)
 525             {
 526             cout<< " : [CURRENT]";
 527             }
 528         cout << endl;
 529         }
 530
 531     cout << endl;
 532     }
 533
 534 void Device::printCurrent()
 535     {
 536     cout << "==============================================================" << endl;
 537     cout << "[Cuda] : current device : " << getName() << endl;
 538     cout << "==============================================================" << endl;
 539     }
 540
 541 /*--------------*\
 542 |*      load      *|
 543  \*-------------*/
 544
 545 /**
 546  * Linux : nvidia-smi -pm 1 utile? TODO
 547  * marche pas pour opengl
 548  */
 549 void Device::loadCudaDriver(int deviceID, bool isMapMemoryAsk)
 550     {
 551     Chronos chrono;
 552     cout << "\nDevice(" << deviceID << ") : Load Driver ";
 553
 554     int* ptrBidon;
 555
 556     HANDLE_ERROR(cudaSetDevice(deviceID));
 557
 558     if (isHostMapMemoryEnable() && isMapMemoryAsk)
 559         {
 560         HANDLE_ERROR(cudaSetDeviceFlags(cudaDeviceMapHost));
 561         cout << "(HostMapMemory activate) : "<<flush;
 562         }
 563     else if (!isHostMapMemoryEnable() && isMapMemoryAsk)
 564         {
 565         cerr << "(HostMapMemory not enable) : "<<flush;
 566         }
 567
 568     HANDLE_ERROR(cudaMalloc((void** ) &ptrBidon, sizeof(int)));
 569     HANDLE_ERROR(cudaFree(ptrBidon));
 570
 571     chrono.stop();
 572     cout << chrono.getDeltaTime() << " (s)" << endl<<endl;
 573     }
 574
 575 void Device::loadCudaDriver(bool isMapMemoryEnable)
 576     {
 577     loadCudaDriver(getDeviceId(), isMapMemoryEnable);
 578     }
 579
 580 void Device::loadCudaDriverAll(bool isMapMemoryEnable)
 581     {
 582     cout << "\nLoad Cuda Driver : start ..." << endl;
 583     Chronos chrono;
 584
 585     int k = Device::getDeviceCount();
 586     //omp_set_num_threads(k);
 587     //#pragma omp parallel for
 588     for (int i = 0; i < k; i++)
 589         {
 590         loadCudaDriver(i, isMapMemoryEnable);
 591         }
 592
 593     chrono.stop();
 594     cout << "Load Cuda Driver : end  : " << chrono.getDeltaTime() << " (s)\n" << endl;
 595     }
 596
 597 /*--------------*\
 598 |*      p2p      *|
 599  \*-------------*/
 600
 601 void Device::printP2PmatrixCompatibility()
 602     {
 603     int* matrixP2PCompatibility = p2pMatrixCompatibility();
 604     int* ptrMatrixP2PCompatibility = matrixP2PCompatibility;
 605
 606     int n = Device::getDeviceCount();
 607     cout << "P2P compatibility : symetric matrix (" << n << "x" << n << "):" << endl;
 608     for (int i = 0; i < n; i++)
 609         {
 610         for (int j = 0; j < n; j++)
 611             {
 612             if (i != j)
 613                 {
 614                 cout << *ptrMatrixP2PCompatibility << " ";
 615                 }
 616             else
 617                 {
 618                 cout << "  ";
 619                 }
 620
 621             ptrMatrixP2PCompatibility++;
 622             }
 623         cout << endl;
 624         }
 625     cout << endl;
 626
 627     delete[] matrixP2PCompatibility;
 628     }
 629
 630 void Device::p2pEnableALL()
 631     {
 632     int n = Device::getDeviceCount();
 633
 634     int* matrixP2PCompatibility = p2pMatrixCompatibility();
 635     int* ptrMatrixP2PCompatibility = matrixP2PCompatibility;
 636
 637     if (n >= 2)
 638         {
 639         cout << "P2P enable : symetric matrix (" << n << "x" << n << "):" << endl;
 640         }
 641
 642     for (int i = 0; i < n; i++)
 643         {
 644         for (int j = 0; j < n; j++)
 645             {
 646             if (i != j)
 647                 {
 648                 cout << *ptrMatrixP2PCompatibility << " ";
 649                 }
 650             else
 651                 {
 652                 cout << "  ";
 653                 }
 654
 655             if (*ptrMatrixP2PCompatibility)
 656                 {
 657                 int flaginutile = 0;
 658                 HANDLE_ERROR(cudaSetDevice(i));
 659                 HANDLE_ERROR(cudaDeviceEnablePeerAccess(j, flaginutile));
 660                 }
 661             ptrMatrixP2PCompatibility++;
 662             }
 663         }
 664
 665     delete[] matrixP2PCompatibility;
 666     }
 667
 668 int* Device::p2pMatrixCompatibility()
 669     {
 670     int n = Device::getDeviceCount();
 671
 672     int* matrixP2PCompatibility = new int[n * n];
 673     int* ptrMatrixP2PCompatibility = matrixP2PCompatibility;
 674
 675     for (int i = 0; i < n; i++)
 676         {
 677         for (int j = 0; j < n; j++)
 678             {
 679             int isP2PAutorized01;
 680             HANDLE_ERROR(cudaDeviceCanAccessPeer(&isP2PAutorized01, i, j));
 681
 682             *ptrMatrixP2PCompatibility++ = isP2PAutorized01;
 683             }
 684         }
 685
 686     return matrixP2PCompatibility;
 687     }
 688
 689 /*--------------------------------------*\
 690  |*             Private                 *|
 691  \*-------------------------------------*/
 692
 693 void cout01(int isTrue)
 694     {
 695     if (isTrue)
 696         cout << "True" << endl;
 697     else
 698         cout << "False" << endl;
 699     }
 700
 701 void coutBool(bool isFlag)
 702     {
 703     if (isFlag)
 704         cout << "True" << endl;
 705     else
 706         cout << "False" << endl;
 707     }
 708
 709 /*----------------------------------------------------------------------*\
 710  |*                     End                                             *|
 711  \*---------------------------------------------------------------------*/
 712