10 #include "StringTools.h"
18 /*----------------------------------------------------------------------*\
20 \*---------------------------------------------------------------------*/
22 /*--------------------------------------*\
24 \*-------------------------------------*/
26 /*--------------------------------------*\
28 \*-------------------------------------*/
30 static void cout01(int isTrue
);
31 static void coutBool(bool isFlag
);
32 static int dim(const dim3
& dim
);
34 /*----------------------------------------------------------------------*\
36 \*---------------------------------------------------------------------*/
38 /*--------------------------------------*\
40 \*-------------------------------------*/
46 void Device::synchronize(void)
48 cudaDeviceSynchronize();
55 void Device::checkKernelError(const char *message
)
57 cudaError_t error
= cudaGetLastError();
58 if (error
!= cudaSuccess
)
62 fprintf(stderr
, "\n[CUDA ERROR] : Kernel Execution Failed : %s: %s\n\n", message
, cudaGetErrorString(error
));
66 fprintf(stderr
, "\n[CUDA ERROR] : Kernel Execution Failed : %s\n\n", cudaGetErrorString(error
));
72 void Device::checkDimError(const dim3
& dg
, const dim3
& db
)
75 checkDimOptimiser(dg
, db
);
78 void Device::checkDimOptimiser(const dim3
& dg
, const dim3
& db
)
80 int mpCount
= Device::getMPCount();
81 int warpSize
= Device::getWarpSize();
83 int nbBlock
= dim(dg
);
84 int nbThreadBlock
= dim(db
);
87 if (nbBlock
< mpCount
|| nbBlock
% mpCount
!= 0)
89 string messageGrid
= "nbBlock = " + StringTools::toString(nbBlock
) + " : Heuristic : For best performance, for nbBlock, try a multiple of #MP="
90 + StringTools::toString(mpCount
);
92 cout
<< "dg" << "(" << dg
.x
<< "," << dg
.y
<< "," << dg
.z
<< ") " << messageGrid
<< endl
;
96 if (nbThreadBlock
< warpSize
|| nbThreadBlock
% warpSize
!= 0) // TODO kepler
98 string messageBlock
= "nbThreadBlock = " + StringTools::toString(nbThreadBlock
)
99 + " : Heuristic : For best performance, for nbThreadBlock, try a multiple of |warp|=" + StringTools::toString(warpSize
);
101 cout
<< "db" << "(" << db
.x
<< "," << db
.y
<< "," << db
.z
<< ") " << messageBlock
<< endl
;
105 void Device::assertDim(const dim3
& dg
, const dim3
& db
)
109 dim3 dimGridMax
= Device::getMaxGridDim();
111 assert(dg
.x
<= dimGridMax
.x
);
112 assert(dg
.y
<= dimGridMax
.y
);
113 assert(dg
.z
<= dimGridMax
.z
);
118 dim3 dimBlockMax
= Device::getMaxBlockDim();
120 assert(db
.x
<= dimBlockMax
.x
);
121 assert(db
.y
<= dimBlockMax
.y
);
122 assert(db
.z
<= dimBlockMax
.z
);
126 assert(dim(db
) <= getMaxThreadPerBlock());
129 void Device::print(const dim3
& dg
, const dim3
& db
)
131 cout
<< "dg" << "(" << dg
.x
<< "," << dg
.y
<< "," << dg
.z
<< ") " << endl
;
132 cout
<< "db" << "(" << db
.x
<< "," << db
.y
<< "," << db
.z
<< ") " << endl
;
135 int Device::dim(const dim3
& dim
)
137 return dim
.x
* dim
.y
* dim
.z
;
140 int Device::nbThread(const dim3
& dg
, const dim3
& db
)
142 return dim(dg
) * dim(db
);
149 int Device::getDeviceId(void)
152 HANDLE_ERROR(cudaGetDevice(&deviceId
));
157 int Device::getDeviceCount(void)
160 HANDLE_ERROR(cudaGetDeviceCount(&nbDevice
));
165 cudaDeviceProp
Device::getDeviceProp(int idDevice
)
168 HANDLE_ERROR(cudaGetDeviceProperties(&prop
, idDevice
));
172 cudaDeviceProp
Device::getDeviceProp(void)
174 return getDeviceProp(getDeviceId());
177 /*--------------------------------------*\
179 \*-------------------------------------*/
185 string
Device::getNameSimple(int idDevice
)
187 return getDeviceProp(idDevice
).name
;
190 string
Device::getNameSimple()
192 return getNameSimple(getDeviceId());
195 string
Device::getName(int idDevice
)
197 string id
= StringTools::toString(idDevice
);
198 string a
= StringTools::toString(getCapacityMajor(idDevice
));
199 string b
= StringTools::toString(getCapacityMinor(idDevice
));
201 return "[" + getNameSimple(idDevice
) + "] id = " + id
+ " sm=" + a
+ "" + b
;
204 string
Device::getName()
206 return getName(getDeviceId());
209 dim3
Device::getMaxGridDim(int idDevice
)
211 cudaDeviceProp prop
= getDeviceProp(idDevice
);
213 return dim3(prop
.maxGridSize
[0], prop
.maxGridSize
[1], prop
.maxGridSize
[2]);
216 dim3
Device::getMaxGridDim()
218 return getMaxGridDim(getDeviceId());
221 dim3
Device::getMaxBlockDim(int idDevice
)
223 cudaDeviceProp prop
= getDeviceProp(idDevice
);
225 return dim3(prop
.maxThreadsDim
[0], prop
.maxThreadsDim
[1], prop
.maxThreadsDim
[2]);
228 dim3
Device::getMaxBlockDim()
230 return getMaxBlockDim(getDeviceId());
233 int Device::getMaxThreadPerBlock(int idDevice
)
235 return getDeviceProp(idDevice
).maxThreadsPerBlock
;
238 int Device::getMaxThreadPerBlock()
240 return getMaxThreadPerBlock(getDeviceId());
243 int Device::getWarpSize(int idDevice
)
245 return getDeviceProp(idDevice
).warpSize
;
248 int Device::getWarpSize(void)
250 return getWarpSize(getDeviceId());
253 int Device::getMPCount(int idDevice
)
255 return getDeviceProp(idDevice
).multiProcessorCount
;
258 int Device::getMPCount(void)
260 return getMPCount(getDeviceId());
263 int Device::getCapacityMajor(int idDevice
)
265 return getDeviceProp(idDevice
).major
;
268 int Device::getCapacityMajor()
270 return getCapacityMajor(getDeviceId());
273 int Device::getCapacityMinor(int idDevice
)
275 return getDeviceProp(idDevice
).minor
;
278 int Device::getCapacityMinor()
280 return getCapacityMinor(getDeviceId());
283 int Device::getRuntimeVersion()
286 cudaRuntimeGetVersion(&version
);
289 // return CUDART_VERSION;
292 int Device::getDriverVersion(void)
295 cudaDriverGetVersion(&version
);
299 int Device::getAsyncEngineCount(int idDevice
)
301 return getDeviceProp(idDevice
).asyncEngineCount
;
304 int Device::getAsyncEngineCount()
306 return getAsyncEngineCount(getDeviceId());
313 bool Device::isCuda(void)
315 return getDeviceCount() >= 1;
318 bool Device::isFermi(int idDevice
)
320 int c
= getCapacityMajor(idDevice
);
322 return c
>= 2 && c
< 3;
325 bool Device::isFermi()
327 return isFermi(getDeviceId());
330 bool Device::isKepler(int idDevice
)
332 int c
= getCapacityMajor(idDevice
);
334 return c
>= 3 && c
< 4;
337 bool Device::isKepler()
339 return isKepler(getDeviceId());
345 bool Device::isUVAEnable(int idDevice
)
347 return getCapacityMajor() >= 2.0 && getRuntimeVersion() >= 4000;
353 bool Device::isUVAEnable()
355 return isUVAEnable(getDeviceId());
358 bool Device::isAtomicShareMemoryEnable(int idDevice
)
360 return (getCapacityMajor(idDevice
) == 1 && Device::getCapacityMinor(idDevice
) >= 2) || getCapacityMajor(idDevice
) >= 2;
363 bool Device::isAtomicShareMemoryEnable()
365 return isAtomicShareMemoryEnable(getDeviceId());
368 bool Device::isHostMapMemoryEnable(int idDevice
)
370 return getDeviceProp(idDevice
).canMapHostMemory
;
373 bool Device::isHostMapMemoryEnable()
375 return isHostMapMemoryEnable(getDeviceId());
378 bool Device::isECCEnable(int idDevice
)
380 return getDeviceProp(idDevice
).ECCEnabled
;
383 bool Device::isECCEnable(void)
385 return isECCEnable(getDeviceId());
388 bool Device::isAsyncEngine(int idDevice
)
390 return getDeviceProp(idDevice
).deviceOverlap
;
393 bool Device::isAsyncEngine(void)
395 return isAsyncEngine(getDeviceId());
398 /*--------------------------------------*\
400 \*-------------------------------------*/
402 void Device::print(int idDevice
)
404 cudaDeviceProp prop
= getDeviceProp(idDevice
);
407 cout
<< "===========================================" << endl
;
408 cout
<< " " << prop
.name
<< " : id=" << idDevice
<< " : sm=" << prop
.major
<< "" << prop
.minor
<< " :" << endl
;
409 cout
<< "===========================================" << endl
;
412 cout
<< "Device id : " << idDevice
<< endl
;
413 cout
<< "Name : " << prop
.name
<< endl
;
414 cout
<< "GPU capability : " << prop
.major
<< "." << prop
.minor
<< "" << endl
;
415 cout
<< "is Fermi : ";
416 coutBool(isFermi(idDevice
));
417 cout
<< "is Kepler : ";
418 coutBool(isKepler(idDevice
));
419 cout
<< "Clock rate : " << prop
.clockRate
/ 1000 << " MHZ" << endl
;
420 cout
<< "GPU integrated on MB : ";
421 cout01(prop
.integrated
);
422 // cout << "ComputeMode : " << prop.computeMode << endl;
425 cout
<< "Kernel : " << endl
;
426 cout
<< "Limit execution (timeout): ";
427 cout01(prop
.kernelExecTimeoutEnabled
);
430 cout
<< "Memory : " << endl
;
431 cout
<< "Global Memory : " << prop
.totalGlobalMem
/ 1024 / 1024 << " MB" << endl
;
432 cout
<< "Constant Memory : " << prop
.totalConstMem
/ 1024 << " KB" << endl
;
433 cout
<< "Texture1D max size : (" << prop
.maxTexture1D
<< ")" << endl
;
434 cout
<< "Texture2D max size : (" << prop
.maxTexture2D
[0] << "," << prop
.maxTexture2D
[1] << ")" << endl
;
435 cout
<< "Texture3D max size : (" << prop
.maxTexture3D
[0] << "," << prop
.maxTexture3D
[1] << "," << prop
.maxTexture3D
[2] << ")" << endl
;
436 //cout << "Texture2D Array max Size : (" << ptrProp.maxTexture2DArray[0] << "," << ptrProp.maxTexture2DArray[1] << "," << ptrProp.maxTexture2DArray[2] << ")"<< endl;
437 cout
<< "Texture Alignment : " << prop
.textureAlignment
<< " B" << endl
;
438 cout
<< "Max mem pitch : " << prop
.memPitch
<< endl
;
441 cout
<< "Multiprocesseur(MP) : " << endl
;
442 cout
<< "MP count : " << prop
.multiProcessorCount
<< endl
;
443 cout
<< "Shared memory per block : " << prop
.sharedMemPerBlock
/ 1024 << " KB " << endl
;
444 cout
<< "Register memory per block: " << prop
.regsPerBlock
/ 1024 << " KB " << endl
;
445 cout
<< "Max threads per block : " << prop
.maxThreadsPerBlock
<< endl
;
446 cout
<< "Max block dim : (" << prop
.maxThreadsDim
[0] << "," << prop
.maxThreadsDim
[1] << "," << prop
.maxThreadsDim
[2] << ")" << endl
;
447 cout
<< "Max grid dim : (" << prop
.maxGridSize
[0] << "," << prop
.maxGridSize
[1] << "," << prop
.maxGridSize
[2] << ")" << endl
;
448 cout
<< "Threads in warp : " << prop
.warpSize
<< endl
;
451 cout
<< "GPU Capacity : " << endl
;
452 cout
<< "MapHostMemory : ";
453 cout01(isHostMapMemoryEnable());
454 cout
<< "AtomicOperation sharedMemory : ";
455 cout01(isAtomicShareMemoryEnable());
457 cout
<< "UVA (Unified Virtual Addressing) : ";
458 cout01(isUVAEnable());
460 cout
<< "ECCEnabled : ";
461 cout01(prop
.ECCEnabled
);
463 cout
<< "Concurrent bidirectional copy (DMA only!) : ";
464 cout01(prop
.deviceOverlap
);
465 if (prop
.deviceOverlap
)
467 cout
<< "Concurrent bidirectional copy : host->device // device->host : " << getAsyncEngineCount(idDevice
) << endl
;
470 cout
<< "Concurrent Kernels (fermi 16x) : ";
471 cout01(prop
.concurrentKernels
);
474 if (Device::getDeviceCount() >= 2)
476 printP2PmatrixCompatibility();
479 cout
<< "Cuda Runtime version : " << getRuntimeVersion() << endl
;
480 cout
<< "Cuda Driver version : " << getDriverVersion() << endl
;
482 // cout << "=================== end ========================" << endl;
485 // cout << "===========================================" << endl;
486 // cout << "Cuda Runtime version : " << getRuntimeVersion() << endl;
487 // cout << "===========================================" << endl;
493 print(getDeviceId());
496 void Device::printAll()
498 cout
<< "\nList of all GPU available :" << endl
;
501 cout
<< endl
<< "Details :" << endl
;
502 int deviceCount
= getDeviceCount();
504 for (int id
= 0; id
< deviceCount
; id
++)
512 void Device::printAllSimple()
515 cout
<< "==============================================================" << endl
;
516 cout
<< "[CUDA] : List GPU Available : cuda version = " << getRuntimeVersion() <<endl
;
517 cout
<< "==============================================================" << endl
;
520 int current
=getDeviceId();
521 for (int id
= 0; id
< getDeviceCount(); id
++)
523 cout
<< getName(id
) ;
526 cout
<< " : [CURRENT]";
534 void Device::printCurrent()
536 cout
<< "==============================================================" << endl
;
537 cout
<< "[Cuda] : current device : " << getName() << endl
;
538 cout
<< "==============================================================" << endl
;
546 * Linux : nvidia-smi -pm 1 utile? TODO
547 * marche pas pour opengl
549 void Device::loadCudaDriver(int deviceID
, bool isMapMemoryAsk
)
552 cout
<< "\nDevice(" << deviceID
<< ") : Load Driver ";
556 HANDLE_ERROR(cudaSetDevice(deviceID
));
558 if (isHostMapMemoryEnable() && isMapMemoryAsk
)
560 HANDLE_ERROR(cudaSetDeviceFlags(cudaDeviceMapHost
));
561 cout
<< "(HostMapMemory activate) : "<<flush
;
563 else if (!isHostMapMemoryEnable() && isMapMemoryAsk
)
565 cerr
<< "(HostMapMemory not enable) : "<<flush
;
568 HANDLE_ERROR(cudaMalloc((void** ) &ptrBidon
, sizeof(int)));
569 HANDLE_ERROR(cudaFree(ptrBidon
));
572 cout
<< chrono
.getDeltaTime() << " (s)" << endl
<<endl
;
575 void Device::loadCudaDriver(bool isMapMemoryEnable
)
577 loadCudaDriver(getDeviceId(), isMapMemoryEnable
);
580 void Device::loadCudaDriverAll(bool isMapMemoryEnable
)
582 cout
<< "\nLoad Cuda Driver : start ..." << endl
;
585 int k
= Device::getDeviceCount();
586 //omp_set_num_threads(k);
587 //#pragma omp parallel for
588 for (int i
= 0; i
< k
; i
++)
590 loadCudaDriver(i
, isMapMemoryEnable
);
594 cout
<< "Load Cuda Driver : end : " << chrono
.getDeltaTime() << " (s)\n" << endl
;
601 void Device::printP2PmatrixCompatibility()
603 int* matrixP2PCompatibility
= p2pMatrixCompatibility();
604 int* ptrMatrixP2PCompatibility
= matrixP2PCompatibility
;
606 int n
= Device::getDeviceCount();
607 cout
<< "P2P compatibility : symetric matrix (" << n
<< "x" << n
<< "):" << endl
;
608 for (int i
= 0; i
< n
; i
++)
610 for (int j
= 0; j
< n
; j
++)
614 cout
<< *ptrMatrixP2PCompatibility
<< " ";
621 ptrMatrixP2PCompatibility
++;
627 delete[] matrixP2PCompatibility
;
630 void Device::p2pEnableALL()
632 int n
= Device::getDeviceCount();
634 int* matrixP2PCompatibility
= p2pMatrixCompatibility();
635 int* ptrMatrixP2PCompatibility
= matrixP2PCompatibility
;
639 cout
<< "P2P enable : symetric matrix (" << n
<< "x" << n
<< "):" << endl
;
642 for (int i
= 0; i
< n
; i
++)
644 for (int j
= 0; j
< n
; j
++)
648 cout
<< *ptrMatrixP2PCompatibility
<< " ";
655 if (*ptrMatrixP2PCompatibility
)
658 HANDLE_ERROR(cudaSetDevice(i
));
659 HANDLE_ERROR(cudaDeviceEnablePeerAccess(j
, flaginutile
));
661 ptrMatrixP2PCompatibility
++;
665 delete[] matrixP2PCompatibility
;
668 int* Device::p2pMatrixCompatibility()
670 int n
= Device::getDeviceCount();
672 int* matrixP2PCompatibility
= new int[n
* n
];
673 int* ptrMatrixP2PCompatibility
= matrixP2PCompatibility
;
675 for (int i
= 0; i
< n
; i
++)
677 for (int j
= 0; j
< n
; j
++)
679 int isP2PAutorized01
;
680 HANDLE_ERROR(cudaDeviceCanAccessPeer(&isP2PAutorized01
, i
, j
));
682 *ptrMatrixP2PCompatibility
++ = isP2PAutorized01
;
686 return matrixP2PCompatibility
;
689 /*--------------------------------------*\
691 \*-------------------------------------*/
693 void cout01(int isTrue
)
696 cout
<< "True" << endl
;
698 cout
<< "False" << endl
;
701 void coutBool(bool isFlag
)
704 cout
<< "True" << endl
;
706 cout
<< "False" << endl
;
709 /*----------------------------------------------------------------------*\
711 \*---------------------------------------------------------------------*/