Ajout de l'ensemble du workspace.
[GPU.git] / WCudaMSE / BilatTools_Cuda / src / core / cudatools / cpp / Device.cpp
1 #include <iostream>
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <assert.h>
5
6 #include "Device.h"
7 #include "cudaTools.h"
8 #include "Chronos.h"
9 #include "omp.h"
10 #include "StringTools.h"
11
12 using std::string;
13 using std::cout;
14 using std::cerr;
15 using std::endl;
16 using std::flush;
17
18 /*----------------------------------------------------------------------*\
19 |* Declaration *|
20 \*---------------------------------------------------------------------*/
21
22 /*--------------------------------------*\
23 |* Public *|
24 \*-------------------------------------*/
25
26 /*--------------------------------------*\
27 |* Private *|
28 \*-------------------------------------*/
29
30 static void cout01(int isTrue);
31 static void coutBool(bool isFlag);
32 static int dim(const dim3& dim);
33
34 /*----------------------------------------------------------------------*\
35 |* Implementation *|
36 \*---------------------------------------------------------------------*/
37
38 /*--------------------------------------*\
39 |* Public *|
40 \*-------------------------------------*/
41
42 /*--------------*\
43 |* wrapper *|
44 \*-------------*/
45
46 void Device::synchronize(void)
47 {
48 cudaDeviceSynchronize();
49 }
50
51 /*--------------*\
52 |* Tools *|
53 \*-------------*/
54
55 void Device::checkKernelError(const char *message)
56 {
57 cudaError_t error = cudaGetLastError();
58 if (error != cudaSuccess)
59 {
60 if (message != NULL)
61 {
62 fprintf(stderr, "\n[CUDA ERROR] : Kernel Execution Failed : %s: %s\n\n", message, cudaGetErrorString(error));
63 }
64 else
65 {
66 fprintf(stderr, "\n[CUDA ERROR] : Kernel Execution Failed : %s\n\n", cudaGetErrorString(error));
67 }
68 exit (EXIT_FAILURE);
69 }
70 }
71
72 void Device::checkDimError(const dim3& dg, const dim3& db)
73 {
74 assertDim(dg, db);
75 checkDimOptimiser(dg, db);
76 }
77
78 void Device::checkDimOptimiser(const dim3& dg, const dim3& db)
79 {
80 int mpCount = Device::getMPCount();
81 int warpSize = Device::getWarpSize();
82
83 int nbBlock = dim(dg);
84 int nbThreadBlock = dim(db);
85
86 // grid
87 if (nbBlock < mpCount || nbBlock % mpCount != 0)
88 {
89 string messageGrid = "nbBlock = " + StringTools::toString(nbBlock) + " : Heuristic : For best performance, for nbBlock, try a multiple of #MP="
90 + StringTools::toString(mpCount);
91
92 cout << "dg" << "(" << dg.x << "," << dg.y << "," << dg.z << ") " << messageGrid << endl;
93 }
94
95 // block
96 if (nbThreadBlock < warpSize || nbThreadBlock % warpSize != 0) // TODO kepler
97 {
98 string messageBlock = "nbThreadBlock = " + StringTools::toString(nbThreadBlock)
99 + " : Heuristic : For best performance, for nbThreadBlock, try a multiple of |warp|=" + StringTools::toString(warpSize);
100
101 cout << "db" << "(" << db.x << "," << db.y << "," << db.z << ") " << messageBlock << endl;
102 }
103 }
104
105 void Device::assertDim(const dim3& dg, const dim3& db)
106 {
107 // grid
108 {
109 dim3 dimGridMax = Device::getMaxGridDim();
110
111 assert(dg.x <= dimGridMax.x);
112 assert(dg.y <= dimGridMax.y);
113 assert(dg.z <= dimGridMax.z);
114 }
115
116 // block
117 {
118 dim3 dimBlockMax = Device::getMaxBlockDim();
119
120 assert(db.x <= dimBlockMax.x);
121 assert(db.y <= dimBlockMax.y);
122 assert(db.z <= dimBlockMax.z);
123 }
124
125 // Thread per block
126 assert(dim(db) <= getMaxThreadPerBlock());
127 }
128
129 void Device::print(const dim3& dg, const dim3& db)
130 {
131 cout << "dg" << "(" << dg.x << "," << dg.y << "," << dg.z << ") " << endl;
132 cout << "db" << "(" << db.x << "," << db.y << "," << db.z << ") " << endl;
133 }
134
135 int Device::dim(const dim3& dim)
136 {
137 return dim.x * dim.y * dim.z;
138 }
139
140 int Device::nbThread(const dim3& dg, const dim3& db)
141 {
142 return dim(dg) * dim(db);
143 }
144
145 /*--------------*\
146 |* get *|
147 \*-------------*/
148
149 int Device::getDeviceId(void)
150 {
151 int deviceId;
152 HANDLE_ERROR(cudaGetDevice(&deviceId));
153
154 return deviceId;
155 }
156
157 int Device::getDeviceCount(void)
158 {
159 int nbDevice;
160 HANDLE_ERROR(cudaGetDeviceCount(&nbDevice));
161
162 return nbDevice;
163 }
164
165 cudaDeviceProp Device::getDeviceProp(int idDevice)
166 {
167 cudaDeviceProp prop;
168 HANDLE_ERROR(cudaGetDeviceProperties(&prop, idDevice));
169 return prop;
170 }
171
172 cudaDeviceProp Device::getDeviceProp(void)
173 {
174 return getDeviceProp(getDeviceId());
175 }
176
177 /*--------------------------------------*\
178 |* Secondaire *|
179 \*-------------------------------------*/
180
181 /*--------------*\
182 |* get *|
183 \*-------------*/
184
185 string Device::getNameSimple(int idDevice)
186 {
187 return getDeviceProp(idDevice).name;
188 }
189
190 string Device::getNameSimple()
191 {
192 return getNameSimple(getDeviceId());
193 }
194
195 string Device::getName(int idDevice)
196 {
197 string id = StringTools::toString(idDevice);
198 string a = StringTools::toString(getCapacityMajor(idDevice));
199 string b = StringTools::toString(getCapacityMinor(idDevice));
200
201 return "[" + getNameSimple(idDevice) + "] id = " + id + " sm=" + a + "" + b;
202 }
203
204 string Device::getName()
205 {
206 return getName(getDeviceId());
207 }
208
209 dim3 Device::getMaxGridDim(int idDevice)
210 {
211 cudaDeviceProp prop = getDeviceProp(idDevice);
212
213 return dim3(prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
214 }
215
216 dim3 Device::getMaxGridDim()
217 {
218 return getMaxGridDim(getDeviceId());
219 }
220
221 dim3 Device::getMaxBlockDim(int idDevice)
222 {
223 cudaDeviceProp prop = getDeviceProp(idDevice);
224
225 return dim3(prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
226 }
227
228 dim3 Device::getMaxBlockDim()
229 {
230 return getMaxBlockDim(getDeviceId());
231 }
232
233 int Device::getMaxThreadPerBlock(int idDevice)
234 {
235 return getDeviceProp(idDevice).maxThreadsPerBlock;
236 }
237
238 int Device::getMaxThreadPerBlock()
239 {
240 return getMaxThreadPerBlock(getDeviceId());
241 }
242
243 int Device::getWarpSize(int idDevice)
244 {
245 return getDeviceProp(idDevice).warpSize;
246 }
247
248 int Device::getWarpSize(void)
249 {
250 return getWarpSize(getDeviceId());
251 }
252
253 int Device::getMPCount(int idDevice)
254 {
255 return getDeviceProp(idDevice).multiProcessorCount;
256 }
257
258 int Device::getMPCount(void)
259 {
260 return getMPCount(getDeviceId());
261 }
262
263 int Device::getCapacityMajor(int idDevice)
264 {
265 return getDeviceProp(idDevice).major;
266 }
267
268 int Device::getCapacityMajor()
269 {
270 return getCapacityMajor(getDeviceId());
271 }
272
273 int Device::getCapacityMinor(int idDevice)
274 {
275 return getDeviceProp(idDevice).minor;
276 }
277
278 int Device::getCapacityMinor()
279 {
280 return getCapacityMinor(getDeviceId());
281 }
282
283 int Device::getRuntimeVersion()
284 {
285 int version = -1;
286 cudaRuntimeGetVersion(&version);
287 return version;
288
289 // return CUDART_VERSION;
290 }
291
292 int Device::getDriverVersion(void)
293 {
294 int version = -1;
295 cudaDriverGetVersion(&version);
296 return version;
297 }
298
299 int Device::getAsyncEngineCount(int idDevice)
300 {
301 return getDeviceProp(idDevice).asyncEngineCount;
302 }
303
304 int Device::getAsyncEngineCount()
305 {
306 return getAsyncEngineCount(getDeviceId());
307 }
308
309 /*--------------*\
310 |* is *|
311 \*-------------*/
312
313 bool Device::isCuda(void)
314 {
315 return getDeviceCount() >= 1;
316 }
317
318 bool Device::isFermi(int idDevice)
319 {
320 int c = getCapacityMajor(idDevice);
321
322 return c >= 2 && c < 3;
323 }
324
325 bool Device::isFermi()
326 {
327 return isFermi(getDeviceId());
328 }
329
330 bool Device::isKepler(int idDevice)
331 {
332 int c = getCapacityMajor(idDevice);
333
334 return c >= 3 && c < 4;
335 }
336
337 bool Device::isKepler()
338 {
339 return isKepler(getDeviceId());
340 }
341
342 /**
343 * 64 bits only
344 */
345 bool Device::isUVAEnable(int idDevice)
346 {
347 return getCapacityMajor() >= 2.0 && getRuntimeVersion() >= 4000;
348 }
349
350 /**
351 * 64 bits only
352 */
353 bool Device::isUVAEnable()
354 {
355 return isUVAEnable(getDeviceId());
356 }
357
358 bool Device::isAtomicShareMemoryEnable(int idDevice)
359 {
360 return (getCapacityMajor(idDevice) == 1 && Device::getCapacityMinor(idDevice) >= 2) || getCapacityMajor(idDevice) >= 2;
361 }
362
363 bool Device::isAtomicShareMemoryEnable()
364 {
365 return isAtomicShareMemoryEnable(getDeviceId());
366 }
367
368 bool Device::isHostMapMemoryEnable(int idDevice)
369 {
370 return getDeviceProp(idDevice).canMapHostMemory;
371 }
372
373 bool Device::isHostMapMemoryEnable()
374 {
375 return isHostMapMemoryEnable(getDeviceId());
376 }
377
378 bool Device::isECCEnable(int idDevice)
379 {
380 return getDeviceProp(idDevice).ECCEnabled;
381 }
382
383 bool Device::isECCEnable(void)
384 {
385 return isECCEnable(getDeviceId());
386 }
387
388 bool Device::isAsyncEngine(int idDevice)
389 {
390 return getDeviceProp(idDevice).deviceOverlap;
391 }
392
393 bool Device::isAsyncEngine(void)
394 {
395 return isAsyncEngine(getDeviceId());
396 }
397
398 /*--------------------------------------*\
399 |* Print *|
400 \*-------------------------------------*/
401
402 void Device::print(int idDevice)
403 {
404 cudaDeviceProp prop = getDeviceProp(idDevice);
405
406 cout << endl;
407 cout << "===========================================" << endl;
408 cout << " " << prop.name << " : id=" << idDevice << " : sm=" << prop.major << "" << prop.minor << " :" << endl;
409 cout << "===========================================" << endl;
410
411 cout << endl;
412 cout << "Device id : " << idDevice << endl;
413 cout << "Name : " << prop.name << endl;
414 cout << "GPU capability : " << prop.major << "." << prop.minor << "" << endl;
415 cout << "is Fermi : ";
416 coutBool(isFermi(idDevice));
417 cout << "is Kepler : ";
418 coutBool(isKepler(idDevice));
419 cout << "Clock rate : " << prop.clockRate / 1000 << " MHZ" << endl;
420 cout << "GPU integrated on MB : ";
421 cout01(prop.integrated);
422 // cout << "ComputeMode : " << prop.computeMode << endl;
423
424 cout << endl;
425 cout << "Kernel : " << endl;
426 cout << "Limit execution (timeout): ";
427 cout01(prop.kernelExecTimeoutEnabled);
428
429 cout << endl;
430 cout << "Memory : " << endl;
431 cout << "Global Memory : " << prop.totalGlobalMem / 1024 / 1024 << " MB" << endl;
432 cout << "Constant Memory : " << prop.totalConstMem / 1024 << " KB" << endl;
433 cout << "Texture1D max size : (" << prop.maxTexture1D << ")" << endl;
434 cout << "Texture2D max size : (" << prop.maxTexture2D[0] << "," << prop.maxTexture2D[1] << ")" << endl;
435 cout << "Texture3D max size : (" << prop.maxTexture3D[0] << "," << prop.maxTexture3D[1] << "," << prop.maxTexture3D[2] << ")" << endl;
436 //cout << "Texture2D Array max Size : (" << ptrProp.maxTexture2DArray[0] << "," << ptrProp.maxTexture2DArray[1] << "," << ptrProp.maxTexture2DArray[2] << ")"<< endl;
437 cout << "Texture Alignment : " << prop.textureAlignment << " B" << endl;
438 cout << "Max mem pitch : " << prop.memPitch << endl;
439
440 cout << endl;
441 cout << "Multiprocesseur(MP) : " << endl;
442 cout << "MP count : " << prop.multiProcessorCount << endl;
443 cout << "Shared memory per block : " << prop.sharedMemPerBlock / 1024 << " KB " << endl;
444 cout << "Register memory per block: " << prop.regsPerBlock / 1024 << " KB " << endl;
445 cout << "Max threads per block : " << prop.maxThreadsPerBlock << endl;
446 cout << "Max block dim : (" << prop.maxThreadsDim[0] << "," << prop.maxThreadsDim[1] << "," << prop.maxThreadsDim[2] << ")" << endl;
447 cout << "Max grid dim : (" << prop.maxGridSize[0] << "," << prop.maxGridSize[1] << "," << prop.maxGridSize[2] << ")" << endl;
448 cout << "Threads in warp : " << prop.warpSize << endl;
449
450 cout << endl;
451 cout << "GPU Capacity : " << endl;
452 cout << "MapHostMemory : ";
453 cout01(isHostMapMemoryEnable());
454 cout << "AtomicOperation sharedMemory : ";
455 cout01(isAtomicShareMemoryEnable());
456
457 cout << "UVA (Unified Virtual Addressing) : ";
458 cout01(isUVAEnable());
459
460 cout << "ECCEnabled : ";
461 cout01(prop.ECCEnabled);
462
463 cout << "Concurrent bidirectional copy (DMA only!) : ";
464 cout01(prop.deviceOverlap);
465 if (prop.deviceOverlap)
466 {
467 cout << "Concurrent bidirectional copy : host->device // device->host : " << getAsyncEngineCount(idDevice) << endl;
468 }
469
470 cout << "Concurrent Kernels (fermi 16x) : ";
471 cout01(prop.concurrentKernels);
472
473 cout << endl;
474 if (Device::getDeviceCount() >= 2)
475 {
476 printP2PmatrixCompatibility();
477 }
478
479 cout << "Cuda Runtime version : " << getRuntimeVersion() << endl;
480 cout << "Cuda Driver version : " << getDriverVersion() << endl;
481 cout << endl;
482 // cout << "=================== end ========================" << endl;
483
484 // cout << endl;
485 // cout << "===========================================" << endl;
486 // cout << "Cuda Runtime version : " << getRuntimeVersion() << endl;
487 // cout << "===========================================" << endl;
488 // cout << endl;
489 }
490
491 void Device::print()
492 {
493 print(getDeviceId());
494 }
495
496 void Device::printAll()
497 {
498 cout << "\nList of all GPU available :" << endl;
499 printAllSimple();
500
501 cout << endl << "Details :" << endl;
502 int deviceCount = getDeviceCount();
503
504 for (int id = 0; id < deviceCount; id++)
505 {
506 print(id);
507 }
508
509 printCurrent();
510 }
511
512 void Device::printAllSimple()
513 {
514 cout << endl;
515 cout << "==============================================================" << endl;
516 cout << "[CUDA] : List GPU Available : cuda version = " << getRuntimeVersion() <<endl;
517 cout << "==============================================================" << endl;
518 cout << endl;
519
520 int current=getDeviceId();
521 for (int id = 0; id < getDeviceCount(); id++)
522 {
523 cout << getName(id) ;
524 if(id == current)
525 {
526 cout<< " : [CURRENT]";
527 }
528 cout << endl;
529 }
530
531 cout << endl;
532 }
533
534 void Device::printCurrent()
535 {
536 cout << "==============================================================" << endl;
537 cout << "[Cuda] : current device : " << getName() << endl;
538 cout << "==============================================================" << endl;
539 }
540
541 /*--------------*\
542 |* load *|
543 \*-------------*/
544
545 /**
546 * Linux : nvidia-smi -pm 1 utile? TODO
547 * marche pas pour opengl
548 */
549 void Device::loadCudaDriver(int deviceID, bool isMapMemoryAsk)
550 {
551 Chronos chrono;
552 cout << "\nDevice(" << deviceID << ") : Load Driver ";
553
554 int* ptrBidon;
555
556 HANDLE_ERROR(cudaSetDevice(deviceID));
557
558 if (isHostMapMemoryEnable() && isMapMemoryAsk)
559 {
560 HANDLE_ERROR(cudaSetDeviceFlags(cudaDeviceMapHost));
561 cout << "(HostMapMemory activate) : "<<flush;
562 }
563 else if (!isHostMapMemoryEnable() && isMapMemoryAsk)
564 {
565 cerr << "(HostMapMemory not enable) : "<<flush;
566 }
567
568 HANDLE_ERROR(cudaMalloc((void** ) &ptrBidon, sizeof(int)));
569 HANDLE_ERROR(cudaFree(ptrBidon));
570
571 chrono.stop();
572 cout << chrono.getDeltaTime() << " (s)" << endl<<endl;
573 }
574
575 void Device::loadCudaDriver(bool isMapMemoryEnable)
576 {
577 loadCudaDriver(getDeviceId(), isMapMemoryEnable);
578 }
579
580 void Device::loadCudaDriverAll(bool isMapMemoryEnable)
581 {
582 cout << "\nLoad Cuda Driver : start ..." << endl;
583 Chronos chrono;
584
585 int k = Device::getDeviceCount();
586 //omp_set_num_threads(k);
587 //#pragma omp parallel for
588 for (int i = 0; i < k; i++)
589 {
590 loadCudaDriver(i, isMapMemoryEnable);
591 }
592
593 chrono.stop();
594 cout << "Load Cuda Driver : end : " << chrono.getDeltaTime() << " (s)\n" << endl;
595 }
596
597 /*--------------*\
598 |* p2p *|
599 \*-------------*/
600
601 void Device::printP2PmatrixCompatibility()
602 {
603 int* matrixP2PCompatibility = p2pMatrixCompatibility();
604 int* ptrMatrixP2PCompatibility = matrixP2PCompatibility;
605
606 int n = Device::getDeviceCount();
607 cout << "P2P compatibility : symetric matrix (" << n << "x" << n << "):" << endl;
608 for (int i = 0; i < n; i++)
609 {
610 for (int j = 0; j < n; j++)
611 {
612 if (i != j)
613 {
614 cout << *ptrMatrixP2PCompatibility << " ";
615 }
616 else
617 {
618 cout << " ";
619 }
620
621 ptrMatrixP2PCompatibility++;
622 }
623 cout << endl;
624 }
625 cout << endl;
626
627 delete[] matrixP2PCompatibility;
628 }
629
630 void Device::p2pEnableALL()
631 {
632 int n = Device::getDeviceCount();
633
634 int* matrixP2PCompatibility = p2pMatrixCompatibility();
635 int* ptrMatrixP2PCompatibility = matrixP2PCompatibility;
636
637 if (n >= 2)
638 {
639 cout << "P2P enable : symetric matrix (" << n << "x" << n << "):" << endl;
640 }
641
642 for (int i = 0; i < n; i++)
643 {
644 for (int j = 0; j < n; j++)
645 {
646 if (i != j)
647 {
648 cout << *ptrMatrixP2PCompatibility << " ";
649 }
650 else
651 {
652 cout << " ";
653 }
654
655 if (*ptrMatrixP2PCompatibility)
656 {
657 int flaginutile = 0;
658 HANDLE_ERROR(cudaSetDevice(i));
659 HANDLE_ERROR(cudaDeviceEnablePeerAccess(j, flaginutile));
660 }
661 ptrMatrixP2PCompatibility++;
662 }
663 }
664
665 delete[] matrixP2PCompatibility;
666 }
667
668 int* Device::p2pMatrixCompatibility()
669 {
670 int n = Device::getDeviceCount();
671
672 int* matrixP2PCompatibility = new int[n * n];
673 int* ptrMatrixP2PCompatibility = matrixP2PCompatibility;
674
675 for (int i = 0; i < n; i++)
676 {
677 for (int j = 0; j < n; j++)
678 {
679 int isP2PAutorized01;
680 HANDLE_ERROR(cudaDeviceCanAccessPeer(&isP2PAutorized01, i, j));
681
682 *ptrMatrixP2PCompatibility++ = isP2PAutorized01;
683 }
684 }
685
686 return matrixP2PCompatibility;
687 }
688
689 /*--------------------------------------*\
690 |* Private *|
691 \*-------------------------------------*/
692
693 void cout01(int isTrue)
694 {
695 if (isTrue)
696 cout << "True" << endl;
697 else
698 cout << "False" << endl;
699 }
700
701 void coutBool(bool isFlag)
702 {
703 if (isFlag)
704 cout << "True" << endl;
705 else
706 cout << "False" << endl;
707 }
708
709 /*----------------------------------------------------------------------*\
710 |* End *|
711 \*---------------------------------------------------------------------*/
712