WCudaMSE/Student_Cuda/src/cpp/core/01c_Saucisson/Saucisson.cu

   1 #include "Saucisson.h"
   2
   3 #include <iostream>
   4 #include <cmath>
   5 #include <stdio.h>
   6 using namespace std;
   7
   8 #include "Indice1D.h"
   9 #include "cudaTools.h"
  10 #include "Device.h"
  11 #include "Lock.h"
  12
  13 /*
  14  * 1) Chaque thread calcule un résultat intermediaire qu'il va ensuite placer en shared memory.
  15  * n: Nombre d'échantillon.
  16  */
  17 __device__
  18 void reductionIntraThread(int n, float deltaX, float* tabSM)
  19     {
  20     const int NB_THREAD = Indice1D::nbThread();
  21     const int TID = Indice1D::tid();
  22     const int TID_LOCAL = Indice1D::tidLocal();
  23
  24     float threadResult = 0.0;
  25     int s = TID;
  26     while (s < n)
  27         {
  28         const float i = s + 1;
  29         const float xi = -1 + i * deltaX;
  30         threadResult += sqrtf(1-xi*xi);
  31
  32         s += NB_THREAD;
  33         }
  34
  35     tabSM[TID_LOCAL] = threadResult;
  36     }
  37
  38 /*
  39  * Combine les résultats de 'tabSM' dans 'tabSM[0]'
  40  */
  41 __device__
  42 void combine(float* tabSM, int middle)
  43     {
  44     const int TID_LOCAL = Indice1D::tidLocal();
  45     const int NB_THREAD_LOCAL = Indice1D::nbThreadBlock();
  46
  47     int s = TID_LOCAL;
  48     while (s < middle)
  49         {
  50         tabSM[s] += tabSM[s + middle];
  51         s += NB_THREAD_LOCAL;
  52         }
  53     }
  54
  55 /*
  56  * 2) La shared memory est réduite, le résultat est placé dans 'tabSM[0]'.
  57  */
  58 __device__
  59 void reductionIntraBlock(float* tabSM)
  60     {
  61     const int TAB_SIZE = blockDim.x;
  62     int middle = TAB_SIZE / 2;
  63
  64     while (middle > 0)
  65         {
  66         combine(tabSM, middle);
  67         middle /= 2;
  68         __syncthreads(); // Synchronisation des threads au niveau du bloc.
  69         }
  70     }
  71
  72 /*
  73  * 3) Le 'tabSM[0]' de chaque bloc est reduit dans 'ptrResult' qui se trouve en global memory.
  74  */
  75 __device__
  76 void reductionInterBlock(float* tabSM, float* ptrResult)
  77     {
  78     const int TID_LOCAL = Indice1D::tidLocal();
  79     if (TID_LOCAL == 0)
  80         {
  81         atomicAdd(ptrResult, float(tabSM[0]));
  82         }
  83     }
  84
  85 /**
  86  * La taille de la shared memory (en terme de # de sizeof(float)) doit
  87  * être égal à la taille des blocs.
  88  * n: le nombre d'échantillon
  89  * ptrResult: Le resultat du calcul de pi.
  90  */
  91 __global__
  92 void saucisson(int n, float deltaX, float* ptrResult)
  93     {
  94     extern __shared__ float tabSM[]; // Dynamic shared memory.
  95
  96     // 1) Réduction intra-thread.
  97     reductionIntraThread(n, deltaX, tabSM);
  98
  99     __syncthreads();
 100
 101     // 2) Réduction intra-block.
 102     reductionIntraBlock(tabSM);
 103
 104     // 3) Réduction inter-block.
 105     reductionInterBlock(tabSM, ptrResult);
 106     }
 107
 108 bool saucisson()
 109     {
 110     cout << "saucisson() ..." << endl;
 111
 112     // Nombre d'échantillon. Au-delà, la qualité du résultat n'est pas meilleure. Il faudrait employé des doubles à la place de floats.
 113     const int N = 100000;
 114
 115     // Allocation coté GPU en global memory (GM).
 116     float* ptrDevResult = 0;
 117     HANDLE_ERROR(cudaMalloc(&ptrDevResult, sizeof(float)));
 118     HANDLE_ERROR(cudaMemset(ptrDevResult, 0, sizeof(float)));
 119
 120     // Paramètre de l'appel de la fonction sur le device.
 121     const dim3 dg(256, 1, 1);
 122     const dim3 db(256, 1, 1);
 123     Device::assertDim(dg, db);
 124     const size_t SMSize = db.x * sizeof(float); // 256 floats;
 125
 126     const float deltaX = 2.0f / N;
 127     saucisson<<<dg, db, SMSize>>>(N, deltaX, ptrDevResult);
 128
 129     // cudaDeviceSynchronize(); // Utilisé pour flusher les prints sur le stdout à partir du device (debug).
 130
 131     float pi;
 132     // Barrière implicite de synchronisation ('cudaMemCpy').
 133     HANDLE_ERROR(cudaMemcpy(&pi, ptrDevResult, sizeof(float), cudaMemcpyDeviceToHost));
 134     pi *= 2 * deltaX;
 135
 136     cout.precision(20);
 137     cout << "Approximation de PI : " << pi << endl;
 138
 139     return true;
 140     }