WCudaMSE/Student_Cuda/src/cpp/core/04_MonteCarlo/MonteCarlo.cu

   1 #include "MonteCarlo.h"
   2
   3 #include <iostream>
   4 #include <cmath>
   5 #include <stdio.h>
   6 using namespace std;
   7
   8 #include <curand_kernel.h>
   9
  10 #include "Indice1D.h"
  11 #include "cudaTools.h"
  12 #include "Device.h"
  13 #include "Lock.h"
  14
  15 // Paramètres pour le calcul de pi.
  16 const float X_MIN = 0;
  17 const float X_MAX = 1;
  18 const float M = 1;
  19
  20 /*
  21  * 1) Chaque thread calcule un résultat intermediaire qu'il va ensuite placer en shared memory.
  22  * n: Nombre d'échantillon.
  23  */
  24 __device__
  25 void mc_reductionIntraThread(int n, uint* tabSM, curandState* tabGeneratorThread)
  26     {
  27     const int NB_THREAD = Indice1D::nbThread();
  28     const int TID = Indice1D::tid();
  29     const int TID_LOCAL = Indice1D::tidLocal();
  30
  31     curandState& localState = tabGeneratorThread[TID];
  32
  33     uint threadResult = 0;
  34     int s = TID;
  35     while (s < n)
  36         {
  37         // Tire un point aléatoire entre (0,0) et (1,1), voir X_MIN, X_MAX et M.
  38         const float x = curand_uniform(&localState);
  39         const float y = curand_uniform(&localState);
  40         const float fx = sqrtf(1 - x * x);
  41
  42         // Si y est sous la courbe alors on le comptabilise.
  43         if (y < fx)
  44             threadResult += 1;
  45
  46         s += NB_THREAD;
  47         }
  48
  49     tabSM[TID_LOCAL] = threadResult;
  50     }
  51
  52 /*
  53  * Combine les résultats de 'tabSM' dans 'tabSM[0]'
  54  */
  55 __device__
  56 void mc_combine(uint* tabSM, int middle)
  57     {
  58     const int TID_LOCAL = Indice1D::tidLocal();
  59     const int NB_THREAD_LOCAL = Indice1D::nbThreadBlock();
  60
  61     int s = TID_LOCAL;
  62     while (s < middle)
  63         {
  64         tabSM[s] += tabSM[s + middle];
  65         s += NB_THREAD_LOCAL;
  66         }
  67     }
  68
  69 /*
  70  * 2) La shared memory est réduite, le résultat est placé dans 'tabSM[0]'.
  71  */
  72 __device__
  73 void mc_reductionIntraBlock(uint* tabSM)
  74     {
  75     const int TAB_SIZE = blockDim.x;
  76     int middle = TAB_SIZE / 2;
  77
  78     while (middle > 0)
  79         {
  80         mc_combine(tabSM, middle);
  81         middle /= 2;
  82         __syncthreads(); // Synchronisation des threads au niveau du bloc.
  83         }
  84     }
  85
  86 /*
  87  * 3) Le 'tabSM[0]' de chaque bloc est reduit dans 'ptrResult' qui se trouve en global memory.
  88  */
  89 __device__
  90 void mc_reductionInterBlock(uint* tabSM, uint* ptrResult)
  91     {
  92     const int TID_LOCAL = Indice1D::tidLocal();
  93     if (TID_LOCAL == 0)
  94         {
  95         atomicAdd(ptrResult, tabSM[0]);
  96         }
  97     }
  98
  99 /**
 100  * La taille de la shared memory (en terme de # de sizeof(uint)) doit
 101  * être égal à la taille des blocs.
 102  * n: le nombre de point à tiré aléatoirement.
 103  * ptrResult: Le resultat du calcul.
 104  * tabGeneratorThread: les generateurs aléatoires
 105  */
 106 __global__
 107 void monteCarlo(int n,  uint* ptrResult, curandState* tabGeneratorThread)
 108     {
 109     extern __shared__ uint tabSM[]; // Dynamic shared memory.
 110
 111     // 1) Réduction intra-thread.
 112     mc_reductionIntraThread(n, tabSM, tabGeneratorThread);
 113
 114     __syncthreads();
 115
 116     // 2) Réduction intra-block.
 117     mc_reductionIntraBlock(tabSM);
 118
 119     // 3) Réduction inter-block.
 120     mc_reductionInterBlock(tabSM, ptrResult);
 121     }
 122
 123 __global__
 124 void setupKernelRand(curandState* tabGeneratorThread, int deviceId)
 125     {
 126     int tid = Indice1D::tid();
 127     int deltaSeed = deviceId * INT_MAX;
 128     int deltaSequence = deviceId * 100;
 129     int deltaOffset = deviceId * 100;
 130
 131     int seed = 1234 + deltaSeed;
 132     int sequenceNumber = tid + deltaSequence;
 133     int offset = deltaOffset;
 134     curand_init(seed, sequenceNumber, offset, &tabGeneratorThread[tid]);
 135     }
 136
 137 bool monteCarlo()
 138     {
 139     cout << endl << "monteCarlo() ..." << endl;
 140
 141     // Nombre de point total tiré aléatoirement.
 142     const int N = 1000000;
 143
 144     // Allocation coté GPU en global memory (GM).
 145     uint* ptrDevResult;
 146     HANDLE_ERROR(cudaMalloc(&ptrDevResult, sizeof(uint)));
 147     HANDLE_ERROR(cudaMemset(ptrDevResult, 0, sizeof(uint)));
 148
 149     // Paramètre de l'appel de la fonction sur le device.
 150     const dim3 dg(256, 1, 1);
 151     const dim3 db(256, 1, 1);
 152     Device::assertDim(dg, db);
 153     const size_t SMSize = db.x * sizeof(uint); // 256 uint;
 154
 155     // Gestion des générateurs aléatoires (un par thread).
 156     curandState* tabGeneratorThread;
 157     HANDLE_ERROR(cudaMalloc(&tabGeneratorThread, db.x * dg.x * sizeof(curandState)));
 158     setupKernelRand<<<dg, db>>>(tabGeneratorThread, 0); // Device 0 (à utilise si utilisation en MP).
 159     cudaDeviceSynchronize();
 160
 161     monteCarlo<<<dg, db, SMSize>>>(N, ptrDevResult, tabGeneratorThread);
 162
 163     // cudaDeviceSynchronize(); // Utilisé pour flusher les prints sur le stdout à partir du device (debug).
 164
 165     uint nb;
 166     // Barrière implicite de synchronisation ('cudaMemCpy').
 167     HANDLE_ERROR(cudaMemcpy(&nb, ptrDevResult, sizeof(uint), cudaMemcpyDeviceToHost));
 168
 169     double integrale = (double)nb / (double)N * (X_MAX - X_MIN) * M;
 170     double pi = 4 * integrale;
 171
 172     cout.precision(20);
 173     cout << "Approximation de PI : " << pi << endl;
 174
 175     return true;
 176     }