WCudaMSE/Student_Cuda/src/cpp/core/03_Histogramme/Histogramme.cu

   1 #include "Histogramme.h"
   2
   3 #include <iostream>
   4 #include <cmath>
   5 #include <cstdlib>
   6 #include <stdio.h>
   7 using namespace std;
   8
   9 #include "Indice1D.h"
  10 #include "cudaTools.h"
  11 #include "Device.h"
  12 #include "Lock.h"
  13 #include "Chronos.h"
  14
  15 /**
  16  * 0) Initialisation de la mémoire partagée.
  17  */
  18 __device__
  19 void initSM(uint* tabSM)
  20     {
  21     const int NB_THREAD_LOCAL = Indice1D::nbThreadBlock();
  22     const int TID_LOCAL = Indice1D::tidLocal();
  23
  24     int s = TID_LOCAL;
  25     while (s < 256)
  26         {
  27         tabSM[s] = 0;
  28         s += NB_THREAD_LOCAL;
  29         }
  30     }
  31
  32 /*
  33  * 1) Chaque thread calcule un résultat intermediaire qu'il va ensuite placer en shared memory.
  34  */
  35 __device__
  36 void reductionIntraThread(int n, uchar* ptrInput, uint* tabSM)
  37     {
  38     const int NB_THREAD = Indice1D::nbThread();
  39     const int TID = Indice1D::tid();
  40     const int TID_LOCAL = Indice1D::tidLocal();
  41
  42     int s = TID;
  43     while (s < n)
  44         {
  45         atomicAdd(&tabSM[ptrInput[s]], 1);
  46         s += NB_THREAD;
  47         }
  48     }
  49
  50 /*
  51  * 2) Copie les résultats de block vers la global memory.
  52  */
  53 __device__
  54 void reductionInterBlock(uint* tabSM, uint* ptrResult)
  55     {
  56     const int NB_THREAD_LOCAL = Indice1D::nbThreadBlock();
  57     const int TID_LOCAL = Indice1D::tidLocal();
  58
  59     // Un maximum de 256 threads par block s'occupe de la copie SM -> GM.
  60     int s = TID_LOCAL;
  61     while (s < 256)
  62         {
  63         atomicAdd(&ptrResult[s], tabSM[s]);
  64         s += NB_THREAD_LOCAL;
  65         }
  66
  67     // Un seul thread par block s'occupe de la copie SM -> GM.
  68     /* Cette version ne présente pas d'amélioration en terme de temps.
  69     if (TID_LOCAL == 0)
  70         {
  71         for (int i = 0; i < 256; i++)
  72             atomicAdd(&ptrResult[i], tabSM[i]);
  73         }*/
  74     }
  75
  76 /**
  77  * La taille de la shared memory (en terme de # de sizeof(float)) doit
  78  * être égal à la taille des blocs.
  79  * n: le nombre d'échantillon (taille du tableau d'entrée)
  80  * ptrInput: les échantillons.
  81  * ptrResult: Le resultat à peupler.
  82  */
  83 __global__
  84 void histogramme(int n, uchar* ptrInput, uint* ptrResult)
  85     {
  86     extern __shared__ uint tabSM[]; // Dynamic shared memory.
  87
  88     // 0) Initialisation de la shared memory de manière parallèle.
  89     initSM(tabSM);
  90
  91     __syncthreads();
  92
  93     // 1) Réduction intra-thread.
  94     reductionIntraThread(n, ptrInput, tabSM);
  95
  96     __syncthreads();
  97
  98     // 2) Réduction inter-block.
  99     reductionInterBlock(tabSM, ptrResult);
 100     }
 101
 102 uchar* createTabInput(int size)
 103     {
 104     uchar* tab = new uchar[size];
 105
 106     // Génération de la suite.
 107     for (int i = 1; i <= size; ++i)
 108         tab[i-1] = i % 256;
 109
 110     // Permutations aléatoires.
 111     srand(time(NULL));
 112     for (int i = 0; i < size; ++i)
 113         {
 114         int i1 = rand() % size; // Il y a un biais mais ça n'a pas d'importance ici.
 115         int i2 = rand() % size;
 116         if (i1 != i2)
 117             swap(tab[i1], tab[i2]);
 118         }
 119
 120     return tab;
 121     }
 122
 123 bool histogramme()
 124     {
 125     cout << "hisogramme() ..." << endl;
 126
 127     Chronos chronos;
 128     chronos.start();
 129
 130     // Taille du tableau en input, doit être un multiple de 256.
 131     int N = 100000 * 256;
 132     uchar* tabInput = createTabInput(N);
 133
 134     // Allocation coté GPU en global memory (GM) et copie du tableau.
 135     uchar* ptrDevInput;
 136     HANDLE_ERROR(cudaMalloc(&ptrDevInput, N * sizeof(uchar)));
 137     HANDLE_ERROR(cudaMemcpy(ptrDevInput, tabInput, N * sizeof(uchar), cudaMemcpyHostToDevice));
 138
 139     uint* ptrDevResult;
 140     HANDLE_ERROR(cudaMalloc(&ptrDevResult, 256 * sizeof(uint)));
 141     HANDLE_ERROR(cudaMemset(ptrDevResult, 0, 256 * sizeof(uint)));
 142
 143     // Paramètre de l'appel de la fonction sur le device.
 144     const dim3 dg(256, 1, 1);
 145     const dim3 db(256, 1, 1);
 146     Device::assertDim(dg, db);
 147     const size_t SMSize = db.x * sizeof(uint); // 256 uint;
 148
 149     histogramme<<<dg, db, SMSize>>>(N, ptrDevInput, ptrDevResult);
 150
 151     cudaDeviceSynchronize(); // Utilisé pour flusher les prints sur le stdout à partir du device (debug).
 152
 153     // Barrière implicite de synchronisation ('cudaMemCpy').
 154     uint* ptrResult = new uint[256];
 155     HANDLE_ERROR(cudaMemcpy(ptrResult, ptrDevResult, 256 * sizeof(uint), cudaMemcpyDeviceToHost));
 156
 157     uint expected = N / 256;
 158     for (int i = 0; i < 256; ++i)
 159         {
 160         if (ptrResult[i] != expected)
 161             {
 162             cout << "Wrong result: " << ptrResult[i] << ", expected: " << expected << endl;
 163             return false;
 164             }
 165         }
 166     cout << "OK, all values equal the exptected value: " << expected << endl;
 167
 168     chronos.stop();
 169     cout << "Time: " << chronos << endl;
 170
 171     return true;
 172     }