From ee885ed84f2ff3d5fb1e7ac41fa3c8879314ee36 Mon Sep 17 00:00:00 2001
From: gburri <gregory.burri@master.hes-so.ch>
Date: Mon, 12 Jan 2015 20:39:28 +0100
Subject: [PATCH] TP Histogramme.

---
 .../src/cpp/core/01b_AddVector/AddVector.cu   |   2 +
 .../src/cpp/core/01c_Saucisson/Saucisson.cu   |   2 +
 .../02_ProduitScalaire/ProduitScalaire.cu     |   2 +
 .../cpp/core/03_Histogramme/Histogramme.cu    | 172 ++++++++++++++++++
 .../src/cpp/core/03_Histogramme/Histogramme.h |   6 +
 .../Student_Cuda/src/cpp/core/mainCore.cpp    |  10 +-
 6 files changed, 190 insertions(+), 4 deletions(-)
 create mode 100644 WCudaMSE/Student_Cuda/src/cpp/core/03_Histogramme/Histogramme.cu
 create mode 100644 WCudaMSE/Student_Cuda/src/cpp/core/03_Histogramme/Histogramme.h
diff --git a/WCudaMSE/Student_Cuda/src/cpp/core/01b_AddVector/AddVector.cu b/WCudaMSE/Student_Cuda/src/cpp/core/01b_AddVector/AddVector.cu
index 1a5b6eb..02f0cc8 100644
--- a/WCudaMSE/Student_Cuda/src/cpp/core/01b_AddVector/AddVector.cu
+++ b/WCudaMSE/Student_Cuda/src/cpp/core/01b_AddVector/AddVector.cu
@@ -31,6 +31,8 @@ __device__ float work(float v1, float v2)
 
 bool addVectors()
     {
+    cout << "addVectors() ..." << endl;
+
     // Inputs (passÃ© en paramÃ¨tre de la fonction dans un cas gÃ©nÃ©ral).
     float v1[] = { 1, 2, 3 };
     float v2[] = { 10, 20, 30 };
diff --git a/WCudaMSE/Student_Cuda/src/cpp/core/01c_Saucisson/Saucisson.cu b/WCudaMSE/Student_Cuda/src/cpp/core/01c_Saucisson/Saucisson.cu
index c4175be..3ec1cd3 100644
--- a/WCudaMSE/Student_Cuda/src/cpp/core/01c_Saucisson/Saucisson.cu
+++ b/WCudaMSE/Student_Cuda/src/cpp/core/01c_Saucisson/Saucisson.cu
@@ -107,6 +107,8 @@ void saucisson(int n, float deltaX, float* ptrResult)
 
 bool saucisson()
     {
+    cout << "saucisson() ..." << endl;
+
     // Nombre d'Ã©chantillon. Au-delÃ , la qualitÃ© du rÃ©sultat n'est pas meilleure. Il faudrait employÃ© des doubles Ã  la place de floats.
     const int N = 100000;
 
diff --git a/WCudaMSE/Student_Cuda/src/cpp/core/02_ProduitScalaire/ProduitScalaire.cu b/WCudaMSE/Student_Cuda/src/cpp/core/02_ProduitScalaire/ProduitScalaire.cu
index 27c3a55..429659b 100644
--- a/WCudaMSE/Student_Cuda/src/cpp/core/02_ProduitScalaire/ProduitScalaire.cu
+++ b/WCudaMSE/Student_Cuda/src/cpp/core/02_ProduitScalaire/ProduitScalaire.cu
@@ -163,6 +163,8 @@ double resultatTheorique(long n)
 
 bool produitScalaire()
     {
+    cout << "produitScalaire() ..." << endl;
+
     const int N = 10000000; // Taille des deux vecteurs : 10 * 10^6.
 
     // Allocation cotÃ© GPU en global memory (GM).
diff --git a/WCudaMSE/Student_Cuda/src/cpp/core/03_Histogramme/Histogramme.cu b/WCudaMSE/Student_Cuda/src/cpp/core/03_Histogramme/Histogramme.cu
new file mode 100644
index 0000000..07bc577
--- /dev/null
+++ b/WCudaMSE/Student_Cuda/src/cpp/core/03_Histogramme/Histogramme.cu
@@ -0,0 +1,172 @@
+#include "Histogramme.h"
+
+#include <iostream>
+#include <cmath>
+#include <cstdlib>
+#include <stdio.h>
+using namespace std;
+
+#include "Indice1D.h"
+#include "cudaTools.h"
+#include "Device.h"
+#include "Lock.h"
+#include "Chronos.h"
+
+/**
+ * 0) Initialisation de la mÃ©moire partagÃ©e.
+ */
+__device__
+void initSM(uint* tabSM)
+    {
+    const int NB_THREAD_LOCAL = Indice1D::nbThreadBlock();
+    const int TID_LOCAL = Indice1D::tidLocal();
+
+    int s = TID_LOCAL;
+    while (s < 256)
+        {
+        tabSM[s] = 0;
+        s += NB_THREAD_LOCAL;
+        }
+    }
+
+/*
+ * 1) Chaque thread calcule un rÃ©sultat intermediaire qu'il va ensuite placer en shared memory.
+ */
+__device__
+void reductionIntraThread(int n, uchar* ptrInput, uint* tabSM)
+    {
+    const int NB_THREAD = Indice1D::nbThread();
+    const int TID = Indice1D::tid();
+    const int TID_LOCAL = Indice1D::tidLocal();
+
+    int s = TID;
+    while (s < n)
+        {
+        atomicAdd(&tabSM[ptrInput[s]], 1);
+        s += NB_THREAD;
+        }
+    }
+
+/*
+ * 2) Copie les rÃ©sultats de block vers la global memory.
+ */
+__device__
+void reductionInterBlock(uint* tabSM, uint* ptrResult)
+    {
+    const int NB_THREAD_LOCAL = Indice1D::nbThreadBlock();
+    const int TID_LOCAL = Indice1D::tidLocal();
+
+    // Un maximum de 256 threads par block s'occupe de la copie SM -> GM.
+    int s = TID_LOCAL;
+    while (s < 256)
+        {
+        atomicAdd(&ptrResult[s], tabSM[s]);
+        s += NB_THREAD_LOCAL;
+        }
+
+    // Un seul thread par block s'occupe de la copie SM -> GM.
+    /* Cette version ne prÃ©sente pas d'amÃ©lioration en terme de temps.
+    if (TID_LOCAL == 0)
+        {
+        for (int i = 0; i < 256; i++)
+            atomicAdd(&ptrResult[i], tabSM[i]);
+        }*/
+    }
+
+/**
+ * La taille de la shared memory (en terme de # de sizeof(float)) doit
+ * Ãªtre Ã©gal Ã  la taille des blocs.
+ * n: le nombre d'Ã©chantillon (taille du tableau d'entrÃ©e)
+ * ptrInput: les Ã©chantillons.
+ * ptrResult: Le resultat Ã  peupler.
+ */
+__global__
+void histogramme(int n, uchar* ptrInput, uint* ptrResult)
+    {
+    extern __shared__ uint tabSM[]; // Dynamic shared memory.
+
+    // 0) Initialisation de la shared memory de maniÃ¨re parallÃ¨le.
+    initSM(tabSM);
+
+    __syncthreads();
+
+    // 1) RÃ©duction intra-thread.
+    reductionIntraThread(n, ptrInput, tabSM);
+
+    __syncthreads();
+
+    // 2) RÃ©duction inter-block.
+    reductionInterBlock(tabSM, ptrResult);
+    }
+
+uchar* createTabInput(int size)
+    {
+    uchar* tab = new uchar[size];
+
+    // GÃ©nÃ©ration de la suite.
+    for (int i = 1; i <= size; ++i)
+        tab[i-1] = i % 256;
+
+    // Permutations alÃ©atoires.
+    srand(time(NULL));
+    for (int i = 0; i < size; ++i)
+        {
+        int i1 = rand() % size; // Il y a un biais mais Ã§a n'a pas d'importance ici.
+        int i2 = rand() % size;
+        if (i1 != i2)
+            swap(tab[i1], tab[i2]);
+        }
+
+    return tab;
+    }
+
+bool histogramme()
+    {
+    cout << "hisogramme() ..." << endl;
+
+    Chronos chronos;
+    chronos.start();
+
+    // Taille du tableau en input, doit Ãªtre un multiple de 256.
+    int N = 100000 * 256;
+    uchar* tabInput = createTabInput(N);
+
+    // Allocation cotÃ© GPU en global memory (GM) et copie du tableau.
+    uchar* ptrDevInput;
+    HANDLE_ERROR(cudaMalloc(&ptrDevInput, N * sizeof(uchar)));
+    HANDLE_ERROR(cudaMemcpy(ptrDevInput, tabInput, N * sizeof(uchar), cudaMemcpyHostToDevice));
+
+    uint* ptrDevResult;
+    HANDLE_ERROR(cudaMalloc(&ptrDevResult, 256 * sizeof(uint)));
+    HANDLE_ERROR(cudaMemset(ptrDevResult, 0, 256 * sizeof(uint)));
+
+    // ParamÃ¨tre de l'appel de la fonction sur le device.
+    const dim3 dg(256, 1, 1);
+    const dim3 db(256, 1, 1);
+    Device::assertDim(dg, db);
+    const size_t SMSize = db.x * sizeof(uint); // 256 uint;
+
+    histogramme<<<dg, db, SMSize>>>(N, ptrDevInput, ptrDevResult);
+
+    cudaDeviceSynchronize(); // UtilisÃ© pour flusher les prints sur le stdout Ã  partir du device (debug).
+
+    // BarriÃ¨re implicite de synchronisation ('cudaMemCpy').
+    uint* ptrResult = new uint[256];
+    HANDLE_ERROR(cudaMemcpy(ptrResult, ptrDevResult, 256 * sizeof(uint), cudaMemcpyDeviceToHost));
+
+    uint expected = N / 256;
+    for (int i = 0; i < 256; ++i)
+        {
+        if (ptrResult[i] != expected)
+            {
+            cout << "Wrong result: " << ptrResult[i] << ", expected: " << expected << endl;
+            return false;
+            }
+        }
+    cout << "OK, all values equal the exptected value: " << expected << endl;
+
+    chronos.stop();
+    cout << "Time: " << chronos << endl;
+
+    return true;
+    }
diff --git a/WCudaMSE/Student_Cuda/src/cpp/core/03_Histogramme/Histogramme.h b/WCudaMSE/Student_Cuda/src/cpp/core/03_Histogramme/Histogramme.h
new file mode 100644
index 0000000..2d6566f
--- /dev/null
+++ b/WCudaMSE/Student_Cuda/src/cpp/core/03_Histogramme/Histogramme.h
@@ -0,0 +1,6 @@
+#ifndef HISTOGRAMME_H
+#define HISTOGRAMME_H
+
+bool histogramme();
+
+#endif
diff --git a/WCudaMSE/Student_Cuda/src/cpp/core/mainCore.cpp b/WCudaMSE/Student_Cuda/src/cpp/core/mainCore.cpp
index be769f8..2b24911 100755
--- a/WCudaMSE/Student_Cuda/src/cpp/core/mainCore.cpp
+++ b/WCudaMSE/Student_Cuda/src/cpp/core/mainCore.cpp
@@ -4,6 +4,7 @@ using namespace std;
 
 #include "01c_Saucisson/Saucisson.h"
 #include "02_ProduitScalaire/ProduitScalaire.h"
+#include "03_Histogramme/Histogramme.h"
 
 extern bool useHello();
 extern bool addVectors();
@@ -11,10 +12,11 @@ extern bool addVectors();
 int mainCore()
     {
     bool isOk = true;
-    /*isOk &= useHello();
-    isOk &= addVectors();
-    isOk &= produitScalaire();*/
-    isOk &= saucisson();
+    //isOk &= useHello();
+    //isOk &= addVectors();
+    //isOk &= produitScalaire();
+    //isOk &= saucisson();
+    //isOk &= histogramme();
 
     cout << "\nisOK = " << isOk << endl;
     cout << "\nEnd : mainCore" << endl;
-- 
2.50.0