From 19015d26dfb874d075516772ef531ee5e42fa213 Mon Sep 17 00:00:00 2001
From: gburri <gregory.burri@master.hes-so.ch>
Date: Sat, 6 Dec 2014 10:38:19 +0100
Subject: [PATCH] Labo "Produit Scalaire".

---
 .../header/device/synchronisation/Lock.h      |  18 +--
 .../src/cpp/core/01b_AddVector/AddVector.cu   |  65 +++++++++++
 .../02_ProduitScalaire/ProduitScalaire.cu     | 108 ++++++++++++------
 .../core/02_ProduitScalaire/ProduitScalaire.h |   6 +
 .../Student_Cuda/src/cpp/core/mainCore.cpp    |   5 +-
 5 files changed, 155 insertions(+), 47 deletions(-)
 create mode 100644 WCudaMSE/Student_Cuda/src/cpp/core/01b_AddVector/AddVector.cu
 create mode 100644 WCudaMSE/Student_Cuda/src/cpp/core/02_ProduitScalaire/ProduitScalaire.h
diff --git a/WCudaMSE/BilatTools_Cuda/src/core/cudatools/header/device/synchronisation/Lock.h b/WCudaMSE/BilatTools_Cuda/src/core/cudatools/header/device/synchronisation/Lock.h
index 4c6787d..97534a0 100755
--- a/WCudaMSE/BilatTools_Cuda/src/core/cudatools/header/device/synchronisation/Lock.h
+++ b/WCudaMSE/BilatTools_Cuda/src/core/cudatools/header/device/synchronisation/Lock.h
@@ -20,19 +20,19 @@
  * 	The competing threads must then wait until the owner has written a 0 to the mutex beforfe they can attempt to modify the locked memory.
  *
  * Interet
- * 	On utilise un lock lorsque l'opération qui doit être synchroniser ne possède pas d'opérateur atomic (comme atomicADD, ...),
- * 	ou lorsqui'il s'agit de plusieurs opérations à synchroniser (joue alors le role de section critique)
+ * 	On utilise un lock lorsque l'opï¿½ration qui doit ï¿½tre synchroniser ne possï¿½de pas d'opï¿½rateur atomic (comme atomicADD, ...),
+ * 	ou lorsqui'il s'agit de plusieurs opï¿½rations ï¿½ synchroniser (joue alors le role de section critique)
  *
  * Note :
  *
- * 	Lock ne laisse aucune trace coté host, il s'instancie only coté device: Code moins invasif
- * 	LockMixte laisse une trace coté host. Code plus invasif
+ * 	Lock ne laisse aucune trace cotï¿½ host, il s'instancie only cotï¿½ device: Code moins invasif
+ * 	LockMixte laisse une trace cotï¿½ host. Code plus invasif
  *
  * Use (Device side only)
  *
  *
  *	// Global variable of .cu
- * 	__device__ int mutex=0;	// Attention à l'initialisation
+ * 	__device__ int mutex=0;	// Attention ï¿½ l'initialisation
  *
  *	// variable local inside a kernel (same .cu as variable mutex)
  * 	Lock lock=Lock(&mutex);
@@ -76,10 +76,10 @@ class Lock
 	    // Solution:
 	    // 		atomicCAS  = atomic Compare And Swap
 	    // 		Prototype : 	c atomicCAS(ptr,a,b)
-	    // 		Action :	compare ptr avec a, si egale affecte b à ptr, renvoie ptr
+	    // 		Action :	compare ptr avec a, si egale affecte b ï¿½ ptr, renvoie ptr
 
 	    // Tant que ptrDev_mutex!=0 le thread cuda boucle sur while
-	    // Des qu'il vaut 0, il met le mutex à 1 et lock se termine
+	    // Des qu'il vaut 0, il met le mutex ï¿½ 1 et lock se termine
 	    while (atomicCAS(ptrDevMutexGM, 0, 1) != 0);
 	    }
 
@@ -92,7 +92,7 @@ class Lock
 	    // Solution 1:
 	    //
 	    //		*ptrDev_mutex=0;
-	    //		Aucun thread en competition ici. L'affectation n'a pas besoin d'être atomique.
+	    //		Aucun thread en competition ici. L'affectation n'a pas besoin d'ï¿½tre atomique.
 	    //		Solution satisfaisante.
 	    //
 	    // Solution 2 (prefered for symetric approach)
@@ -112,7 +112,7 @@ class Lock
 	int* ptrDevMutexGM;
     };
 
-#endif 
+#endif
 
 /*----------------------------------------------------------------------*\
  |*			End	 					*|
diff --git a/WCudaMSE/Student_Cuda/src/cpp/core/01b_AddVector/AddVector.cu b/WCudaMSE/Student_Cuda/src/cpp/core/01b_AddVector/AddVector.cu
new file mode 100644
index 0000000..1a5b6eb
--- /dev/null
+++ b/WCudaMSE/Student_Cuda/src/cpp/core/01b_AddVector/AddVector.cu
@@ -0,0 +1,65 @@
+#include <iostream>
+
+#include "Indice2D.h"
+#include "cudaTools.h"
+#include "Device.h"
+
+using std::cout;
+using std::endl;
+
+static __global__ void add(float* ptrDevV1, float* ptrDevV2, int n, float* ptrDevResult);
+static __device__ float work(float v1, float v2);
+
+__global__ void add(float* ptrDevV1, float* ptrDevV2, int n, float* ptrDevResult)
+    {
+    const int NB_THREAD = Indice2D::nbThread();
+    const int TID = Indice2D::tid();
+
+    int s = TID;
+
+    while (s < n)
+        {
+        ptrDevResult[s] = work(ptrDevV1[s], ptrDevV2[s]);
+        s += NB_THREAD;
+        }
+    }
+
+__device__ float work(float v1, float v2)
+    {
+    return v1 + v2;
+    }
+
+bool addVectors()
+    {
+    // Inputs (passÃ© en paramÃ¨tre de la fonction dans un cas gÃ©nÃ©ral).
+    float v1[] = { 1, 2, 3 };
+    float v2[] = { 10, 20, 30 };
+
+    // Outputs (renvoyer de la fonction dans un cas gÃ©nÃ©ral).
+    float vRes[3];
+
+    // Allocation cotÃ© GPU.
+    float* ptrDevV1, *ptrDevV2, *ptrDevVResult = 0;
+    const size_t vecSize = 3 * sizeof(float);
+    HANDLE_ERROR(cudaMalloc(&ptrDevV1, vecSize));
+    HANDLE_ERROR(cudaMalloc(&ptrDevV2, vecSize));
+    HANDLE_ERROR(cudaMalloc(&ptrDevVResult, vecSize));
+
+    HANDLE_ERROR(cudaMemset(ptrDevV1, 0, vecSize));
+    HANDLE_ERROR(cudaMemset(ptrDevV2, 0, vecSize));
+    HANDLE_ERROR(cudaMemset(ptrDevVResult, 0, vecSize));
+
+    HANDLE_ERROR(cudaMemcpy(ptrDevV1, v1, vecSize, cudaMemcpyHostToDevice));
+    HANDLE_ERROR(cudaMemcpy(ptrDevV2, v2, vecSize, cudaMemcpyHostToDevice));
+
+    const dim3 dg(2, 2, 1);
+    const dim3 db(2, 2, 1);
+    Device::assertDim(dg, db);
+
+    add<<<dg, db>>>(ptrDevV1, ptrDevV2, 3, ptrDevVResult);
+
+    // BarriÃ¨re implicite de synchronisation ('cudaMemCpy').
+    HANDLE_ERROR(cudaMemcpy(vRes, ptrDevVResult, vecSize, cudaMemcpyDeviceToHost));
+
+    return vRes[0] == 11 && vRes[1] == 22 && vRes[2] == 33;
+    }
diff --git a/WCudaMSE/Student_Cuda/src/cpp/core/02_ProduitScalaire/ProduitScalaire.cu b/WCudaMSE/Student_Cuda/src/cpp/core/02_ProduitScalaire/ProduitScalaire.cu
index c96562a..27c3a55 100644
--- a/WCudaMSE/Student_Cuda/src/cpp/core/02_ProduitScalaire/ProduitScalaire.cu
+++ b/WCudaMSE/Student_Cuda/src/cpp/core/02_ProduitScalaire/ProduitScalaire.cu
@@ -1,16 +1,20 @@
+#include "ProduitScalaire.h"
+
 #include <iostream>
 #include <cmath>
+#include <stdio.h>
 using namespace std;
 
 #include "Indice1D.h"
 #include "cudaTools.h"
 #include "Device.h"
+#include "Lock.h"
 
 #define M_V 200
 #define M_W 200
 
 #define VI 1.4422495703074083
-#define WI 0.7390850782394409
+#define WI 0.739085133215160672293109200837
 
 /*
  * Renvoie la valeur du iÃ¨me Ã©lÃ©ment du vecteur v.
@@ -24,7 +28,13 @@ double v(long i)
         const double xCarre = x * x;
         x = x - (xCarre * x - 3) / (3 * xCarre);
         }
-    return (x / VI) * sqrt(double(i));
+
+    /* Debug afin d'ajuster VI
+    if (Indice1D::tid() == 0)
+        printf("x: %.30f, VI: %.30f, x / VI: %.30f\n", x, VI, x / VI);
+        */
+
+    return (x / VI) * sqrt(double(i)); // x / VI doit Ãªtre Ã©gal Ã  1.
     }
 
 /*
@@ -36,13 +46,20 @@ double w(long i)
     double x = abs(cos(double(i)));
     for (long j = 1; j <= M_W; j++)
         x = x - (cos(x) - x) / (-sin(x) - 1);
+
+    /* Debug afin d'ajuster WI
+    if (Indice1D::tid() == 0)
+        printf("x: %.30f, WI: %.30f, x / WI: %.30f\n", x, WI, x / WI); */
+
     return (x / WI) * sqrt(double(i));
     }
 
 /*
+ * 1) Chaque thread calcule un rÃ©sultat intermediaire qu'il va ensuite placer en shared memory.
  * n: La taille des deux vecteurs.
  */
-__device__ void reductionIntraThread(int n, double* tabResultSM)
+__device__
+void reductionIntraThread(int n, double* tabSM)
     {
     const int NB_THREAD = Indice1D::nbThread();
     const int TID = Indice1D::tid();
@@ -55,44 +72,63 @@ __device__ void reductionIntraThread(int n, double* tabResultSM)
         threadResult += v(s) * w(s);
         s += NB_THREAD;
         }
-    tabResultSM[TID_LOCAL] = threadResult;
+
+    tabSM[TID_LOCAL] = threadResult;
     }
 
 /*
- * Combine les rÃ©sultats de 'tabResultSM' dans 'tabResulSM[0]'
+ * Combine les rÃ©sultats de 'tabSM' dans 'tabSM[0]'
  */
-__device__ void combine(double* tabResultSM, int middle)
+__device__
+void combine(double* tabSM, int middle)
     {
-        const int TID_LOCAL = Indice1D::tidLocal();
-        const int NB_THREAD_LOCAL = Indice1D::nbThreadBlock();
-
-        int s = TID_LOCAL;
-        while (s < middle)
-            {
-            tabResultSM[s] = tabResultSM[s] + tabResultSM[s + middle];
-            s += NB_THREAD_LOCAL;
-            }
+    const int TID_LOCAL = Indice1D::tidLocal();
+    const int NB_THREAD_LOCAL = Indice1D::nbThreadBlock();
+
+    int s = TID_LOCAL;
+    while (s < middle)
+        {
+        tabSM[s] = tabSM[s] + tabSM[s + middle];
+        s += NB_THREAD_LOCAL;
+        }
     }
 
-__device__ void reductionIntraBlock(double* tabResultSM)
+/*
+ * 2) La shared memory est rÃ©duite, le rÃ©sultat est placÃ© dans 'tabSM[0]'.
+ */
+__device__
+void reductionIntraBlock(double* tabSM)
     {
     const int TAB_SIZE = blockDim.x;
     int middle = TAB_SIZE / 2;
 
     while (middle > 0)
         {
-        combine(tabResultSM, middle);
+        combine(tabSM, middle);
         middle /= 2;
         __syncthreads();
         }
     }
 
-__device__ void reductionInterBlock(double* tabResultSM, float* ptrResult)
+__device__
+int mutexReductionInterBlock = 0;
+
+/*
+ * 3) Le 'tabSM[0]' de chaque bloc est reduit dans 'ptrResult' qui se trouve en global memory.
+ */
+__device__
+void reductionInterBlock(double* tabSM, double* ptrResult)
     {
     const int TID_LOCAL = Indice1D::tidLocal();
     if (TID_LOCAL == 0)
         {
-        atomicAdd(ptrResult, float(tabResultSM[0]));
+        Lock lock(&mutexReductionInterBlock);
+        lock.lock();
+        (*ptrResult) += tabSM[0];
+        lock.unlock();
+
+        // Si on travail en float (pas besoin de mutex) :
+        // atomicAdd(ptrResult, float(tabSM[0]));
         }
     }
 
@@ -103,36 +139,36 @@ __device__ void reductionInterBlock(double* tabResultSM, float* ptrResult)
  * ptrResult: Le resultat du produit scalaire.
  */
 __global__
-void produitScalaire(int n, float* ptrResult)
+void produitScalaire(int n, double* ptrResult)
     {
-    extern __shared__ double tabResultSM[]; // Shared memory.
+    extern __shared__ double tabSM[]; // Dynamic shared memory.;
 
     // 1) RÃ©duction intra-thread.
-    reductionIntraThread(n, tabResultSM);
+    reductionIntraThread(n, tabSM);
 
     __syncthreads();
 
     // 2) RÃ©duction intra-block.
-    reductionIntraBlock(tabResultSM);
+    reductionIntraBlock(tabSM);
 
     // 3) RÃ©duction inter-block.
-    reductionInterBlock(tabResultSM, ptrResult);
+    reductionInterBlock(tabSM, ptrResult);
     }
 
 double resultatTheorique(long n)
-{
+    {
     n -= 1;
-    return (n / 2.0) * (n+1);
-}
+    return (n / 2.0) * (n + 1);
+    }
 
 bool produitScalaire()
     {
-    const int N = 100000000; // Taille des deux vecteurs.
+    const int N = 10000000; // Taille des deux vecteurs : 10 * 10^6.
 
     // Allocation cotÃ© GPU en global memory (GM).
-    float* ptrDevResult = 0;
-    HANDLE_ERROR(cudaMalloc(&ptrDevResult, sizeof(float)));
-    HANDLE_ERROR(cudaMemset(ptrDevResult, 0, sizeof(float)));
+    double* ptrDevResult = 0;
+    HANDLE_ERROR(cudaMalloc(&ptrDevResult, sizeof(double)));
+    HANDLE_ERROR(cudaMemset(ptrDevResult, 0, sizeof(double)));
 
     // ParamÃ¨tre de l'appel de la fonction sur le device.
     const dim3 dg(256, 1, 1);
@@ -142,13 +178,15 @@ bool produitScalaire()
 
     produitScalaire<<<dg, db, SMSize>>>(N, ptrDevResult);
 
-    float res;
+    cudaDeviceSynchronize(); // UtilisÃ© pour flusher les prints sur le stdout Ã  partir du device.
+
+    double res;
     // BarriÃ¨re implicite de synchronisation ('cudaMemCpy').
-    HANDLE_ERROR(cudaMemcpy(&res, ptrDevResult, sizeof(float), cudaMemcpyDeviceToHost));
+    HANDLE_ERROR(cudaMemcpy(&res, ptrDevResult, sizeof(double), cudaMemcpyDeviceToHost));
 
-    double resTheo = resultatTheorique(N);
+    const double resTheo = resultatTheorique(N);
 
-    cout.precision(10);
+    cout.precision(30);
     cout << "RÃ©sultat : " << res << endl;
     cout << "RÃ©sultat thÃ©orique : " << resTheo << endl;
     cout << "DiffÃ©rence absolue : " << resTheo - res << endl;
diff --git a/WCudaMSE/Student_Cuda/src/cpp/core/02_ProduitScalaire/ProduitScalaire.h b/WCudaMSE/Student_Cuda/src/cpp/core/02_ProduitScalaire/ProduitScalaire.h
new file mode 100644
index 0000000..aa9423d
--- /dev/null
+++ b/WCudaMSE/Student_Cuda/src/cpp/core/02_ProduitScalaire/ProduitScalaire.h
@@ -0,0 +1,6 @@
+#ifndef PRODUIT_SCALAIRE_H
+#define PRODUIT_SCALAIRE_H
+
+bool produitScalaire();
+
+#endif
diff --git a/WCudaMSE/Student_Cuda/src/cpp/core/mainCore.cpp b/WCudaMSE/Student_Cuda/src/cpp/core/mainCore.cpp
index 05a7e99..d16b105 100755
--- a/WCudaMSE/Student_Cuda/src/cpp/core/mainCore.cpp
+++ b/WCudaMSE/Student_Cuda/src/cpp/core/mainCore.cpp
@@ -2,11 +2,10 @@
 #include <stdlib.h>
 using namespace std;
 
+#include "02_ProduitScalaire/ProduitScalaire.h"
+
 extern bool useHello();
 extern bool addVectors();
-extern bool produitScalaire();
-
-int mainCore();
 
 int mainCore()
     {
-- 
2.43.0