From 8cb42977ea2d8904795381957474b65f15f46526 Mon Sep 17 00:00:00 2001
From: gburri <gregory.burri@master.hes-so.ch>
Date: Sun, 30 Nov 2014 17:12:23 +0100
Subject: [PATCH] Ajout du TP "Produit scalaire".

---
 .../src/cpp/core/02_AddVector/AddVector.cu    |  65 -------
 .../02_ProduitScalaire/ProduitScalaire.cu     | 158 ++++++++++++++++++
 .../Student_Cuda/src/cpp/core/mainCore.cpp    |  49 +-----
 WCudaMSE/Student_Cuda/src/cpp/main.cpp        |   4 +-
 .../Student_Cuda/src/cpp/test/mainTest.cpp    |   4 +-
 .../moo/device/FractalDevice.cu               |  36 +---
 6 files changed, 169 insertions(+), 147 deletions(-)
 delete mode 100644 WCudaMSE/Student_Cuda/src/cpp/core/02_AddVector/AddVector.cu
 create mode 100644 WCudaMSE/Student_Cuda/src/cpp/core/02_ProduitScalaire/ProduitScalaire.cu
diff --git a/WCudaMSE/Student_Cuda/src/cpp/core/02_AddVector/AddVector.cu b/WCudaMSE/Student_Cuda/src/cpp/core/02_AddVector/AddVector.cu
deleted file mode 100644
index 766480e..0000000
--- a/WCudaMSE/Student_Cuda/src/cpp/core/02_AddVector/AddVector.cu
+++ /dev/null
@@ -1,65 +0,0 @@
-#include <iostream>
-
-#include "Indice2D.h"
-#include "cudaTools.h"
-#include "Device.h"
-
-using std::cout;
-using std::endl;
-
-static __global__ void add(float* ptrDevV1, float* ptrDevV2, int n, float* ptrDevResult);
-static __device__ float work(float v1, float v2);
-
-__global__ void add(float* ptrDevV1, float* ptrDevV2, int n, float* ptrDevResult)
-    {
-	const int NB_THREAD = Indice2D::nbThread();
-	const int TID = Indice2D::tid();
-
-	int s = TID;
-
-	while (s < n)
-	    {
-	    ptrDevResult[s] = work(ptrDevV1[s], ptrDevV2[s]);
-	    s += NB_THREAD;
-	    }
-    }
-
-__device__ float work(float v1, float v2)
-    {
-    return v1 + v2;
-    }
-
-bool addVectors()
-    {
-    // Inputs (passÃ© en paramÃ¨tre de la fonction dans un cas gÃ©nÃ©ral).
-    float v1[] = { 1, 2, 3 };
-    float v2[] = { 10, 20, 30 };
-
-    // Outputs (renvoyer de la fonction dans un cas gÃ©nÃ©ral).
-    float vRes[3];
-
-    // Allocation cotÃ© GPU.
-    float* ptrDevV1, *ptrDevV2, *ptrDevVResult = 0;
-    const size_t vecSize = 3 * sizeof(float);
-    HANDLE_ERROR(cudaMalloc(&ptrDevV1, vecSize));
-    HANDLE_ERROR(cudaMalloc(&ptrDevV2, vecSize));
-    HANDLE_ERROR(cudaMalloc(&ptrDevVResult, vecSize));
-
-    HANDLE_ERROR(cudaMemset(ptrDevV1, 0, vecSize));
-    HANDLE_ERROR(cudaMemset(ptrDevV2, 0, vecSize));
-    HANDLE_ERROR(cudaMemset(ptrDevVResult, 0, vecSize));
-
-    HANDLE_ERROR(cudaMemcpy(ptrDevV1, v1, vecSize, cudaMemcpyHostToDevice));
-    HANDLE_ERROR(cudaMemcpy(ptrDevV2, v2, vecSize, cudaMemcpyHostToDevice));
-
-    const dim3 dg(2, 2, 1);
-    const dim3 db(2, 2, 1);
-    Device::assertDim(dg, db);
-
-    add<<<dg, db>>>(ptrDevV1, ptrDevV2, 3, ptrDevVResult);
-
-    // BarriÃ¨re implicite de synchronisation ('cudaMemCpy').
-    HANDLE_ERROR(cudaMemcpy(vRes, ptrDevVResult, vecSize, cudaMemcpyDeviceToHost));
-
-    return vRes[0] == 11 && vRes[1] == 22 && vRes[2] == 33;
-    }
diff --git a/WCudaMSE/Student_Cuda/src/cpp/core/02_ProduitScalaire/ProduitScalaire.cu b/WCudaMSE/Student_Cuda/src/cpp/core/02_ProduitScalaire/ProduitScalaire.cu
new file mode 100644
index 0000000..c96562a
--- /dev/null
+++ b/WCudaMSE/Student_Cuda/src/cpp/core/02_ProduitScalaire/ProduitScalaire.cu
@@ -0,0 +1,158 @@
+#include <iostream>
+#include <cmath>
+using namespace std;
+
+#include "Indice1D.h"
+#include "cudaTools.h"
+#include "Device.h"
+
+#define M_V 200
+#define M_W 200
+
+#define VI 1.4422495703074083
+#define WI 0.7390850782394409
+
+/*
+ * Renvoie la valeur du iÃ¨me Ã©lÃ©ment du vecteur v.
+ */
+__device__
+double v(long i)
+    {
+    double x = 1.5 + abs(cos(double(i)));
+    for (long j = 1; j <= M_V; j++)
+        {
+        const double xCarre = x * x;
+        x = x - (xCarre * x - 3) / (3 * xCarre);
+        }
+    return (x / VI) * sqrt(double(i));
+    }
+
+/*
+ * Renvoie la valeur du iÃ¨me Ã©lÃ©ment du vecteur w.
+ */
+__device__
+double w(long i)
+    {
+    double x = abs(cos(double(i)));
+    for (long j = 1; j <= M_W; j++)
+        x = x - (cos(x) - x) / (-sin(x) - 1);
+    return (x / WI) * sqrt(double(i));
+    }
+
+/*
+ * n: La taille des deux vecteurs.
+ */
+__device__ void reductionIntraThread(int n, double* tabResultSM)
+    {
+    const int NB_THREAD = Indice1D::nbThread();
+    const int TID = Indice1D::tid();
+    const int TID_LOCAL = Indice1D::tidLocal();
+
+    double threadResult = 0.0;
+    int s = TID;
+    while (s < n)
+        {
+        threadResult += v(s) * w(s);
+        s += NB_THREAD;
+        }
+    tabResultSM[TID_LOCAL] = threadResult;
+    }
+
+/*
+ * Combine les rÃ©sultats de 'tabResultSM' dans 'tabResulSM[0]'
+ */
+__device__ void combine(double* tabResultSM, int middle)
+    {
+        const int TID_LOCAL = Indice1D::tidLocal();
+        const int NB_THREAD_LOCAL = Indice1D::nbThreadBlock();
+
+        int s = TID_LOCAL;
+        while (s < middle)
+            {
+            tabResultSM[s] = tabResultSM[s] + tabResultSM[s + middle];
+            s += NB_THREAD_LOCAL;
+            }
+    }
+
+__device__ void reductionIntraBlock(double* tabResultSM)
+    {
+    const int TAB_SIZE = blockDim.x;
+    int middle = TAB_SIZE / 2;
+
+    while (middle > 0)
+        {
+        combine(tabResultSM, middle);
+        middle /= 2;
+        __syncthreads();
+        }
+    }
+
+__device__ void reductionInterBlock(double* tabResultSM, float* ptrResult)
+    {
+    const int TID_LOCAL = Indice1D::tidLocal();
+    if (TID_LOCAL == 0)
+        {
+        atomicAdd(ptrResult, float(tabResultSM[0]));
+        }
+    }
+
+/**
+ * La taille de la shared memory (en terme de # de sizeof(double)) doit
+ * Ãªtre Ã©gal Ã  la taille des blocs.
+ * n: La taille des deux vecteurs.
+ * ptrResult: Le resultat du produit scalaire.
+ */
+__global__
+void produitScalaire(int n, float* ptrResult)
+    {
+    extern __shared__ double tabResultSM[]; // Shared memory.
+
+    // 1) RÃ©duction intra-thread.
+    reductionIntraThread(n, tabResultSM);
+
+    __syncthreads();
+
+    // 2) RÃ©duction intra-block.
+    reductionIntraBlock(tabResultSM);
+
+    // 3) RÃ©duction inter-block.
+    reductionInterBlock(tabResultSM, ptrResult);
+    }
+
+double resultatTheorique(long n)
+{
+    n -= 1;
+    return (n / 2.0) * (n+1);
+}
+
+bool produitScalaire()
+    {
+    const int N = 100000000; // Taille des deux vecteurs.
+
+    // Allocation cotÃ© GPU en global memory (GM).
+    float* ptrDevResult = 0;
+    HANDLE_ERROR(cudaMalloc(&ptrDevResult, sizeof(float)));
+    HANDLE_ERROR(cudaMemset(ptrDevResult, 0, sizeof(float)));
+
+    // ParamÃ¨tre de l'appel de la fonction sur le device.
+    const dim3 dg(256, 1, 1);
+    const dim3 db(256, 1, 1);
+    Device::assertDim(dg, db);
+    const size_t SMSize = db.x * sizeof(double); // 256 doubles;
+
+    produitScalaire<<<dg, db, SMSize>>>(N, ptrDevResult);
+
+    float res;
+    // BarriÃ¨re implicite de synchronisation ('cudaMemCpy').
+    HANDLE_ERROR(cudaMemcpy(&res, ptrDevResult, sizeof(float), cudaMemcpyDeviceToHost));
+
+    double resTheo = resultatTheorique(N);
+
+    cout.precision(10);
+    cout << "RÃ©sultat : " << res << endl;
+    cout << "RÃ©sultat thÃ©orique : " << resTheo << endl;
+    cout << "DiffÃ©rence absolue : " << resTheo - res << endl;
+    cout << "DiffÃ©rence relatif : " << 100 * (resTheo - res) / (resTheo + res) << " %" << endl;
+
+    return true;
+    }
diff --git a/WCudaMSE/Student_Cuda/src/cpp/core/mainCore.cpp b/WCudaMSE/Student_Cuda/src/cpp/core/mainCore.cpp
index 31af0c5..05a7e99 100755
--- a/WCudaMSE/Student_Cuda/src/cpp/core/mainCore.cpp
+++ b/WCudaMSE/Student_Cuda/src/cpp/core/mainCore.cpp
@@ -1,46 +1,19 @@
 #include <iostream>
 #include <stdlib.h>
+using namespace std;
 
-
-using std::cout;
-using std::endl;
-
-/*----------------------------------------------------------------------*\
- |*			Declaration 					*|
- \*---------------------------------------------------------------------*/
-
-/*--------------------------------------*\
- |*		Imported	 	*|
- \*-------------------------------------*/
-
-extern bool useHello(void);
+extern bool useHello();
 extern bool addVectors();
-
-/*--------------------------------------*\
- |*		Public			*|
- \*-------------------------------------*/
+extern bool produitScalaire();
 
 int mainCore();
 
-/*--------------------------------------*\
- |*		Private			*|
- \*-------------------------------------*/
-
-
-
-/*----------------------------------------------------------------------*\
- |*			Implementation 					*|
- \*---------------------------------------------------------------------*/
-
-/*--------------------------------------*\
- |*		Public			*|
- \*-------------------------------------*/
-
 int mainCore()
     {
     bool isOk = true;
-    isOk &= useHello();
-    isOk &= addVectors();
+    /*isOk &= useHello();
+    isOk &= addVectors();*/
+    isOk &= produitScalaire();
 
     cout << "\nisOK = " << isOk << endl;
     cout << "\nEnd : mainCore" << endl;
@@ -48,13 +21,3 @@ int mainCore()
     return isOk ? EXIT_SUCCESS : EXIT_FAILURE;
     }
 
-/*--------------------------------------*\
- |*		Private			*|
- \*-------------------------------------*/
-
-
-
-/*----------------------------------------------------------------------*\
- |*			End	 					*|
- \*---------------------------------------------------------------------*/
-
diff --git a/WCudaMSE/Student_Cuda/src/cpp/main.cpp b/WCudaMSE/Student_Cuda/src/cpp/main.cpp
index ad4d735..495aa65 100755
--- a/WCudaMSE/Student_Cuda/src/cpp/main.cpp
+++ b/WCudaMSE/Student_Cuda/src/cpp/main.cpp
@@ -4,7 +4,7 @@
 
 #include "cudaTools.h"
 #include "Device.h"
-#include  "LimitsTools.h"
+#include "LimitsTools.h"
 
 using std::cout;
 using std::endl;
@@ -49,7 +49,7 @@ int main(void)
 
     if (Device::isCuda())
 	{
-	 Device::printAll();
+	Device::printAll();
 	Device::printAllSimple();
 
 	// Server Cuda1: in [0,5]
diff --git a/WCudaMSE/Student_Cuda/src/cpp/test/mainTest.cpp b/WCudaMSE/Student_Cuda/src/cpp/test/mainTest.cpp
index 259f355..d6a5144 100755
--- a/WCudaMSE/Student_Cuda/src/cpp/test/mainTest.cpp
+++ b/WCudaMSE/Student_Cuda/src/cpp/test/mainTest.cpp
@@ -59,11 +59,11 @@ int mainTest()
 
 bool testALL()
     {
-    int deviceId=Device::getDeviceId();
+    int deviceId = Device::getDeviceId();
 
     Suite testSuite;
 
-    testSuite.add(std::auto_ptr < Suite > (new TestHello(deviceId)));
+    testSuite.add(std::auto_ptr<Suite>(new TestHello(deviceId)));
 
     string titre = "deviceId_" + StringTools::toString(deviceId);
 
diff --git a/WCudaMSE/Student_Cuda_Image/src/cpp/core/02_Mandelbrot_Julia/moo/device/FractalDevice.cu b/WCudaMSE/Student_Cuda_Image/src/cpp/core/02_Mandelbrot_Julia/moo/device/FractalDevice.cu
index f9a8926..997b32b 100755
--- a/WCudaMSE/Student_Cuda_Image/src/cpp/core/02_Mandelbrot_Julia/moo/device/FractalDevice.cu
+++ b/WCudaMSE/Student_Cuda_Image/src/cpp/core/02_Mandelbrot_Julia/moo/device/FractalDevice.cu
@@ -11,39 +11,10 @@
 using std::cout;
 using std::endl;
 
-/*----------------------------------------------------------------------*\
- |*			Declaration 					*|
- \*---------------------------------------------------------------------*/
-
-/*--------------------------------------*\
- |*		Imported	 	*|
- \*-------------------------------------*/
-
-/*--------------------------------------*\
- |*		Public			*|
- \*-------------------------------------*/
-
 __global__ void fractalMandelbrot(uchar4* ptrDevPixels, int w, int h, DomaineMath domaineMath, int n);
 __global__ void fractalJulia(uchar4* ptrDevPixels, int w, int h, DomaineMath domaineMath, int n, float c_r, float c_i);
 __device__ void fractal(uchar4* ptrDevPixels, int w, int h, const DomaineMath& domaineMath, int n, const FractalMath& fractalMath);
 
-/*--------------------------------------*\
- |*		Private			*|
- \*-------------------------------------*/
-
-/*----------------------------------------------------------------------*\
- |*			Implementation 					*|
- \*---------------------------------------------------------------------*/
-
-/*--------------------------------------*\
- |*		Public			*|
- \*-------------------------------------*/
-
-/*--------------------------------------*\
- |*		Private			*|
- \*-------------------------------------*/
-
-
 __global__ void fractalMandelbrot(uchar4* ptrDevPixels, int w, int h, DomaineMath domaineMath, int n)
     {
     FractalMandelbrotMath fractalMath(n);
@@ -73,7 +44,7 @@ __device__ void fractal(uchar4* ptrDevPixels, int w, int h, const DomaineMath& d
         {
         IndiceTools::toIJ(s, w, &pixelI, &pixelJ); // update (pixelI, pixelJ)
 
-        // (i,j) domaine ecran
+        // (i,j) domaine Ã©cran
         // (x,y) domaine math
         domaineMath.toXY(pixelI, pixelJ, &x, &y); //  (i,j) -> (x,y)
 
@@ -84,8 +55,3 @@ __device__ void fractal(uchar4* ptrDevPixels, int w, int h, const DomaineMath& d
         s += NB_THREAD;
         }
     }
-
-/*----------------------------------------------------------------------*\
- |*			End	 					*|
- \*---------------------------------------------------------------------*/
-
-- 
2.43.0