From 8cb42977ea2d8904795381957474b65f15f46526 Mon Sep 17 00:00:00 2001 From: gburri Date: Sun, 30 Nov 2014 17:12:23 +0100 Subject: [PATCH] Ajout du TP "Produit scalaire". --- .../src/cpp/core/02_AddVector/AddVector.cu | 65 ------- .../02_ProduitScalaire/ProduitScalaire.cu | 158 ++++++++++++++++++ .../Student_Cuda/src/cpp/core/mainCore.cpp | 49 +----- WCudaMSE/Student_Cuda/src/cpp/main.cpp | 4 +- .../Student_Cuda/src/cpp/test/mainTest.cpp | 4 +- .../moo/device/FractalDevice.cu | 36 +--- 6 files changed, 169 insertions(+), 147 deletions(-) delete mode 100644 WCudaMSE/Student_Cuda/src/cpp/core/02_AddVector/AddVector.cu create mode 100644 WCudaMSE/Student_Cuda/src/cpp/core/02_ProduitScalaire/ProduitScalaire.cu diff --git a/WCudaMSE/Student_Cuda/src/cpp/core/02_AddVector/AddVector.cu b/WCudaMSE/Student_Cuda/src/cpp/core/02_AddVector/AddVector.cu deleted file mode 100644 index 766480e..0000000 --- a/WCudaMSE/Student_Cuda/src/cpp/core/02_AddVector/AddVector.cu +++ /dev/null @@ -1,65 +0,0 @@ -#include - -#include "Indice2D.h" -#include "cudaTools.h" -#include "Device.h" - -using std::cout; -using std::endl; - -static __global__ void add(float* ptrDevV1, float* ptrDevV2, int n, float* ptrDevResult); -static __device__ float work(float v1, float v2); - -__global__ void add(float* ptrDevV1, float* ptrDevV2, int n, float* ptrDevResult) - { - const int NB_THREAD = Indice2D::nbThread(); - const int TID = Indice2D::tid(); - - int s = TID; - - while (s < n) - { - ptrDevResult[s] = work(ptrDevV1[s], ptrDevV2[s]); - s += NB_THREAD; - } - } - -__device__ float work(float v1, float v2) - { - return v1 + v2; - } - -bool addVectors() - { - // Inputs (passé en paramètre de la fonction dans un cas général). - float v1[] = { 1, 2, 3 }; - float v2[] = { 10, 20, 30 }; - - // Outputs (renvoyer de la fonction dans un cas général). - float vRes[3]; - - // Allocation coté GPU. - float* ptrDevV1, *ptrDevV2, *ptrDevVResult = 0; - const size_t vecSize = 3 * sizeof(float); - HANDLE_ERROR(cudaMalloc(&ptrDevV1, vecSize)); - HANDLE_ERROR(cudaMalloc(&ptrDevV2, vecSize)); - HANDLE_ERROR(cudaMalloc(&ptrDevVResult, vecSize)); - - HANDLE_ERROR(cudaMemset(ptrDevV1, 0, vecSize)); - HANDLE_ERROR(cudaMemset(ptrDevV2, 0, vecSize)); - HANDLE_ERROR(cudaMemset(ptrDevVResult, 0, vecSize)); - - HANDLE_ERROR(cudaMemcpy(ptrDevV1, v1, vecSize, cudaMemcpyHostToDevice)); - HANDLE_ERROR(cudaMemcpy(ptrDevV2, v2, vecSize, cudaMemcpyHostToDevice)); - - const dim3 dg(2, 2, 1); - const dim3 db(2, 2, 1); - Device::assertDim(dg, db); - - add<<>>(ptrDevV1, ptrDevV2, 3, ptrDevVResult); - - // Barrière implicite de synchronisation ('cudaMemCpy'). - HANDLE_ERROR(cudaMemcpy(vRes, ptrDevVResult, vecSize, cudaMemcpyDeviceToHost)); - - return vRes[0] == 11 && vRes[1] == 22 && vRes[2] == 33; - } diff --git a/WCudaMSE/Student_Cuda/src/cpp/core/02_ProduitScalaire/ProduitScalaire.cu b/WCudaMSE/Student_Cuda/src/cpp/core/02_ProduitScalaire/ProduitScalaire.cu new file mode 100644 index 0000000..c96562a --- /dev/null +++ b/WCudaMSE/Student_Cuda/src/cpp/core/02_ProduitScalaire/ProduitScalaire.cu @@ -0,0 +1,158 @@ +#include +#include +using namespace std; + +#include "Indice1D.h" +#include "cudaTools.h" +#include "Device.h" + +#define M_V 200 +#define M_W 200 + +#define VI 1.4422495703074083 +#define WI 0.7390850782394409 + +/* + * Renvoie la valeur du ième élément du vecteur v. + */ +__device__ +double v(long i) + { + double x = 1.5 + abs(cos(double(i))); + for (long j = 1; j <= M_V; j++) + { + const double xCarre = x * x; + x = x - (xCarre * x - 3) / (3 * xCarre); + } + return (x / VI) * sqrt(double(i)); + } + +/* + * Renvoie la valeur du ième élément du vecteur w. + */ +__device__ +double w(long i) + { + double x = abs(cos(double(i))); + for (long j = 1; j <= M_W; j++) + x = x - (cos(x) - x) / (-sin(x) - 1); + return (x / WI) * sqrt(double(i)); + } + +/* + * n: La taille des deux vecteurs. + */ +__device__ void reductionIntraThread(int n, double* tabResultSM) + { + const int NB_THREAD = Indice1D::nbThread(); + const int TID = Indice1D::tid(); + const int TID_LOCAL = Indice1D::tidLocal(); + + double threadResult = 0.0; + int s = TID; + while (s < n) + { + threadResult += v(s) * w(s); + s += NB_THREAD; + } + tabResultSM[TID_LOCAL] = threadResult; + } + +/* + * Combine les résultats de 'tabResultSM' dans 'tabResulSM[0]' + */ +__device__ void combine(double* tabResultSM, int middle) + { + const int TID_LOCAL = Indice1D::tidLocal(); + const int NB_THREAD_LOCAL = Indice1D::nbThreadBlock(); + + int s = TID_LOCAL; + while (s < middle) + { + tabResultSM[s] = tabResultSM[s] + tabResultSM[s + middle]; + s += NB_THREAD_LOCAL; + } + } + +__device__ void reductionIntraBlock(double* tabResultSM) + { + const int TAB_SIZE = blockDim.x; + int middle = TAB_SIZE / 2; + + while (middle > 0) + { + combine(tabResultSM, middle); + middle /= 2; + __syncthreads(); + } + } + +__device__ void reductionInterBlock(double* tabResultSM, float* ptrResult) + { + const int TID_LOCAL = Indice1D::tidLocal(); + if (TID_LOCAL == 0) + { + atomicAdd(ptrResult, float(tabResultSM[0])); + } + } + +/** + * La taille de la shared memory (en terme de # de sizeof(double)) doit + * être égal à la taille des blocs. + * n: La taille des deux vecteurs. + * ptrResult: Le resultat du produit scalaire. + */ +__global__ +void produitScalaire(int n, float* ptrResult) + { + extern __shared__ double tabResultSM[]; // Shared memory. + + // 1) Réduction intra-thread. + reductionIntraThread(n, tabResultSM); + + __syncthreads(); + + // 2) Réduction intra-block. + reductionIntraBlock(tabResultSM); + + // 3) Réduction inter-block. + reductionInterBlock(tabResultSM, ptrResult); + } + +double resultatTheorique(long n) +{ + n -= 1; + return (n / 2.0) * (n+1); +} + +bool produitScalaire() + { + const int N = 100000000; // Taille des deux vecteurs. + + // Allocation coté GPU en global memory (GM). + float* ptrDevResult = 0; + HANDLE_ERROR(cudaMalloc(&ptrDevResult, sizeof(float))); + HANDLE_ERROR(cudaMemset(ptrDevResult, 0, sizeof(float))); + + // Paramètre de l'appel de la fonction sur le device. + const dim3 dg(256, 1, 1); + const dim3 db(256, 1, 1); + Device::assertDim(dg, db); + const size_t SMSize = db.x * sizeof(double); // 256 doubles; + + produitScalaire<<>>(N, ptrDevResult); + + float res; + // Barrière implicite de synchronisation ('cudaMemCpy'). + HANDLE_ERROR(cudaMemcpy(&res, ptrDevResult, sizeof(float), cudaMemcpyDeviceToHost)); + + double resTheo = resultatTheorique(N); + + cout.precision(10); + cout << "Résultat : " << res << endl; + cout << "Résultat théorique : " << resTheo << endl; + cout << "Différence absolue : " << resTheo - res << endl; + cout << "Différence relatif : " << 100 * (resTheo - res) / (resTheo + res) << " %" << endl; + + return true; + } diff --git a/WCudaMSE/Student_Cuda/src/cpp/core/mainCore.cpp b/WCudaMSE/Student_Cuda/src/cpp/core/mainCore.cpp index 31af0c5..05a7e99 100755 --- a/WCudaMSE/Student_Cuda/src/cpp/core/mainCore.cpp +++ b/WCudaMSE/Student_Cuda/src/cpp/core/mainCore.cpp @@ -1,46 +1,19 @@ #include #include +using namespace std; - -using std::cout; -using std::endl; - -/*----------------------------------------------------------------------*\ - |* Declaration *| - \*---------------------------------------------------------------------*/ - -/*--------------------------------------*\ - |* Imported *| - \*-------------------------------------*/ - -extern bool useHello(void); +extern bool useHello(); extern bool addVectors(); - -/*--------------------------------------*\ - |* Public *| - \*-------------------------------------*/ +extern bool produitScalaire(); int mainCore(); -/*--------------------------------------*\ - |* Private *| - \*-------------------------------------*/ - - - -/*----------------------------------------------------------------------*\ - |* Implementation *| - \*---------------------------------------------------------------------*/ - -/*--------------------------------------*\ - |* Public *| - \*-------------------------------------*/ - int mainCore() { bool isOk = true; - isOk &= useHello(); - isOk &= addVectors(); + /*isOk &= useHello(); + isOk &= addVectors();*/ + isOk &= produitScalaire(); cout << "\nisOK = " << isOk << endl; cout << "\nEnd : mainCore" << endl; @@ -48,13 +21,3 @@ int mainCore() return isOk ? EXIT_SUCCESS : EXIT_FAILURE; } -/*--------------------------------------*\ - |* Private *| - \*-------------------------------------*/ - - - -/*----------------------------------------------------------------------*\ - |* End *| - \*---------------------------------------------------------------------*/ - diff --git a/WCudaMSE/Student_Cuda/src/cpp/main.cpp b/WCudaMSE/Student_Cuda/src/cpp/main.cpp index ad4d735..495aa65 100755 --- a/WCudaMSE/Student_Cuda/src/cpp/main.cpp +++ b/WCudaMSE/Student_Cuda/src/cpp/main.cpp @@ -4,7 +4,7 @@ #include "cudaTools.h" #include "Device.h" -#include "LimitsTools.h" +#include "LimitsTools.h" using std::cout; using std::endl; @@ -49,7 +49,7 @@ int main(void) if (Device::isCuda()) { - Device::printAll(); + Device::printAll(); Device::printAllSimple(); // Server Cuda1: in [0,5] diff --git a/WCudaMSE/Student_Cuda/src/cpp/test/mainTest.cpp b/WCudaMSE/Student_Cuda/src/cpp/test/mainTest.cpp index 259f355..d6a5144 100755 --- a/WCudaMSE/Student_Cuda/src/cpp/test/mainTest.cpp +++ b/WCudaMSE/Student_Cuda/src/cpp/test/mainTest.cpp @@ -59,11 +59,11 @@ int mainTest() bool testALL() { - int deviceId=Device::getDeviceId(); + int deviceId = Device::getDeviceId(); Suite testSuite; - testSuite.add(std::auto_ptr < Suite > (new TestHello(deviceId))); + testSuite.add(std::auto_ptr(new TestHello(deviceId))); string titre = "deviceId_" + StringTools::toString(deviceId); diff --git a/WCudaMSE/Student_Cuda_Image/src/cpp/core/02_Mandelbrot_Julia/moo/device/FractalDevice.cu b/WCudaMSE/Student_Cuda_Image/src/cpp/core/02_Mandelbrot_Julia/moo/device/FractalDevice.cu index f9a8926..997b32b 100755 --- a/WCudaMSE/Student_Cuda_Image/src/cpp/core/02_Mandelbrot_Julia/moo/device/FractalDevice.cu +++ b/WCudaMSE/Student_Cuda_Image/src/cpp/core/02_Mandelbrot_Julia/moo/device/FractalDevice.cu @@ -11,39 +11,10 @@ using std::cout; using std::endl; -/*----------------------------------------------------------------------*\ - |* Declaration *| - \*---------------------------------------------------------------------*/ - -/*--------------------------------------*\ - |* Imported *| - \*-------------------------------------*/ - -/*--------------------------------------*\ - |* Public *| - \*-------------------------------------*/ - __global__ void fractalMandelbrot(uchar4* ptrDevPixels, int w, int h, DomaineMath domaineMath, int n); __global__ void fractalJulia(uchar4* ptrDevPixels, int w, int h, DomaineMath domaineMath, int n, float c_r, float c_i); __device__ void fractal(uchar4* ptrDevPixels, int w, int h, const DomaineMath& domaineMath, int n, const FractalMath& fractalMath); -/*--------------------------------------*\ - |* Private *| - \*-------------------------------------*/ - -/*----------------------------------------------------------------------*\ - |* Implementation *| - \*---------------------------------------------------------------------*/ - -/*--------------------------------------*\ - |* Public *| - \*-------------------------------------*/ - -/*--------------------------------------*\ - |* Private *| - \*-------------------------------------*/ - - __global__ void fractalMandelbrot(uchar4* ptrDevPixels, int w, int h, DomaineMath domaineMath, int n) { FractalMandelbrotMath fractalMath(n); @@ -73,7 +44,7 @@ __device__ void fractal(uchar4* ptrDevPixels, int w, int h, const DomaineMath& d { IndiceTools::toIJ(s, w, &pixelI, &pixelJ); // update (pixelI, pixelJ) - // (i,j) domaine ecran + // (i,j) domaine écran // (x,y) domaine math domaineMath.toXY(pixelI, pixelJ, &x, &y); // (i,j) -> (x,y) @@ -84,8 +55,3 @@ __device__ void fractal(uchar4* ptrDevPixels, int w, int h, const DomaineMath& d s += NB_THREAD; } } - -/*----------------------------------------------------------------------*\ - |* End *| - \*---------------------------------------------------------------------*/ - -- 2.45.2