+++ /dev/null
-#include <iostream>
-
-#include "Indice2D.h"
-#include "cudaTools.h"
-#include "Device.h"
-
-using std::cout;
-using std::endl;
-
-static __global__ void add(float* ptrDevV1, float* ptrDevV2, int n, float* ptrDevResult);
-static __device__ float work(float v1, float v2);
-
-__global__ void add(float* ptrDevV1, float* ptrDevV2, int n, float* ptrDevResult)
- {
- const int NB_THREAD = Indice2D::nbThread();
- const int TID = Indice2D::tid();
-
- int s = TID;
-
- while (s < n)
- {
- ptrDevResult[s] = work(ptrDevV1[s], ptrDevV2[s]);
- s += NB_THREAD;
- }
- }
-
-__device__ float work(float v1, float v2)
- {
- return v1 + v2;
- }
-
-bool addVectors()
- {
- // Inputs (passé en paramètre de la fonction dans un cas général).
- float v1[] = { 1, 2, 3 };
- float v2[] = { 10, 20, 30 };
-
- // Outputs (renvoyer de la fonction dans un cas général).
- float vRes[3];
-
- // Allocation coté GPU.
- float* ptrDevV1, *ptrDevV2, *ptrDevVResult = 0;
- const size_t vecSize = 3 * sizeof(float);
- HANDLE_ERROR(cudaMalloc(&ptrDevV1, vecSize));
- HANDLE_ERROR(cudaMalloc(&ptrDevV2, vecSize));
- HANDLE_ERROR(cudaMalloc(&ptrDevVResult, vecSize));
-
- HANDLE_ERROR(cudaMemset(ptrDevV1, 0, vecSize));
- HANDLE_ERROR(cudaMemset(ptrDevV2, 0, vecSize));
- HANDLE_ERROR(cudaMemset(ptrDevVResult, 0, vecSize));
-
- HANDLE_ERROR(cudaMemcpy(ptrDevV1, v1, vecSize, cudaMemcpyHostToDevice));
- HANDLE_ERROR(cudaMemcpy(ptrDevV2, v2, vecSize, cudaMemcpyHostToDevice));
-
- const dim3 dg(2, 2, 1);
- const dim3 db(2, 2, 1);
- Device::assertDim(dg, db);
-
- add<<<dg, db>>>(ptrDevV1, ptrDevV2, 3, ptrDevVResult);
-
- // Barrière implicite de synchronisation ('cudaMemCpy').
- HANDLE_ERROR(cudaMemcpy(vRes, ptrDevVResult, vecSize, cudaMemcpyDeviceToHost));
-
- return vRes[0] == 11 && vRes[1] == 22 && vRes[2] == 33;
- }
--- /dev/null
+#include <iostream>
+#include <cmath>
+using namespace std;
+
+#include "Indice1D.h"
+#include "cudaTools.h"
+#include "Device.h"
+
+#define M_V 200
+#define M_W 200
+
+#define VI 1.4422495703074083
+#define WI 0.7390850782394409
+
+/*
+ * Renvoie la valeur du ième élément du vecteur v.
+ */
+__device__
+double v(long i)
+ {
+ double x = 1.5 + abs(cos(double(i)));
+ for (long j = 1; j <= M_V; j++)
+ {
+ const double xCarre = x * x;
+ x = x - (xCarre * x - 3) / (3 * xCarre);
+ }
+ return (x / VI) * sqrt(double(i));
+ }
+
+/*
+ * Renvoie la valeur du ième élément du vecteur w.
+ */
+__device__
+double w(long i)
+ {
+ double x = abs(cos(double(i)));
+ for (long j = 1; j <= M_W; j++)
+ x = x - (cos(x) - x) / (-sin(x) - 1);
+ return (x / WI) * sqrt(double(i));
+ }
+
+/*
+ * n: La taille des deux vecteurs.
+ */
+__device__ void reductionIntraThread(int n, double* tabResultSM)
+ {
+ const int NB_THREAD = Indice1D::nbThread();
+ const int TID = Indice1D::tid();
+ const int TID_LOCAL = Indice1D::tidLocal();
+
+ double threadResult = 0.0;
+ int s = TID;
+ while (s < n)
+ {
+ threadResult += v(s) * w(s);
+ s += NB_THREAD;
+ }
+ tabResultSM[TID_LOCAL] = threadResult;
+ }
+
+/*
+ * Combine les résultats de 'tabResultSM' dans 'tabResulSM[0]'
+ */
+__device__ void combine(double* tabResultSM, int middle)
+ {
+ const int TID_LOCAL = Indice1D::tidLocal();
+ const int NB_THREAD_LOCAL = Indice1D::nbThreadBlock();
+
+ int s = TID_LOCAL;
+ while (s < middle)
+ {
+ tabResultSM[s] = tabResultSM[s] + tabResultSM[s + middle];
+ s += NB_THREAD_LOCAL;
+ }
+ }
+
+__device__ void reductionIntraBlock(double* tabResultSM)
+ {
+ const int TAB_SIZE = blockDim.x;
+ int middle = TAB_SIZE / 2;
+
+ while (middle > 0)
+ {
+ combine(tabResultSM, middle);
+ middle /= 2;
+ __syncthreads();
+ }
+ }
+
+__device__ void reductionInterBlock(double* tabResultSM, float* ptrResult)
+ {
+ const int TID_LOCAL = Indice1D::tidLocal();
+ if (TID_LOCAL == 0)
+ {
+ atomicAdd(ptrResult, float(tabResultSM[0]));
+ }
+ }
+
+/**
+ * La taille de la shared memory (en terme de # de sizeof(double)) doit
+ * être égal à la taille des blocs.
+ * n: La taille des deux vecteurs.
+ * ptrResult: Le resultat du produit scalaire.
+ */
+__global__
+void produitScalaire(int n, float* ptrResult)
+ {
+ extern __shared__ double tabResultSM[]; // Shared memory.
+
+ // 1) Réduction intra-thread.
+ reductionIntraThread(n, tabResultSM);
+
+ __syncthreads();
+
+ // 2) Réduction intra-block.
+ reductionIntraBlock(tabResultSM);
+
+ // 3) Réduction inter-block.
+ reductionInterBlock(tabResultSM, ptrResult);
+ }
+
+double resultatTheorique(long n)
+{
+ n -= 1;
+ return (n / 2.0) * (n+1);
+}
+
+bool produitScalaire()
+ {
+ const int N = 100000000; // Taille des deux vecteurs.
+
+ // Allocation coté GPU en global memory (GM).
+ float* ptrDevResult = 0;
+ HANDLE_ERROR(cudaMalloc(&ptrDevResult, sizeof(float)));
+ HANDLE_ERROR(cudaMemset(ptrDevResult, 0, sizeof(float)));
+
+ // Paramètre de l'appel de la fonction sur le device.
+ const dim3 dg(256, 1, 1);
+ const dim3 db(256, 1, 1);
+ Device::assertDim(dg, db);
+ const size_t SMSize = db.x * sizeof(double); // 256 doubles;
+
+ produitScalaire<<<dg, db, SMSize>>>(N, ptrDevResult);
+
+ float res;
+ // Barrière implicite de synchronisation ('cudaMemCpy').
+ HANDLE_ERROR(cudaMemcpy(&res, ptrDevResult, sizeof(float), cudaMemcpyDeviceToHost));
+
+ double resTheo = resultatTheorique(N);
+
+ cout.precision(10);
+ cout << "Résultat : " << res << endl;
+ cout << "Résultat théorique : " << resTheo << endl;
+ cout << "Différence absolue : " << resTheo - res << endl;
+ cout << "Différence relatif : " << 100 * (resTheo - res) / (resTheo + res) << " %" << endl;
+
+ return true;
+ }
#include <iostream>
#include <stdlib.h>
+using namespace std;
-
-using std::cout;
-using std::endl;
-
-/*----------------------------------------------------------------------*\
- |* Declaration *|
- \*---------------------------------------------------------------------*/
-
-/*--------------------------------------*\
- |* Imported *|
- \*-------------------------------------*/
-
-extern bool useHello(void);
+extern bool useHello();
extern bool addVectors();
-
-/*--------------------------------------*\
- |* Public *|
- \*-------------------------------------*/
+extern bool produitScalaire();
int mainCore();
-/*--------------------------------------*\
- |* Private *|
- \*-------------------------------------*/
-
-
-
-/*----------------------------------------------------------------------*\
- |* Implementation *|
- \*---------------------------------------------------------------------*/
-
-/*--------------------------------------*\
- |* Public *|
- \*-------------------------------------*/
-
int mainCore()
{
bool isOk = true;
- isOk &= useHello();
- isOk &= addVectors();
+ /*isOk &= useHello();
+ isOk &= addVectors();*/
+ isOk &= produitScalaire();
cout << "\nisOK = " << isOk << endl;
cout << "\nEnd : mainCore" << endl;
return isOk ? EXIT_SUCCESS : EXIT_FAILURE;
}
-/*--------------------------------------*\
- |* Private *|
- \*-------------------------------------*/
-
-
-
-/*----------------------------------------------------------------------*\
- |* End *|
- \*---------------------------------------------------------------------*/
-
#include "cudaTools.h"
#include "Device.h"
-#include "LimitsTools.h"
+#include "LimitsTools.h"
using std::cout;
using std::endl;
if (Device::isCuda())
{
- Device::printAll();
+ Device::printAll();
Device::printAllSimple();
// Server Cuda1: in [0,5]
bool testALL()
{
- int deviceId=Device::getDeviceId();
+ int deviceId = Device::getDeviceId();
Suite testSuite;
- testSuite.add(std::auto_ptr < Suite > (new TestHello(deviceId)));
+ testSuite.add(std::auto_ptr<Suite>(new TestHello(deviceId)));
string titre = "deviceId_" + StringTools::toString(deviceId);
using std::cout;\r
using std::endl;\r
\r
-/*----------------------------------------------------------------------*\\r
- |* Declaration *|\r
- \*---------------------------------------------------------------------*/\r
-\r
-/*--------------------------------------*\\r
- |* Imported *|\r
- \*-------------------------------------*/\r
-\r
-/*--------------------------------------*\\r
- |* Public *|\r
- \*-------------------------------------*/\r
-\r
__global__ void fractalMandelbrot(uchar4* ptrDevPixels, int w, int h, DomaineMath domaineMath, int n);\r
__global__ void fractalJulia(uchar4* ptrDevPixels, int w, int h, DomaineMath domaineMath, int n, float c_r, float c_i);\r
__device__ void fractal(uchar4* ptrDevPixels, int w, int h, const DomaineMath& domaineMath, int n, const FractalMath& fractalMath);\r
\r
-/*--------------------------------------*\\r
- |* Private *|\r
- \*-------------------------------------*/\r
-\r
-/*----------------------------------------------------------------------*\\r
- |* Implementation *|\r
- \*---------------------------------------------------------------------*/\r
-\r
-/*--------------------------------------*\\r
- |* Public *|\r
- \*-------------------------------------*/\r
-\r
-/*--------------------------------------*\\r
- |* Private *|\r
- \*-------------------------------------*/\r
-\r
-\r
__global__ void fractalMandelbrot(uchar4* ptrDevPixels, int w, int h, DomaineMath domaineMath, int n)\r
{\r
FractalMandelbrotMath fractalMath(n);\r
{\r
IndiceTools::toIJ(s, w, &pixelI, &pixelJ); // update (pixelI, pixelJ)\r
\r
- // (i,j) domaine ecran\r
+ // (i,j) domaine écran\r
// (x,y) domaine math\r
domaineMath.toXY(pixelI, pixelJ, &x, &y); // (i,j) -> (x,y)\r
\r
s += NB_THREAD;\r
}\r
}\r
-\r
-/*----------------------------------------------------------------------*\\r
- |* End *|\r
- \*---------------------------------------------------------------------*/\r
-\r