Ajout de l'ensemble du workspace.

[GPU.git] / WCudaMSE / BilatTools_Cuda / src / core / cudatools / header / device / reduction / reductionADD.h
diff --git a/WCudaMSE/BilatTools_Cuda/src/core/cudatools/header/device/reduction/reductionADD.h b/WCudaMSE/BilatTools_Cuda/src/core/cudatools/header/device/reduction/reductionADD.h

new file mode 100755 (executable)

index 0000000..fb9eba7
--- /dev/null
+++ b/WCudaMSE/BilatTools_Cuda/src/core/cudatools/header/device/reduction/reductionADD.h
@@ -0,0 +1,142 @@
+#ifndef REDUCTION_ADD_H_\r
+#define REDUCTION_ADD_H_\r
+\r
+/*----------------------------------------------------------------------*\\r
+ |*                    Declaration                                     *|\r
+ \*---------------------------------------------------------------------*/\r
+\r
+/*--------------------------------------*\\r
+ |*            Imported                *|\r
+ \*-------------------------------------*/\r
+\r
+#include "Indice1D.h"\r
+\r
+/*--------------------------------------*\\r
+ |*            Public                  *|\r
+ \*-------------------------------------*/\r
+\r
+/**\r
+ * Hyp\r
+ *     (H1) atomicAdd doit exister pour T , sinon utiliser la version in reductionADDLock.h\r
+ *     (H2) tabBlock size is a power of 2\r
+ *     (H3) tabBlock is already fill with data\r
+ */\r
+template <typename T>\r
+__device__ void reductionADD(T* tabBlock, T* ptrDevResultat);\r
+\r
+/*--------------------------------------*\\r
+ |*            Private                 *|\r
+ \*-------------------------------------*/\r
+\r
+template <typename T>\r
+static __device__ void reductionIntraBlock(T* tabBlock);\r
+\r
+template <typename T>\r
+static __device__ void reductionInterblock(T* tabBlock, T* ptrDevResultat);\r
+\r
+/*----------------------------------------------------------------------*\\r
+ |*                    Implementation                                  *|\r
+ \*---------------------------------------------------------------------*/\r
+\r
+/*--------------------------------------*\\r
+ |*            Public                  *|\r
+ \*-------------------------------------*/\r
+\r
+template <typename T>\r
+__device__ void reductionADD(T* tabBlock, T* ptrDevResultat)\r
+    {\r
+    reductionIntraBlock(tabBlock);\r
+\r
+    __syncthreads();\r
+\r
+    reductionInterblock(tabBlock, ptrDevResultat);\r
+    }\r
+\r
+/*--------------------------------------*\\r
+ |*            Private                 *|\r
+ \*-------------------------------------*/\r
+\r
+/**\r
+ * Hyp :\r
+ *     (H1)    length(tabBlock) est une puissance de 2 ! ie blockDim.x est une puissance de 2\r
+ *     (H2)    gride et Block de dim 1\r
+ *     (H3)    T est un type simple suppoter par atomicAdd\r
+ *\r
+ */\r
+template <typename T>\r
+__device__ void reductionIntraBlock(T* tabBlock)\r
+    {\r
+\r
+    //v1\r
+//    int midle = blockDim.x / 2;\r
+//    int tidLocal = threadIdx.x;\r
+//\r
+//    // int tidLocal =   Indice1D::tidLocal()\r
+//\r
+//    while (midle >= 1)\r
+//     {\r
+//\r
+//     if (tidLocal < midle)\r
+//         {\r
+//         tabBlock[tidLocal] += tabBlock[tidLocal + midle];\r
+//         }\r
+//\r
+//     __syncthreads();\r
+//\r
+//     //midle /= 2;\r
+//     midle>>=1;\r
+//     }\r
+\r
+//v2\r
+    int midle = blockDim.x / 2;\r
+    int tidLocal = threadIdx.x;\r
+\r
+    //a 64 on ne divise plus et on a besoin de 32 thread pour finir de reduire le 64 premières cases\r
+    while (midle >= 64)\r
+       {\r
+\r
+       if (tidLocal < midle)\r
+           {\r
+           tabBlock[tidLocal] += tabBlock[tidLocal + midle];\r
+           }\r
+\r
+       __syncthreads();\r
+\r
+       //midle /= 2;\r
+       midle>>=1;\r
+       }\r
+\r
+    // Utilisation des 32 thread d'un warp pour finir la reduction\r
+    if(tidLocal<32)\r
+       {\r
+       // no __syncthreads() necessary after exah of the following lines as long as  we acces the data via a pointzer decalred as volatile\r
+       // because teh 32 therad in each warp execute in a locked-step with each other\r
+       volatile T* ptrData=tabBlock;\r
+\r
+       ptrData[tidLocal]+=ptrData[tidLocal+32];\r
+       ptrData[tidLocal]+=ptrData[tidLocal+16];\r
+       ptrData[tidLocal]+=ptrData[tidLocal+8];\r
+       ptrData[tidLocal]+=ptrData[tidLocal+4];\r
+       ptrData[tidLocal]+=ptrData[tidLocal+2];\r
+       ptrData[tidLocal]+=ptrData[tidLocal+1];\r
+       }\r
+\r
+    }\r
+\r
+/**\r
+ *  Hyp : ptrDevResultat iniotaiuliasé avec 0 !!\r
+ */\r
+template <typename T>\r
+__device__ void reductionInterblock(T* tabBlock, T* ptrDevResultat)\r
+    {\r
+    if (threadIdx.x == 0)\r
+       {\r
+       atomicAdd(ptrDevResultat, tabBlock[0]); // autant d'acces que de block\r
+       }\r
+    }\r
+\r
+#endif \r
+\r
+/*----------------------------------------------------------------------*\\r
+ |*                    End                                             *|\r
+ \*---------------------------------------------------------------------*/\r