+#ifndef REDUCTION_ADD_H_\r
+#define REDUCTION_ADD_H_\r
+\r
+/*----------------------------------------------------------------------*\\r
+ |* Declaration *|\r
+ \*---------------------------------------------------------------------*/\r
+\r
+/*--------------------------------------*\\r
+ |* Imported *|\r
+ \*-------------------------------------*/\r
+\r
+#include "Indice1D.h"\r
+\r
+/*--------------------------------------*\\r
+ |* Public *|\r
+ \*-------------------------------------*/\r
+\r
+/**\r
+ * Hyp\r
+ * (H1) atomicAdd doit exister pour T , sinon utiliser la version in reductionADDLock.h\r
+ * (H2) tabBlock size is a power of 2\r
+ * (H3) tabBlock is already fill with data\r
+ */\r
+template <typename T>\r
+__device__ void reductionADD(T* tabBlock, T* ptrDevResultat);\r
+\r
+/*--------------------------------------*\\r
+ |* Private *|\r
+ \*-------------------------------------*/\r
+\r
+template <typename T>\r
+static __device__ void reductionIntraBlock(T* tabBlock);\r
+\r
+template <typename T>\r
+static __device__ void reductionInterblock(T* tabBlock, T* ptrDevResultat);\r
+\r
+/*----------------------------------------------------------------------*\\r
+ |* Implementation *|\r
+ \*---------------------------------------------------------------------*/\r
+\r
+/*--------------------------------------*\\r
+ |* Public *|\r
+ \*-------------------------------------*/\r
+\r
+template <typename T>\r
+__device__ void reductionADD(T* tabBlock, T* ptrDevResultat)\r
+ {\r
+ reductionIntraBlock(tabBlock);\r
+\r
+ __syncthreads();\r
+\r
+ reductionInterblock(tabBlock, ptrDevResultat);\r
+ }\r
+\r
+/*--------------------------------------*\\r
+ |* Private *|\r
+ \*-------------------------------------*/\r
+\r
+/**\r
+ * Hyp :\r
+ * (H1) length(tabBlock) est une puissance de 2 ! ie blockDim.x est une puissance de 2\r
+ * (H2) gride et Block de dim 1\r
+ * (H3) T est un type simple suppoter par atomicAdd\r
+ *\r
+ */\r
+template <typename T>\r
+__device__ void reductionIntraBlock(T* tabBlock)\r
+ {\r
+\r
+ //v1\r
+// int midle = blockDim.x / 2;\r
+// int tidLocal = threadIdx.x;\r
+//\r
+// // int tidLocal = Indice1D::tidLocal()\r
+//\r
+// while (midle >= 1)\r
+// {\r
+//\r
+// if (tidLocal < midle)\r
+// {\r
+// tabBlock[tidLocal] += tabBlock[tidLocal + midle];\r
+// }\r
+//\r
+// __syncthreads();\r
+//\r
+// //midle /= 2;\r
+// midle>>=1;\r
+// }\r
+\r
+//v2\r
+ int midle = blockDim.x / 2;\r
+ int tidLocal = threadIdx.x;\r
+\r
+ //a 64 on ne divise plus et on a besoin de 32 thread pour finir de reduire le 64 premières cases\r
+ while (midle >= 64)\r
+ {\r
+\r
+ if (tidLocal < midle)\r
+ {\r
+ tabBlock[tidLocal] += tabBlock[tidLocal + midle];\r
+ }\r
+\r
+ __syncthreads();\r
+\r
+ //midle /= 2;\r
+ midle>>=1;\r
+ }\r
+\r
+ // Utilisation des 32 thread d'un warp pour finir la reduction\r
+ if(tidLocal<32)\r
+ {\r
+ // no __syncthreads() necessary after exah of the following lines as long as we acces the data via a pointzer decalred as volatile\r
+ // because teh 32 therad in each warp execute in a locked-step with each other\r
+ volatile T* ptrData=tabBlock;\r
+\r
+ ptrData[tidLocal]+=ptrData[tidLocal+32];\r
+ ptrData[tidLocal]+=ptrData[tidLocal+16];\r
+ ptrData[tidLocal]+=ptrData[tidLocal+8];\r
+ ptrData[tidLocal]+=ptrData[tidLocal+4];\r
+ ptrData[tidLocal]+=ptrData[tidLocal+2];\r
+ ptrData[tidLocal]+=ptrData[tidLocal+1];\r
+ }\r
+\r
+ }\r
+\r
+/**\r
+ * Hyp : ptrDevResultat iniotaiuliasé avec 0 !!\r
+ */\r
+template <typename T>\r
+__device__ void reductionInterblock(T* tabBlock, T* ptrDevResultat)\r
+ {\r
+ if (threadIdx.x == 0)\r
+ {\r
+ atomicAdd(ptrDevResultat, tabBlock[0]); // autant d'acces que de block\r
+ }\r
+ }\r
+\r
+#endif \r
+\r
+/*----------------------------------------------------------------------*\\r
+ |* End *|\r
+ \*---------------------------------------------------------------------*/\r