1 #ifndef REDUCTION_ADD_H_
2 #define REDUCTION_ADD_H_
4 /*----------------------------------------------------------------------*\
6 \*---------------------------------------------------------------------*/
8 /*--------------------------------------*\
10 \*-------------------------------------*/
14 /*--------------------------------------*\
16 \*-------------------------------------*/
20 * (H1) atomicAdd doit exister pour T , sinon utiliser la version in reductionADDLock.h
21 * (H2) tabBlock size is a power of 2
22 * (H3) tabBlock is already fill with data
25 __device__
void reductionADD(T
* tabBlock
, T
* ptrDevResultat
);
27 /*--------------------------------------*\
29 \*-------------------------------------*/
32 static __device__
void reductionIntraBlock(T
* tabBlock
);
35 static __device__
void reductionInterblock(T
* tabBlock
, T
* ptrDevResultat
);
37 /*----------------------------------------------------------------------*\
39 \*---------------------------------------------------------------------*/
41 /*--------------------------------------*\
43 \*-------------------------------------*/
46 __device__
void reductionADD(T
* tabBlock
, T
* ptrDevResultat
)
48 reductionIntraBlock(tabBlock
);
52 reductionInterblock(tabBlock
, ptrDevResultat
);
55 /*--------------------------------------*\
57 \*-------------------------------------*/
61 * (H1) length(tabBlock) est une puissance de 2 ! ie blockDim.x est une puissance de 2
62 * (H2) gride et Block de dim 1
63 * (H3) T est un type simple suppoter par atomicAdd
67 __device__
void reductionIntraBlock(T
* tabBlock
)
71 // int midle = blockDim.x / 2;
72 // int tidLocal = threadIdx.x;
74 // // int tidLocal = Indice1D::tidLocal()
79 // if (tidLocal < midle)
81 // tabBlock[tidLocal] += tabBlock[tidLocal + midle];
91 int midle
= blockDim
.x
/ 2;
92 int tidLocal
= threadIdx
.x
;
94 //a 64 on ne divise plus et on a besoin de 32 thread pour finir de reduire le 64 premières cases
100 tabBlock
[tidLocal
] += tabBlock
[tidLocal
+ midle
];
109 // Utilisation des 32 thread d'un warp pour finir la reduction
112 // no __syncthreads() necessary after exah of the following lines as long as we acces the data via a pointzer decalred as volatile
113 // because teh 32 therad in each warp execute in a locked-step with each other
114 volatile T
* ptrData
=tabBlock
;
116 ptrData
[tidLocal
]+=ptrData
[tidLocal
+32];
117 ptrData
[tidLocal
]+=ptrData
[tidLocal
+16];
118 ptrData
[tidLocal
]+=ptrData
[tidLocal
+8];
119 ptrData
[tidLocal
]+=ptrData
[tidLocal
+4];
120 ptrData
[tidLocal
]+=ptrData
[tidLocal
+2];
121 ptrData
[tidLocal
]+=ptrData
[tidLocal
+1];
127 * Hyp : ptrDevResultat iniotaiuliasé avec 0 !!
129 template <typename T
>
130 __device__
void reductionInterblock(T
* tabBlock
, T
* ptrDevResultat
)
132 if (threadIdx
.x
== 0)
134 atomicAdd(ptrDevResultat
, tabBlock
[0]); // autant d'acces que de block
140 /*----------------------------------------------------------------------*\
142 \*---------------------------------------------------------------------*/