1 #ifndef REDUCTION_ADD_LOCK_H_
2 #define REDUCTION_ADD_LOCK_H_
7 /*----------------------------------------------------------------------*\
9 \*---------------------------------------------------------------------*/
11 /*--------------------------------------*\
13 \*-------------------------------------*/
17 /*--------------------------------------*\
19 \*-------------------------------------*/
23 * (H1) atomicAdd n'existe pas pour T , sinon utiliser la version in reductionADD.h
24 * (H2) tabBlock size is a power of 2
25 * (H3) tabBlock is already fill with data
29 * // variable global .cu
30 * __device__ int mutex=0; // Attention à l'initialisation
32 * // Variable local inside kernel (same .cu as variable mutex)
33 * LockGPU lock=LockGPU(&mutex);
36 __device__
void reductionADD(T
* tabBlock
, T
* ptrDevResultat
,Lock
* ptrlock
);
38 /*--------------------------------------*\
40 \*-------------------------------------*/
43 static __device__
void reductionIntraBlock(T
* tabBlock
);
46 static __device__
void reductionInterblock(T
* tabBlock
, T
* ptrDevResultat
,Lock
* ptrlock
);
48 /*----------------------------------------------------------------------*\
50 \*---------------------------------------------------------------------*/
52 /*--------------------------------------*\
54 \*-------------------------------------*/
57 __device__
void reductionADD(T
* tabBlock
, T
* ptrDevResultat
,Lock
* ptrlock
)
59 reductionIntraBlock(tabBlock
);
63 reductionInterblock(tabBlock
, ptrDevResultat
,ptrlock
);
66 /*--------------------------------------*\
68 \*-------------------------------------*/
72 * (H1) length(tabBlock) est une puissance de 2 ! ie blockDim.x est une puissance de 2
73 * (H2) gride et Block de dim 1
74 * (H3) T est un type simple suppoter par atomicAdd
78 __device__
void reductionIntraBlock(T
* tabBlock
)
82 // int midle = blockDim.x / 2;
83 // int tidLocal = threadIdx.x;
85 // // int tidLocal = Indice1D::tidLocal()
90 // if (tidLocal < midle)
92 // tabBlock[tidLocal] += tabBlock[tidLocal + midle];
102 int midle
= blockDim
.x
/ 2;
103 int tidLocal
= threadIdx
.x
;
105 //a 64 on ne divise plus et on a besoin de 32 thread pour finir de reduire le 64 premières cases
109 if (tidLocal
< midle
)
111 tabBlock
[tidLocal
] += tabBlock
[tidLocal
+ midle
];
120 // Utilisation des 32 thread d'un warp pour finir la reduction
123 // no __syncthreads() necessary after exah of the following lines as long as we acces the data via a pointzer decalred as volatile
124 // because teh 32 therad in each warp execute in a locked-step with each other
125 volatile T
* ptrData
=tabBlock
;
127 ptrData
[tidLocal
]+=ptrData
[tidLocal
+32];
128 ptrData
[tidLocal
]+=ptrData
[tidLocal
+16];
129 ptrData
[tidLocal
]+=ptrData
[tidLocal
+8];
130 ptrData
[tidLocal
]+=ptrData
[tidLocal
+4];
131 ptrData
[tidLocal
]+=ptrData
[tidLocal
+2];
132 ptrData
[tidLocal
]+=ptrData
[tidLocal
+1];
138 * Hyp : ptrDevResultat iniotaiuliasé avec 0 !!
140 template <typename T
>
141 __device__
void reductionInterblock(T
* tabBlock
, T
* ptrDevResultat
,Lock
* ptrlock
)
143 if (threadIdx
.x
== 0)
146 *ptrDevResultat
+= tabBlock
[0];
153 /*----------------------------------------------------------------------*\
155 \*---------------------------------------------------------------------*/