Ajout de l'ensemble du workspace.
[GPU.git] / WCudaMSE / BilatTools_Cuda / src / core / cudatools / header / device / reduction / reductionADD.h
1 #ifndef REDUCTION_ADD_H_
2 #define REDUCTION_ADD_H_
3
4 /*----------------------------------------------------------------------*\
5 |* Declaration *|
6 \*---------------------------------------------------------------------*/
7
8 /*--------------------------------------*\
9 |* Imported *|
10 \*-------------------------------------*/
11
12 #include "Indice1D.h"
13
14 /*--------------------------------------*\
15 |* Public *|
16 \*-------------------------------------*/
17
18 /**
19 * Hyp
20 * (H1) atomicAdd doit exister pour T , sinon utiliser la version in reductionADDLock.h
21 * (H2) tabBlock size is a power of 2
22 * (H3) tabBlock is already fill with data
23 */
24 template <typename T>
25 __device__ void reductionADD(T* tabBlock, T* ptrDevResultat);
26
27 /*--------------------------------------*\
28 |* Private *|
29 \*-------------------------------------*/
30
31 template <typename T>
32 static __device__ void reductionIntraBlock(T* tabBlock);
33
34 template <typename T>
35 static __device__ void reductionInterblock(T* tabBlock, T* ptrDevResultat);
36
37 /*----------------------------------------------------------------------*\
38 |* Implementation *|
39 \*---------------------------------------------------------------------*/
40
41 /*--------------------------------------*\
42 |* Public *|
43 \*-------------------------------------*/
44
45 template <typename T>
46 __device__ void reductionADD(T* tabBlock, T* ptrDevResultat)
47 {
48 reductionIntraBlock(tabBlock);
49
50 __syncthreads();
51
52 reductionInterblock(tabBlock, ptrDevResultat);
53 }
54
55 /*--------------------------------------*\
56 |* Private *|
57 \*-------------------------------------*/
58
59 /**
60 * Hyp :
61 * (H1) length(tabBlock) est une puissance de 2 ! ie blockDim.x est une puissance de 2
62 * (H2) gride et Block de dim 1
63 * (H3) T est un type simple suppoter par atomicAdd
64 *
65 */
66 template <typename T>
67 __device__ void reductionIntraBlock(T* tabBlock)
68 {
69
70 //v1
71 // int midle = blockDim.x / 2;
72 // int tidLocal = threadIdx.x;
73 //
74 // // int tidLocal = Indice1D::tidLocal()
75 //
76 // while (midle >= 1)
77 // {
78 //
79 // if (tidLocal < midle)
80 // {
81 // tabBlock[tidLocal] += tabBlock[tidLocal + midle];
82 // }
83 //
84 // __syncthreads();
85 //
86 // //midle /= 2;
87 // midle>>=1;
88 // }
89
90 //v2
91 int midle = blockDim.x / 2;
92 int tidLocal = threadIdx.x;
93
94 //a 64 on ne divise plus et on a besoin de 32 thread pour finir de reduire le 64 premières cases
95 while (midle >= 64)
96 {
97
98 if (tidLocal < midle)
99 {
100 tabBlock[tidLocal] += tabBlock[tidLocal + midle];
101 }
102
103 __syncthreads();
104
105 //midle /= 2;
106 midle>>=1;
107 }
108
109 // Utilisation des 32 thread d'un warp pour finir la reduction
110 if(tidLocal<32)
111 {
112 // no __syncthreads() necessary after exah of the following lines as long as we acces the data via a pointzer decalred as volatile
113 // because teh 32 therad in each warp execute in a locked-step with each other
114 volatile T* ptrData=tabBlock;
115
116 ptrData[tidLocal]+=ptrData[tidLocal+32];
117 ptrData[tidLocal]+=ptrData[tidLocal+16];
118 ptrData[tidLocal]+=ptrData[tidLocal+8];
119 ptrData[tidLocal]+=ptrData[tidLocal+4];
120 ptrData[tidLocal]+=ptrData[tidLocal+2];
121 ptrData[tidLocal]+=ptrData[tidLocal+1];
122 }
123
124 }
125
126 /**
127 * Hyp : ptrDevResultat iniotaiuliasé avec 0 !!
128 */
129 template <typename T>
130 __device__ void reductionInterblock(T* tabBlock, T* ptrDevResultat)
131 {
132 if (threadIdx.x == 0)
133 {
134 atomicAdd(ptrDevResultat, tabBlock[0]); // autant d'acces que de block
135 }
136 }
137
138 #endif
139
140 /*----------------------------------------------------------------------*\
141 |* End *|
142 \*---------------------------------------------------------------------*/