X-Git-Url: http://git.euphorik.ch/index.cgi?a=blobdiff_plain;f=WCudaMSE%2FBilatTools_Cuda%2Fsrc%2Fcore%2Fcudatools%2Fheader%2Fdevice%2Freduction%2FreductionADD.h;fp=WCudaMSE%2FBilatTools_Cuda%2Fsrc%2Fcore%2Fcudatools%2Fheader%2Fdevice%2Freduction%2FreductionADD.h;h=fb9eba76ca989befc727d1f3c68ba74f6df0332c;hb=8d08c12b29c2a14684f35c023ee39e694bb80d25;hp=0000000000000000000000000000000000000000;hpb=226de81f7e1f1fbf4ac79d0d089e8a05ec7159a0;p=GPU.git diff --git a/WCudaMSE/BilatTools_Cuda/src/core/cudatools/header/device/reduction/reductionADD.h b/WCudaMSE/BilatTools_Cuda/src/core/cudatools/header/device/reduction/reductionADD.h new file mode 100755 index 0000000..fb9eba7 --- /dev/null +++ b/WCudaMSE/BilatTools_Cuda/src/core/cudatools/header/device/reduction/reductionADD.h @@ -0,0 +1,142 @@ +#ifndef REDUCTION_ADD_H_ +#define REDUCTION_ADD_H_ + +/*----------------------------------------------------------------------*\ + |* Declaration *| + \*---------------------------------------------------------------------*/ + +/*--------------------------------------*\ + |* Imported *| + \*-------------------------------------*/ + +#include "Indice1D.h" + +/*--------------------------------------*\ + |* Public *| + \*-------------------------------------*/ + +/** + * Hyp + * (H1) atomicAdd doit exister pour T , sinon utiliser la version in reductionADDLock.h + * (H2) tabBlock size is a power of 2 + * (H3) tabBlock is already fill with data + */ +template +__device__ void reductionADD(T* tabBlock, T* ptrDevResultat); + +/*--------------------------------------*\ + |* Private *| + \*-------------------------------------*/ + +template +static __device__ void reductionIntraBlock(T* tabBlock); + +template +static __device__ void reductionInterblock(T* tabBlock, T* ptrDevResultat); + +/*----------------------------------------------------------------------*\ + |* Implementation *| + \*---------------------------------------------------------------------*/ + +/*--------------------------------------*\ + |* Public *| + \*-------------------------------------*/ + +template +__device__ void reductionADD(T* tabBlock, T* ptrDevResultat) + { + reductionIntraBlock(tabBlock); + + __syncthreads(); + + reductionInterblock(tabBlock, ptrDevResultat); + } + +/*--------------------------------------*\ + |* Private *| + \*-------------------------------------*/ + +/** + * Hyp : + * (H1) length(tabBlock) est une puissance de 2 ! ie blockDim.x est une puissance de 2 + * (H2) gride et Block de dim 1 + * (H3) T est un type simple suppoter par atomicAdd + * + */ +template +__device__ void reductionIntraBlock(T* tabBlock) + { + + //v1 +// int midle = blockDim.x / 2; +// int tidLocal = threadIdx.x; +// +// // int tidLocal = Indice1D::tidLocal() +// +// while (midle >= 1) +// { +// +// if (tidLocal < midle) +// { +// tabBlock[tidLocal] += tabBlock[tidLocal + midle]; +// } +// +// __syncthreads(); +// +// //midle /= 2; +// midle>>=1; +// } + +//v2 + int midle = blockDim.x / 2; + int tidLocal = threadIdx.x; + + //a 64 on ne divise plus et on a besoin de 32 thread pour finir de reduire le 64 premières cases + while (midle >= 64) + { + + if (tidLocal < midle) + { + tabBlock[tidLocal] += tabBlock[tidLocal + midle]; + } + + __syncthreads(); + + //midle /= 2; + midle>>=1; + } + + // Utilisation des 32 thread d'un warp pour finir la reduction + if(tidLocal<32) + { + // no __syncthreads() necessary after exah of the following lines as long as we acces the data via a pointzer decalred as volatile + // because teh 32 therad in each warp execute in a locked-step with each other + volatile T* ptrData=tabBlock; + + ptrData[tidLocal]+=ptrData[tidLocal+32]; + ptrData[tidLocal]+=ptrData[tidLocal+16]; + ptrData[tidLocal]+=ptrData[tidLocal+8]; + ptrData[tidLocal]+=ptrData[tidLocal+4]; + ptrData[tidLocal]+=ptrData[tidLocal+2]; + ptrData[tidLocal]+=ptrData[tidLocal+1]; + } + + } + +/** + * Hyp : ptrDevResultat iniotaiuliasé avec 0 !! + */ +template +__device__ void reductionInterblock(T* tabBlock, T* ptrDevResultat) + { + if (threadIdx.x == 0) + { + atomicAdd(ptrDevResultat, tabBlock[0]); // autant d'acces que de block + } + } + +#endif + +/*----------------------------------------------------------------------*\ + |* End *| + \*---------------------------------------------------------------------*/