WCudaMSE/BilatTools_Cuda/src/core/cudatools/header/device/reduction/reductionADD.h

   1 #ifndef REDUCTION_ADD_H_
   2 #define REDUCTION_ADD_H_
   3
   4 /*----------------------------------------------------------------------*\
   5  |*                     Declaration                                     *|
   6  \*---------------------------------------------------------------------*/
   7
   8 /*--------------------------------------*\
   9  |*             Imported                *|
  10  \*-------------------------------------*/
  11
  12 #include "Indice1D.h"
  13
  14 /*--------------------------------------*\
  15  |*             Public                  *|
  16  \*-------------------------------------*/
  17
  18 /**
  19  * Hyp
  20  *      (H1) atomicAdd doit exister pour T , sinon utiliser la version in reductionADDLock.h
  21  *      (H2) tabBlock size is a power of 2
  22  *      (H3) tabBlock is already fill with data
  23  */
  24 template <typename T>
  25 __device__ void reductionADD(T* tabBlock, T* ptrDevResultat);
  26
  27 /*--------------------------------------*\
  28  |*             Private                 *|
  29  \*-------------------------------------*/
  30
  31 template <typename T>
  32 static __device__ void reductionIntraBlock(T* tabBlock);
  33
  34 template <typename T>
  35 static __device__ void reductionInterblock(T* tabBlock, T* ptrDevResultat);
  36
  37 /*----------------------------------------------------------------------*\
  38  |*                     Implementation                                  *|
  39  \*---------------------------------------------------------------------*/
  40
  41 /*--------------------------------------*\
  42  |*             Public                  *|
  43  \*-------------------------------------*/
  44
  45 template <typename T>
  46 __device__ void reductionADD(T* tabBlock, T* ptrDevResultat)
  47     {
  48     reductionIntraBlock(tabBlock);
  49
  50     __syncthreads();
  51
  52     reductionInterblock(tabBlock, ptrDevResultat);
  53     }
  54
  55 /*--------------------------------------*\
  56  |*             Private                 *|
  57  \*-------------------------------------*/
  58
  59 /**
  60  * Hyp :
  61  *      (H1)    length(tabBlock) est une puissance de 2 ! ie blockDim.x est une puissance de 2
  62  *      (H2)    gride et Block de dim 1
  63  *      (H3)    T est un type simple suppoter par atomicAdd
  64  *
  65  */
  66 template <typename T>
  67 __device__ void reductionIntraBlock(T* tabBlock)
  68     {
  69
  70     //v1
  71 //    int midle = blockDim.x / 2;
  72 //    int tidLocal = threadIdx.x;
  73 //
  74 //    // int tidLocal =   Indice1D::tidLocal()
  75 //
  76 //    while (midle >= 1)
  77 //      {
  78 //
  79 //      if (tidLocal < midle)
  80 //          {
  81 //          tabBlock[tidLocal] += tabBlock[tidLocal + midle];
  82 //          }
  83 //
  84 //      __syncthreads();
  85 //
  86 //      //midle /= 2;
  87 //      midle>>=1;
  88 //      }
  89
  90 //v2
  91     int midle = blockDim.x / 2;
  92     int tidLocal = threadIdx.x;
  93
  94     //a 64 on ne divise plus et on a besoin de 32 thread pour finir de reduire le 64 premières cases
  95     while (midle >= 64)
  96         {
  97
  98         if (tidLocal < midle)
  99             {
 100             tabBlock[tidLocal] += tabBlock[tidLocal + midle];
 101             }
 102
 103         __syncthreads();
 104
 105         //midle /= 2;
 106         midle>>=1;
 107         }
 108
 109     // Utilisation des 32 thread d'un warp pour finir la reduction
 110     if(tidLocal<32)
 111         {
 112         // no __syncthreads() necessary after exah of the following lines as long as  we acces the data via a pointzer decalred as volatile
 113         // because teh 32 therad in each warp execute in a locked-step with each other
 114         volatile T* ptrData=tabBlock;
 115
 116         ptrData[tidLocal]+=ptrData[tidLocal+32];
 117         ptrData[tidLocal]+=ptrData[tidLocal+16];
 118         ptrData[tidLocal]+=ptrData[tidLocal+8];
 119         ptrData[tidLocal]+=ptrData[tidLocal+4];
 120         ptrData[tidLocal]+=ptrData[tidLocal+2];
 121         ptrData[tidLocal]+=ptrData[tidLocal+1];
 122         }
 123
 124     }
 125
 126 /**
 127  *  Hyp : ptrDevResultat iniotaiuliasé avec 0 !!
 128  */
 129 template <typename T>
 130 __device__ void reductionInterblock(T* tabBlock, T* ptrDevResultat)
 131     {
 132     if (threadIdx.x == 0)
 133         {
 134         atomicAdd(ptrDevResultat, tabBlock[0]); // autant d'acces que de block
 135         }
 136     }
 137
 138 #endif
 139
 140 /*----------------------------------------------------------------------*\
 141  |*                     End                                             *|
 142  \*---------------------------------------------------------------------*/