WCudaMSE/BilatTools_Cuda/src/core/cudatools/header/device/reduction/reductionMinMax.h

   1 #ifndef REDUCTION_MIN_MAX_H_
   2 #define REDUCTION_MIN_MAX_H_
   3
   4 #ifndef MIN
   5 #define MIN(X,Y) ((X)<(Y)?(X):(Y))
   6 #endif
   7
   8 #ifndef MAX
   9 #define MAX(X,Y) ((X)>(Y)?(X):(Y))
  10 #endif
  11
  12
  13 /*----------------------------------------------------------------------*\
  14  |*                     Declaration                                     *|
  15  \*---------------------------------------------------------------------*/
  16
  17 /*--------------------------------------*\
  18  |*             Imported                *|
  19  \*-------------------------------------*/
  20
  21 #include "Indice1D.h"
  22
  23 /*--------------------------------------*\
  24  |*             Public                  *|
  25  \*-------------------------------------*/
  26
  27 /**
  28  * Hyp
  29  *      (H1) atomicMin existe pour T , sinon utiliser la version in reductionMinMaxLock.h
  30  *      (H2) tabBlock size is a power of 2
  31  *      (H3) tabBlock is already fill with data
  32  */
  33 template <typename T>
  34 __device__ void reduction_Min(T* tabBlock, T* ptrDevResultat);
  35
  36 /**
  37  * Hyp
  38  *      (H1) atomicMax existe pour T , sinon utiliser la version in reductionMinMaxLock.h
  39  *      (H2) tabBlock size is a power of 2
  40  *      (H3) tabBlock is already fill with data
  41  */
  42 template <typename T>
  43 __device__ void reduction_Max(T* tabBlock, T* ptrDevResultat);
  44
  45 /*--------------------------------------*\
  46  |*             Private                 *|
  47  \*-------------------------------------*/
  48
  49 /*-----------------*\
  50  |*     Intra      *|
  51  \*-----------------*/
  52
  53 template <typename T>
  54 static __device__ void reduction_IntraBlock_Min(T* tabBlock);
  55
  56 template <typename T>
  57 static __device__ void reduction_IntraBlock_Max(T* tabBlock);
  58
  59 /*-----------------*\
  60  |*     Inter      *|
  61  \*-----------------*/
  62
  63 template <typename T>
  64 static __device__ void reduction_Interblock_Min(T* tabBlock, T* ptrDevResultat);
  65
  66 template <typename T>
  67 static __device__ void reduction_Interblock_Max(T* tabBlock, T* ptrDevResultat);
  68
  69 /*----------------------------------------------------------------------*\
  70  |*                     Implementation                                  *|
  71  \*---------------------------------------------------------------------*/
  72
  73
  74
  75 /*--------------------------------------*\
  76  |*             Public                  *|
  77  \*-------------------------------------*/
  78
  79 template <typename T>
  80 __device__ void reduction_Min(T* tabBlock, T* ptrDevResultat)
  81     {
  82     reduction_IntraBlock_Min(tabBlock);
  83
  84     __syncthreads();
  85
  86     reduction_Interblock_Min(tabBlock, ptrDevResultat);
  87     }
  88
  89 template <typename T>
  90 __device__ void reduction_Max(T* tabBlock, T* ptrDevResultat)
  91     {
  92     reduction_IntraBlock_Max(tabBlock);
  93
  94     __syncthreads();
  95
  96     reduction_Interblock_Max(tabBlock, ptrDevResultat);
  97     }
  98
  99 /*--------------------------------------*\
 100  |*             Private                 *|
 101  \*-------------------------------------*/
 102
 103
 104
 105 /*-----------------*\
 106  |*     Intra      *|
 107  \*-----------------*/
 108
 109
 110 /**
 111  * Hyp :
 112  *      (H1)    length(tabBlock) est une puissance de 2 ! ie blockDim.x est une puissance de 2
 113  *      (H2)    gride et Block de dim 1
 114  *      (H3)    T est un type simple suppoter par atomicMin (sinon use reductionMinMaxLock.h)
 115  */
 116 template <typename T>
 117 __device__ void reduction_IntraBlock_Min(T* tabBlock)
 118     {
 119     // v1
 120 //    int midle = blockDim.x / 2;
 121 //    int tidLocal = threadIdx.x;
 122 //
 123 //    // int tidLocal =   Indice1D::tidLocal()
 124 //
 125 //    while (midle >= 1)
 126 //      {
 127 //
 128 //      if (tidLocal < midle)
 129 //          {
 130 //          tabBlock[tidLocal] =MIN( tabBlock[tidLocal] ,tabBlock[tidLocal + midle]);
 131 //          }
 132 //
 133 //      __syncthreads(); // threads d'un meme block
 134 //
 135 //      //midle /= 2;
 136 //      midle>>=1;
 137 //      }
 138
 139     //v2
 140         int midle = blockDim.x / 2;
 141         int tidLocal = threadIdx.x;
 142
 143         //a 64 on ne divise plus et on a besoin de 32 thread pour finir de reduire le 64 premières cases
 144         while (midle >= 64)
 145         {
 146
 147         if (tidLocal < midle)
 148             {
 149             tabBlock[tidLocal] =MIN( tabBlock[tidLocal], tabBlock[tidLocal + midle]);
 150 }
 151
 152         __syncthreads();
 153
 154         //midle /= 2;
 155         midle>>=1;
 156         }
 157
 158     // Utilisation des 32 thread d'un warp pour finir la reduction
 159         if(tidLocal<32)
 160         {
 161         // no __syncthreads() necessary after exah of the following lines as long as  we acces the data via a pointzer decalred as volatile
 162         // because teh 32 therad in each warp execute in a locked-step with each other
 163         volatile T* ptrData=tabBlock;
 164
 165         ptrData[tidLocal]=MIN( tabBlock[tidLocal], tabBlock[tidLocal + 32]);
 166 ptrData[tidLocal]=MIN( tabBlock[tidLocal], tabBlock[tidLocal + 16]);
 167 ptrData[tidLocal]=MIN( tabBlock[tidLocal], tabBlock[tidLocal + 8]);
 168 ptrData[tidLocal]=MIN( tabBlock[tidLocal], tabBlock[tidLocal + 4]);
 169 ptrData[tidLocal]=MIN( tabBlock[tidLocal], tabBlock[tidLocal + 2]);
 170 ptrData[tidLocal]=MIN( tabBlock[tidLocal], tabBlock[tidLocal + 1]);
 171 }
 172     }
 173
 174 /**
 175  * Hyp :
 176  *      (H1)    length(tabBlock) est une puissance de 2 ! ie blockDim.x est une puissance de 2
 177  *      (H2)    gride et Block de dim 1
 178  *      (H3)    T est un type simple suppoter par atomicMax  (sinon use reductionMinMaxLock.h)
 179  */
 180 template <typename T>
 181 __device__ void reduction_IntraBlock_Max(T* tabBlock)
 182     {
 183     //v1
 184 //    int midle = blockDim.x / 2;
 185 //    int tidLocal = threadIdx.x;
 186 //
 187 //    // int tidLocal =   Indice1D::tidLocal()
 188 //
 189 //    while (midle >= 1)
 190 //      {
 191 //
 192 //      if (tidLocal < midle)
 193 //          {
 194 //          tabBlock[tidLocal] =MAX( tabBlock[tidLocal] ,tabBlock[tidLocal + midle]);
 195 //          }
 196 //
 197 //      __syncthreads(); // threads d'un meme block
 198 //
 199 //      midle /= 2;
 200 //      }
 201
 202     //v2
 203            int midle = blockDim.x / 2;
 204            int tidLocal = threadIdx.x;
 205
 206            //a 64 on ne divise plus et on a besoin de 32 thread pour finir de reduire le 64 premières cases
 207            while (midle >= 64)
 208         {
 209
 210         if (tidLocal < midle)
 211             {
 212             tabBlock[tidLocal] =MAX( tabBlock[tidLocal], tabBlock[tidLocal + midle]);
 213 }
 214
 215         __syncthreads();
 216
 217         //midle /= 2;
 218         midle>>=1;
 219         }
 220
 221        // Utilisation des 32 thread d'un warp pour finir la reduction
 222            if(tidLocal<32)
 223         {
 224         // no __syncthreads() necessary after exah of the following lines as long as  we acces the data via a pointzer decalred as volatile
 225         // because teh 32 therad in each warp execute in a locked-step with each other
 226         volatile float* ptrData=tabBlock;
 227
 228         ptrData[tidLocal]=MAX( tabBlock[tidLocal], tabBlock[tidLocal + 32]);
 229 ptrData[tidLocal]=MAX( tabBlock[tidLocal], tabBlock[tidLocal + 16]);
 230 ptrData[tidLocal]=MAX( tabBlock[tidLocal], tabBlock[tidLocal + 8]);
 231 ptrData[tidLocal]=MAX( tabBlock[tidLocal], tabBlock[tidLocal + 4]);
 232 ptrData[tidLocal]=MAX( tabBlock[tidLocal], tabBlock[tidLocal + 2]);
 233 ptrData[tidLocal]=MAX( tabBlock[tidLocal], tabBlock[tidLocal + 1]);
 234 }
 235 }
 236
 237 /*-----------------*\
 238  |*     Inter      *|
 239  \*-----------------*/
 240
 241 /**
 242  * Hyp :
 243  *      (H1)    length(tabBlock) est une puissance de 2 ! ie blockDim.x est une puissance de 2
 244  *      (H2)    gride et Block de dim 1
 245  *      (H3)    T est un type simple suppoter par atomicMin
 246  *      (H4)    ptrDevResultat a été initialisé avec une valeur intelligente
 247  */
 248 template <typename T>
 249 __device__ void reduction_Interblock_Min(T* tabBlock, T* ptrDevResultat)
 250 {
 251 if (threadIdx.x == 0)
 252 {
 253 atomicMin(ptrDevResultat, tabBlock[0]); // autant d'acces que de block
 254 }
 255 }
 256
 257 /**
 258  * Hyp :
 259  *      (H1)    length(tabBlock) est une puissance de 2 ! ie blockDim.x est une puissance de 2
 260  *      (H2)    gride et Block de dim 1
 261  *      (H3)    T est un type simple suppoter par atomicMax
 262  *      (H4)    ptrDevResultat a été initialisé avec une valeur intelligente
 263  */
 264 template <typename T>
 265 __device__ void reduction_Interblock_Max(T* tabBlock, T* ptrDevResultat)
 266 {
 267 if (threadIdx.x == 0)
 268 {
 269 atomicMax(ptrDevResultat, tabBlock[0]); // autant d'acces que de block
 270 }
 271 }
 272
 273 #endif
 274
 275 /*----------------------------------------------------------------------*\
 276  |*                     End                                             *|
 277  \*---------------------------------------------------------------------*/