src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh

   1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
   2  *
   3  *
   4  *                This source code is part of
   5  *
   6  *                 G   R   O   M   A   C   S
   7  *
   8  *          GROningen MAchine for Chemical Simulations
   9  *
  10  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
  11  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  12  * Copyright (c) 2001-2012, The GROMACS development team,
  13  * check out http://www.gromacs.org for more information.
  14  *
  15  * This program is free software; you can redistribute it and/or
  16  * modify it under the terms of the GNU General Public License
  17  * as published by the Free Software Foundation; either version 2
  18  * of the License, or (at your option) any later version.
  19  *
  20  * If you want to redistribute modifications, please consider that
  21  * scientific software is very special. Version control is crucial -
  22  * bugs must be traceable. We will be happy to consider code for
  23  * inclusion in the official distribution, but derived work must not
  24  * be called official GROMACS. Details are found in the README & COPYING
  25  * files - if they are missing, get the official version at www.gromacs.org.
  26  *
  27  * To help us fund GROMACS development, we humbly ask that you cite
  28  * the papers on the package - you can find them in the top README file.
  29  *
  30  * For more info, check our website at http://www.gromacs.org
  31  *
  32  * And Hey:
  33  * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
  34  */
  35 #include "../../gmxlib/cuda_tools/vectype_ops.cuh"
  36
  37 #ifndef NBNXN_CUDA_KERNEL_UTILS_CUH
  38 #define NBNXN_CUDA_KERNEL_UTILS_CUH
  39
  40 #define WARP_SIZE_POW2_EXPONENT     (5)
  41 #define CL_SIZE_POW2_EXPONENT       (3)  /* change this together with GPU_NS_CLUSTER_SIZE !*/
  42 #define CL_SIZE_SQ                  (CL_SIZE * CL_SIZE)
  43 #define FBUF_STRIDE                 (CL_SIZE_SQ)
  44
  45 /*! Interpolate Ewald coulomb force using the table through the tex_nbfp texture.
  46  *  Original idea: OpenMM
  47  */
  48 static inline __device__
  49 float interpolate_coulomb_force_r(float r, float scale)
  50 {
  51     float   normalized = scale * r;
  52     int     index = (int) normalized;
  53     float   fract2 = normalized - index;
  54     float   fract1 = 1.0f - fract2;
  55
  56     return  fract1 * tex1Dfetch(tex_coulomb_tab, index)
  57             + fract2 * tex1Dfetch(tex_coulomb_tab, index + 1);
  58 }
  59
  60 /*! Final j-force reduction; this generic implementation works with
  61  *  arbitrary array sizes.
  62  */
  63 static inline __device__
  64 void reduce_force_j_generic(float *f_buf, float3 *fout,
  65                             int tidxi, int tidxj, int aidx)
  66 {
  67     if (tidxi == 0)
  68     {
  69         float3 f = make_float3(0.0f);
  70         for (int j = tidxj * CL_SIZE; j < (tidxj + 1) * CL_SIZE; j++)
  71         {
  72             f.x += f_buf[                  j];
  73             f.y += f_buf[    FBUF_STRIDE + j];
  74             f.z += f_buf[2 * FBUF_STRIDE + j];
  75         }
  76
  77         atomicAdd(&fout[aidx], f);
  78     }
  79 }
  80
  81 /*! Final j-force reduction; this implementation only with power of two
  82  *  array sizes and with sm >= 3.0
  83  */
  84 #if __CUDA_ARCH__ >= 300
  85 static inline __device__
  86 void reduce_force_j_warp_shfl(float3 f, float3 *fout,
  87                               int tidxi, int aidx)
  88 {
  89     int i;
  90
  91 #pragma unroll 3
  92     for (i = 0; i < 3; i++)
  93     {
  94         f.x += __shfl_down(f.x, 1<<i);
  95         f.y += __shfl_down(f.y, 1<<i);
  96         f.z += __shfl_down(f.z, 1<<i);
  97     }
  98
  99     /* Write the reduced j-force on one thread for each j */
 100     if (tidxi == 0)
 101     {
 102         atomicAdd(&fout[aidx], f);
 103     }
 104 }
 105 #endif
 106
 107 /*! Final i-force reduction; this generic implementation works with
 108  *  arbitrary array sizes.
 109  */
 110 static inline __device__
 111 void reduce_force_i_generic(float *f_buf, float3 *fout,
 112                             float3 *fshift_buf, bool bCalcFshift,
 113                             int tidxi, int tidxj, int aidx)
 114 {
 115     if (tidxj == 0)
 116     {
 117         float3 f = make_float3(0.0f);
 118         for (int j = tidxi; j < CL_SIZE_SQ; j += CL_SIZE)
 119         {
 120             f.x += f_buf[                  j];
 121             f.y += f_buf[    FBUF_STRIDE + j];
 122             f.z += f_buf[2 * FBUF_STRIDE + j];
 123         }
 124
 125         atomicAdd(&fout[aidx], f);
 126
 127         if (bCalcFshift)
 128         {
 129             *fshift_buf += f;
 130         }
 131     }
 132 }
 133
 134 /*! Final i-force reduction; this implementation works only with power of two
 135  *  array sizes.
 136  */
 137 static inline __device__
 138 void reduce_force_i_pow2(volatile float *f_buf, float3 *fout,
 139                          float3 *fshift_buf, bool bCalcFshift,
 140                          int tidxi, int tidxj, int aidx)
 141 {
 142     int     i, j;
 143     float3  f = make_float3(0.0f);
 144
 145     /* Reduce the initial CL_SIZE values for each i atom to half
 146      * every step by using CL_SIZE * i threads.
 147      * Can't just use i as loop variable because than nvcc refuses to unroll.
 148      */
 149     i = CL_SIZE/2;
 150     # pragma unroll 5
 151     for (j = CL_SIZE_POW2_EXPONENT - 1; j > 0; j--)
 152     {
 153         if (tidxj < i)
 154         {
 155
 156             f_buf[                  tidxj * CL_SIZE + tidxi] += f_buf[                  (tidxj + i) * CL_SIZE + tidxi];
 157             f_buf[    FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[    FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 158             f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 159         }
 160         i >>= 1;
 161     }
 162
 163     /* i == 1, last reduction step, writing to global mem */
 164     if (tidxj == 0)
 165     {
 166         f.x = f_buf[                  tidxj * CL_SIZE + tidxi] + f_buf[                  (tidxj + i) * CL_SIZE + tidxi];
 167         f.y = f_buf[    FBUF_STRIDE + tidxj * CL_SIZE + tidxi] + f_buf[    FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 168         f.z = f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] + f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 169
 170         atomicAdd(&fout[aidx], f);
 171
 172         if (bCalcFshift)
 173         {
 174             *fshift_buf += f;
 175         }
 176     }
 177 }
 178
 179 /*! Final i-force reduction wrapper; calls the generic or pow2 reduction depending
 180  *  on whether the size of the array to be reduced is power of two or not.
 181  */
 182 static inline __device__
 183 void reduce_force_i(float *f_buf, float3 *f,
 184                     float3 *fshift_buf, bool bCalcFshift,
 185                     int tidxi, int tidxj, int ai)
 186 {
 187     if ((CL_SIZE & (CL_SIZE - 1)))
 188     {
 189         reduce_force_i_generic(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
 190     }
 191     else
 192     {
 193         reduce_force_i_pow2(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
 194     }
 195 }
 196
 197 /*! Final i-force reduction; this implementation works only with power of two
 198  *  array sizes and with sm >= 3.0
 199  */
 200 #if __CUDA_ARCH__ >= 300
 201 static inline __device__
 202 void reduce_force_i_warp_shfl(float3 fin, float3 *fout,
 203                               float3 *fshift_buf, bool bCalcFshift,
 204                               int tidxj, int aidx)
 205 {
 206     int j;
 207
 208 #pragma unroll 2
 209     for (j = 0; j < 2; j++)
 210     {
 211         fin.x += __shfl_down(fin.x,  CL_SIZE<<j);
 212         fin.y += __shfl_down(fin.y,  CL_SIZE<<j);
 213         fin.z += __shfl_down(fin.z,  CL_SIZE<<j);
 214     }
 215
 216     /* The first thread in the warp writes the reduced force */
 217     if (tidxj == 0 || tidxj == 4)
 218     {
 219         atomicAdd(&fout[aidx], fin);
 220
 221         if (bCalcFshift)
 222         {
 223             fshift_buf->x += fin.x;
 224             fshift_buf->y += fin.y;
 225             fshift_buf->z += fin.z;
 226         }
 227     }
 228 }
 229 #endif
 230
 231 /*! Energy reduction; this implementation works only with power of two
 232  *  array sizes.
 233  */
 234 static inline __device__
 235 void reduce_energy_pow2(volatile float *buf,
 236                         float *e_lj, float *e_el,
 237                         unsigned int tidx)
 238 {
 239     int     i, j;
 240     float   e1, e2;
 241
 242     i = WARP_SIZE/2;
 243
 244     /* Can't just use i as loop variable because than nvcc refuses to unroll. */
 245 # pragma unroll 10
 246     for (j = WARP_SIZE_POW2_EXPONENT - 1; j > 0; j--)
 247     {
 248         if (tidx < i)
 249         {
 250             buf[              tidx] += buf[              tidx + i];
 251             buf[FBUF_STRIDE + tidx] += buf[FBUF_STRIDE + tidx + i];
 252         }
 253         i >>= 1;
 254     }
 255
 256     /* last reduction step, writing to global mem */
 257     if (tidx == 0)
 258     {
 259         e1 = buf[              tidx] + buf[              tidx + i];
 260         e2 = buf[FBUF_STRIDE + tidx] + buf[FBUF_STRIDE + tidx + i];
 261
 262         atomicAdd(e_lj, e1);
 263         atomicAdd(e_el, e2);
 264     }
 265 }
 266
 267 /*! Energy reduction; this implementation works only with power of two
 268  *  array sizes and with sm >= 3.0
 269  */
 270 #if __CUDA_ARCH__ >= 300
 271 static inline __device__
 272 void reduce_energy_warp_shfl(float E_lj, float E_el,
 273                              float *e_lj, float *e_el,
 274                              int tidx)
 275 {
 276     int i, sh;
 277
 278     sh = 1;
 279 #pragma unroll 5
 280     for (i = 0; i < 5; i++)
 281     {
 282         E_lj += __shfl_down(E_lj,sh);
 283         E_el += __shfl_down(E_el,sh);
 284         sh += sh;
 285     }
 286
 287     /* The first thread in the warp writes the reduced energies */
 288     if (tidx == 0 || tidx == WARP_SIZE)
 289     {
 290         atomicAdd(e_lj,E_lj);
 291         atomicAdd(e_el,E_el);
 292     }
 293 }
 294 #endif /* __CUDA_ARCH__ */
 295
 296 #endif /* NBNXN_CUDA_KERNEL_UTILS_CUH */