src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,  by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 /*! \internal \file
  37  *  \brief
  38  *  Utility constant and function declaration for the CUDA non-bonded kernels.
  39  *  This header should be included once at the top level, just before the
  40  *  kernels are included (has to be preceded by nbnxn_cuda_types.h).
  41  *
  42  *  \author Szilárd Páll <pall.szilard@gmail.com>
  43  *  \ingroup module_mdlib
  44  */
  45 #include "config.h"
  46
  47 /* Note that floating-point constants in CUDA code should be suffixed
  48  * with f (e.g. 0.5f), to stop the compiler producing intermediate
  49  * code that is in double precision.
  50  */
  51
  52 #include "gromacs/gpu_utils/cuda_arch_utils.cuh"
  53 #include "gromacs/gpu_utils/vectype_ops.cuh"
  54
  55 #include "nbnxn_cuda_types.h"
  56
  57 #ifndef NBNXN_CUDA_KERNEL_UTILS_CUH
  58 #define NBNXN_CUDA_KERNEL_UTILS_CUH
  59
  60 /* Use texture objects if supported by the target hardware. */
  61 #if GMX_PTX_ARCH >= 300
  62 /* Note: convenience macro, needs to be undef-ed at the end of the file. */
  63 #define USE_TEXOBJ
  64 #endif
  65
  66 #define WARP_SIZE_POW2_EXPONENT     (5)
  67 #define CL_SIZE_POW2_EXPONENT       (3)  /* change this together with GPU_NS_CLUSTER_SIZE !*/
  68 #define CL_SIZE_SQ                  (CL_SIZE * CL_SIZE)
  69 #define FBUF_STRIDE                 (CL_SIZE_SQ)
  70
  71 #define ONE_SIXTH_F     0.16666667f
  72 #define ONE_TWELVETH_F  0.08333333f
  73
  74 /* With multiple compilation units this ensures that texture refs are available
  75    in the the kernels' compilation units. */
  76 #if !GMX_CUDA_NB_SINGLE_COMPILATION_UNIT
  77 /*! Texture reference for LJ C6/C12 parameters; bound to cu_nbparam_t.nbfp */
  78 extern texture<float, 1, cudaReadModeElementType> nbfp_texref;
  79
  80 /*! Texture reference for LJ-PME parameters; bound to cu_nbparam_t.nbfp_comb */
  81 extern texture<float, 1, cudaReadModeElementType> nbfp_comb_texref;
  82
  83 /*! Texture reference for Ewald coulomb force table; bound to cu_nbparam_t.coulomb_tab */
  84 extern texture<float, 1, cudaReadModeElementType> coulomb_tab_texref;
  85 #endif /* GMX_CUDA_NB_SINGLE_COMPILATION_UNIT */
  86
  87 /*! Apply force switch,  force + energy version. */
  88 static inline __device__
  89 void calculate_force_switch_F(const  cu_nbparam_t nbparam,
  90                               float               c6,
  91                               float               c12,
  92                               float               inv_r,
  93                               float               r2,
  94                               float              *F_invr)
  95 {
  96     float r, r_switch;
  97
  98     /* force switch constants */
  99     float disp_shift_V2 = nbparam.dispersion_shift.c2;
 100     float disp_shift_V3 = nbparam.dispersion_shift.c3;
 101     float repu_shift_V2 = nbparam.repulsion_shift.c2;
 102     float repu_shift_V3 = nbparam.repulsion_shift.c3;
 103
 104     r         = r2 * inv_r;
 105     r_switch  = r - nbparam.rvdw_switch;
 106     r_switch  = r_switch >= 0.0f ? r_switch : 0.0f;
 107
 108     *F_invr  +=
 109         -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
 110         c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
 111 }
 112
 113 /*! Apply force switch, force-only version. */
 114 static inline __device__
 115 void calculate_force_switch_F_E(const  cu_nbparam_t nbparam,
 116                                 float               c6,
 117                                 float               c12,
 118                                 float               inv_r,
 119                                 float               r2,
 120                                 float              *F_invr,
 121                                 float              *E_lj)
 122 {
 123     float r, r_switch;
 124
 125     /* force switch constants */
 126     float disp_shift_V2 = nbparam.dispersion_shift.c2;
 127     float disp_shift_V3 = nbparam.dispersion_shift.c3;
 128     float repu_shift_V2 = nbparam.repulsion_shift.c2;
 129     float repu_shift_V3 = nbparam.repulsion_shift.c3;
 130
 131     float disp_shift_F2 = nbparam.dispersion_shift.c2/3;
 132     float disp_shift_F3 = nbparam.dispersion_shift.c3/4;
 133     float repu_shift_F2 = nbparam.repulsion_shift.c2/3;
 134     float repu_shift_F3 = nbparam.repulsion_shift.c3/4;
 135
 136     r         = r2 * inv_r;
 137     r_switch  = r - nbparam.rvdw_switch;
 138     r_switch  = r_switch >= 0.0f ? r_switch : 0.0f;
 139
 140     *F_invr  +=
 141         -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
 142         c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
 143     *E_lj    +=
 144         c6*(disp_shift_F2 + disp_shift_F3*r_switch)*r_switch*r_switch*r_switch -
 145         c12*(repu_shift_F2 + repu_shift_F3*r_switch)*r_switch*r_switch*r_switch;
 146 }
 147
 148 /*! Apply potential switch, force-only version. */
 149 static inline __device__
 150 void calculate_potential_switch_F(const  cu_nbparam_t nbparam,
 151                                   float               c6,
 152                                   float               c12,
 153                                   float               inv_r,
 154                                   float               r2,
 155                                   float              *F_invr,
 156                                   float              *E_lj)
 157 {
 158     float r, r_switch;
 159     float sw, dsw;
 160
 161     /* potential switch constants */
 162     float switch_V3 = nbparam.vdw_switch.c3;
 163     float switch_V4 = nbparam.vdw_switch.c4;
 164     float switch_V5 = nbparam.vdw_switch.c5;
 165     float switch_F2 = 3*nbparam.vdw_switch.c3;
 166     float switch_F3 = 4*nbparam.vdw_switch.c4;
 167     float switch_F4 = 5*nbparam.vdw_switch.c5;
 168
 169     r        = r2 * inv_r;
 170     r_switch = r - nbparam.rvdw_switch;
 171
 172     /* Unlike in the F+E kernel, conditional is faster here */
 173     if (r_switch > 0.0f)
 174     {
 175         sw      = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
 176         dsw     = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
 177
 178         *F_invr = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
 179     }
 180 }
 181
 182 /*! Apply potential switch, force + energy version. */
 183 static inline __device__
 184 void calculate_potential_switch_F_E(const  cu_nbparam_t nbparam,
 185                                     float               c6,
 186                                     float               c12,
 187                                     float               inv_r,
 188                                     float               r2,
 189                                     float              *F_invr,
 190                                     float              *E_lj)
 191 {
 192     float r, r_switch;
 193     float sw, dsw;
 194
 195     /* potential switch constants */
 196     float switch_V3 = nbparam.vdw_switch.c3;
 197     float switch_V4 = nbparam.vdw_switch.c4;
 198     float switch_V5 = nbparam.vdw_switch.c5;
 199     float switch_F2 = 3*nbparam.vdw_switch.c3;
 200     float switch_F3 = 4*nbparam.vdw_switch.c4;
 201     float switch_F4 = 5*nbparam.vdw_switch.c5;
 202
 203     r        = r2 * inv_r;
 204     r_switch = r - nbparam.rvdw_switch;
 205     r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
 206
 207     /* Unlike in the F-only kernel, masking is faster here */
 208     sw       = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
 209     dsw      = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
 210
 211     *F_invr  = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
 212     *E_lj   *= sw;
 213 }
 214
 215 /*! Calculate LJ-PME grid force contribution with
 216  *  geometric combination rule.
 217  */
 218 static inline __device__
 219 void calculate_lj_ewald_comb_geom_F(const cu_nbparam_t nbparam,
 220                                     int                typei,
 221                                     int                typej,
 222                                     float              r2,
 223                                     float              inv_r2,
 224                                     float              lje_coeff2,
 225                                     float              lje_coeff6_6,
 226                                     float             *F_invr)
 227 {
 228     float c6grid, inv_r6_nm, cr2, expmcr2, poly;
 229
 230 #ifdef USE_TEXOBJ
 231     c6grid    = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typei) * tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typej);
 232 #else
 233     c6grid    = tex1Dfetch(nbfp_comb_texref, 2*typei) * tex1Dfetch(nbfp_comb_texref, 2*typej);
 234 #endif /* USE_TEXOBJ */
 235
 236     /* Recalculate inv_r6 without exclusion mask */
 237     inv_r6_nm = inv_r2*inv_r2*inv_r2;
 238     cr2       = lje_coeff2*r2;
 239     expmcr2   = expf(-cr2);
 240     poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
 241
 242     /* Subtract the grid force from the total LJ force */
 243     *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
 244 }
 245
 246 /*! Calculate LJ-PME grid force + energy contribution with
 247  *  geometric combination rule.
 248  */
 249 static inline __device__
 250 void calculate_lj_ewald_comb_geom_F_E(const cu_nbparam_t nbparam,
 251                                       int                typei,
 252                                       int                typej,
 253                                       float              r2,
 254                                       float              inv_r2,
 255                                       float              lje_coeff2,
 256                                       float              lje_coeff6_6,
 257                                       float              int_bit,
 258                                       float             *F_invr,
 259                                       float             *E_lj)
 260 {
 261     float c6grid, inv_r6_nm, cr2, expmcr2, poly, sh_mask;
 262
 263 #ifdef USE_TEXOBJ
 264     c6grid    = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typei) * tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typej);
 265 #else
 266     c6grid    = tex1Dfetch(nbfp_comb_texref, 2*typei) * tex1Dfetch(nbfp_comb_texref, 2*typej);
 267 #endif /* USE_TEXOBJ */
 268
 269     /* Recalculate inv_r6 without exclusion mask */
 270     inv_r6_nm = inv_r2*inv_r2*inv_r2;
 271     cr2       = lje_coeff2*r2;
 272     expmcr2   = expf(-cr2);
 273     poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
 274
 275     /* Subtract the grid force from the total LJ force */
 276     *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
 277
 278     /* Shift should be applied only to real LJ pairs */
 279     sh_mask   = nbparam.sh_lj_ewald*int_bit;
 280     *E_lj    += ONE_SIXTH_F*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
 281 }
 282
 283 /*! Calculate LJ-PME grid force + energy contribution (if E_lj != NULL) with
 284  *  Lorentz-Berthelot combination rule.
 285  *  We use a single F+E kernel with conditional because the performance impact
 286  *  of this is pretty small and LB on the CPU is anyway very slow.
 287  */
 288 static inline __device__
 289 void calculate_lj_ewald_comb_LB_F_E(const cu_nbparam_t nbparam,
 290                                     int                typei,
 291                                     int                typej,
 292                                     float              r2,
 293                                     float              inv_r2,
 294                                     float              lje_coeff2,
 295                                     float              lje_coeff6_6,
 296                                     float              int_bit,
 297                                     float             *F_invr,
 298                                     float             *E_lj)
 299 {
 300     float c6grid, inv_r6_nm, cr2, expmcr2, poly;
 301     float sigma, sigma2, epsilon;
 302
 303     /* sigma and epsilon are scaled to give 6*C6 */
 304 #ifdef USE_TEXOBJ
 305     sigma   = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typei    ) + tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typej    );
 306     epsilon = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typei + 1) * tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typej + 1);
 307 #else
 308     sigma   = tex1Dfetch(nbfp_comb_texref, 2*typei    ) + tex1Dfetch(nbfp_comb_texref, 2*typej    );
 309     epsilon = tex1Dfetch(nbfp_comb_texref, 2*typei + 1) * tex1Dfetch(nbfp_comb_texref, 2*typej + 1);
 310 #endif /* USE_TEXOBJ */
 311     sigma2  = sigma*sigma;
 312     c6grid  = epsilon*sigma2*sigma2*sigma2;
 313
 314     /* Recalculate inv_r6 without exclusion mask */
 315     inv_r6_nm = inv_r2*inv_r2*inv_r2;
 316     cr2       = lje_coeff2*r2;
 317     expmcr2   = expf(-cr2);
 318     poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
 319
 320     /* Subtract the grid force from the total LJ force */
 321     *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
 322
 323     if (E_lj != NULL)
 324     {
 325         float sh_mask;
 326
 327         /* Shift should be applied only to real LJ pairs */
 328         sh_mask   = nbparam.sh_lj_ewald*int_bit;
 329         *E_lj    += ONE_SIXTH_F*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
 330     }
 331 }
 332
 333 /*! Interpolate Ewald coulomb force using the table through the tex_nbfp texture.
 334  *  Original idea: from the OpenMM project
 335  */
 336 static inline __device__
 337 float interpolate_coulomb_force_r(float r, float scale)
 338 {
 339     float   normalized = scale * r;
 340     int     index      = (int) normalized;
 341     float   fract2     = normalized - index;
 342     float   fract1     = 1.0f - fract2;
 343
 344     return fract1 * tex1Dfetch(coulomb_tab_texref, index)
 345            + fract2 * tex1Dfetch(coulomb_tab_texref, index + 1);
 346 }
 347
 348 static inline __device__
 349 float interpolate_coulomb_force_r(cudaTextureObject_t texobj_coulomb_tab,
 350                                   float r, float scale)
 351 {
 352     float   normalized = scale * r;
 353     int     index      = (int) normalized;
 354     float   fract2     = normalized - index;
 355     float   fract1     = 1.0f - fract2;
 356
 357     return fract1 * tex1Dfetch<float>(texobj_coulomb_tab, index) +
 358            fract2 * tex1Dfetch<float>(texobj_coulomb_tab, index + 1);
 359 }
 360
 361 /*! Calculate analytical Ewald correction term. */
 362 static inline __device__
 363 float pmecorrF(float z2)
 364 {
 365     const float FN6 = -1.7357322914161492954e-8f;
 366     const float FN5 = 1.4703624142580877519e-6f;
 367     const float FN4 = -0.000053401640219807709149f;
 368     const float FN3 = 0.0010054721316683106153f;
 369     const float FN2 = -0.019278317264888380590f;
 370     const float FN1 = 0.069670166153766424023f;
 371     const float FN0 = -0.75225204789749321333f;
 372
 373     const float FD4 = 0.0011193462567257629232f;
 374     const float FD3 = 0.014866955030185295499f;
 375     const float FD2 = 0.11583842382862377919f;
 376     const float FD1 = 0.50736591960530292870f;
 377     const float FD0 = 1.0f;
 378
 379     float       z4;
 380     float       polyFN0, polyFN1, polyFD0, polyFD1;
 381
 382     z4          = z2*z2;
 383
 384     polyFD0     = FD4*z4 + FD2;
 385     polyFD1     = FD3*z4 + FD1;
 386     polyFD0     = polyFD0*z4 + FD0;
 387     polyFD0     = polyFD1*z2 + polyFD0;
 388
 389     polyFD0     = 1.0f/polyFD0;
 390
 391     polyFN0     = FN6*z4 + FN4;
 392     polyFN1     = FN5*z4 + FN3;
 393     polyFN0     = polyFN0*z4 + FN2;
 394     polyFN1     = polyFN1*z4 + FN1;
 395     polyFN0     = polyFN0*z4 + FN0;
 396     polyFN0     = polyFN1*z2 + polyFN0;
 397
 398     return polyFN0*polyFD0;
 399 }
 400
 401 /*! Final j-force reduction; this generic implementation works with
 402  *  arbitrary array sizes.
 403  */
 404 static inline __device__
 405 void reduce_force_j_generic(float *f_buf, float3 *fout,
 406                             int tidxi, int tidxj, int aidx)
 407 {
 408     if (tidxi < 3)
 409     {
 410         float f = 0.0f;
 411         for (int j = tidxj * CL_SIZE; j < (tidxj + 1) * CL_SIZE; j++)
 412         {
 413             f += f_buf[FBUF_STRIDE * tidxi + j];
 414         }
 415
 416         atomicAdd((&fout[aidx].x)+tidxi, f);
 417     }
 418 }
 419
 420 /*! Final j-force reduction; this implementation only with power of two
 421  *  array sizes and with sm >= 3.0
 422  */
 423 #if GMX_PTX_ARCH >= 300
 424 static inline __device__
 425 void reduce_force_j_warp_shfl(float3 f, float3 *fout,
 426                               int tidxi, int aidx)
 427 {
 428     f.x += __shfl_down(f.x, 1);
 429     f.y += __shfl_up  (f.y, 1);
 430     f.z += __shfl_down(f.z, 1);
 431
 432     if (tidxi & 1)
 433     {
 434         f.x = f.y;
 435     }
 436
 437     f.x += __shfl_down(f.x, 2);
 438     f.z += __shfl_up  (f.z, 2);
 439
 440     if (tidxi & 2)
 441     {
 442         f.x = f.z;
 443     }
 444
 445     f.x += __shfl_down(f.x, 4);
 446
 447     if (tidxi < 3)
 448     {
 449         atomicAdd((&fout[aidx].x) + tidxi, f.x);
 450     }
 451 }
 452 #endif
 453
 454 /*! Final i-force reduction; this generic implementation works with
 455  *  arbitrary array sizes.
 456  * TODO: add the tidxi < 3 trick
 457  */
 458 static inline __device__
 459 void reduce_force_i_generic(float *f_buf, float3 *fout,
 460                             float *fshift_buf, bool bCalcFshift,
 461                             int tidxi, int tidxj, int aidx)
 462 {
 463     if (tidxj < 3)
 464     {
 465         float f = 0.0f;
 466         for (int j = tidxi; j < CL_SIZE_SQ; j += CL_SIZE)
 467         {
 468             f += f_buf[tidxj * FBUF_STRIDE + j];
 469         }
 470
 471         atomicAdd(&fout[aidx].x + tidxj, f);
 472
 473         if (bCalcFshift)
 474         {
 475             *fshift_buf += f;
 476         }
 477     }
 478 }
 479
 480 /*! Final i-force reduction; this implementation works only with power of two
 481  *  array sizes.
 482  */
 483 static inline __device__
 484 void reduce_force_i_pow2(volatile float *f_buf, float3 *fout,
 485                          float *fshift_buf, bool bCalcFshift,
 486                          int tidxi, int tidxj, int aidx)
 487 {
 488     int     i, j;
 489     float   f;
 490
 491     /* Reduce the initial CL_SIZE values for each i atom to half
 492      * every step by using CL_SIZE * i threads.
 493      * Can't just use i as loop variable because than nvcc refuses to unroll.
 494      */
 495     i = CL_SIZE/2;
 496     # pragma unroll 5
 497     for (j = CL_SIZE_POW2_EXPONENT - 1; j > 0; j--)
 498     {
 499         if (tidxj < i)
 500         {
 501
 502             f_buf[                  tidxj * CL_SIZE + tidxi] += f_buf[                  (tidxj + i) * CL_SIZE + tidxi];
 503             f_buf[    FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[    FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 504             f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 505         }
 506         i >>= 1;
 507     }
 508
 509     /* i == 1, last reduction step, writing to global mem */
 510     if (tidxj < 3)
 511     {
 512         /* tidxj*FBUF_STRIDE selects x, y or z */
 513         f = f_buf[tidxj * FBUF_STRIDE               + tidxi] +
 514             f_buf[tidxj * FBUF_STRIDE + i * CL_SIZE + tidxi];
 515
 516         atomicAdd(&(fout[aidx].x) + tidxj, f);
 517
 518         if (bCalcFshift)
 519         {
 520             *fshift_buf += f;
 521         }
 522     }
 523
 524 }
 525
 526 /*! Final i-force reduction wrapper; calls the generic or pow2 reduction depending
 527  *  on whether the size of the array to be reduced is power of two or not.
 528  */
 529 static inline __device__
 530 void reduce_force_i(float *f_buf, float3 *f,
 531                     float *fshift_buf, bool bCalcFshift,
 532                     int tidxi, int tidxj, int ai)
 533 {
 534     if ((CL_SIZE & (CL_SIZE - 1)))
 535     {
 536         reduce_force_i_generic(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
 537     }
 538     else
 539     {
 540         reduce_force_i_pow2(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
 541     }
 542 }
 543
 544 /*! Final i-force reduction; this implementation works only with power of two
 545  *  array sizes and with sm >= 3.0
 546  */
 547 #if GMX_PTX_ARCH >= 300
 548 static inline __device__
 549 void reduce_force_i_warp_shfl(float3 fin, float3 *fout,
 550                               float *fshift_buf, bool bCalcFshift,
 551                               int tidxj, int aidx)
 552 {
 553     fin.x += __shfl_down(fin.x, CL_SIZE);
 554     fin.y += __shfl_up  (fin.y, CL_SIZE);
 555     fin.z += __shfl_down(fin.z, CL_SIZE);
 556
 557     if (tidxj & 1)
 558     {
 559         fin.x = fin.y;
 560     }
 561
 562     fin.x += __shfl_down(fin.x, 2*CL_SIZE);
 563     fin.z += __shfl_up  (fin.z, 2*CL_SIZE);
 564
 565     if (tidxj & 2)
 566     {
 567         fin.x = fin.z;
 568     }
 569
 570     /* Threads 0,1,2 and 4,5,6 increment x,y,z for their warp */
 571     if ((tidxj & 3) < 3)
 572     {
 573         atomicAdd(&fout[aidx].x + (tidxj & ~4), fin.x);
 574
 575         if (bCalcFshift)
 576         {
 577             *fshift_buf += fin.x;
 578         }
 579     }
 580 }
 581 #endif
 582
 583 /*! Energy reduction; this implementation works only with power of two
 584  *  array sizes.
 585  */
 586 static inline __device__
 587 void reduce_energy_pow2(volatile float *buf,
 588                         float *e_lj, float *e_el,
 589                         unsigned int tidx)
 590 {
 591     int     i, j;
 592     float   e1, e2;
 593
 594     i = WARP_SIZE/2;
 595
 596     /* Can't just use i as loop variable because than nvcc refuses to unroll. */
 597 # pragma unroll 10
 598     for (j = WARP_SIZE_POW2_EXPONENT - 1; j > 0; j--)
 599     {
 600         if (tidx < i)
 601         {
 602             buf[              tidx] += buf[              tidx + i];
 603             buf[FBUF_STRIDE + tidx] += buf[FBUF_STRIDE + tidx + i];
 604         }
 605         i >>= 1;
 606     }
 607
 608     // TODO do this on two threads - requires e_lj and e_el to be stored on adjascent
 609     // memory locations to make sense
 610     /* last reduction step, writing to global mem */
 611     if (tidx == 0)
 612     {
 613         e1 = buf[              tidx] + buf[              tidx + i];
 614         e2 = buf[FBUF_STRIDE + tidx] + buf[FBUF_STRIDE + tidx + i];
 615
 616         atomicAdd(e_lj, e1);
 617         atomicAdd(e_el, e2);
 618     }
 619 }
 620
 621 /*! Energy reduction; this implementation works only with power of two
 622  *  array sizes and with sm >= 3.0
 623  */
 624 #if GMX_PTX_ARCH >= 300
 625 static inline __device__
 626 void reduce_energy_warp_shfl(float E_lj, float E_el,
 627                              float *e_lj, float *e_el,
 628                              int tidx)
 629 {
 630     int i, sh;
 631
 632     sh = 1;
 633 #pragma unroll 5
 634     for (i = 0; i < 5; i++)
 635     {
 636         E_lj += __shfl_down(E_lj, sh);
 637         E_el += __shfl_down(E_el, sh);
 638         sh   += sh;
 639     }
 640
 641     /* The first thread in the warp writes the reduced energies */
 642     if (tidx == 0 || tidx == WARP_SIZE)
 643     {
 644         atomicAdd(e_lj, E_lj);
 645         atomicAdd(e_el, E_el);
 646     }
 647 }
 648 #endif /* GMX_PTX_ARCH */
 649
 650 #undef USE_TEXOBJ
 651
 652 #endif /* NBNXN_CUDA_KERNEL_UTILS_CUH */