src/gromacs/nbnxm/kernels_simd_2xmm/kernel_inner.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
   5  * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
   6  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   7  * and including many others, as listed in the AUTHORS file in the
   8  * top-level source directory and at http://www.gromacs.org.
   9  *
  10  * GROMACS is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public License
  12  * as published by the Free Software Foundation; either version 2.1
  13  * of the License, or (at your option) any later version.
  14  *
  15  * GROMACS is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with GROMACS; if not, see
  22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  24  *
  25  * If you want to redistribute modifications to GROMACS, please
  26  * consider that scientific software is very special. Version
  27  * control is crucial - bugs must be traceable. We will be happy to
  28  * consider code for inclusion in the official distribution, but
  29  * derived work must not be called official GROMACS. Details are found
  30  * in the README & COPYING files - if they are missing, get the
  31  * official version at http://www.gromacs.org.
  32  *
  33  * To help us fund GROMACS development, we humbly ask that you cite
  34  * the research papers on the package. Check out http://www.gromacs.org.
  35  */
  36
  37 /* This is the innermost loop contents for the 4 x N atom simd kernel.
  38  * This flavor of the kernel duplicates the data for N j-particles in
  39  * 2xN wide simd registers to do operate on 2 i-particles at once.
  40  * This leads to 4/2=2 sets of most instructions. Therefore we call
  41  * this kernel 2x(N+N) = 2xnn
  42  *
  43  * This 2xnn kernel is basically the 4xn equivalent with half the registers
  44  * and instructions removed.
  45  *
  46  * An alternative would be to load to different cluster of N j-particles
  47  * into simd registers, giving a 4x(N+N) kernel. This doubles the amount
  48  * of instructions, which could lead to better scheduling. But we actually
  49  * observed worse scheduling for the AVX-256 4x8 normal analytical PME
  50  * kernel, which has a lower pair throughput than 2x(4+4) with gcc 4.7.
  51  * It could be worth trying this option, but it takes some more effort.
  52  * This 2xnn kernel is basically the 4xn equivalent with
  53  */
  54
  55
  56 /* When calculating RF or Ewald interactions we calculate the electrostatic/LJ
  57  * forces on excluded atom pairs here in the non-bonded loops.
  58  * But when energies and/or virial is required we calculate them
  59  * separately to as then it is easier to separate the energy and virial
  60  * contributions.
  61  */
  62 #if defined CHECK_EXCLS && (defined CALC_COULOMB || defined LJ_EWALD_GEOM)
  63 #    define EXCL_FORCES
  64 #endif
  65
  66 {
  67     int cj, aj, ajx, ajy, ajz;
  68
  69 #ifdef ENERGY_GROUPS
  70     /* Energy group indices for two atoms packed into one int */
  71     int egp_jj[UNROLLJ / 2];
  72 #endif
  73
  74 #ifdef CHECK_EXCLS
  75     /* Interaction (non-exclusion) mask of all 1's or 0's */
  76     SimdBool interact_S0;
  77     SimdBool interact_S2;
  78 #endif
  79
  80     SimdReal jx_S, jy_S, jz_S;
  81     SimdReal dx_S0, dy_S0, dz_S0;
  82     SimdReal dx_S2, dy_S2, dz_S2;
  83     SimdReal tx_S0, ty_S0, tz_S0;
  84     SimdReal tx_S2, ty_S2, tz_S2;
  85     SimdReal rsq_S0, rinv_S0, rinvsq_S0;
  86     SimdReal rsq_S2, rinv_S2, rinvsq_S2;
  87     /* wco: within cut-off, mask of all 1's or 0's */
  88     SimdBool wco_S0;
  89     SimdBool wco_S2;
  90 #ifdef VDW_CUTOFF_CHECK
  91     SimdBool wco_vdw_S0;
  92 #    ifndef HALF_LJ
  93     SimdBool wco_vdw_S2;
  94 #    endif
  95 #endif
  96
  97 #if (defined CALC_COULOMB && defined CALC_COUL_TAB) || defined LJ_FORCE_SWITCH || defined LJ_POT_SWITCH
  98     SimdReal                                                        r_S0;
  99 #    if (defined CALC_COULOMB && defined CALC_COUL_TAB) || !defined HALF_LJ
 100     SimdReal                                                        r_S2;
 101 #    endif
 102 #endif
 103
 104 #if defined LJ_FORCE_SWITCH || defined LJ_POT_SWITCH
 105     SimdReal rsw_S0, rsw2_S0;
 106 #    ifndef HALF_LJ
 107     SimdReal rsw_S2, rsw2_S2;
 108 #    endif
 109 #endif
 110
 111 #ifdef CALC_COULOMB
 112 #    ifdef CHECK_EXCLS
 113     /* 1/r masked with the interaction mask */
 114     SimdReal rinv_ex_S0;
 115     SimdReal rinv_ex_S2;
 116 #    endif
 117     SimdReal jq_S;
 118     SimdReal qq_S0;
 119     SimdReal qq_S2;
 120 #    ifdef CALC_COUL_TAB
 121     /* The force (PME mesh force) we need to subtract from 1/r^2 */
 122     SimdReal fsub_S0;
 123     SimdReal fsub_S2;
 124 #    endif
 125 #    ifdef CALC_COUL_EWALD
 126     SimdReal brsq_S0, brsq_S2;
 127     SimdReal ewcorr_S0, ewcorr_S2;
 128 #    endif
 129
 130     /* frcoul = (1/r - fsub)*r */
 131     SimdReal frcoul_S0;
 132     SimdReal frcoul_S2;
 133 #    ifdef CALC_COUL_TAB
 134     /* For tables: r, rs=r/sp, rf=floor(rs), frac=rs-rf */
 135     SimdReal rs_S0, rf_S0, frac_S0;
 136     SimdReal rs_S2, rf_S2, frac_S2;
 137     /* Table index: rs truncated to an int */
 138     SimdInt32 ti_S0, ti_S2;
 139     /* Linear force table values */
 140     SimdReal ctab0_S0, ctab1_S0;
 141     SimdReal ctab0_S2, ctab1_S2;
 142 #        ifdef CALC_ENERGIES
 143     /* Quadratic energy table value */
 144     SimdReal ctabv_S0, dum_S0;
 145     SimdReal ctabv_S2, dum_S2;
 146 #        endif
 147 #    endif
 148 #    if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
 149     /* The potential (PME mesh) we need to subtract from 1/r */
 150     SimdReal vc_sub_S0;
 151     SimdReal vc_sub_S2;
 152 #    endif
 153 #    ifdef CALC_ENERGIES
 154     /* Electrostatic potential */
 155     SimdReal vcoul_S0;
 156     SimdReal vcoul_S2;
 157 #    endif
 158 #endif
 159     /* The force times 1/r */
 160     SimdReal fscal_S0;
 161     SimdReal fscal_S2;
 162
 163 #ifdef CALC_LJ
 164 #    ifdef LJ_COMB_LB
 165     /* LJ sigma_j/2 and sqrt(epsilon_j) */
 166     SimdReal hsig_j_S, seps_j_S;
 167     /* LJ sigma_ij and epsilon_ij */
 168     SimdReal sig_S0, eps_S0;
 169 #        ifndef HALF_LJ
 170     SimdReal sig_S2, eps_S2;
 171 #        endif
 172 #        ifdef CALC_ENERGIES
 173     SimdReal sig2_S0, sig6_S0;
 174 #            ifndef HALF_LJ
 175     SimdReal sig2_S2, sig6_S2;
 176 #            endif
 177 #        endif /* LJ_COMB_LB */
 178 #    endif     /* CALC_LJ */
 179
 180 #    ifdef LJ_COMB_GEOM
 181     SimdReal c6s_j_S, c12s_j_S;
 182 #    endif
 183
 184 #    if defined LJ_COMB_GEOM || defined LJ_COMB_LB || defined LJ_EWALD_GEOM
 185     /* Index for loading LJ parameters, complicated when interleaving */
 186     int aj2;
 187 #    endif
 188
 189     /* Intermediate variables for LJ calculation */
 190 #    ifndef LJ_COMB_LB
 191     SimdReal rinvsix_S0;
 192 #        ifndef HALF_LJ
 193     SimdReal rinvsix_S2;
 194 #        endif
 195 #    endif
 196 #    ifdef LJ_COMB_LB
 197     SimdReal sir_S0, sir2_S0, sir6_S0;
 198 #        ifndef HALF_LJ
 199     SimdReal sir_S2, sir2_S2, sir6_S2;
 200 #        endif
 201 #    endif
 202
 203     SimdReal FrLJ6_S0, FrLJ12_S0, frLJ_S0;
 204 #    ifndef HALF_LJ
 205     SimdReal FrLJ6_S2, FrLJ12_S2, frLJ_S2;
 206 #    endif
 207 #endif /* CALC_LJ */
 208
 209     /* j-cluster index */
 210     cj = l_cj[cjind].cj;
 211
 212     /* Atom indices (of the first atom in the cluster) */
 213     aj = cj * UNROLLJ;
 214 #if defined CALC_LJ && (defined LJ_COMB_GEOM || defined LJ_COMB_LB || defined LJ_EWALD_GEOM)
 215     aj2 = aj * 2;
 216 #endif
 217     ajx = aj * DIM;
 218     ajy = ajx + STRIDE;
 219     ajz = ajy + STRIDE;
 220
 221 #ifdef CHECK_EXCLS
 222     gmx_load_simd_2xnn_interactions(static_cast<int>(l_cj[cjind].excl), filter_S0, filter_S2,
 223                                     &interact_S0, &interact_S2);
 224 #endif /* CHECK_EXCLS */
 225
 226     /* load j atom coordinates */
 227     jx_S = loadDuplicateHsimd(x + ajx);
 228     jy_S = loadDuplicateHsimd(x + ajy);
 229     jz_S = loadDuplicateHsimd(x + ajz);
 230
 231     /* Calculate distance */
 232     dx_S0 = ix_S0 - jx_S;
 233     dy_S0 = iy_S0 - jy_S;
 234     dz_S0 = iz_S0 - jz_S;
 235     dx_S2 = ix_S2 - jx_S;
 236     dy_S2 = iy_S2 - jy_S;
 237     dz_S2 = iz_S2 - jz_S;
 238
 239     /* rsq = dx*dx+dy*dy+dz*dz */
 240     rsq_S0 = norm2(dx_S0, dy_S0, dz_S0);
 241     rsq_S2 = norm2(dx_S2, dy_S2, dz_S2);
 242
 243     /* Do the cut-off check */
 244     wco_S0 = (rsq_S0 < rc2_S);
 245     wco_S2 = (rsq_S2 < rc2_S);
 246
 247 #ifdef CHECK_EXCLS
 248 #    ifdef EXCL_FORCES
 249     /* Only remove the (sub-)diagonal to avoid double counting */
 250 #        if UNROLLJ == UNROLLI
 251     if (cj == ci_sh)
 252     {
 253         wco_S0 = wco_S0 && diagonal_mask_S0;
 254         wco_S2 = wco_S2 && diagonal_mask_S2;
 255     }
 256 #        else
 257 #            if UNROLLJ == 2 * UNROLLI
 258     if (cj * 2 == ci_sh)
 259     {
 260         wco_S0 = wco_S0 && diagonal_mask0_S0;
 261         wco_S2 = wco_S2 && diagonal_mask0_S2;
 262     }
 263     else if (cj * 2 + 1 == ci_sh)
 264     {
 265         wco_S0 = wco_S0 && diagonal_mask1_S0;
 266         wco_S2 = wco_S2 && diagonal_mask1_S2;
 267     }
 268 #            else
 269 #                error "only UNROLLJ == UNROLLI*(1 or 2) currently supported in 2xnn kernels"
 270 #            endif
 271 #        endif
 272 #    else /* EXCL_FORCES */
 273     /* No exclusion forces: remove all excluded atom pairs from the list */
 274     wco_S0 = wco_S0 && interact_S0;
 275     wco_S2 = wco_S2 && interact_S2;
 276 #    endif
 277 #endif
 278
 279 #ifdef COUNT_PAIRS
 280     {
 281         int                              i, j;
 282         alignas(GMX_SIMD_ALIGNMENT) real tmp[GMX_SIMD_REAL_WIDTH];
 283
 284         for (i = 0; i < UNROLLI; i += 2)
 285         {
 286             store(tmp, rc2_S - (i == 0 ? rsq_S0 : rsq_S2));
 287             for (j = 0; j < 2 * UNROLLJ; j++)
 288             {
 289                 if (tmp[j] >= 0)
 290                 {
 291                     npair++;
 292                 }
 293             }
 294         }
 295     }
 296 #endif
 297
 298     // Ensure the distances do not fall below the limit where r^-12 overflows.
 299     // This should never happen for normal interactions.
 300     rsq_S0 = max(rsq_S0, minRsq_S);
 301     rsq_S2 = max(rsq_S2, minRsq_S);
 302
 303     /* Calculate 1/r */
 304     rinv_S0 = invsqrt(rsq_S0);
 305     rinv_S2 = invsqrt(rsq_S2);
 306
 307 #ifdef CALC_COULOMB
 308     /* Load parameters for j atom */
 309     jq_S  = loadDuplicateHsimd(q + aj);
 310     qq_S0 = iq_S0 * jq_S;
 311     qq_S2 = iq_S2 * jq_S;
 312 #endif
 313
 314 #ifdef CALC_LJ
 315 #    if !defined LJ_COMB_GEOM && !defined LJ_COMB_LB && !defined FIX_LJ_C
 316     SimdReal                                                     c6_S0, c12_S0;
 317     gatherLoadTransposeHsimd<c_simdBestPairAlignment>(nbfp0, nbfp1, type + aj, &c6_S0, &c12_S0);
 318 #        ifndef HALF_LJ
 319     SimdReal c6_S2, c12_S2;
 320     gatherLoadTransposeHsimd<c_simdBestPairAlignment>(nbfp2, nbfp3, type + aj, &c6_S2, &c12_S2);
 321 #        endif
 322 #    endif /* not defined any LJ rule */
 323
 324 #    ifdef LJ_COMB_GEOM
 325     c6s_j_S        = loadDuplicateHsimd(ljc + aj2);
 326     c12s_j_S       = loadDuplicateHsimd(ljc + aj2 + STRIDE);
 327     SimdReal c6_S0 = c6s_S0 * c6s_j_S;
 328 #        ifndef HALF_LJ
 329     SimdReal c6_S2 = c6s_S2 * c6s_j_S;
 330 #        endif
 331     SimdReal c12_S0 = c12s_S0 * c12s_j_S;
 332 #        ifndef HALF_LJ
 333     SimdReal c12_S2 = c12s_S2 * c12s_j_S;
 334 #        endif
 335 #    endif /* LJ_COMB_GEOM */
 336
 337 #    ifdef LJ_COMB_LB
 338     hsig_j_S = loadDuplicateHsimd(ljc + aj2);
 339     seps_j_S = loadDuplicateHsimd(ljc + aj2 + STRIDE);
 340
 341     sig_S0 = hsig_i_S0 + hsig_j_S;
 342     eps_S0 = seps_i_S0 * seps_j_S;
 343 #        ifndef HALF_LJ
 344     sig_S2 = hsig_i_S2 + hsig_j_S;
 345     eps_S2 = seps_i_S2 * seps_j_S;
 346 #        endif
 347 #    endif /* LJ_COMB_LB */
 348
 349 #endif /* CALC_LJ */
 350
 351     /* Set rinv to zero for r beyond the cut-off */
 352     rinv_S0 = selectByMask(rinv_S0, wco_S0);
 353     rinv_S2 = selectByMask(rinv_S2, wco_S2);
 354
 355     rinvsq_S0 = rinv_S0 * rinv_S0;
 356     rinvsq_S2 = rinv_S2 * rinv_S2;
 357
 358 #ifdef CALC_COULOMB
 359     /* Note that here we calculate force*r, not the usual force/r.
 360      * This allows avoiding masking the reaction-field contribution,
 361      * as frcoul is later multiplied by rinvsq which has been
 362      * masked with the cut-off check.
 363      */
 364
 365 #    ifdef EXCL_FORCES
 366     /* Only add 1/r for non-excluded atom pairs */
 367     rinv_ex_S0 = selectByMask(rinv_S0, interact_S0);
 368     rinv_ex_S2 = selectByMask(rinv_S2, interact_S2);
 369 #    else
 370     /* No exclusion forces, we always need 1/r */
 371 #        define rinv_ex_S0 rinv_S0
 372 #        define rinv_ex_S2 rinv_S2
 373 #    endif
 374
 375 #    ifdef CALC_COUL_RF
 376     /* Electrostatic interactions */
 377     frcoul_S0 = qq_S0 * fma(rsq_S0, mrc_3_S, rinv_ex_S0);
 378     frcoul_S2 = qq_S2 * fma(rsq_S2, mrc_3_S, rinv_ex_S2);
 379
 380 #        ifdef CALC_ENERGIES
 381     vcoul_S0 = qq_S0 * (rinv_ex_S0 + fma(rsq_S0, hrc_3_S, moh_rc_S));
 382     vcoul_S2 = qq_S2 * (rinv_ex_S2 + fma(rsq_S2, hrc_3_S, moh_rc_S));
 383 #        endif
 384 #    endif
 385
 386 #    ifdef CALC_COUL_EWALD
 387     /* We need to mask (or limit) rsq for the cut-off,
 388      * as large distances can cause an overflow in gmx_pmecorrF/V.
 389      */
 390     brsq_S0   = beta2_S * selectByMask(rsq_S0, wco_S0);
 391     brsq_S2   = beta2_S * selectByMask(rsq_S2, wco_S2);
 392     ewcorr_S0 = beta_S * pmeForceCorrection(brsq_S0);
 393     ewcorr_S2 = beta_S * pmeForceCorrection(brsq_S2);
 394     frcoul_S0 = qq_S0 * fma(ewcorr_S0, brsq_S0, rinv_ex_S0);
 395     frcoul_S2 = qq_S2 * fma(ewcorr_S2, brsq_S2, rinv_ex_S2);
 396
 397 #        ifdef CALC_ENERGIES
 398     vc_sub_S0 = beta_S * pmePotentialCorrection(brsq_S0);
 399     vc_sub_S2 = beta_S * pmePotentialCorrection(brsq_S2);
 400 #        endif
 401
 402 #    endif /* CALC_COUL_EWALD */
 403
 404 #    ifdef CALC_COUL_TAB
 405     /* Electrostatic interactions */
 406     r_S0 = rsq_S0 * rinv_S0;
 407     r_S2 = rsq_S2 * rinv_S2;
 408     /* Convert r to scaled table units */
 409     rs_S0 = r_S0 * invtsp_S;
 410     rs_S2 = r_S2 * invtsp_S;
 411     /* Truncate scaled r to an int */
 412     ti_S0 = cvttR2I(rs_S0);
 413     ti_S2 = cvttR2I(rs_S2);
 414
 415     rf_S0 = trunc(rs_S0);
 416     rf_S2 = trunc(rs_S2);
 417
 418     frac_S0 = rs_S0 - rf_S0;
 419     frac_S2 = rs_S2 - rf_S2;
 420
 421     /* Load and interpolate table forces and possibly energies.
 422      * Force and energy can be combined in one table, stride 4: FDV0
 423      * or in two separate tables with stride 1: F and V
 424      * Currently single precision uses FDV0, double F and V.
 425      */
 426 #        ifndef CALC_ENERGIES
 427 #            ifdef TAB_FDV0
 428     gatherLoadBySimdIntTranspose<4>(tab_coul_F, ti_S0, &ctab0_S0, &ctab1_S0);
 429     gatherLoadBySimdIntTranspose<4>(tab_coul_F, ti_S2, &ctab0_S2, &ctab1_S2);
 430 #            else
 431     gatherLoadUBySimdIntTranspose<1>(tab_coul_F, ti_S0, &ctab0_S0, &ctab1_S0);
 432     gatherLoadUBySimdIntTranspose<1>(tab_coul_F, ti_S2, &ctab0_S2, &ctab1_S2);
 433     ctab1_S0  = ctab1_S0 - ctab0_S0;
 434     ctab1_S2  = ctab1_S2 - ctab0_S2;
 435 #            endif
 436 #        else
 437 #            ifdef TAB_FDV0
 438     gatherLoadBySimdIntTranspose<4>(tab_coul_F, ti_S0, &ctab0_S0, &ctab1_S0, &ctabv_S0, &dum_S0);
 439     gatherLoadBySimdIntTranspose<4>(tab_coul_F, ti_S2, &ctab0_S2, &ctab1_S2, &ctabv_S2, &dum_S2);
 440 #            else
 441     gatherLoadUBySimdIntTranspose<1>(tab_coul_F, ti_S0, &ctab0_S0, &ctab1_S0);
 442     gatherLoadUBySimdIntTranspose<1>(tab_coul_V, ti_S0, &ctabv_S0, &dum_S0);
 443     gatherLoadUBySimdIntTranspose<1>(tab_coul_F, ti_S2, &ctab0_S2, &ctab1_S2);
 444     gatherLoadUBySimdIntTranspose<1>(tab_coul_V, ti_S2, &ctabv_S2, &dum_S2);
 445     ctab1_S0 = ctab1_S0 - ctab0_S0;
 446     ctab1_S2 = ctab1_S2 - ctab0_S2;
 447 #            endif
 448 #        endif
 449     fsub_S0   = fma(frac_S0, ctab1_S0, ctab0_S0);
 450     fsub_S2   = fma(frac_S2, ctab1_S2, ctab0_S2);
 451     frcoul_S0 = qq_S0 * fnma(fsub_S0, r_S0, rinv_ex_S0);
 452     frcoul_S2 = qq_S2 * fnma(fsub_S2, r_S2, rinv_ex_S2);
 453
 454 #        ifdef CALC_ENERGIES
 455     vc_sub_S0 = fma((mhalfsp_S * frac_S0), (ctab0_S0 + fsub_S0), ctabv_S0);
 456     vc_sub_S2 = fma((mhalfsp_S * frac_S2), (ctab0_S2 + fsub_S2), ctabv_S2);
 457 #        endif
 458 #    endif /* CALC_COUL_TAB */
 459
 460 #    if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
 461 #        ifndef NO_SHIFT_EWALD
 462     /* Add Ewald potential shift to vc_sub for convenience */
 463 #            ifdef CHECK_EXCLS
 464     vc_sub_S0 = vc_sub_S0 + selectByMask(sh_ewald_S, interact_S0);
 465     vc_sub_S2 = vc_sub_S2 + selectByMask(sh_ewald_S, interact_S2);
 466 #            else
 467     vc_sub_S0 = vc_sub_S0 + sh_ewald_S;
 468     vc_sub_S2 = vc_sub_S2 + sh_ewald_S;
 469 #            endif
 470 #        endif
 471
 472     vcoul_S0 = qq_S0 * (rinv_ex_S0 - vc_sub_S0);
 473     vcoul_S2 = qq_S2 * (rinv_ex_S2 - vc_sub_S2);
 474
 475 #    endif
 476
 477 #    ifdef CALC_ENERGIES
 478     /* Mask energy for cut-off and diagonal */
 479     vcoul_S0 = selectByMask(vcoul_S0, wco_S0);
 480     vcoul_S2 = selectByMask(vcoul_S2, wco_S2);
 481 #    endif
 482
 483 #endif /* CALC_COULOMB */
 484
 485 #ifdef CALC_LJ
 486     /* Lennard-Jones interaction */
 487
 488 #    ifdef VDW_CUTOFF_CHECK
 489     wco_vdw_S0 = (rsq_S0 < rcvdw2_S);
 490 #        ifndef HALF_LJ
 491     wco_vdw_S2 = (rsq_S2 < rcvdw2_S);
 492 #        endif
 493 #    else
 494     /* Same cut-off for Coulomb and VdW, reuse the registers */
 495 #        define wco_vdw_S0 wco_S0
 496 #        define wco_vdw_S2 wco_S2
 497 #    endif
 498
 499 #    ifndef LJ_COMB_LB
 500     rinvsix_S0 = rinvsq_S0 * rinvsq_S0 * rinvsq_S0;
 501 #        ifdef EXCL_FORCES
 502     rinvsix_S0 = selectByMask(rinvsix_S0, interact_S0);
 503 #        endif
 504 #        ifndef HALF_LJ
 505     rinvsix_S2 = rinvsq_S2 * rinvsq_S2 * rinvsq_S2;
 506 #            ifdef EXCL_FORCES
 507     rinvsix_S2 = selectByMask(rinvsix_S2, interact_S2);
 508 #            endif
 509 #        endif
 510
 511 #        if defined LJ_CUT || defined LJ_POT_SWITCH
 512     /* We have plain LJ or LJ-PME with simple C6/6 C12/12 coefficients */
 513     FrLJ6_S0 = c6_S0 * rinvsix_S0;
 514 #            ifndef HALF_LJ
 515     FrLJ6_S2 = c6_S2 * rinvsix_S2;
 516 #            endif
 517     FrLJ12_S0 = c12_S0 * rinvsix_S0 * rinvsix_S0;
 518 #            ifndef HALF_LJ
 519     FrLJ12_S2 = c12_S2 * rinvsix_S2 * rinvsix_S2;
 520 #            endif
 521 #        endif
 522
 523 #        if defined LJ_FORCE_SWITCH || defined LJ_POT_SWITCH
 524     /* We switch the LJ force */
 525     r_S0    = rsq_S0 * rinv_S0;
 526     rsw_S0  = max(r_S0 - rswitch_S, zero_S);
 527     rsw2_S0 = rsw_S0 * rsw_S0;
 528 #            ifndef HALF_LJ
 529     r_S2    = rsq_S2 * rinv_S2;
 530     rsw_S2  = max(r_S2 - rswitch_S, zero_S);
 531     rsw2_S2 = rsw_S2 * rsw_S2;
 532 #            endif
 533 #        endif
 534
 535 #        ifdef LJ_FORCE_SWITCH
 536
 537 #            define add_fr_switch(fr, rsw, rsw2_r, c2, c3) fma(fma(c3, rsw, c2), rsw2_r, fr)
 538     SimdReal rsw2_r_S0 = rsw2_S0 * r_S0;
 539     FrLJ6_S0           = c6_S0 * add_fr_switch(rinvsix_S0, rsw_S0, rsw2_r_S0, p6_fc2_S, p6_fc3_S);
 540 #            ifndef HALF_LJ
 541     SimdReal rsw2_r_S2 = rsw2_S2 * r_S2;
 542     FrLJ6_S2           = c6_S2 * add_fr_switch(rinvsix_S2, rsw_S2, rsw2_r_S2, p6_fc2_S, p6_fc3_S);
 543 #            endif
 544     FrLJ12_S0 = c12_S0 * add_fr_switch(rinvsix_S0 * rinvsix_S0, rsw_S0, rsw2_r_S0, p12_fc2_S, p12_fc3_S);
 545 #            ifndef HALF_LJ
 546     FrLJ12_S2 = c12_S2 * add_fr_switch(rinvsix_S2 * rinvsix_S2, rsw_S2, rsw2_r_S2, p12_fc2_S, p12_fc3_S);
 547 #            endif
 548 #            undef add_fr_switch
 549 #        endif /* LJ_FORCE_SWITCH */
 550
 551 #    endif /* not LJ_COMB_LB */
 552
 553 #    ifdef LJ_COMB_LB
 554     sir_S0 = sig_S0 * rinv_S0;
 555 #        ifndef HALF_LJ
 556     sir_S2 = sig_S2 * rinv_S2;
 557 #        endif
 558     sir2_S0 = sir_S0 * sir_S0;
 559 #        ifndef HALF_LJ
 560     sir2_S2 = sir_S2 * sir_S2;
 561 #        endif
 562     sir6_S0 = sir2_S0 * sir2_S0 * sir2_S0;
 563 #        ifdef EXCL_FORCES
 564     sir6_S0 = selectByMask(sir6_S0, interact_S0);
 565 #        endif
 566 #        ifndef HALF_LJ
 567     sir6_S2 = sir2_S2 * sir2_S2 * sir2_S2;
 568 #            ifdef EXCL_FORCES
 569     sir6_S2 = selectByMask(sir6_S2, interact_S2);
 570 #            endif
 571 #        endif
 572 #        ifdef VDW_CUTOFF_CHECK
 573     sir6_S0 = selectByMask(sir6_S0, wco_vdw_S0);
 574 #            ifndef HALF_LJ
 575     sir6_S2 = selectByMask(sir6_S2, wco_vdw_S2);
 576 #            endif
 577 #        endif
 578     FrLJ6_S0 = eps_S0 * sir6_S0;
 579 #        ifndef HALF_LJ
 580     FrLJ6_S2 = eps_S2 * sir6_S2;
 581 #        endif
 582     FrLJ12_S0 = FrLJ6_S0 * sir6_S0;
 583 #        ifndef HALF_LJ
 584     FrLJ12_S2 = FrLJ6_S2 * sir6_S2;
 585 #        endif
 586 #        if defined CALC_ENERGIES
 587     /* We need C6 and C12 to calculate the LJ potential shift */
 588     sig2_S0 = sig_S0 * sig_S0;
 589 #            ifndef HALF_LJ
 590     sig2_S2 = sig_S2 * sig_S2;
 591 #            endif
 592     sig6_S0 = sig2_S0 * sig2_S0 * sig2_S0;
 593 #            ifndef HALF_LJ
 594     sig6_S2 = sig2_S2 * sig2_S2 * sig2_S2;
 595 #            endif
 596     SimdReal c6_S0 = eps_S0 * sig6_S0;
 597 #            ifndef HALF_LJ
 598     SimdReal c6_S2 = eps_S2 * sig6_S2;
 599 #            endif
 600     SimdReal c12_S0 = c6_S0 * sig6_S0;
 601 #            ifndef HALF_LJ
 602     SimdReal c12_S2 = c6_S2 * sig6_S2;
 603 #            endif
 604 #        endif
 605 #    endif /* LJ_COMB_LB */
 606
 607     /* Determine the total scalar LJ force*r */
 608     frLJ_S0 = FrLJ12_S0 - FrLJ6_S0;
 609 #    ifndef HALF_LJ
 610     frLJ_S2 = FrLJ12_S2 - FrLJ6_S2;
 611 #    endif
 612
 613 #    if (defined LJ_CUT || defined LJ_FORCE_SWITCH) && defined CALC_ENERGIES
 614
 615 #        ifdef LJ_CUT
 616     /* Calculate the LJ energies, with constant potential shift */
 617     SimdReal VLJ6_S0 = sixth_S * fma(c6_S0, p6_cpot_S, FrLJ6_S0);
 618 #            ifndef HALF_LJ
 619     SimdReal VLJ6_S2 = sixth_S * fma(c6_S2, p6_cpot_S, FrLJ6_S2);
 620 #            endif
 621     SimdReal VLJ12_S0 = twelveth_S * fma(c12_S0, p12_cpot_S, FrLJ12_S0);
 622 #            ifndef HALF_LJ
 623     SimdReal VLJ12_S2 = twelveth_S * fma(c12_S2, p12_cpot_S, FrLJ12_S2);
 624 #            endif
 625 #        endif /* LJ_CUT */
 626
 627 #        ifdef LJ_FORCE_SWITCH
 628 #            define v_fswitch_pr(rsw, rsw2, c0, c3, c4) fma(fma(c4, rsw, c3), (rsw2) * (rsw), c0)
 629
 630     SimdReal VLJ6_S0 =
 631             c6_S0 * fma(sixth_S, rinvsix_S0, v_fswitch_pr(rsw_S0, rsw2_S0, p6_6cpot_S, p6_vc3_S, p6_vc4_S));
 632 #            ifndef HALF_LJ
 633     SimdReal VLJ6_S2 =
 634             c6_S2 * fma(sixth_S, rinvsix_S2, v_fswitch_pr(rsw_S2, rsw2_S2, p6_6cpot_S, p6_vc3_S, p6_vc4_S));
 635 #            endif
 636     SimdReal VLJ12_S0 = c12_S0
 637                         * fma(twelveth_S, rinvsix_S0 * rinvsix_S0,
 638                               v_fswitch_pr(rsw_S0, rsw2_S0, p12_12cpot_S, p12_vc3_S, p12_vc4_S));
 639 #            ifndef HALF_LJ
 640     SimdReal VLJ12_S2 = c12_S2
 641                         * fma(twelveth_S, rinvsix_S2 * rinvsix_S2,
 642                               v_fswitch_pr(rsw_S2, rsw2_S2, p12_12cpot_S, p12_vc3_S, p12_vc4_S));
 643 #            endif
 644 #            undef v_fswitch_pr
 645 #        endif /* LJ_FORCE_SWITCH */
 646
 647     /* Add up the repulsion and dispersion */
 648     SimdReal VLJ_S0 = VLJ12_S0 - VLJ6_S0;
 649 #        ifndef HALF_LJ
 650     SimdReal VLJ_S2 = VLJ12_S2 - VLJ6_S2;
 651 #        endif
 652
 653 #    endif /* (LJ_CUT || LJ_FORCE_SWITCH) && CALC_ENERGIES */
 654
 655 #    ifdef LJ_POT_SWITCH
 656     /* We always need the potential, since it is needed for the force */
 657     SimdReal VLJ_S0 = fnma(sixth_S, FrLJ6_S0, twelveth_S * FrLJ12_S0);
 658 #        ifndef HALF_LJ
 659     SimdReal VLJ_S2 = fnma(sixth_S, FrLJ6_S2, twelveth_S * FrLJ12_S2);
 660 #        endif
 661
 662     {
 663         SimdReal sw_S0, dsw_S0;
 664 #        ifndef HALF_LJ
 665         SimdReal sw_S2, dsw_S2;
 666 #        endif
 667
 668 #        define switch_pr(rsw, rsw2, c3, c4, c5) \
 669             fma(fma(fma(c5, rsw, c4), rsw, c3), (rsw2) * (rsw), one_S)
 670 #        define dswitch_pr(rsw, rsw2, c2, c3, c4) fma(fma(c4, rsw, c3), rsw, c2) * (rsw2)
 671
 672         sw_S0  = switch_pr(rsw_S0, rsw2_S0, swV3_S, swV4_S, swV5_S);
 673         dsw_S0 = dswitch_pr(rsw_S0, rsw2_S0, swF2_S, swF3_S, swF4_S);
 674 #        ifndef HALF_LJ
 675         sw_S2  = switch_pr(rsw_S2, rsw2_S2, swV3_S, swV4_S, swV5_S);
 676         dsw_S2 = dswitch_pr(rsw_S2, rsw2_S2, swF2_S, swF3_S, swF4_S);
 677 #        endif
 678         frLJ_S0 = fnma(dsw_S0 * VLJ_S0, r_S0, sw_S0 * frLJ_S0);
 679 #        ifndef HALF_LJ
 680         frLJ_S2 = fnma(dsw_S2 * VLJ_S2, r_S2, sw_S2 * frLJ_S2);
 681 #        endif
 682 #        ifdef CALC_ENERGIES
 683         VLJ_S0 = sw_S0 * VLJ_S0;
 684 #            ifndef HALF_LJ
 685         VLJ_S2 = sw_S2 * VLJ_S2;
 686 #            endif
 687 #        endif
 688
 689 #        undef switch_pr
 690 #        undef dswitch_pr
 691     }
 692 #    endif /* LJ_POT_SWITCH */
 693
 694 #    if defined CALC_ENERGIES && defined CHECK_EXCLS
 695     /* The potential shift should be removed for excluded pairs */
 696     VLJ_S0 = selectByMask(VLJ_S0, interact_S0);
 697 #        ifndef HALF_LJ
 698     VLJ_S2 = selectByMask(VLJ_S2, interact_S2);
 699 #        endif
 700 #    endif
 701
 702 #    ifdef LJ_EWALD_GEOM
 703     {
 704         SimdReal c6s_j_S;
 705         SimdReal c6grid_S0, rinvsix_nm_S0, cr2_S0, expmcr2_S0, poly_S0;
 706 #        ifndef HALF_LJ
 707         SimdReal c6grid_S2, rinvsix_nm_S2, cr2_S2, expmcr2_S2, poly_S2;
 708 #        endif
 709 #        ifdef CALC_ENERGIES
 710         SimdReal sh_mask_S0;
 711 #            ifndef HALF_LJ
 712         SimdReal sh_mask_S2;
 713 #            endif
 714 #        endif
 715
 716         /* Determine C6 for the grid using the geometric combination rule */
 717         c6s_j_S   = loadDuplicateHsimd(ljc + aj2);
 718         c6grid_S0 = c6s_S0 * c6s_j_S;
 719 #        ifndef HALF_LJ
 720         c6grid_S2 = c6s_S2 * c6s_j_S;
 721 #        endif
 722
 723 #        ifdef CHECK_EXCLS
 724         /* Recalculate rinvsix without exclusion mask (compiler might optimize) */
 725         rinvsix_nm_S0 = rinvsq_S0 * rinvsq_S0 * rinvsq_S0;
 726 #            ifndef HALF_LJ
 727         rinvsix_nm_S2 = rinvsq_S2 * rinvsq_S2 * rinvsq_S2;
 728 #            endif
 729 #        else
 730         /* We didn't use a mask, so we can copy */
 731         rinvsix_nm_S0 = rinvsix_S0;
 732 #            ifndef HALF_LJ
 733         rinvsix_nm_S2 = rinvsix_S2;
 734 #            endif
 735 #        endif
 736
 737         /* Mask for the cut-off to avoid overflow of cr2^2 */
 738         cr2_S0 = lje_c2_S * selectByMask(rsq_S0, wco_vdw_S0);
 739 #        ifndef HALF_LJ
 740         cr2_S2 = lje_c2_S * selectByMask(rsq_S2, wco_vdw_S2);
 741 #        endif
 742         // Unsafe version of our exp() should be fine, since these arguments should never
 743         // be smaller than -127 for any reasonable choice of cutoff or ewald coefficients.
 744         expmcr2_S0 = exp<MathOptimization::Unsafe>(-cr2_S0);
 745 #        ifndef HALF_LJ
 746         expmcr2_S2 = exp<MathOptimization::Unsafe>(-cr2_S2);
 747 #        endif
 748
 749         /* 1 + cr2 + 1/2*cr2^2 */
 750         poly_S0 = fma(fma(half_S, cr2_S0, one_S), cr2_S0, one_S);
 751 #        ifndef HALF_LJ
 752         poly_S2 = fma(fma(half_S, cr2_S2, one_S), cr2_S2, one_S);
 753 #        endif
 754
 755         /* We calculate LJ F*r = (6*C6)*(r^-6 - F_mesh/6), we use:
 756          * r^-6*cexp*(1 + cr2 + cr2^2/2 + cr2^3/6) = cexp*(r^-6*poly + c^6/6)
 757          */
 758         frLJ_S0 = fma(c6grid_S0,
 759                       fnma(expmcr2_S0, fma(rinvsix_nm_S0, poly_S0, lje_c6_6_S), rinvsix_nm_S0), frLJ_S0);
 760 #        ifndef HALF_LJ
 761         frLJ_S2 = fma(c6grid_S2,
 762                       fnma(expmcr2_S2, fma(rinvsix_nm_S2, poly_S2, lje_c6_6_S), rinvsix_nm_S2), frLJ_S2);
 763 #        endif
 764
 765 #        ifdef CALC_ENERGIES
 766 #            ifdef CHECK_EXCLS
 767         sh_mask_S0 = selectByMask(lje_vc_S, interact_S0);
 768 #                ifndef HALF_LJ
 769         sh_mask_S2 = selectByMask(lje_vc_S, interact_S2);
 770 #                endif
 771 #            else
 772         sh_mask_S0 = lje_vc_S;
 773 #                ifndef HALF_LJ
 774         sh_mask_S2 = lje_vc_S;
 775 #                endif
 776 #            endif
 777
 778         VLJ_S0 = fma(sixth_S * c6grid_S0,
 779                      fma(rinvsix_nm_S0, fnma(expmcr2_S0, poly_S0, one_S), sh_mask_S0), VLJ_S0);
 780 #            ifndef HALF_LJ
 781         VLJ_S2 = fma(sixth_S * c6grid_S2,
 782                      fma(rinvsix_nm_S2, fnma(expmcr2_S2, poly_S2, one_S), sh_mask_S2), VLJ_S2);
 783 #            endif
 784 #        endif /* CALC_ENERGIES */
 785     }
 786 #    endif /* LJ_EWALD_GEOM */
 787
 788 #    if defined VDW_CUTOFF_CHECK
 789     /* frLJ is multiplied later by rinvsq, which is masked for the Coulomb
 790      * cut-off, but if the VdW cut-off is shorter, we need to mask with that.
 791      */
 792     frLJ_S0 = selectByMask(frLJ_S0, wco_vdw_S0);
 793 #        ifndef HALF_LJ
 794     frLJ_S2 = selectByMask(frLJ_S2, wco_vdw_S2);
 795 #        endif
 796 #    endif
 797
 798 #    ifdef CALC_ENERGIES
 799     /* The potential shift should be removed for pairs beyond cut-off */
 800     VLJ_S0 = selectByMask(VLJ_S0, wco_vdw_S0);
 801 #        ifndef HALF_LJ
 802     VLJ_S2 = selectByMask(VLJ_S2, wco_vdw_S2);
 803 #        endif
 804 #    endif
 805
 806 #endif /* CALC_LJ */
 807
 808 #ifdef CALC_ENERGIES
 809 #    ifdef ENERGY_GROUPS
 810     /* Extract the group pair index per j pair.
 811      * Energy groups are stored per i-cluster, so things get
 812      * complicated when the i- and j-cluster size don't match.
 813      */
 814     {
 815         int egps_j;
 816 #        if UNROLLJ == 2
 817         egps_j    = nbatParams.energrp[cj >> 1];
 818         egp_jj[0] = ((egps_j >> ((cj & 1) * egps_jshift)) & egps_jmask) * egps_jstride;
 819 #        else
 820         /* We assume UNROLLI <= UNROLLJ */
 821         int jdi;
 822         for (jdi = 0; jdi < UNROLLJ / UNROLLI; jdi++)
 823         {
 824             int jj;
 825             egps_j = nbatParams.energrp[cj * (UNROLLJ / UNROLLI) + jdi];
 826             for (jj = 0; jj < (UNROLLI / 2); jj++)
 827             {
 828                 egp_jj[jdi * (UNROLLI / 2) + jj] =
 829                         ((egps_j >> (jj * egps_jshift)) & egps_jmask) * egps_jstride;
 830             }
 831         }
 832 #        endif
 833     }
 834 #    endif
 835
 836 #    ifdef CALC_COULOMB
 837 #        ifndef ENERGY_GROUPS
 838     vctot_S = vctot_S + vcoul_S0 + vcoul_S2;
 839 #        else
 840     add_ener_grp_halves(vcoul_S0, vctp[0], vctp[1], egp_jj);
 841     add_ener_grp_halves(vcoul_S2, vctp[2], vctp[3], egp_jj);
 842 #        endif
 843 #    endif
 844
 845 #    ifdef CALC_LJ
 846 #        ifndef ENERGY_GROUPS
 847     Vvdwtot_S = Vvdwtot_S + VLJ_S0
 848 #            ifndef HALF_LJ
 849                 + VLJ_S2
 850 #            endif
 851             ;
 852 #        else
 853     add_ener_grp_halves(VLJ_S0, vvdwtp[0], vvdwtp[1], egp_jj);
 854 #            ifndef HALF_LJ
 855     add_ener_grp_halves(VLJ_S2, vvdwtp[2], vvdwtp[3], egp_jj);
 856 #            endif
 857 #        endif
 858 #    endif /* CALC_LJ */
 859 #endif     /* CALC_ENERGIES */
 860
 861 #ifdef CALC_LJ
 862 #    ifdef CALC_COULOMB
 863     fscal_S0 = rinvsq_S0 * (frcoul_S0 + frLJ_S0);
 864 #    else
 865     fscal_S0 = rinvsq_S0 * frLJ_S0;
 866 #    endif
 867 #else
 868     fscal_S0 = rinvsq_S0 * frcoul_S0;
 869 #endif /* CALC_LJ */
 870 #if defined CALC_LJ && !defined HALF_LJ
 871 #    ifdef CALC_COULOMB
 872     fscal_S2 = rinvsq_S2 * (frcoul_S2 + frLJ_S2);
 873 #    else
 874     fscal_S2 = rinvsq_S2 * frLJ_S2;
 875 #    endif
 876 #else
 877     /* Atom 2 and 3 don't have LJ, so only add Coulomb forces */
 878     fscal_S2 = rinvsq_S2 * frcoul_S2;
 879 #endif
 880
 881     /* Calculate temporary vectorial force */
 882     tx_S0 = fscal_S0 * dx_S0;
 883     tx_S2 = fscal_S2 * dx_S2;
 884     ty_S0 = fscal_S0 * dy_S0;
 885     ty_S2 = fscal_S2 * dy_S2;
 886     tz_S0 = fscal_S0 * dz_S0;
 887     tz_S2 = fscal_S2 * dz_S2;
 888
 889     /* Increment i atom force */
 890     fix_S0 = fix_S0 + tx_S0;
 891     fix_S2 = fix_S2 + tx_S2;
 892     fiy_S0 = fiy_S0 + ty_S0;
 893     fiy_S2 = fiy_S2 + ty_S2;
 894     fiz_S0 = fiz_S0 + tz_S0;
 895     fiz_S2 = fiz_S2 + tz_S2;
 896
 897     /* Decrement j atom force */
 898     decrHsimd(f + ajx, tx_S0 + tx_S2);
 899     decrHsimd(f + ajy, ty_S0 + ty_S2);
 900     decrHsimd(f + ajz, tz_S0 + tz_S2);
 901 }
 902
 903 #undef rinv_ex_S0
 904 #undef rinv_ex_S2
 905
 906 #undef wco_vdw_S0
 907 #undef wco_vdw_S2
 908
 909 #undef EXCL_FORCES