Split lines with many copyright years
[gromacs.git] / src / gromacs / nbnxm / kernels_simd_2xmm / kernel_inner.h
bloba2dd3b605d9d220fa24549f3278affb962d5e64e
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
5 * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
6 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7 * and including many others, as listed in the AUTHORS file in the
8 * top-level source directory and at http://www.gromacs.org.
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
37 /* This is the innermost loop contents for the 4 x N atom simd kernel.
38 * This flavor of the kernel duplicates the data for N j-particles in
39 * 2xN wide simd registers to do operate on 2 i-particles at once.
40 * This leads to 4/2=2 sets of most instructions. Therefore we call
41 * this kernel 2x(N+N) = 2xnn
43 * This 2xnn kernel is basically the 4xn equivalent with half the registers
44 * and instructions removed.
46 * An alternative would be to load to different cluster of N j-particles
47 * into simd registers, giving a 4x(N+N) kernel. This doubles the amount
48 * of instructions, which could lead to better scheduling. But we actually
49 * observed worse scheduling for the AVX-256 4x8 normal analytical PME
50 * kernel, which has a lower pair throughput than 2x(4+4) with gcc 4.7.
51 * It could be worth trying this option, but it takes some more effort.
52 * This 2xnn kernel is basically the 4xn equivalent with
56 /* When calculating RF or Ewald interactions we calculate the electrostatic/LJ
57 * forces on excluded atom pairs here in the non-bonded loops.
58 * But when energies and/or virial is required we calculate them
59 * separately to as then it is easier to separate the energy and virial
60 * contributions.
62 #if defined CHECK_EXCLS && (defined CALC_COULOMB || defined LJ_EWALD_GEOM)
63 # define EXCL_FORCES
64 #endif
67 int cj, aj, ajx, ajy, ajz;
69 #ifdef ENERGY_GROUPS
70 /* Energy group indices for two atoms packed into one int */
71 int egp_jj[UNROLLJ / 2];
72 #endif
74 #ifdef CHECK_EXCLS
75 /* Interaction (non-exclusion) mask of all 1's or 0's */
76 SimdBool interact_S0;
77 SimdBool interact_S2;
78 #endif
80 SimdReal jx_S, jy_S, jz_S;
81 SimdReal dx_S0, dy_S0, dz_S0;
82 SimdReal dx_S2, dy_S2, dz_S2;
83 SimdReal tx_S0, ty_S0, tz_S0;
84 SimdReal tx_S2, ty_S2, tz_S2;
85 SimdReal rsq_S0, rinv_S0, rinvsq_S0;
86 SimdReal rsq_S2, rinv_S2, rinvsq_S2;
87 /* wco: within cut-off, mask of all 1's or 0's */
88 SimdBool wco_S0;
89 SimdBool wco_S2;
90 #ifdef VDW_CUTOFF_CHECK
91 SimdBool wco_vdw_S0;
92 # ifndef HALF_LJ
93 SimdBool wco_vdw_S2;
94 # endif
95 #endif
97 #if (defined CALC_COULOMB && defined CALC_COUL_TAB) || defined LJ_FORCE_SWITCH || defined LJ_POT_SWITCH
98 SimdReal r_S0;
99 # if (defined CALC_COULOMB && defined CALC_COUL_TAB) || !defined HALF_LJ
100 SimdReal r_S2;
101 # endif
102 #endif
104 #if defined LJ_FORCE_SWITCH || defined LJ_POT_SWITCH
105 SimdReal rsw_S0, rsw2_S0;
106 # ifndef HALF_LJ
107 SimdReal rsw_S2, rsw2_S2;
108 # endif
109 #endif
111 #ifdef CALC_COULOMB
112 # ifdef CHECK_EXCLS
113 /* 1/r masked with the interaction mask */
114 SimdReal rinv_ex_S0;
115 SimdReal rinv_ex_S2;
116 # endif
117 SimdReal jq_S;
118 SimdReal qq_S0;
119 SimdReal qq_S2;
120 # ifdef CALC_COUL_TAB
121 /* The force (PME mesh force) we need to subtract from 1/r^2 */
122 SimdReal fsub_S0;
123 SimdReal fsub_S2;
124 # endif
125 # ifdef CALC_COUL_EWALD
126 SimdReal brsq_S0, brsq_S2;
127 SimdReal ewcorr_S0, ewcorr_S2;
128 # endif
130 /* frcoul = (1/r - fsub)*r */
131 SimdReal frcoul_S0;
132 SimdReal frcoul_S2;
133 # ifdef CALC_COUL_TAB
134 /* For tables: r, rs=r/sp, rf=floor(rs), frac=rs-rf */
135 SimdReal rs_S0, rf_S0, frac_S0;
136 SimdReal rs_S2, rf_S2, frac_S2;
137 /* Table index: rs truncated to an int */
138 SimdInt32 ti_S0, ti_S2;
139 /* Linear force table values */
140 SimdReal ctab0_S0, ctab1_S0;
141 SimdReal ctab0_S2, ctab1_S2;
142 # ifdef CALC_ENERGIES
143 /* Quadratic energy table value */
144 SimdReal ctabv_S0, dum_S0;
145 SimdReal ctabv_S2, dum_S2;
146 # endif
147 # endif
148 # if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
149 /* The potential (PME mesh) we need to subtract from 1/r */
150 SimdReal vc_sub_S0;
151 SimdReal vc_sub_S2;
152 # endif
153 # ifdef CALC_ENERGIES
154 /* Electrostatic potential */
155 SimdReal vcoul_S0;
156 SimdReal vcoul_S2;
157 # endif
158 #endif
159 /* The force times 1/r */
160 SimdReal fscal_S0;
161 SimdReal fscal_S2;
163 #ifdef CALC_LJ
164 # ifdef LJ_COMB_LB
165 /* LJ sigma_j/2 and sqrt(epsilon_j) */
166 SimdReal hsig_j_S, seps_j_S;
167 /* LJ sigma_ij and epsilon_ij */
168 SimdReal sig_S0, eps_S0;
169 # ifndef HALF_LJ
170 SimdReal sig_S2, eps_S2;
171 # endif
172 # ifdef CALC_ENERGIES
173 SimdReal sig2_S0, sig6_S0;
174 # ifndef HALF_LJ
175 SimdReal sig2_S2, sig6_S2;
176 # endif
177 # endif /* LJ_COMB_LB */
178 # endif /* CALC_LJ */
180 # ifdef LJ_COMB_GEOM
181 SimdReal c6s_j_S, c12s_j_S;
182 # endif
184 # if defined LJ_COMB_GEOM || defined LJ_COMB_LB || defined LJ_EWALD_GEOM
185 /* Index for loading LJ parameters, complicated when interleaving */
186 int aj2;
187 # endif
189 /* Intermediate variables for LJ calculation */
190 # ifndef LJ_COMB_LB
191 SimdReal rinvsix_S0;
192 # ifndef HALF_LJ
193 SimdReal rinvsix_S2;
194 # endif
195 # endif
196 # ifdef LJ_COMB_LB
197 SimdReal sir_S0, sir2_S0, sir6_S0;
198 # ifndef HALF_LJ
199 SimdReal sir_S2, sir2_S2, sir6_S2;
200 # endif
201 # endif
203 SimdReal FrLJ6_S0, FrLJ12_S0, frLJ_S0;
204 # ifndef HALF_LJ
205 SimdReal FrLJ6_S2, FrLJ12_S2, frLJ_S2;
206 # endif
207 #endif /* CALC_LJ */
209 /* j-cluster index */
210 cj = l_cj[cjind].cj;
212 /* Atom indices (of the first atom in the cluster) */
213 aj = cj * UNROLLJ;
214 #if defined CALC_LJ && (defined LJ_COMB_GEOM || defined LJ_COMB_LB || defined LJ_EWALD_GEOM)
215 aj2 = aj * 2;
216 #endif
217 ajx = aj * DIM;
218 ajy = ajx + STRIDE;
219 ajz = ajy + STRIDE;
221 #ifdef CHECK_EXCLS
222 gmx_load_simd_2xnn_interactions(static_cast<int>(l_cj[cjind].excl), filter_S0, filter_S2,
223 &interact_S0, &interact_S2);
224 #endif /* CHECK_EXCLS */
226 /* load j atom coordinates */
227 jx_S = loadDuplicateHsimd(x + ajx);
228 jy_S = loadDuplicateHsimd(x + ajy);
229 jz_S = loadDuplicateHsimd(x + ajz);
231 /* Calculate distance */
232 dx_S0 = ix_S0 - jx_S;
233 dy_S0 = iy_S0 - jy_S;
234 dz_S0 = iz_S0 - jz_S;
235 dx_S2 = ix_S2 - jx_S;
236 dy_S2 = iy_S2 - jy_S;
237 dz_S2 = iz_S2 - jz_S;
239 /* rsq = dx*dx+dy*dy+dz*dz */
240 rsq_S0 = norm2(dx_S0, dy_S0, dz_S0);
241 rsq_S2 = norm2(dx_S2, dy_S2, dz_S2);
243 /* Do the cut-off check */
244 wco_S0 = (rsq_S0 < rc2_S);
245 wco_S2 = (rsq_S2 < rc2_S);
247 #ifdef CHECK_EXCLS
248 # ifdef EXCL_FORCES
249 /* Only remove the (sub-)diagonal to avoid double counting */
250 # if UNROLLJ == UNROLLI
251 if (cj == ci_sh)
253 wco_S0 = wco_S0 && diagonal_mask_S0;
254 wco_S2 = wco_S2 && diagonal_mask_S2;
256 # else
257 # if UNROLLJ == 2 * UNROLLI
258 if (cj * 2 == ci_sh)
260 wco_S0 = wco_S0 && diagonal_mask0_S0;
261 wco_S2 = wco_S2 && diagonal_mask0_S2;
263 else if (cj * 2 + 1 == ci_sh)
265 wco_S0 = wco_S0 && diagonal_mask1_S0;
266 wco_S2 = wco_S2 && diagonal_mask1_S2;
268 # else
269 # error "only UNROLLJ == UNROLLI*(1 or 2) currently supported in 2xnn kernels"
270 # endif
271 # endif
272 # else /* EXCL_FORCES */
273 /* No exclusion forces: remove all excluded atom pairs from the list */
274 wco_S0 = wco_S0 && interact_S0;
275 wco_S2 = wco_S2 && interact_S2;
276 # endif
277 #endif
279 #ifdef COUNT_PAIRS
281 int i, j;
282 alignas(GMX_SIMD_ALIGNMENT) real tmp[GMX_SIMD_REAL_WIDTH];
284 for (i = 0; i < UNROLLI; i += 2)
286 store(tmp, rc2_S - (i == 0 ? rsq_S0 : rsq_S2));
287 for (j = 0; j < 2 * UNROLLJ; j++)
289 if (tmp[j] >= 0)
291 npair++;
296 #endif
298 // Ensure the distances do not fall below the limit where r^-12 overflows.
299 // This should never happen for normal interactions.
300 rsq_S0 = max(rsq_S0, minRsq_S);
301 rsq_S2 = max(rsq_S2, minRsq_S);
303 /* Calculate 1/r */
304 rinv_S0 = invsqrt(rsq_S0);
305 rinv_S2 = invsqrt(rsq_S2);
307 #ifdef CALC_COULOMB
308 /* Load parameters for j atom */
309 jq_S = loadDuplicateHsimd(q + aj);
310 qq_S0 = iq_S0 * jq_S;
311 qq_S2 = iq_S2 * jq_S;
312 #endif
314 #ifdef CALC_LJ
315 # if !defined LJ_COMB_GEOM && !defined LJ_COMB_LB && !defined FIX_LJ_C
316 SimdReal c6_S0, c12_S0;
317 gatherLoadTransposeHsimd<c_simdBestPairAlignment>(nbfp0, nbfp1, type + aj, &c6_S0, &c12_S0);
318 # ifndef HALF_LJ
319 SimdReal c6_S2, c12_S2;
320 gatherLoadTransposeHsimd<c_simdBestPairAlignment>(nbfp2, nbfp3, type + aj, &c6_S2, &c12_S2);
321 # endif
322 # endif /* not defined any LJ rule */
324 # ifdef LJ_COMB_GEOM
325 c6s_j_S = loadDuplicateHsimd(ljc + aj2);
326 c12s_j_S = loadDuplicateHsimd(ljc + aj2 + STRIDE);
327 SimdReal c6_S0 = c6s_S0 * c6s_j_S;
328 # ifndef HALF_LJ
329 SimdReal c6_S2 = c6s_S2 * c6s_j_S;
330 # endif
331 SimdReal c12_S0 = c12s_S0 * c12s_j_S;
332 # ifndef HALF_LJ
333 SimdReal c12_S2 = c12s_S2 * c12s_j_S;
334 # endif
335 # endif /* LJ_COMB_GEOM */
337 # ifdef LJ_COMB_LB
338 hsig_j_S = loadDuplicateHsimd(ljc + aj2);
339 seps_j_S = loadDuplicateHsimd(ljc + aj2 + STRIDE);
341 sig_S0 = hsig_i_S0 + hsig_j_S;
342 eps_S0 = seps_i_S0 * seps_j_S;
343 # ifndef HALF_LJ
344 sig_S2 = hsig_i_S2 + hsig_j_S;
345 eps_S2 = seps_i_S2 * seps_j_S;
346 # endif
347 # endif /* LJ_COMB_LB */
349 #endif /* CALC_LJ */
351 /* Set rinv to zero for r beyond the cut-off */
352 rinv_S0 = selectByMask(rinv_S0, wco_S0);
353 rinv_S2 = selectByMask(rinv_S2, wco_S2);
355 rinvsq_S0 = rinv_S0 * rinv_S0;
356 rinvsq_S2 = rinv_S2 * rinv_S2;
358 #ifdef CALC_COULOMB
359 /* Note that here we calculate force*r, not the usual force/r.
360 * This allows avoiding masking the reaction-field contribution,
361 * as frcoul is later multiplied by rinvsq which has been
362 * masked with the cut-off check.
365 # ifdef EXCL_FORCES
366 /* Only add 1/r for non-excluded atom pairs */
367 rinv_ex_S0 = selectByMask(rinv_S0, interact_S0);
368 rinv_ex_S2 = selectByMask(rinv_S2, interact_S2);
369 # else
370 /* No exclusion forces, we always need 1/r */
371 # define rinv_ex_S0 rinv_S0
372 # define rinv_ex_S2 rinv_S2
373 # endif
375 # ifdef CALC_COUL_RF
376 /* Electrostatic interactions */
377 frcoul_S0 = qq_S0 * fma(rsq_S0, mrc_3_S, rinv_ex_S0);
378 frcoul_S2 = qq_S2 * fma(rsq_S2, mrc_3_S, rinv_ex_S2);
380 # ifdef CALC_ENERGIES
381 vcoul_S0 = qq_S0 * (rinv_ex_S0 + fma(rsq_S0, hrc_3_S, moh_rc_S));
382 vcoul_S2 = qq_S2 * (rinv_ex_S2 + fma(rsq_S2, hrc_3_S, moh_rc_S));
383 # endif
384 # endif
386 # ifdef CALC_COUL_EWALD
387 /* We need to mask (or limit) rsq for the cut-off,
388 * as large distances can cause an overflow in gmx_pmecorrF/V.
390 brsq_S0 = beta2_S * selectByMask(rsq_S0, wco_S0);
391 brsq_S2 = beta2_S * selectByMask(rsq_S2, wco_S2);
392 ewcorr_S0 = beta_S * pmeForceCorrection(brsq_S0);
393 ewcorr_S2 = beta_S * pmeForceCorrection(brsq_S2);
394 frcoul_S0 = qq_S0 * fma(ewcorr_S0, brsq_S0, rinv_ex_S0);
395 frcoul_S2 = qq_S2 * fma(ewcorr_S2, brsq_S2, rinv_ex_S2);
397 # ifdef CALC_ENERGIES
398 vc_sub_S0 = beta_S * pmePotentialCorrection(brsq_S0);
399 vc_sub_S2 = beta_S * pmePotentialCorrection(brsq_S2);
400 # endif
402 # endif /* CALC_COUL_EWALD */
404 # ifdef CALC_COUL_TAB
405 /* Electrostatic interactions */
406 r_S0 = rsq_S0 * rinv_S0;
407 r_S2 = rsq_S2 * rinv_S2;
408 /* Convert r to scaled table units */
409 rs_S0 = r_S0 * invtsp_S;
410 rs_S2 = r_S2 * invtsp_S;
411 /* Truncate scaled r to an int */
412 ti_S0 = cvttR2I(rs_S0);
413 ti_S2 = cvttR2I(rs_S2);
415 rf_S0 = trunc(rs_S0);
416 rf_S2 = trunc(rs_S2);
418 frac_S0 = rs_S0 - rf_S0;
419 frac_S2 = rs_S2 - rf_S2;
421 /* Load and interpolate table forces and possibly energies.
422 * Force and energy can be combined in one table, stride 4: FDV0
423 * or in two separate tables with stride 1: F and V
424 * Currently single precision uses FDV0, double F and V.
426 # ifndef CALC_ENERGIES
427 # ifdef TAB_FDV0
428 gatherLoadBySimdIntTranspose<4>(tab_coul_F, ti_S0, &ctab0_S0, &ctab1_S0);
429 gatherLoadBySimdIntTranspose<4>(tab_coul_F, ti_S2, &ctab0_S2, &ctab1_S2);
430 # else
431 gatherLoadUBySimdIntTranspose<1>(tab_coul_F, ti_S0, &ctab0_S0, &ctab1_S0);
432 gatherLoadUBySimdIntTranspose<1>(tab_coul_F, ti_S2, &ctab0_S2, &ctab1_S2);
433 ctab1_S0 = ctab1_S0 - ctab0_S0;
434 ctab1_S2 = ctab1_S2 - ctab0_S2;
435 # endif
436 # else
437 # ifdef TAB_FDV0
438 gatherLoadBySimdIntTranspose<4>(tab_coul_F, ti_S0, &ctab0_S0, &ctab1_S0, &ctabv_S0, &dum_S0);
439 gatherLoadBySimdIntTranspose<4>(tab_coul_F, ti_S2, &ctab0_S2, &ctab1_S2, &ctabv_S2, &dum_S2);
440 # else
441 gatherLoadUBySimdIntTranspose<1>(tab_coul_F, ti_S0, &ctab0_S0, &ctab1_S0);
442 gatherLoadUBySimdIntTranspose<1>(tab_coul_V, ti_S0, &ctabv_S0, &dum_S0);
443 gatherLoadUBySimdIntTranspose<1>(tab_coul_F, ti_S2, &ctab0_S2, &ctab1_S2);
444 gatherLoadUBySimdIntTranspose<1>(tab_coul_V, ti_S2, &ctabv_S2, &dum_S2);
445 ctab1_S0 = ctab1_S0 - ctab0_S0;
446 ctab1_S2 = ctab1_S2 - ctab0_S2;
447 # endif
448 # endif
449 fsub_S0 = fma(frac_S0, ctab1_S0, ctab0_S0);
450 fsub_S2 = fma(frac_S2, ctab1_S2, ctab0_S2);
451 frcoul_S0 = qq_S0 * fnma(fsub_S0, r_S0, rinv_ex_S0);
452 frcoul_S2 = qq_S2 * fnma(fsub_S2, r_S2, rinv_ex_S2);
454 # ifdef CALC_ENERGIES
455 vc_sub_S0 = fma((mhalfsp_S * frac_S0), (ctab0_S0 + fsub_S0), ctabv_S0);
456 vc_sub_S2 = fma((mhalfsp_S * frac_S2), (ctab0_S2 + fsub_S2), ctabv_S2);
457 # endif
458 # endif /* CALC_COUL_TAB */
460 # if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
461 # ifndef NO_SHIFT_EWALD
462 /* Add Ewald potential shift to vc_sub for convenience */
463 # ifdef CHECK_EXCLS
464 vc_sub_S0 = vc_sub_S0 + selectByMask(sh_ewald_S, interact_S0);
465 vc_sub_S2 = vc_sub_S2 + selectByMask(sh_ewald_S, interact_S2);
466 # else
467 vc_sub_S0 = vc_sub_S0 + sh_ewald_S;
468 vc_sub_S2 = vc_sub_S2 + sh_ewald_S;
469 # endif
470 # endif
472 vcoul_S0 = qq_S0 * (rinv_ex_S0 - vc_sub_S0);
473 vcoul_S2 = qq_S2 * (rinv_ex_S2 - vc_sub_S2);
475 # endif
477 # ifdef CALC_ENERGIES
478 /* Mask energy for cut-off and diagonal */
479 vcoul_S0 = selectByMask(vcoul_S0, wco_S0);
480 vcoul_S2 = selectByMask(vcoul_S2, wco_S2);
481 # endif
483 #endif /* CALC_COULOMB */
485 #ifdef CALC_LJ
486 /* Lennard-Jones interaction */
488 # ifdef VDW_CUTOFF_CHECK
489 wco_vdw_S0 = (rsq_S0 < rcvdw2_S);
490 # ifndef HALF_LJ
491 wco_vdw_S2 = (rsq_S2 < rcvdw2_S);
492 # endif
493 # else
494 /* Same cut-off for Coulomb and VdW, reuse the registers */
495 # define wco_vdw_S0 wco_S0
496 # define wco_vdw_S2 wco_S2
497 # endif
499 # ifndef LJ_COMB_LB
500 rinvsix_S0 = rinvsq_S0 * rinvsq_S0 * rinvsq_S0;
501 # ifdef EXCL_FORCES
502 rinvsix_S0 = selectByMask(rinvsix_S0, interact_S0);
503 # endif
504 # ifndef HALF_LJ
505 rinvsix_S2 = rinvsq_S2 * rinvsq_S2 * rinvsq_S2;
506 # ifdef EXCL_FORCES
507 rinvsix_S2 = selectByMask(rinvsix_S2, interact_S2);
508 # endif
509 # endif
511 # if defined LJ_CUT || defined LJ_POT_SWITCH
512 /* We have plain LJ or LJ-PME with simple C6/6 C12/12 coefficients */
513 FrLJ6_S0 = c6_S0 * rinvsix_S0;
514 # ifndef HALF_LJ
515 FrLJ6_S2 = c6_S2 * rinvsix_S2;
516 # endif
517 FrLJ12_S0 = c12_S0 * rinvsix_S0 * rinvsix_S0;
518 # ifndef HALF_LJ
519 FrLJ12_S2 = c12_S2 * rinvsix_S2 * rinvsix_S2;
520 # endif
521 # endif
523 # if defined LJ_FORCE_SWITCH || defined LJ_POT_SWITCH
524 /* We switch the LJ force */
525 r_S0 = rsq_S0 * rinv_S0;
526 rsw_S0 = max(r_S0 - rswitch_S, zero_S);
527 rsw2_S0 = rsw_S0 * rsw_S0;
528 # ifndef HALF_LJ
529 r_S2 = rsq_S2 * rinv_S2;
530 rsw_S2 = max(r_S2 - rswitch_S, zero_S);
531 rsw2_S2 = rsw_S2 * rsw_S2;
532 # endif
533 # endif
535 # ifdef LJ_FORCE_SWITCH
537 # define add_fr_switch(fr, rsw, rsw2_r, c2, c3) fma(fma(c3, rsw, c2), rsw2_r, fr)
538 SimdReal rsw2_r_S0 = rsw2_S0 * r_S0;
539 FrLJ6_S0 = c6_S0 * add_fr_switch(rinvsix_S0, rsw_S0, rsw2_r_S0, p6_fc2_S, p6_fc3_S);
540 # ifndef HALF_LJ
541 SimdReal rsw2_r_S2 = rsw2_S2 * r_S2;
542 FrLJ6_S2 = c6_S2 * add_fr_switch(rinvsix_S2, rsw_S2, rsw2_r_S2, p6_fc2_S, p6_fc3_S);
543 # endif
544 FrLJ12_S0 = c12_S0 * add_fr_switch(rinvsix_S0 * rinvsix_S0, rsw_S0, rsw2_r_S0, p12_fc2_S, p12_fc3_S);
545 # ifndef HALF_LJ
546 FrLJ12_S2 = c12_S2 * add_fr_switch(rinvsix_S2 * rinvsix_S2, rsw_S2, rsw2_r_S2, p12_fc2_S, p12_fc3_S);
547 # endif
548 # undef add_fr_switch
549 # endif /* LJ_FORCE_SWITCH */
551 # endif /* not LJ_COMB_LB */
553 # ifdef LJ_COMB_LB
554 sir_S0 = sig_S0 * rinv_S0;
555 # ifndef HALF_LJ
556 sir_S2 = sig_S2 * rinv_S2;
557 # endif
558 sir2_S0 = sir_S0 * sir_S0;
559 # ifndef HALF_LJ
560 sir2_S2 = sir_S2 * sir_S2;
561 # endif
562 sir6_S0 = sir2_S0 * sir2_S0 * sir2_S0;
563 # ifdef EXCL_FORCES
564 sir6_S0 = selectByMask(sir6_S0, interact_S0);
565 # endif
566 # ifndef HALF_LJ
567 sir6_S2 = sir2_S2 * sir2_S2 * sir2_S2;
568 # ifdef EXCL_FORCES
569 sir6_S2 = selectByMask(sir6_S2, interact_S2);
570 # endif
571 # endif
572 # ifdef VDW_CUTOFF_CHECK
573 sir6_S0 = selectByMask(sir6_S0, wco_vdw_S0);
574 # ifndef HALF_LJ
575 sir6_S2 = selectByMask(sir6_S2, wco_vdw_S2);
576 # endif
577 # endif
578 FrLJ6_S0 = eps_S0 * sir6_S0;
579 # ifndef HALF_LJ
580 FrLJ6_S2 = eps_S2 * sir6_S2;
581 # endif
582 FrLJ12_S0 = FrLJ6_S0 * sir6_S0;
583 # ifndef HALF_LJ
584 FrLJ12_S2 = FrLJ6_S2 * sir6_S2;
585 # endif
586 # if defined CALC_ENERGIES
587 /* We need C6 and C12 to calculate the LJ potential shift */
588 sig2_S0 = sig_S0 * sig_S0;
589 # ifndef HALF_LJ
590 sig2_S2 = sig_S2 * sig_S2;
591 # endif
592 sig6_S0 = sig2_S0 * sig2_S0 * sig2_S0;
593 # ifndef HALF_LJ
594 sig6_S2 = sig2_S2 * sig2_S2 * sig2_S2;
595 # endif
596 SimdReal c6_S0 = eps_S0 * sig6_S0;
597 # ifndef HALF_LJ
598 SimdReal c6_S2 = eps_S2 * sig6_S2;
599 # endif
600 SimdReal c12_S0 = c6_S0 * sig6_S0;
601 # ifndef HALF_LJ
602 SimdReal c12_S2 = c6_S2 * sig6_S2;
603 # endif
604 # endif
605 # endif /* LJ_COMB_LB */
607 /* Determine the total scalar LJ force*r */
608 frLJ_S0 = FrLJ12_S0 - FrLJ6_S0;
609 # ifndef HALF_LJ
610 frLJ_S2 = FrLJ12_S2 - FrLJ6_S2;
611 # endif
613 # if (defined LJ_CUT || defined LJ_FORCE_SWITCH) && defined CALC_ENERGIES
615 # ifdef LJ_CUT
616 /* Calculate the LJ energies, with constant potential shift */
617 SimdReal VLJ6_S0 = sixth_S * fma(c6_S0, p6_cpot_S, FrLJ6_S0);
618 # ifndef HALF_LJ
619 SimdReal VLJ6_S2 = sixth_S * fma(c6_S2, p6_cpot_S, FrLJ6_S2);
620 # endif
621 SimdReal VLJ12_S0 = twelveth_S * fma(c12_S0, p12_cpot_S, FrLJ12_S0);
622 # ifndef HALF_LJ
623 SimdReal VLJ12_S2 = twelveth_S * fma(c12_S2, p12_cpot_S, FrLJ12_S2);
624 # endif
625 # endif /* LJ_CUT */
627 # ifdef LJ_FORCE_SWITCH
628 # define v_fswitch_pr(rsw, rsw2, c0, c3, c4) fma(fma(c4, rsw, c3), (rsw2) * (rsw), c0)
630 SimdReal VLJ6_S0 =
631 c6_S0 * fma(sixth_S, rinvsix_S0, v_fswitch_pr(rsw_S0, rsw2_S0, p6_6cpot_S, p6_vc3_S, p6_vc4_S));
632 # ifndef HALF_LJ
633 SimdReal VLJ6_S2 =
634 c6_S2 * fma(sixth_S, rinvsix_S2, v_fswitch_pr(rsw_S2, rsw2_S2, p6_6cpot_S, p6_vc3_S, p6_vc4_S));
635 # endif
636 SimdReal VLJ12_S0 = c12_S0
637 * fma(twelveth_S, rinvsix_S0 * rinvsix_S0,
638 v_fswitch_pr(rsw_S0, rsw2_S0, p12_12cpot_S, p12_vc3_S, p12_vc4_S));
639 # ifndef HALF_LJ
640 SimdReal VLJ12_S2 = c12_S2
641 * fma(twelveth_S, rinvsix_S2 * rinvsix_S2,
642 v_fswitch_pr(rsw_S2, rsw2_S2, p12_12cpot_S, p12_vc3_S, p12_vc4_S));
643 # endif
644 # undef v_fswitch_pr
645 # endif /* LJ_FORCE_SWITCH */
647 /* Add up the repulsion and dispersion */
648 SimdReal VLJ_S0 = VLJ12_S0 - VLJ6_S0;
649 # ifndef HALF_LJ
650 SimdReal VLJ_S2 = VLJ12_S2 - VLJ6_S2;
651 # endif
653 # endif /* (LJ_CUT || LJ_FORCE_SWITCH) && CALC_ENERGIES */
655 # ifdef LJ_POT_SWITCH
656 /* We always need the potential, since it is needed for the force */
657 SimdReal VLJ_S0 = fnma(sixth_S, FrLJ6_S0, twelveth_S * FrLJ12_S0);
658 # ifndef HALF_LJ
659 SimdReal VLJ_S2 = fnma(sixth_S, FrLJ6_S2, twelveth_S * FrLJ12_S2);
660 # endif
663 SimdReal sw_S0, dsw_S0;
664 # ifndef HALF_LJ
665 SimdReal sw_S2, dsw_S2;
666 # endif
668 # define switch_pr(rsw, rsw2, c3, c4, c5) \
669 fma(fma(fma(c5, rsw, c4), rsw, c3), (rsw2) * (rsw), one_S)
670 # define dswitch_pr(rsw, rsw2, c2, c3, c4) fma(fma(c4, rsw, c3), rsw, c2) * (rsw2)
672 sw_S0 = switch_pr(rsw_S0, rsw2_S0, swV3_S, swV4_S, swV5_S);
673 dsw_S0 = dswitch_pr(rsw_S0, rsw2_S0, swF2_S, swF3_S, swF4_S);
674 # ifndef HALF_LJ
675 sw_S2 = switch_pr(rsw_S2, rsw2_S2, swV3_S, swV4_S, swV5_S);
676 dsw_S2 = dswitch_pr(rsw_S2, rsw2_S2, swF2_S, swF3_S, swF4_S);
677 # endif
678 frLJ_S0 = fnma(dsw_S0 * VLJ_S0, r_S0, sw_S0 * frLJ_S0);
679 # ifndef HALF_LJ
680 frLJ_S2 = fnma(dsw_S2 * VLJ_S2, r_S2, sw_S2 * frLJ_S2);
681 # endif
682 # ifdef CALC_ENERGIES
683 VLJ_S0 = sw_S0 * VLJ_S0;
684 # ifndef HALF_LJ
685 VLJ_S2 = sw_S2 * VLJ_S2;
686 # endif
687 # endif
689 # undef switch_pr
690 # undef dswitch_pr
692 # endif /* LJ_POT_SWITCH */
694 # if defined CALC_ENERGIES && defined CHECK_EXCLS
695 /* The potential shift should be removed for excluded pairs */
696 VLJ_S0 = selectByMask(VLJ_S0, interact_S0);
697 # ifndef HALF_LJ
698 VLJ_S2 = selectByMask(VLJ_S2, interact_S2);
699 # endif
700 # endif
702 # ifdef LJ_EWALD_GEOM
704 SimdReal c6s_j_S;
705 SimdReal c6grid_S0, rinvsix_nm_S0, cr2_S0, expmcr2_S0, poly_S0;
706 # ifndef HALF_LJ
707 SimdReal c6grid_S2, rinvsix_nm_S2, cr2_S2, expmcr2_S2, poly_S2;
708 # endif
709 # ifdef CALC_ENERGIES
710 SimdReal sh_mask_S0;
711 # ifndef HALF_LJ
712 SimdReal sh_mask_S2;
713 # endif
714 # endif
716 /* Determine C6 for the grid using the geometric combination rule */
717 c6s_j_S = loadDuplicateHsimd(ljc + aj2);
718 c6grid_S0 = c6s_S0 * c6s_j_S;
719 # ifndef HALF_LJ
720 c6grid_S2 = c6s_S2 * c6s_j_S;
721 # endif
723 # ifdef CHECK_EXCLS
724 /* Recalculate rinvsix without exclusion mask (compiler might optimize) */
725 rinvsix_nm_S0 = rinvsq_S0 * rinvsq_S0 * rinvsq_S0;
726 # ifndef HALF_LJ
727 rinvsix_nm_S2 = rinvsq_S2 * rinvsq_S2 * rinvsq_S2;
728 # endif
729 # else
730 /* We didn't use a mask, so we can copy */
731 rinvsix_nm_S0 = rinvsix_S0;
732 # ifndef HALF_LJ
733 rinvsix_nm_S2 = rinvsix_S2;
734 # endif
735 # endif
737 /* Mask for the cut-off to avoid overflow of cr2^2 */
738 cr2_S0 = lje_c2_S * selectByMask(rsq_S0, wco_vdw_S0);
739 # ifndef HALF_LJ
740 cr2_S2 = lje_c2_S * selectByMask(rsq_S2, wco_vdw_S2);
741 # endif
742 // Unsafe version of our exp() should be fine, since these arguments should never
743 // be smaller than -127 for any reasonable choice of cutoff or ewald coefficients.
744 expmcr2_S0 = exp<MathOptimization::Unsafe>(-cr2_S0);
745 # ifndef HALF_LJ
746 expmcr2_S2 = exp<MathOptimization::Unsafe>(-cr2_S2);
747 # endif
749 /* 1 + cr2 + 1/2*cr2^2 */
750 poly_S0 = fma(fma(half_S, cr2_S0, one_S), cr2_S0, one_S);
751 # ifndef HALF_LJ
752 poly_S2 = fma(fma(half_S, cr2_S2, one_S), cr2_S2, one_S);
753 # endif
755 /* We calculate LJ F*r = (6*C6)*(r^-6 - F_mesh/6), we use:
756 * r^-6*cexp*(1 + cr2 + cr2^2/2 + cr2^3/6) = cexp*(r^-6*poly + c^6/6)
758 frLJ_S0 = fma(c6grid_S0,
759 fnma(expmcr2_S0, fma(rinvsix_nm_S0, poly_S0, lje_c6_6_S), rinvsix_nm_S0), frLJ_S0);
760 # ifndef HALF_LJ
761 frLJ_S2 = fma(c6grid_S2,
762 fnma(expmcr2_S2, fma(rinvsix_nm_S2, poly_S2, lje_c6_6_S), rinvsix_nm_S2), frLJ_S2);
763 # endif
765 # ifdef CALC_ENERGIES
766 # ifdef CHECK_EXCLS
767 sh_mask_S0 = selectByMask(lje_vc_S, interact_S0);
768 # ifndef HALF_LJ
769 sh_mask_S2 = selectByMask(lje_vc_S, interact_S2);
770 # endif
771 # else
772 sh_mask_S0 = lje_vc_S;
773 # ifndef HALF_LJ
774 sh_mask_S2 = lje_vc_S;
775 # endif
776 # endif
778 VLJ_S0 = fma(sixth_S * c6grid_S0,
779 fma(rinvsix_nm_S0, fnma(expmcr2_S0, poly_S0, one_S), sh_mask_S0), VLJ_S0);
780 # ifndef HALF_LJ
781 VLJ_S2 = fma(sixth_S * c6grid_S2,
782 fma(rinvsix_nm_S2, fnma(expmcr2_S2, poly_S2, one_S), sh_mask_S2), VLJ_S2);
783 # endif
784 # endif /* CALC_ENERGIES */
786 # endif /* LJ_EWALD_GEOM */
788 # if defined VDW_CUTOFF_CHECK
789 /* frLJ is multiplied later by rinvsq, which is masked for the Coulomb
790 * cut-off, but if the VdW cut-off is shorter, we need to mask with that.
792 frLJ_S0 = selectByMask(frLJ_S0, wco_vdw_S0);
793 # ifndef HALF_LJ
794 frLJ_S2 = selectByMask(frLJ_S2, wco_vdw_S2);
795 # endif
796 # endif
798 # ifdef CALC_ENERGIES
799 /* The potential shift should be removed for pairs beyond cut-off */
800 VLJ_S0 = selectByMask(VLJ_S0, wco_vdw_S0);
801 # ifndef HALF_LJ
802 VLJ_S2 = selectByMask(VLJ_S2, wco_vdw_S2);
803 # endif
804 # endif
806 #endif /* CALC_LJ */
808 #ifdef CALC_ENERGIES
809 # ifdef ENERGY_GROUPS
810 /* Extract the group pair index per j pair.
811 * Energy groups are stored per i-cluster, so things get
812 * complicated when the i- and j-cluster size don't match.
815 int egps_j;
816 # if UNROLLJ == 2
817 egps_j = nbatParams.energrp[cj >> 1];
818 egp_jj[0] = ((egps_j >> ((cj & 1) * egps_jshift)) & egps_jmask) * egps_jstride;
819 # else
820 /* We assume UNROLLI <= UNROLLJ */
821 int jdi;
822 for (jdi = 0; jdi < UNROLLJ / UNROLLI; jdi++)
824 int jj;
825 egps_j = nbatParams.energrp[cj * (UNROLLJ / UNROLLI) + jdi];
826 for (jj = 0; jj < (UNROLLI / 2); jj++)
828 egp_jj[jdi * (UNROLLI / 2) + jj] =
829 ((egps_j >> (jj * egps_jshift)) & egps_jmask) * egps_jstride;
832 # endif
834 # endif
836 # ifdef CALC_COULOMB
837 # ifndef ENERGY_GROUPS
838 vctot_S = vctot_S + vcoul_S0 + vcoul_S2;
839 # else
840 add_ener_grp_halves(vcoul_S0, vctp[0], vctp[1], egp_jj);
841 add_ener_grp_halves(vcoul_S2, vctp[2], vctp[3], egp_jj);
842 # endif
843 # endif
845 # ifdef CALC_LJ
846 # ifndef ENERGY_GROUPS
847 Vvdwtot_S = Vvdwtot_S + VLJ_S0
848 # ifndef HALF_LJ
849 + VLJ_S2
850 # endif
852 # else
853 add_ener_grp_halves(VLJ_S0, vvdwtp[0], vvdwtp[1], egp_jj);
854 # ifndef HALF_LJ
855 add_ener_grp_halves(VLJ_S2, vvdwtp[2], vvdwtp[3], egp_jj);
856 # endif
857 # endif
858 # endif /* CALC_LJ */
859 #endif /* CALC_ENERGIES */
861 #ifdef CALC_LJ
862 # ifdef CALC_COULOMB
863 fscal_S0 = rinvsq_S0 * (frcoul_S0 + frLJ_S0);
864 # else
865 fscal_S0 = rinvsq_S0 * frLJ_S0;
866 # endif
867 #else
868 fscal_S0 = rinvsq_S0 * frcoul_S0;
869 #endif /* CALC_LJ */
870 #if defined CALC_LJ && !defined HALF_LJ
871 # ifdef CALC_COULOMB
872 fscal_S2 = rinvsq_S2 * (frcoul_S2 + frLJ_S2);
873 # else
874 fscal_S2 = rinvsq_S2 * frLJ_S2;
875 # endif
876 #else
877 /* Atom 2 and 3 don't have LJ, so only add Coulomb forces */
878 fscal_S2 = rinvsq_S2 * frcoul_S2;
879 #endif
881 /* Calculate temporary vectorial force */
882 tx_S0 = fscal_S0 * dx_S0;
883 tx_S2 = fscal_S2 * dx_S2;
884 ty_S0 = fscal_S0 * dy_S0;
885 ty_S2 = fscal_S2 * dy_S2;
886 tz_S0 = fscal_S0 * dz_S0;
887 tz_S2 = fscal_S2 * dz_S2;
889 /* Increment i atom force */
890 fix_S0 = fix_S0 + tx_S0;
891 fix_S2 = fix_S2 + tx_S2;
892 fiy_S0 = fiy_S0 + ty_S0;
893 fiy_S2 = fiy_S2 + ty_S2;
894 fiz_S0 = fiz_S0 + tz_S0;
895 fiz_S2 = fiz_S2 + tz_S2;
897 /* Decrement j atom force */
898 decrHsimd(f + ajx, tx_S0 + tx_S2);
899 decrHsimd(f + ajy, ty_S0 + ty_S2);
900 decrHsimd(f + ajz, tz_S0 + tz_S2);
903 #undef rinv_ex_S0
904 #undef rinv_ex_S2
906 #undef wco_vdw_S0
907 #undef wco_vdw_S2
909 #undef EXCL_FORCES