2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_single kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_avx_256_single.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_avx_256_single
51 * Electrostatics interaction: Ewald
52 * VdW interaction: LennardJones
53 * Geometry: Water4-Particle
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_avx_256_single
58 (t_nblist
* gmx_restrict nlist
,
59 rvec
* gmx_restrict xx
,
60 rvec
* gmx_restrict ff
,
61 struct t_forcerec
* gmx_restrict fr
,
62 t_mdatoms
* gmx_restrict mdatoms
,
63 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
64 t_nrnb
* gmx_restrict nrnb
)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
72 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
73 int jnrA
,jnrB
,jnrC
,jnrD
;
74 int jnrE
,jnrF
,jnrG
,jnrH
;
75 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
76 int jnrlistE
,jnrlistF
,jnrlistG
,jnrlistH
;
77 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
78 int j_coord_offsetE
,j_coord_offsetF
,j_coord_offsetG
,j_coord_offsetH
;
79 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
81 real
*shiftvec
,*fshift
,*x
,*f
;
82 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
,*fjptrE
,*fjptrF
,*fjptrG
,*fjptrH
;
84 __m256 tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
85 real
* vdwioffsetptr0
;
86 __m256 ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
87 real
* vdwioffsetptr1
;
88 __m256 ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
89 real
* vdwioffsetptr2
;
90 __m256 ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
91 real
* vdwioffsetptr3
;
92 __m256 ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
93 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
,vdwjidx0E
,vdwjidx0F
,vdwjidx0G
,vdwjidx0H
;
94 __m256 jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
95 __m256 dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
96 __m256 dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
97 __m256 dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
98 __m256 dx30
,dy30
,dz30
,rsq30
,rinv30
,rinvsq30
,r30
,qq30
,c6_30
,c12_30
;
99 __m256 velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
102 __m256 rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
105 __m256 one_sixth
= _mm256_set1_ps(1.0/6.0);
106 __m256 one_twelfth
= _mm256_set1_ps(1.0/12.0);
108 __m128i ewitab_lo
,ewitab_hi
;
109 __m256 ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
110 __m256 beta
,beta2
,beta3
,zeta2
,pmecorrF
,pmecorrV
,rinv3
;
112 __m256 dummy_mask
,cutoff_mask
;
113 __m256 signbit
= _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
114 __m256 one
= _mm256_set1_ps(1.0);
115 __m256 two
= _mm256_set1_ps(2.0);
121 jindex
= nlist
->jindex
;
123 shiftidx
= nlist
->shift
;
125 shiftvec
= fr
->shift_vec
[0];
126 fshift
= fr
->fshift
[0];
127 facel
= _mm256_set1_ps(fr
->ic
->epsfac
);
128 charge
= mdatoms
->chargeA
;
129 nvdwtype
= fr
->ntype
;
131 vdwtype
= mdatoms
->typeA
;
133 sh_ewald
= _mm256_set1_ps(fr
->ic
->sh_ewald
);
134 beta
= _mm256_set1_ps(fr
->ic
->ewaldcoeff_q
);
135 beta2
= _mm256_mul_ps(beta
,beta
);
136 beta3
= _mm256_mul_ps(beta
,beta2
);
138 ewtab
= fr
->ic
->tabq_coul_FDV0
;
139 ewtabscale
= _mm256_set1_ps(fr
->ic
->tabq_scale
);
140 ewtabhalfspace
= _mm256_set1_ps(0.5/fr
->ic
->tabq_scale
);
142 /* Setup water-specific parameters */
143 inr
= nlist
->iinr
[0];
144 iq1
= _mm256_mul_ps(facel
,_mm256_set1_ps(charge
[inr
+1]));
145 iq2
= _mm256_mul_ps(facel
,_mm256_set1_ps(charge
[inr
+2]));
146 iq3
= _mm256_mul_ps(facel
,_mm256_set1_ps(charge
[inr
+3]));
147 vdwioffsetptr0
= vdwparam
+2*nvdwtype
*vdwtype
[inr
+0];
149 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
150 rcutoff_scalar
= fr
->ic
->rcoulomb
;
151 rcutoff
= _mm256_set1_ps(rcutoff_scalar
);
152 rcutoff2
= _mm256_mul_ps(rcutoff
,rcutoff
);
154 sh_vdw_invrcut6
= _mm256_set1_ps(fr
->ic
->sh_invrc6
);
155 rvdw
= _mm256_set1_ps(fr
->ic
->rvdw
);
157 /* Avoid stupid compiler warnings */
158 jnrA
= jnrB
= jnrC
= jnrD
= jnrE
= jnrF
= jnrG
= jnrH
= 0;
171 for(iidx
=0;iidx
<4*DIM
;iidx
++)
176 /* Start outer loop over neighborlists */
177 for(iidx
=0; iidx
<nri
; iidx
++)
179 /* Load shift vector for this list */
180 i_shift_offset
= DIM
*shiftidx
[iidx
];
182 /* Load limits for loop over neighbors */
183 j_index_start
= jindex
[iidx
];
184 j_index_end
= jindex
[iidx
+1];
186 /* Get outer coordinate index */
188 i_coord_offset
= DIM
*inr
;
190 /* Load i particle coords and add shift vector */
191 gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
192 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
194 fix0
= _mm256_setzero_ps();
195 fiy0
= _mm256_setzero_ps();
196 fiz0
= _mm256_setzero_ps();
197 fix1
= _mm256_setzero_ps();
198 fiy1
= _mm256_setzero_ps();
199 fiz1
= _mm256_setzero_ps();
200 fix2
= _mm256_setzero_ps();
201 fiy2
= _mm256_setzero_ps();
202 fiz2
= _mm256_setzero_ps();
203 fix3
= _mm256_setzero_ps();
204 fiy3
= _mm256_setzero_ps();
205 fiz3
= _mm256_setzero_ps();
207 /* Reset potential sums */
208 velecsum
= _mm256_setzero_ps();
209 vvdwsum
= _mm256_setzero_ps();
211 /* Start inner kernel loop */
212 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+7]>=0; jidx
+=8)
215 /* Get j neighbor index, and coordinate index */
224 j_coord_offsetA
= DIM
*jnrA
;
225 j_coord_offsetB
= DIM
*jnrB
;
226 j_coord_offsetC
= DIM
*jnrC
;
227 j_coord_offsetD
= DIM
*jnrD
;
228 j_coord_offsetE
= DIM
*jnrE
;
229 j_coord_offsetF
= DIM
*jnrF
;
230 j_coord_offsetG
= DIM
*jnrG
;
231 j_coord_offsetH
= DIM
*jnrH
;
233 /* load j atom coordinates */
234 gmx_mm256_load_1rvec_8ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
235 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
236 x
+j_coord_offsetE
,x
+j_coord_offsetF
,
237 x
+j_coord_offsetG
,x
+j_coord_offsetH
,
240 /* Calculate displacement vector */
241 dx00
= _mm256_sub_ps(ix0
,jx0
);
242 dy00
= _mm256_sub_ps(iy0
,jy0
);
243 dz00
= _mm256_sub_ps(iz0
,jz0
);
244 dx10
= _mm256_sub_ps(ix1
,jx0
);
245 dy10
= _mm256_sub_ps(iy1
,jy0
);
246 dz10
= _mm256_sub_ps(iz1
,jz0
);
247 dx20
= _mm256_sub_ps(ix2
,jx0
);
248 dy20
= _mm256_sub_ps(iy2
,jy0
);
249 dz20
= _mm256_sub_ps(iz2
,jz0
);
250 dx30
= _mm256_sub_ps(ix3
,jx0
);
251 dy30
= _mm256_sub_ps(iy3
,jy0
);
252 dz30
= _mm256_sub_ps(iz3
,jz0
);
254 /* Calculate squared distance and things based on it */
255 rsq00
= gmx_mm256_calc_rsq_ps(dx00
,dy00
,dz00
);
256 rsq10
= gmx_mm256_calc_rsq_ps(dx10
,dy10
,dz10
);
257 rsq20
= gmx_mm256_calc_rsq_ps(dx20
,dy20
,dz20
);
258 rsq30
= gmx_mm256_calc_rsq_ps(dx30
,dy30
,dz30
);
260 rinv10
= avx256_invsqrt_f(rsq10
);
261 rinv20
= avx256_invsqrt_f(rsq20
);
262 rinv30
= avx256_invsqrt_f(rsq30
);
264 rinvsq00
= avx256_inv_f(rsq00
);
265 rinvsq10
= _mm256_mul_ps(rinv10
,rinv10
);
266 rinvsq20
= _mm256_mul_ps(rinv20
,rinv20
);
267 rinvsq30
= _mm256_mul_ps(rinv30
,rinv30
);
269 /* Load parameters for j particles */
270 jq0
= gmx_mm256_load_8real_swizzle_ps(charge
+jnrA
+0,charge
+jnrB
+0,
271 charge
+jnrC
+0,charge
+jnrD
+0,
272 charge
+jnrE
+0,charge
+jnrF
+0,
273 charge
+jnrG
+0,charge
+jnrH
+0);
274 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
275 vdwjidx0B
= 2*vdwtype
[jnrB
+0];
276 vdwjidx0C
= 2*vdwtype
[jnrC
+0];
277 vdwjidx0D
= 2*vdwtype
[jnrD
+0];
278 vdwjidx0E
= 2*vdwtype
[jnrE
+0];
279 vdwjidx0F
= 2*vdwtype
[jnrF
+0];
280 vdwjidx0G
= 2*vdwtype
[jnrG
+0];
281 vdwjidx0H
= 2*vdwtype
[jnrH
+0];
283 fjx0
= _mm256_setzero_ps();
284 fjy0
= _mm256_setzero_ps();
285 fjz0
= _mm256_setzero_ps();
287 /**************************
288 * CALCULATE INTERACTIONS *
289 **************************/
291 if (gmx_mm256_any_lt(rsq00
,rcutoff2
))
294 /* Compute parameters for interactions between i and j atoms */
295 gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0
+vdwjidx0A
,
296 vdwioffsetptr0
+vdwjidx0B
,
297 vdwioffsetptr0
+vdwjidx0C
,
298 vdwioffsetptr0
+vdwjidx0D
,
299 vdwioffsetptr0
+vdwjidx0E
,
300 vdwioffsetptr0
+vdwjidx0F
,
301 vdwioffsetptr0
+vdwjidx0G
,
302 vdwioffsetptr0
+vdwjidx0H
,
305 /* LENNARD-JONES DISPERSION/REPULSION */
307 rinvsix
= _mm256_mul_ps(_mm256_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
308 vvdw6
= _mm256_mul_ps(c6_00
,rinvsix
);
309 vvdw12
= _mm256_mul_ps(c12_00
,_mm256_mul_ps(rinvsix
,rinvsix
));
310 vvdw
= _mm256_sub_ps(_mm256_mul_ps( _mm256_sub_ps(vvdw12
, _mm256_mul_ps(c12_00
,_mm256_mul_ps(sh_vdw_invrcut6
,sh_vdw_invrcut6
))), one_twelfth
) ,
311 _mm256_mul_ps( _mm256_sub_ps(vvdw6
,_mm256_mul_ps(c6_00
,sh_vdw_invrcut6
)),one_sixth
));
312 fvdw
= _mm256_mul_ps(_mm256_sub_ps(vvdw12
,vvdw6
),rinvsq00
);
314 cutoff_mask
= _mm256_cmp_ps(rsq00
,rcutoff2
,_CMP_LT_OQ
);
316 /* Update potential sum for this i atom from the interaction with this j atom. */
317 vvdw
= _mm256_and_ps(vvdw
,cutoff_mask
);
318 vvdwsum
= _mm256_add_ps(vvdwsum
,vvdw
);
322 fscal
= _mm256_and_ps(fscal
,cutoff_mask
);
324 /* Calculate temporary vectorial force */
325 tx
= _mm256_mul_ps(fscal
,dx00
);
326 ty
= _mm256_mul_ps(fscal
,dy00
);
327 tz
= _mm256_mul_ps(fscal
,dz00
);
329 /* Update vectorial force */
330 fix0
= _mm256_add_ps(fix0
,tx
);
331 fiy0
= _mm256_add_ps(fiy0
,ty
);
332 fiz0
= _mm256_add_ps(fiz0
,tz
);
334 fjx0
= _mm256_add_ps(fjx0
,tx
);
335 fjy0
= _mm256_add_ps(fjy0
,ty
);
336 fjz0
= _mm256_add_ps(fjz0
,tz
);
340 /**************************
341 * CALCULATE INTERACTIONS *
342 **************************/
344 if (gmx_mm256_any_lt(rsq10
,rcutoff2
))
347 r10
= _mm256_mul_ps(rsq10
,rinv10
);
349 /* Compute parameters for interactions between i and j atoms */
350 qq10
= _mm256_mul_ps(iq1
,jq0
);
352 /* EWALD ELECTROSTATICS */
354 /* Analytical PME correction */
355 zeta2
= _mm256_mul_ps(beta2
,rsq10
);
356 rinv3
= _mm256_mul_ps(rinvsq10
,rinv10
);
357 pmecorrF
= avx256_pmecorrF_f(zeta2
);
358 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
359 felec
= _mm256_mul_ps(qq10
,felec
);
360 pmecorrV
= avx256_pmecorrV_f(zeta2
);
361 pmecorrV
= _mm256_mul_ps(pmecorrV
,beta
);
362 velec
= _mm256_sub_ps(_mm256_sub_ps(rinv10
,sh_ewald
),pmecorrV
);
363 velec
= _mm256_mul_ps(qq10
,velec
);
365 cutoff_mask
= _mm256_cmp_ps(rsq10
,rcutoff2
,_CMP_LT_OQ
);
367 /* Update potential sum for this i atom from the interaction with this j atom. */
368 velec
= _mm256_and_ps(velec
,cutoff_mask
);
369 velecsum
= _mm256_add_ps(velecsum
,velec
);
373 fscal
= _mm256_and_ps(fscal
,cutoff_mask
);
375 /* Calculate temporary vectorial force */
376 tx
= _mm256_mul_ps(fscal
,dx10
);
377 ty
= _mm256_mul_ps(fscal
,dy10
);
378 tz
= _mm256_mul_ps(fscal
,dz10
);
380 /* Update vectorial force */
381 fix1
= _mm256_add_ps(fix1
,tx
);
382 fiy1
= _mm256_add_ps(fiy1
,ty
);
383 fiz1
= _mm256_add_ps(fiz1
,tz
);
385 fjx0
= _mm256_add_ps(fjx0
,tx
);
386 fjy0
= _mm256_add_ps(fjy0
,ty
);
387 fjz0
= _mm256_add_ps(fjz0
,tz
);
391 /**************************
392 * CALCULATE INTERACTIONS *
393 **************************/
395 if (gmx_mm256_any_lt(rsq20
,rcutoff2
))
398 r20
= _mm256_mul_ps(rsq20
,rinv20
);
400 /* Compute parameters for interactions between i and j atoms */
401 qq20
= _mm256_mul_ps(iq2
,jq0
);
403 /* EWALD ELECTROSTATICS */
405 /* Analytical PME correction */
406 zeta2
= _mm256_mul_ps(beta2
,rsq20
);
407 rinv3
= _mm256_mul_ps(rinvsq20
,rinv20
);
408 pmecorrF
= avx256_pmecorrF_f(zeta2
);
409 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
410 felec
= _mm256_mul_ps(qq20
,felec
);
411 pmecorrV
= avx256_pmecorrV_f(zeta2
);
412 pmecorrV
= _mm256_mul_ps(pmecorrV
,beta
);
413 velec
= _mm256_sub_ps(_mm256_sub_ps(rinv20
,sh_ewald
),pmecorrV
);
414 velec
= _mm256_mul_ps(qq20
,velec
);
416 cutoff_mask
= _mm256_cmp_ps(rsq20
,rcutoff2
,_CMP_LT_OQ
);
418 /* Update potential sum for this i atom from the interaction with this j atom. */
419 velec
= _mm256_and_ps(velec
,cutoff_mask
);
420 velecsum
= _mm256_add_ps(velecsum
,velec
);
424 fscal
= _mm256_and_ps(fscal
,cutoff_mask
);
426 /* Calculate temporary vectorial force */
427 tx
= _mm256_mul_ps(fscal
,dx20
);
428 ty
= _mm256_mul_ps(fscal
,dy20
);
429 tz
= _mm256_mul_ps(fscal
,dz20
);
431 /* Update vectorial force */
432 fix2
= _mm256_add_ps(fix2
,tx
);
433 fiy2
= _mm256_add_ps(fiy2
,ty
);
434 fiz2
= _mm256_add_ps(fiz2
,tz
);
436 fjx0
= _mm256_add_ps(fjx0
,tx
);
437 fjy0
= _mm256_add_ps(fjy0
,ty
);
438 fjz0
= _mm256_add_ps(fjz0
,tz
);
442 /**************************
443 * CALCULATE INTERACTIONS *
444 **************************/
446 if (gmx_mm256_any_lt(rsq30
,rcutoff2
))
449 r30
= _mm256_mul_ps(rsq30
,rinv30
);
451 /* Compute parameters for interactions between i and j atoms */
452 qq30
= _mm256_mul_ps(iq3
,jq0
);
454 /* EWALD ELECTROSTATICS */
456 /* Analytical PME correction */
457 zeta2
= _mm256_mul_ps(beta2
,rsq30
);
458 rinv3
= _mm256_mul_ps(rinvsq30
,rinv30
);
459 pmecorrF
= avx256_pmecorrF_f(zeta2
);
460 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
461 felec
= _mm256_mul_ps(qq30
,felec
);
462 pmecorrV
= avx256_pmecorrV_f(zeta2
);
463 pmecorrV
= _mm256_mul_ps(pmecorrV
,beta
);
464 velec
= _mm256_sub_ps(_mm256_sub_ps(rinv30
,sh_ewald
),pmecorrV
);
465 velec
= _mm256_mul_ps(qq30
,velec
);
467 cutoff_mask
= _mm256_cmp_ps(rsq30
,rcutoff2
,_CMP_LT_OQ
);
469 /* Update potential sum for this i atom from the interaction with this j atom. */
470 velec
= _mm256_and_ps(velec
,cutoff_mask
);
471 velecsum
= _mm256_add_ps(velecsum
,velec
);
475 fscal
= _mm256_and_ps(fscal
,cutoff_mask
);
477 /* Calculate temporary vectorial force */
478 tx
= _mm256_mul_ps(fscal
,dx30
);
479 ty
= _mm256_mul_ps(fscal
,dy30
);
480 tz
= _mm256_mul_ps(fscal
,dz30
);
482 /* Update vectorial force */
483 fix3
= _mm256_add_ps(fix3
,tx
);
484 fiy3
= _mm256_add_ps(fiy3
,ty
);
485 fiz3
= _mm256_add_ps(fiz3
,tz
);
487 fjx0
= _mm256_add_ps(fjx0
,tx
);
488 fjy0
= _mm256_add_ps(fjy0
,ty
);
489 fjz0
= _mm256_add_ps(fjz0
,tz
);
493 fjptrA
= f
+j_coord_offsetA
;
494 fjptrB
= f
+j_coord_offsetB
;
495 fjptrC
= f
+j_coord_offsetC
;
496 fjptrD
= f
+j_coord_offsetD
;
497 fjptrE
= f
+j_coord_offsetE
;
498 fjptrF
= f
+j_coord_offsetF
;
499 fjptrG
= f
+j_coord_offsetG
;
500 fjptrH
= f
+j_coord_offsetH
;
502 gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,fjptrE
,fjptrF
,fjptrG
,fjptrH
,fjx0
,fjy0
,fjz0
);
504 /* Inner loop uses 371 flops */
510 /* Get j neighbor index, and coordinate index */
511 jnrlistA
= jjnr
[jidx
];
512 jnrlistB
= jjnr
[jidx
+1];
513 jnrlistC
= jjnr
[jidx
+2];
514 jnrlistD
= jjnr
[jidx
+3];
515 jnrlistE
= jjnr
[jidx
+4];
516 jnrlistF
= jjnr
[jidx
+5];
517 jnrlistG
= jjnr
[jidx
+6];
518 jnrlistH
= jjnr
[jidx
+7];
519 /* Sign of each element will be negative for non-real atoms.
520 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
521 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
523 dummy_mask
= gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
+4)),_mm_setzero_si128())),
524 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128())));
526 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
527 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
528 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
529 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
530 jnrE
= (jnrlistE
>=0) ? jnrlistE
: 0;
531 jnrF
= (jnrlistF
>=0) ? jnrlistF
: 0;
532 jnrG
= (jnrlistG
>=0) ? jnrlistG
: 0;
533 jnrH
= (jnrlistH
>=0) ? jnrlistH
: 0;
534 j_coord_offsetA
= DIM
*jnrA
;
535 j_coord_offsetB
= DIM
*jnrB
;
536 j_coord_offsetC
= DIM
*jnrC
;
537 j_coord_offsetD
= DIM
*jnrD
;
538 j_coord_offsetE
= DIM
*jnrE
;
539 j_coord_offsetF
= DIM
*jnrF
;
540 j_coord_offsetG
= DIM
*jnrG
;
541 j_coord_offsetH
= DIM
*jnrH
;
543 /* load j atom coordinates */
544 gmx_mm256_load_1rvec_8ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
545 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
546 x
+j_coord_offsetE
,x
+j_coord_offsetF
,
547 x
+j_coord_offsetG
,x
+j_coord_offsetH
,
550 /* Calculate displacement vector */
551 dx00
= _mm256_sub_ps(ix0
,jx0
);
552 dy00
= _mm256_sub_ps(iy0
,jy0
);
553 dz00
= _mm256_sub_ps(iz0
,jz0
);
554 dx10
= _mm256_sub_ps(ix1
,jx0
);
555 dy10
= _mm256_sub_ps(iy1
,jy0
);
556 dz10
= _mm256_sub_ps(iz1
,jz0
);
557 dx20
= _mm256_sub_ps(ix2
,jx0
);
558 dy20
= _mm256_sub_ps(iy2
,jy0
);
559 dz20
= _mm256_sub_ps(iz2
,jz0
);
560 dx30
= _mm256_sub_ps(ix3
,jx0
);
561 dy30
= _mm256_sub_ps(iy3
,jy0
);
562 dz30
= _mm256_sub_ps(iz3
,jz0
);
564 /* Calculate squared distance and things based on it */
565 rsq00
= gmx_mm256_calc_rsq_ps(dx00
,dy00
,dz00
);
566 rsq10
= gmx_mm256_calc_rsq_ps(dx10
,dy10
,dz10
);
567 rsq20
= gmx_mm256_calc_rsq_ps(dx20
,dy20
,dz20
);
568 rsq30
= gmx_mm256_calc_rsq_ps(dx30
,dy30
,dz30
);
570 rinv10
= avx256_invsqrt_f(rsq10
);
571 rinv20
= avx256_invsqrt_f(rsq20
);
572 rinv30
= avx256_invsqrt_f(rsq30
);
574 rinvsq00
= avx256_inv_f(rsq00
);
575 rinvsq10
= _mm256_mul_ps(rinv10
,rinv10
);
576 rinvsq20
= _mm256_mul_ps(rinv20
,rinv20
);
577 rinvsq30
= _mm256_mul_ps(rinv30
,rinv30
);
579 /* Load parameters for j particles */
580 jq0
= gmx_mm256_load_8real_swizzle_ps(charge
+jnrA
+0,charge
+jnrB
+0,
581 charge
+jnrC
+0,charge
+jnrD
+0,
582 charge
+jnrE
+0,charge
+jnrF
+0,
583 charge
+jnrG
+0,charge
+jnrH
+0);
584 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
585 vdwjidx0B
= 2*vdwtype
[jnrB
+0];
586 vdwjidx0C
= 2*vdwtype
[jnrC
+0];
587 vdwjidx0D
= 2*vdwtype
[jnrD
+0];
588 vdwjidx0E
= 2*vdwtype
[jnrE
+0];
589 vdwjidx0F
= 2*vdwtype
[jnrF
+0];
590 vdwjidx0G
= 2*vdwtype
[jnrG
+0];
591 vdwjidx0H
= 2*vdwtype
[jnrH
+0];
593 fjx0
= _mm256_setzero_ps();
594 fjy0
= _mm256_setzero_ps();
595 fjz0
= _mm256_setzero_ps();
597 /**************************
598 * CALCULATE INTERACTIONS *
599 **************************/
601 if (gmx_mm256_any_lt(rsq00
,rcutoff2
))
604 /* Compute parameters for interactions between i and j atoms */
605 gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0
+vdwjidx0A
,
606 vdwioffsetptr0
+vdwjidx0B
,
607 vdwioffsetptr0
+vdwjidx0C
,
608 vdwioffsetptr0
+vdwjidx0D
,
609 vdwioffsetptr0
+vdwjidx0E
,
610 vdwioffsetptr0
+vdwjidx0F
,
611 vdwioffsetptr0
+vdwjidx0G
,
612 vdwioffsetptr0
+vdwjidx0H
,
615 /* LENNARD-JONES DISPERSION/REPULSION */
617 rinvsix
= _mm256_mul_ps(_mm256_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
618 vvdw6
= _mm256_mul_ps(c6_00
,rinvsix
);
619 vvdw12
= _mm256_mul_ps(c12_00
,_mm256_mul_ps(rinvsix
,rinvsix
));
620 vvdw
= _mm256_sub_ps(_mm256_mul_ps( _mm256_sub_ps(vvdw12
, _mm256_mul_ps(c12_00
,_mm256_mul_ps(sh_vdw_invrcut6
,sh_vdw_invrcut6
))), one_twelfth
) ,
621 _mm256_mul_ps( _mm256_sub_ps(vvdw6
,_mm256_mul_ps(c6_00
,sh_vdw_invrcut6
)),one_sixth
));
622 fvdw
= _mm256_mul_ps(_mm256_sub_ps(vvdw12
,vvdw6
),rinvsq00
);
624 cutoff_mask
= _mm256_cmp_ps(rsq00
,rcutoff2
,_CMP_LT_OQ
);
626 /* Update potential sum for this i atom from the interaction with this j atom. */
627 vvdw
= _mm256_and_ps(vvdw
,cutoff_mask
);
628 vvdw
= _mm256_andnot_ps(dummy_mask
,vvdw
);
629 vvdwsum
= _mm256_add_ps(vvdwsum
,vvdw
);
633 fscal
= _mm256_and_ps(fscal
,cutoff_mask
);
635 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
637 /* Calculate temporary vectorial force */
638 tx
= _mm256_mul_ps(fscal
,dx00
);
639 ty
= _mm256_mul_ps(fscal
,dy00
);
640 tz
= _mm256_mul_ps(fscal
,dz00
);
642 /* Update vectorial force */
643 fix0
= _mm256_add_ps(fix0
,tx
);
644 fiy0
= _mm256_add_ps(fiy0
,ty
);
645 fiz0
= _mm256_add_ps(fiz0
,tz
);
647 fjx0
= _mm256_add_ps(fjx0
,tx
);
648 fjy0
= _mm256_add_ps(fjy0
,ty
);
649 fjz0
= _mm256_add_ps(fjz0
,tz
);
653 /**************************
654 * CALCULATE INTERACTIONS *
655 **************************/
657 if (gmx_mm256_any_lt(rsq10
,rcutoff2
))
660 r10
= _mm256_mul_ps(rsq10
,rinv10
);
661 r10
= _mm256_andnot_ps(dummy_mask
,r10
);
663 /* Compute parameters for interactions between i and j atoms */
664 qq10
= _mm256_mul_ps(iq1
,jq0
);
666 /* EWALD ELECTROSTATICS */
668 /* Analytical PME correction */
669 zeta2
= _mm256_mul_ps(beta2
,rsq10
);
670 rinv3
= _mm256_mul_ps(rinvsq10
,rinv10
);
671 pmecorrF
= avx256_pmecorrF_f(zeta2
);
672 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
673 felec
= _mm256_mul_ps(qq10
,felec
);
674 pmecorrV
= avx256_pmecorrV_f(zeta2
);
675 pmecorrV
= _mm256_mul_ps(pmecorrV
,beta
);
676 velec
= _mm256_sub_ps(_mm256_sub_ps(rinv10
,sh_ewald
),pmecorrV
);
677 velec
= _mm256_mul_ps(qq10
,velec
);
679 cutoff_mask
= _mm256_cmp_ps(rsq10
,rcutoff2
,_CMP_LT_OQ
);
681 /* Update potential sum for this i atom from the interaction with this j atom. */
682 velec
= _mm256_and_ps(velec
,cutoff_mask
);
683 velec
= _mm256_andnot_ps(dummy_mask
,velec
);
684 velecsum
= _mm256_add_ps(velecsum
,velec
);
688 fscal
= _mm256_and_ps(fscal
,cutoff_mask
);
690 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
692 /* Calculate temporary vectorial force */
693 tx
= _mm256_mul_ps(fscal
,dx10
);
694 ty
= _mm256_mul_ps(fscal
,dy10
);
695 tz
= _mm256_mul_ps(fscal
,dz10
);
697 /* Update vectorial force */
698 fix1
= _mm256_add_ps(fix1
,tx
);
699 fiy1
= _mm256_add_ps(fiy1
,ty
);
700 fiz1
= _mm256_add_ps(fiz1
,tz
);
702 fjx0
= _mm256_add_ps(fjx0
,tx
);
703 fjy0
= _mm256_add_ps(fjy0
,ty
);
704 fjz0
= _mm256_add_ps(fjz0
,tz
);
708 /**************************
709 * CALCULATE INTERACTIONS *
710 **************************/
712 if (gmx_mm256_any_lt(rsq20
,rcutoff2
))
715 r20
= _mm256_mul_ps(rsq20
,rinv20
);
716 r20
= _mm256_andnot_ps(dummy_mask
,r20
);
718 /* Compute parameters for interactions between i and j atoms */
719 qq20
= _mm256_mul_ps(iq2
,jq0
);
721 /* EWALD ELECTROSTATICS */
723 /* Analytical PME correction */
724 zeta2
= _mm256_mul_ps(beta2
,rsq20
);
725 rinv3
= _mm256_mul_ps(rinvsq20
,rinv20
);
726 pmecorrF
= avx256_pmecorrF_f(zeta2
);
727 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
728 felec
= _mm256_mul_ps(qq20
,felec
);
729 pmecorrV
= avx256_pmecorrV_f(zeta2
);
730 pmecorrV
= _mm256_mul_ps(pmecorrV
,beta
);
731 velec
= _mm256_sub_ps(_mm256_sub_ps(rinv20
,sh_ewald
),pmecorrV
);
732 velec
= _mm256_mul_ps(qq20
,velec
);
734 cutoff_mask
= _mm256_cmp_ps(rsq20
,rcutoff2
,_CMP_LT_OQ
);
736 /* Update potential sum for this i atom from the interaction with this j atom. */
737 velec
= _mm256_and_ps(velec
,cutoff_mask
);
738 velec
= _mm256_andnot_ps(dummy_mask
,velec
);
739 velecsum
= _mm256_add_ps(velecsum
,velec
);
743 fscal
= _mm256_and_ps(fscal
,cutoff_mask
);
745 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
747 /* Calculate temporary vectorial force */
748 tx
= _mm256_mul_ps(fscal
,dx20
);
749 ty
= _mm256_mul_ps(fscal
,dy20
);
750 tz
= _mm256_mul_ps(fscal
,dz20
);
752 /* Update vectorial force */
753 fix2
= _mm256_add_ps(fix2
,tx
);
754 fiy2
= _mm256_add_ps(fiy2
,ty
);
755 fiz2
= _mm256_add_ps(fiz2
,tz
);
757 fjx0
= _mm256_add_ps(fjx0
,tx
);
758 fjy0
= _mm256_add_ps(fjy0
,ty
);
759 fjz0
= _mm256_add_ps(fjz0
,tz
);
763 /**************************
764 * CALCULATE INTERACTIONS *
765 **************************/
767 if (gmx_mm256_any_lt(rsq30
,rcutoff2
))
770 r30
= _mm256_mul_ps(rsq30
,rinv30
);
771 r30
= _mm256_andnot_ps(dummy_mask
,r30
);
773 /* Compute parameters for interactions between i and j atoms */
774 qq30
= _mm256_mul_ps(iq3
,jq0
);
776 /* EWALD ELECTROSTATICS */
778 /* Analytical PME correction */
779 zeta2
= _mm256_mul_ps(beta2
,rsq30
);
780 rinv3
= _mm256_mul_ps(rinvsq30
,rinv30
);
781 pmecorrF
= avx256_pmecorrF_f(zeta2
);
782 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
783 felec
= _mm256_mul_ps(qq30
,felec
);
784 pmecorrV
= avx256_pmecorrV_f(zeta2
);
785 pmecorrV
= _mm256_mul_ps(pmecorrV
,beta
);
786 velec
= _mm256_sub_ps(_mm256_sub_ps(rinv30
,sh_ewald
),pmecorrV
);
787 velec
= _mm256_mul_ps(qq30
,velec
);
789 cutoff_mask
= _mm256_cmp_ps(rsq30
,rcutoff2
,_CMP_LT_OQ
);
791 /* Update potential sum for this i atom from the interaction with this j atom. */
792 velec
= _mm256_and_ps(velec
,cutoff_mask
);
793 velec
= _mm256_andnot_ps(dummy_mask
,velec
);
794 velecsum
= _mm256_add_ps(velecsum
,velec
);
798 fscal
= _mm256_and_ps(fscal
,cutoff_mask
);
800 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
802 /* Calculate temporary vectorial force */
803 tx
= _mm256_mul_ps(fscal
,dx30
);
804 ty
= _mm256_mul_ps(fscal
,dy30
);
805 tz
= _mm256_mul_ps(fscal
,dz30
);
807 /* Update vectorial force */
808 fix3
= _mm256_add_ps(fix3
,tx
);
809 fiy3
= _mm256_add_ps(fiy3
,ty
);
810 fiz3
= _mm256_add_ps(fiz3
,tz
);
812 fjx0
= _mm256_add_ps(fjx0
,tx
);
813 fjy0
= _mm256_add_ps(fjy0
,ty
);
814 fjz0
= _mm256_add_ps(fjz0
,tz
);
818 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
819 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
820 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
821 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
822 fjptrE
= (jnrlistE
>=0) ? f
+j_coord_offsetE
: scratch
;
823 fjptrF
= (jnrlistF
>=0) ? f
+j_coord_offsetF
: scratch
;
824 fjptrG
= (jnrlistG
>=0) ? f
+j_coord_offsetG
: scratch
;
825 fjptrH
= (jnrlistH
>=0) ? f
+j_coord_offsetH
: scratch
;
827 gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,fjptrE
,fjptrF
,fjptrG
,fjptrH
,fjx0
,fjy0
,fjz0
);
829 /* Inner loop uses 374 flops */
832 /* End of innermost loop */
834 gmx_mm256_update_iforce_4atom_swizzle_ps(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
835 f
+i_coord_offset
,fshift
+i_shift_offset
);
838 /* Update potential energies */
839 gmx_mm256_update_1pot_ps(velecsum
,kernel_data
->energygrp_elec
+ggid
);
840 gmx_mm256_update_1pot_ps(vvdwsum
,kernel_data
->energygrp_vdw
+ggid
);
842 /* Increment number of inner iterations */
843 inneriter
+= j_index_end
- j_index_start
;
845 /* Outer loop uses 26 flops */
848 /* Increment number of outer iterations */
851 /* Update outer/inner flops */
853 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W4_VF
,outeriter
*26 + inneriter
*374);
856 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_avx_256_single
857 * Electrostatics interaction: Ewald
858 * VdW interaction: LennardJones
859 * Geometry: Water4-Particle
860 * Calculate force/pot: Force
863 nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_avx_256_single
864 (t_nblist
* gmx_restrict nlist
,
865 rvec
* gmx_restrict xx
,
866 rvec
* gmx_restrict ff
,
867 struct t_forcerec
* gmx_restrict fr
,
868 t_mdatoms
* gmx_restrict mdatoms
,
869 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
870 t_nrnb
* gmx_restrict nrnb
)
872 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
873 * just 0 for non-waters.
874 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
875 * jnr indices corresponding to data put in the four positions in the SIMD register.
877 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
878 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
879 int jnrA
,jnrB
,jnrC
,jnrD
;
880 int jnrE
,jnrF
,jnrG
,jnrH
;
881 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
882 int jnrlistE
,jnrlistF
,jnrlistG
,jnrlistH
;
883 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
884 int j_coord_offsetE
,j_coord_offsetF
,j_coord_offsetG
,j_coord_offsetH
;
885 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
887 real
*shiftvec
,*fshift
,*x
,*f
;
888 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
,*fjptrE
,*fjptrF
,*fjptrG
,*fjptrH
;
890 __m256 tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
891 real
* vdwioffsetptr0
;
892 __m256 ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
893 real
* vdwioffsetptr1
;
894 __m256 ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
895 real
* vdwioffsetptr2
;
896 __m256 ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
897 real
* vdwioffsetptr3
;
898 __m256 ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
899 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
,vdwjidx0E
,vdwjidx0F
,vdwjidx0G
,vdwjidx0H
;
900 __m256 jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
901 __m256 dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
902 __m256 dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
903 __m256 dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
904 __m256 dx30
,dy30
,dz30
,rsq30
,rinv30
,rinvsq30
,r30
,qq30
,c6_30
,c12_30
;
905 __m256 velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
908 __m256 rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
911 __m256 one_sixth
= _mm256_set1_ps(1.0/6.0);
912 __m256 one_twelfth
= _mm256_set1_ps(1.0/12.0);
914 __m128i ewitab_lo
,ewitab_hi
;
915 __m256 ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
916 __m256 beta
,beta2
,beta3
,zeta2
,pmecorrF
,pmecorrV
,rinv3
;
918 __m256 dummy_mask
,cutoff_mask
;
919 __m256 signbit
= _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
920 __m256 one
= _mm256_set1_ps(1.0);
921 __m256 two
= _mm256_set1_ps(2.0);
927 jindex
= nlist
->jindex
;
929 shiftidx
= nlist
->shift
;
931 shiftvec
= fr
->shift_vec
[0];
932 fshift
= fr
->fshift
[0];
933 facel
= _mm256_set1_ps(fr
->ic
->epsfac
);
934 charge
= mdatoms
->chargeA
;
935 nvdwtype
= fr
->ntype
;
937 vdwtype
= mdatoms
->typeA
;
939 sh_ewald
= _mm256_set1_ps(fr
->ic
->sh_ewald
);
940 beta
= _mm256_set1_ps(fr
->ic
->ewaldcoeff_q
);
941 beta2
= _mm256_mul_ps(beta
,beta
);
942 beta3
= _mm256_mul_ps(beta
,beta2
);
944 ewtab
= fr
->ic
->tabq_coul_F
;
945 ewtabscale
= _mm256_set1_ps(fr
->ic
->tabq_scale
);
946 ewtabhalfspace
= _mm256_set1_ps(0.5/fr
->ic
->tabq_scale
);
948 /* Setup water-specific parameters */
949 inr
= nlist
->iinr
[0];
950 iq1
= _mm256_mul_ps(facel
,_mm256_set1_ps(charge
[inr
+1]));
951 iq2
= _mm256_mul_ps(facel
,_mm256_set1_ps(charge
[inr
+2]));
952 iq3
= _mm256_mul_ps(facel
,_mm256_set1_ps(charge
[inr
+3]));
953 vdwioffsetptr0
= vdwparam
+2*nvdwtype
*vdwtype
[inr
+0];
955 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
956 rcutoff_scalar
= fr
->ic
->rcoulomb
;
957 rcutoff
= _mm256_set1_ps(rcutoff_scalar
);
958 rcutoff2
= _mm256_mul_ps(rcutoff
,rcutoff
);
960 sh_vdw_invrcut6
= _mm256_set1_ps(fr
->ic
->sh_invrc6
);
961 rvdw
= _mm256_set1_ps(fr
->ic
->rvdw
);
963 /* Avoid stupid compiler warnings */
964 jnrA
= jnrB
= jnrC
= jnrD
= jnrE
= jnrF
= jnrG
= jnrH
= 0;
977 for(iidx
=0;iidx
<4*DIM
;iidx
++)
982 /* Start outer loop over neighborlists */
983 for(iidx
=0; iidx
<nri
; iidx
++)
985 /* Load shift vector for this list */
986 i_shift_offset
= DIM
*shiftidx
[iidx
];
988 /* Load limits for loop over neighbors */
989 j_index_start
= jindex
[iidx
];
990 j_index_end
= jindex
[iidx
+1];
992 /* Get outer coordinate index */
994 i_coord_offset
= DIM
*inr
;
996 /* Load i particle coords and add shift vector */
997 gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
998 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
1000 fix0
= _mm256_setzero_ps();
1001 fiy0
= _mm256_setzero_ps();
1002 fiz0
= _mm256_setzero_ps();
1003 fix1
= _mm256_setzero_ps();
1004 fiy1
= _mm256_setzero_ps();
1005 fiz1
= _mm256_setzero_ps();
1006 fix2
= _mm256_setzero_ps();
1007 fiy2
= _mm256_setzero_ps();
1008 fiz2
= _mm256_setzero_ps();
1009 fix3
= _mm256_setzero_ps();
1010 fiy3
= _mm256_setzero_ps();
1011 fiz3
= _mm256_setzero_ps();
1013 /* Start inner kernel loop */
1014 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+7]>=0; jidx
+=8)
1017 /* Get j neighbor index, and coordinate index */
1019 jnrB
= jjnr
[jidx
+1];
1020 jnrC
= jjnr
[jidx
+2];
1021 jnrD
= jjnr
[jidx
+3];
1022 jnrE
= jjnr
[jidx
+4];
1023 jnrF
= jjnr
[jidx
+5];
1024 jnrG
= jjnr
[jidx
+6];
1025 jnrH
= jjnr
[jidx
+7];
1026 j_coord_offsetA
= DIM
*jnrA
;
1027 j_coord_offsetB
= DIM
*jnrB
;
1028 j_coord_offsetC
= DIM
*jnrC
;
1029 j_coord_offsetD
= DIM
*jnrD
;
1030 j_coord_offsetE
= DIM
*jnrE
;
1031 j_coord_offsetF
= DIM
*jnrF
;
1032 j_coord_offsetG
= DIM
*jnrG
;
1033 j_coord_offsetH
= DIM
*jnrH
;
1035 /* load j atom coordinates */
1036 gmx_mm256_load_1rvec_8ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1037 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
1038 x
+j_coord_offsetE
,x
+j_coord_offsetF
,
1039 x
+j_coord_offsetG
,x
+j_coord_offsetH
,
1042 /* Calculate displacement vector */
1043 dx00
= _mm256_sub_ps(ix0
,jx0
);
1044 dy00
= _mm256_sub_ps(iy0
,jy0
);
1045 dz00
= _mm256_sub_ps(iz0
,jz0
);
1046 dx10
= _mm256_sub_ps(ix1
,jx0
);
1047 dy10
= _mm256_sub_ps(iy1
,jy0
);
1048 dz10
= _mm256_sub_ps(iz1
,jz0
);
1049 dx20
= _mm256_sub_ps(ix2
,jx0
);
1050 dy20
= _mm256_sub_ps(iy2
,jy0
);
1051 dz20
= _mm256_sub_ps(iz2
,jz0
);
1052 dx30
= _mm256_sub_ps(ix3
,jx0
);
1053 dy30
= _mm256_sub_ps(iy3
,jy0
);
1054 dz30
= _mm256_sub_ps(iz3
,jz0
);
1056 /* Calculate squared distance and things based on it */
1057 rsq00
= gmx_mm256_calc_rsq_ps(dx00
,dy00
,dz00
);
1058 rsq10
= gmx_mm256_calc_rsq_ps(dx10
,dy10
,dz10
);
1059 rsq20
= gmx_mm256_calc_rsq_ps(dx20
,dy20
,dz20
);
1060 rsq30
= gmx_mm256_calc_rsq_ps(dx30
,dy30
,dz30
);
1062 rinv10
= avx256_invsqrt_f(rsq10
);
1063 rinv20
= avx256_invsqrt_f(rsq20
);
1064 rinv30
= avx256_invsqrt_f(rsq30
);
1066 rinvsq00
= avx256_inv_f(rsq00
);
1067 rinvsq10
= _mm256_mul_ps(rinv10
,rinv10
);
1068 rinvsq20
= _mm256_mul_ps(rinv20
,rinv20
);
1069 rinvsq30
= _mm256_mul_ps(rinv30
,rinv30
);
1071 /* Load parameters for j particles */
1072 jq0
= gmx_mm256_load_8real_swizzle_ps(charge
+jnrA
+0,charge
+jnrB
+0,
1073 charge
+jnrC
+0,charge
+jnrD
+0,
1074 charge
+jnrE
+0,charge
+jnrF
+0,
1075 charge
+jnrG
+0,charge
+jnrH
+0);
1076 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
1077 vdwjidx0B
= 2*vdwtype
[jnrB
+0];
1078 vdwjidx0C
= 2*vdwtype
[jnrC
+0];
1079 vdwjidx0D
= 2*vdwtype
[jnrD
+0];
1080 vdwjidx0E
= 2*vdwtype
[jnrE
+0];
1081 vdwjidx0F
= 2*vdwtype
[jnrF
+0];
1082 vdwjidx0G
= 2*vdwtype
[jnrG
+0];
1083 vdwjidx0H
= 2*vdwtype
[jnrH
+0];
1085 fjx0
= _mm256_setzero_ps();
1086 fjy0
= _mm256_setzero_ps();
1087 fjz0
= _mm256_setzero_ps();
1089 /**************************
1090 * CALCULATE INTERACTIONS *
1091 **************************/
1093 if (gmx_mm256_any_lt(rsq00
,rcutoff2
))
1096 /* Compute parameters for interactions between i and j atoms */
1097 gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0
+vdwjidx0A
,
1098 vdwioffsetptr0
+vdwjidx0B
,
1099 vdwioffsetptr0
+vdwjidx0C
,
1100 vdwioffsetptr0
+vdwjidx0D
,
1101 vdwioffsetptr0
+vdwjidx0E
,
1102 vdwioffsetptr0
+vdwjidx0F
,
1103 vdwioffsetptr0
+vdwjidx0G
,
1104 vdwioffsetptr0
+vdwjidx0H
,
1107 /* LENNARD-JONES DISPERSION/REPULSION */
1109 rinvsix
= _mm256_mul_ps(_mm256_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
1110 fvdw
= _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00
,rinvsix
),c6_00
),_mm256_mul_ps(rinvsix
,rinvsq00
));
1112 cutoff_mask
= _mm256_cmp_ps(rsq00
,rcutoff2
,_CMP_LT_OQ
);
1116 fscal
= _mm256_and_ps(fscal
,cutoff_mask
);
1118 /* Calculate temporary vectorial force */
1119 tx
= _mm256_mul_ps(fscal
,dx00
);
1120 ty
= _mm256_mul_ps(fscal
,dy00
);
1121 tz
= _mm256_mul_ps(fscal
,dz00
);
1123 /* Update vectorial force */
1124 fix0
= _mm256_add_ps(fix0
,tx
);
1125 fiy0
= _mm256_add_ps(fiy0
,ty
);
1126 fiz0
= _mm256_add_ps(fiz0
,tz
);
1128 fjx0
= _mm256_add_ps(fjx0
,tx
);
1129 fjy0
= _mm256_add_ps(fjy0
,ty
);
1130 fjz0
= _mm256_add_ps(fjz0
,tz
);
1134 /**************************
1135 * CALCULATE INTERACTIONS *
1136 **************************/
1138 if (gmx_mm256_any_lt(rsq10
,rcutoff2
))
1141 r10
= _mm256_mul_ps(rsq10
,rinv10
);
1143 /* Compute parameters for interactions between i and j atoms */
1144 qq10
= _mm256_mul_ps(iq1
,jq0
);
1146 /* EWALD ELECTROSTATICS */
1148 /* Analytical PME correction */
1149 zeta2
= _mm256_mul_ps(beta2
,rsq10
);
1150 rinv3
= _mm256_mul_ps(rinvsq10
,rinv10
);
1151 pmecorrF
= avx256_pmecorrF_f(zeta2
);
1152 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
1153 felec
= _mm256_mul_ps(qq10
,felec
);
1155 cutoff_mask
= _mm256_cmp_ps(rsq10
,rcutoff2
,_CMP_LT_OQ
);
1159 fscal
= _mm256_and_ps(fscal
,cutoff_mask
);
1161 /* Calculate temporary vectorial force */
1162 tx
= _mm256_mul_ps(fscal
,dx10
);
1163 ty
= _mm256_mul_ps(fscal
,dy10
);
1164 tz
= _mm256_mul_ps(fscal
,dz10
);
1166 /* Update vectorial force */
1167 fix1
= _mm256_add_ps(fix1
,tx
);
1168 fiy1
= _mm256_add_ps(fiy1
,ty
);
1169 fiz1
= _mm256_add_ps(fiz1
,tz
);
1171 fjx0
= _mm256_add_ps(fjx0
,tx
);
1172 fjy0
= _mm256_add_ps(fjy0
,ty
);
1173 fjz0
= _mm256_add_ps(fjz0
,tz
);
1177 /**************************
1178 * CALCULATE INTERACTIONS *
1179 **************************/
1181 if (gmx_mm256_any_lt(rsq20
,rcutoff2
))
1184 r20
= _mm256_mul_ps(rsq20
,rinv20
);
1186 /* Compute parameters for interactions between i and j atoms */
1187 qq20
= _mm256_mul_ps(iq2
,jq0
);
1189 /* EWALD ELECTROSTATICS */
1191 /* Analytical PME correction */
1192 zeta2
= _mm256_mul_ps(beta2
,rsq20
);
1193 rinv3
= _mm256_mul_ps(rinvsq20
,rinv20
);
1194 pmecorrF
= avx256_pmecorrF_f(zeta2
);
1195 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
1196 felec
= _mm256_mul_ps(qq20
,felec
);
1198 cutoff_mask
= _mm256_cmp_ps(rsq20
,rcutoff2
,_CMP_LT_OQ
);
1202 fscal
= _mm256_and_ps(fscal
,cutoff_mask
);
1204 /* Calculate temporary vectorial force */
1205 tx
= _mm256_mul_ps(fscal
,dx20
);
1206 ty
= _mm256_mul_ps(fscal
,dy20
);
1207 tz
= _mm256_mul_ps(fscal
,dz20
);
1209 /* Update vectorial force */
1210 fix2
= _mm256_add_ps(fix2
,tx
);
1211 fiy2
= _mm256_add_ps(fiy2
,ty
);
1212 fiz2
= _mm256_add_ps(fiz2
,tz
);
1214 fjx0
= _mm256_add_ps(fjx0
,tx
);
1215 fjy0
= _mm256_add_ps(fjy0
,ty
);
1216 fjz0
= _mm256_add_ps(fjz0
,tz
);
1220 /**************************
1221 * CALCULATE INTERACTIONS *
1222 **************************/
1224 if (gmx_mm256_any_lt(rsq30
,rcutoff2
))
1227 r30
= _mm256_mul_ps(rsq30
,rinv30
);
1229 /* Compute parameters for interactions between i and j atoms */
1230 qq30
= _mm256_mul_ps(iq3
,jq0
);
1232 /* EWALD ELECTROSTATICS */
1234 /* Analytical PME correction */
1235 zeta2
= _mm256_mul_ps(beta2
,rsq30
);
1236 rinv3
= _mm256_mul_ps(rinvsq30
,rinv30
);
1237 pmecorrF
= avx256_pmecorrF_f(zeta2
);
1238 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
1239 felec
= _mm256_mul_ps(qq30
,felec
);
1241 cutoff_mask
= _mm256_cmp_ps(rsq30
,rcutoff2
,_CMP_LT_OQ
);
1245 fscal
= _mm256_and_ps(fscal
,cutoff_mask
);
1247 /* Calculate temporary vectorial force */
1248 tx
= _mm256_mul_ps(fscal
,dx30
);
1249 ty
= _mm256_mul_ps(fscal
,dy30
);
1250 tz
= _mm256_mul_ps(fscal
,dz30
);
1252 /* Update vectorial force */
1253 fix3
= _mm256_add_ps(fix3
,tx
);
1254 fiy3
= _mm256_add_ps(fiy3
,ty
);
1255 fiz3
= _mm256_add_ps(fiz3
,tz
);
1257 fjx0
= _mm256_add_ps(fjx0
,tx
);
1258 fjy0
= _mm256_add_ps(fjy0
,ty
);
1259 fjz0
= _mm256_add_ps(fjz0
,tz
);
1263 fjptrA
= f
+j_coord_offsetA
;
1264 fjptrB
= f
+j_coord_offsetB
;
1265 fjptrC
= f
+j_coord_offsetC
;
1266 fjptrD
= f
+j_coord_offsetD
;
1267 fjptrE
= f
+j_coord_offsetE
;
1268 fjptrF
= f
+j_coord_offsetF
;
1269 fjptrG
= f
+j_coord_offsetG
;
1270 fjptrH
= f
+j_coord_offsetH
;
1272 gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,fjptrE
,fjptrF
,fjptrG
,fjptrH
,fjx0
,fjy0
,fjz0
);
1274 /* Inner loop uses 210 flops */
1277 if(jidx
<j_index_end
)
1280 /* Get j neighbor index, and coordinate index */
1281 jnrlistA
= jjnr
[jidx
];
1282 jnrlistB
= jjnr
[jidx
+1];
1283 jnrlistC
= jjnr
[jidx
+2];
1284 jnrlistD
= jjnr
[jidx
+3];
1285 jnrlistE
= jjnr
[jidx
+4];
1286 jnrlistF
= jjnr
[jidx
+5];
1287 jnrlistG
= jjnr
[jidx
+6];
1288 jnrlistH
= jjnr
[jidx
+7];
1289 /* Sign of each element will be negative for non-real atoms.
1290 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1291 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1293 dummy_mask
= gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
+4)),_mm_setzero_si128())),
1294 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128())));
1296 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
1297 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
1298 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
1299 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
1300 jnrE
= (jnrlistE
>=0) ? jnrlistE
: 0;
1301 jnrF
= (jnrlistF
>=0) ? jnrlistF
: 0;
1302 jnrG
= (jnrlistG
>=0) ? jnrlistG
: 0;
1303 jnrH
= (jnrlistH
>=0) ? jnrlistH
: 0;
1304 j_coord_offsetA
= DIM
*jnrA
;
1305 j_coord_offsetB
= DIM
*jnrB
;
1306 j_coord_offsetC
= DIM
*jnrC
;
1307 j_coord_offsetD
= DIM
*jnrD
;
1308 j_coord_offsetE
= DIM
*jnrE
;
1309 j_coord_offsetF
= DIM
*jnrF
;
1310 j_coord_offsetG
= DIM
*jnrG
;
1311 j_coord_offsetH
= DIM
*jnrH
;
1313 /* load j atom coordinates */
1314 gmx_mm256_load_1rvec_8ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1315 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
1316 x
+j_coord_offsetE
,x
+j_coord_offsetF
,
1317 x
+j_coord_offsetG
,x
+j_coord_offsetH
,
1320 /* Calculate displacement vector */
1321 dx00
= _mm256_sub_ps(ix0
,jx0
);
1322 dy00
= _mm256_sub_ps(iy0
,jy0
);
1323 dz00
= _mm256_sub_ps(iz0
,jz0
);
1324 dx10
= _mm256_sub_ps(ix1
,jx0
);
1325 dy10
= _mm256_sub_ps(iy1
,jy0
);
1326 dz10
= _mm256_sub_ps(iz1
,jz0
);
1327 dx20
= _mm256_sub_ps(ix2
,jx0
);
1328 dy20
= _mm256_sub_ps(iy2
,jy0
);
1329 dz20
= _mm256_sub_ps(iz2
,jz0
);
1330 dx30
= _mm256_sub_ps(ix3
,jx0
);
1331 dy30
= _mm256_sub_ps(iy3
,jy0
);
1332 dz30
= _mm256_sub_ps(iz3
,jz0
);
1334 /* Calculate squared distance and things based on it */
1335 rsq00
= gmx_mm256_calc_rsq_ps(dx00
,dy00
,dz00
);
1336 rsq10
= gmx_mm256_calc_rsq_ps(dx10
,dy10
,dz10
);
1337 rsq20
= gmx_mm256_calc_rsq_ps(dx20
,dy20
,dz20
);
1338 rsq30
= gmx_mm256_calc_rsq_ps(dx30
,dy30
,dz30
);
1340 rinv10
= avx256_invsqrt_f(rsq10
);
1341 rinv20
= avx256_invsqrt_f(rsq20
);
1342 rinv30
= avx256_invsqrt_f(rsq30
);
1344 rinvsq00
= avx256_inv_f(rsq00
);
1345 rinvsq10
= _mm256_mul_ps(rinv10
,rinv10
);
1346 rinvsq20
= _mm256_mul_ps(rinv20
,rinv20
);
1347 rinvsq30
= _mm256_mul_ps(rinv30
,rinv30
);
1349 /* Load parameters for j particles */
1350 jq0
= gmx_mm256_load_8real_swizzle_ps(charge
+jnrA
+0,charge
+jnrB
+0,
1351 charge
+jnrC
+0,charge
+jnrD
+0,
1352 charge
+jnrE
+0,charge
+jnrF
+0,
1353 charge
+jnrG
+0,charge
+jnrH
+0);
1354 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
1355 vdwjidx0B
= 2*vdwtype
[jnrB
+0];
1356 vdwjidx0C
= 2*vdwtype
[jnrC
+0];
1357 vdwjidx0D
= 2*vdwtype
[jnrD
+0];
1358 vdwjidx0E
= 2*vdwtype
[jnrE
+0];
1359 vdwjidx0F
= 2*vdwtype
[jnrF
+0];
1360 vdwjidx0G
= 2*vdwtype
[jnrG
+0];
1361 vdwjidx0H
= 2*vdwtype
[jnrH
+0];
1363 fjx0
= _mm256_setzero_ps();
1364 fjy0
= _mm256_setzero_ps();
1365 fjz0
= _mm256_setzero_ps();
1367 /**************************
1368 * CALCULATE INTERACTIONS *
1369 **************************/
1371 if (gmx_mm256_any_lt(rsq00
,rcutoff2
))
1374 /* Compute parameters for interactions between i and j atoms */
1375 gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0
+vdwjidx0A
,
1376 vdwioffsetptr0
+vdwjidx0B
,
1377 vdwioffsetptr0
+vdwjidx0C
,
1378 vdwioffsetptr0
+vdwjidx0D
,
1379 vdwioffsetptr0
+vdwjidx0E
,
1380 vdwioffsetptr0
+vdwjidx0F
,
1381 vdwioffsetptr0
+vdwjidx0G
,
1382 vdwioffsetptr0
+vdwjidx0H
,
1385 /* LENNARD-JONES DISPERSION/REPULSION */
1387 rinvsix
= _mm256_mul_ps(_mm256_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
1388 fvdw
= _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00
,rinvsix
),c6_00
),_mm256_mul_ps(rinvsix
,rinvsq00
));
1390 cutoff_mask
= _mm256_cmp_ps(rsq00
,rcutoff2
,_CMP_LT_OQ
);
1394 fscal
= _mm256_and_ps(fscal
,cutoff_mask
);
1396 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
1398 /* Calculate temporary vectorial force */
1399 tx
= _mm256_mul_ps(fscal
,dx00
);
1400 ty
= _mm256_mul_ps(fscal
,dy00
);
1401 tz
= _mm256_mul_ps(fscal
,dz00
);
1403 /* Update vectorial force */
1404 fix0
= _mm256_add_ps(fix0
,tx
);
1405 fiy0
= _mm256_add_ps(fiy0
,ty
);
1406 fiz0
= _mm256_add_ps(fiz0
,tz
);
1408 fjx0
= _mm256_add_ps(fjx0
,tx
);
1409 fjy0
= _mm256_add_ps(fjy0
,ty
);
1410 fjz0
= _mm256_add_ps(fjz0
,tz
);
1414 /**************************
1415 * CALCULATE INTERACTIONS *
1416 **************************/
1418 if (gmx_mm256_any_lt(rsq10
,rcutoff2
))
1421 r10
= _mm256_mul_ps(rsq10
,rinv10
);
1422 r10
= _mm256_andnot_ps(dummy_mask
,r10
);
1424 /* Compute parameters for interactions between i and j atoms */
1425 qq10
= _mm256_mul_ps(iq1
,jq0
);
1427 /* EWALD ELECTROSTATICS */
1429 /* Analytical PME correction */
1430 zeta2
= _mm256_mul_ps(beta2
,rsq10
);
1431 rinv3
= _mm256_mul_ps(rinvsq10
,rinv10
);
1432 pmecorrF
= avx256_pmecorrF_f(zeta2
);
1433 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
1434 felec
= _mm256_mul_ps(qq10
,felec
);
1436 cutoff_mask
= _mm256_cmp_ps(rsq10
,rcutoff2
,_CMP_LT_OQ
);
1440 fscal
= _mm256_and_ps(fscal
,cutoff_mask
);
1442 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
1444 /* Calculate temporary vectorial force */
1445 tx
= _mm256_mul_ps(fscal
,dx10
);
1446 ty
= _mm256_mul_ps(fscal
,dy10
);
1447 tz
= _mm256_mul_ps(fscal
,dz10
);
1449 /* Update vectorial force */
1450 fix1
= _mm256_add_ps(fix1
,tx
);
1451 fiy1
= _mm256_add_ps(fiy1
,ty
);
1452 fiz1
= _mm256_add_ps(fiz1
,tz
);
1454 fjx0
= _mm256_add_ps(fjx0
,tx
);
1455 fjy0
= _mm256_add_ps(fjy0
,ty
);
1456 fjz0
= _mm256_add_ps(fjz0
,tz
);
1460 /**************************
1461 * CALCULATE INTERACTIONS *
1462 **************************/
1464 if (gmx_mm256_any_lt(rsq20
,rcutoff2
))
1467 r20
= _mm256_mul_ps(rsq20
,rinv20
);
1468 r20
= _mm256_andnot_ps(dummy_mask
,r20
);
1470 /* Compute parameters for interactions between i and j atoms */
1471 qq20
= _mm256_mul_ps(iq2
,jq0
);
1473 /* EWALD ELECTROSTATICS */
1475 /* Analytical PME correction */
1476 zeta2
= _mm256_mul_ps(beta2
,rsq20
);
1477 rinv3
= _mm256_mul_ps(rinvsq20
,rinv20
);
1478 pmecorrF
= avx256_pmecorrF_f(zeta2
);
1479 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
1480 felec
= _mm256_mul_ps(qq20
,felec
);
1482 cutoff_mask
= _mm256_cmp_ps(rsq20
,rcutoff2
,_CMP_LT_OQ
);
1486 fscal
= _mm256_and_ps(fscal
,cutoff_mask
);
1488 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
1490 /* Calculate temporary vectorial force */
1491 tx
= _mm256_mul_ps(fscal
,dx20
);
1492 ty
= _mm256_mul_ps(fscal
,dy20
);
1493 tz
= _mm256_mul_ps(fscal
,dz20
);
1495 /* Update vectorial force */
1496 fix2
= _mm256_add_ps(fix2
,tx
);
1497 fiy2
= _mm256_add_ps(fiy2
,ty
);
1498 fiz2
= _mm256_add_ps(fiz2
,tz
);
1500 fjx0
= _mm256_add_ps(fjx0
,tx
);
1501 fjy0
= _mm256_add_ps(fjy0
,ty
);
1502 fjz0
= _mm256_add_ps(fjz0
,tz
);
1506 /**************************
1507 * CALCULATE INTERACTIONS *
1508 **************************/
1510 if (gmx_mm256_any_lt(rsq30
,rcutoff2
))
1513 r30
= _mm256_mul_ps(rsq30
,rinv30
);
1514 r30
= _mm256_andnot_ps(dummy_mask
,r30
);
1516 /* Compute parameters for interactions between i and j atoms */
1517 qq30
= _mm256_mul_ps(iq3
,jq0
);
1519 /* EWALD ELECTROSTATICS */
1521 /* Analytical PME correction */
1522 zeta2
= _mm256_mul_ps(beta2
,rsq30
);
1523 rinv3
= _mm256_mul_ps(rinvsq30
,rinv30
);
1524 pmecorrF
= avx256_pmecorrF_f(zeta2
);
1525 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
1526 felec
= _mm256_mul_ps(qq30
,felec
);
1528 cutoff_mask
= _mm256_cmp_ps(rsq30
,rcutoff2
,_CMP_LT_OQ
);
1532 fscal
= _mm256_and_ps(fscal
,cutoff_mask
);
1534 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
1536 /* Calculate temporary vectorial force */
1537 tx
= _mm256_mul_ps(fscal
,dx30
);
1538 ty
= _mm256_mul_ps(fscal
,dy30
);
1539 tz
= _mm256_mul_ps(fscal
,dz30
);
1541 /* Update vectorial force */
1542 fix3
= _mm256_add_ps(fix3
,tx
);
1543 fiy3
= _mm256_add_ps(fiy3
,ty
);
1544 fiz3
= _mm256_add_ps(fiz3
,tz
);
1546 fjx0
= _mm256_add_ps(fjx0
,tx
);
1547 fjy0
= _mm256_add_ps(fjy0
,ty
);
1548 fjz0
= _mm256_add_ps(fjz0
,tz
);
1552 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
1553 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
1554 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
1555 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
1556 fjptrE
= (jnrlistE
>=0) ? f
+j_coord_offsetE
: scratch
;
1557 fjptrF
= (jnrlistF
>=0) ? f
+j_coord_offsetF
: scratch
;
1558 fjptrG
= (jnrlistG
>=0) ? f
+j_coord_offsetG
: scratch
;
1559 fjptrH
= (jnrlistH
>=0) ? f
+j_coord_offsetH
: scratch
;
1561 gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,fjptrE
,fjptrF
,fjptrG
,fjptrH
,fjx0
,fjy0
,fjz0
);
1563 /* Inner loop uses 213 flops */
1566 /* End of innermost loop */
1568 gmx_mm256_update_iforce_4atom_swizzle_ps(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
1569 f
+i_coord_offset
,fshift
+i_shift_offset
);
1571 /* Increment number of inner iterations */
1572 inneriter
+= j_index_end
- j_index_start
;
1574 /* Outer loop uses 24 flops */
1577 /* Increment number of outer iterations */
1580 /* Update outer/inner flops */
1582 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W4_F
,outeriter
*24 + inneriter
*213);