2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_avx_128_fma_single.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_avx_128_fma_single
51 * Electrostatics interaction: Ewald
52 * VdW interaction: LennardJones
53 * Geometry: Water3-Water3
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_avx_128_fma_single
58 (t_nblist
* gmx_restrict nlist
,
59 rvec
* gmx_restrict xx
,
60 rvec
* gmx_restrict ff
,
61 struct t_forcerec
* gmx_restrict fr
,
62 t_mdatoms
* gmx_restrict mdatoms
,
63 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
64 t_nrnb
* gmx_restrict nrnb
)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
72 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
73 int jnrA
,jnrB
,jnrC
,jnrD
;
74 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
75 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
76 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
78 real
*shiftvec
,*fshift
,*x
,*f
;
79 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
;
81 __m128 fscal
,rcutoff
,rcutoff2
,jidxall
;
83 __m128 ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
85 __m128 ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
87 __m128 ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
88 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
;
89 __m128 jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
90 int vdwjidx1A
,vdwjidx1B
,vdwjidx1C
,vdwjidx1D
;
91 __m128 jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
92 int vdwjidx2A
,vdwjidx2B
,vdwjidx2C
,vdwjidx2D
;
93 __m128 jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
94 __m128 dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
95 __m128 dx01
,dy01
,dz01
,rsq01
,rinv01
,rinvsq01
,r01
,qq01
,c6_01
,c12_01
;
96 __m128 dx02
,dy02
,dz02
,rsq02
,rinv02
,rinvsq02
,r02
,qq02
,c6_02
,c12_02
;
97 __m128 dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
98 __m128 dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
99 __m128 dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
100 __m128 dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
101 __m128 dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
102 __m128 dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
103 __m128 velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
106 __m128 rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
109 __m128 one_sixth
= _mm_set1_ps(1.0/6.0);
110 __m128 one_twelfth
= _mm_set1_ps(1.0/12.0);
112 __m128 ewtabscale
,eweps
,twoeweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
113 __m128 beta
,beta2
,beta3
,zeta2
,pmecorrF
,pmecorrV
,rinv3
;
115 __m128 dummy_mask
,cutoff_mask
;
116 __m128 signbit
= _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
117 __m128 one
= _mm_set1_ps(1.0);
118 __m128 two
= _mm_set1_ps(2.0);
124 jindex
= nlist
->jindex
;
126 shiftidx
= nlist
->shift
;
128 shiftvec
= fr
->shift_vec
[0];
129 fshift
= fr
->fshift
[0];
130 facel
= _mm_set1_ps(fr
->ic
->epsfac
);
131 charge
= mdatoms
->chargeA
;
132 nvdwtype
= fr
->ntype
;
134 vdwtype
= mdatoms
->typeA
;
136 sh_ewald
= _mm_set1_ps(fr
->ic
->sh_ewald
);
137 beta
= _mm_set1_ps(fr
->ic
->ewaldcoeff_q
);
138 beta2
= _mm_mul_ps(beta
,beta
);
139 beta3
= _mm_mul_ps(beta
,beta2
);
140 ewtab
= fr
->ic
->tabq_coul_FDV0
;
141 ewtabscale
= _mm_set1_ps(fr
->ic
->tabq_scale
);
142 ewtabhalfspace
= _mm_set1_ps(0.5/fr
->ic
->tabq_scale
);
144 /* Setup water-specific parameters */
145 inr
= nlist
->iinr
[0];
146 iq0
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+0]));
147 iq1
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+1]));
148 iq2
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+2]));
149 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
151 jq0
= _mm_set1_ps(charge
[inr
+0]);
152 jq1
= _mm_set1_ps(charge
[inr
+1]);
153 jq2
= _mm_set1_ps(charge
[inr
+2]);
154 vdwjidx0A
= 2*vdwtype
[inr
+0];
155 qq00
= _mm_mul_ps(iq0
,jq0
);
156 c6_00
= _mm_set1_ps(vdwparam
[vdwioffset0
+vdwjidx0A
]);
157 c12_00
= _mm_set1_ps(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
158 qq01
= _mm_mul_ps(iq0
,jq1
);
159 qq02
= _mm_mul_ps(iq0
,jq2
);
160 qq10
= _mm_mul_ps(iq1
,jq0
);
161 qq11
= _mm_mul_ps(iq1
,jq1
);
162 qq12
= _mm_mul_ps(iq1
,jq2
);
163 qq20
= _mm_mul_ps(iq2
,jq0
);
164 qq21
= _mm_mul_ps(iq2
,jq1
);
165 qq22
= _mm_mul_ps(iq2
,jq2
);
167 /* Avoid stupid compiler warnings */
168 jnrA
= jnrB
= jnrC
= jnrD
= 0;
177 for(iidx
=0;iidx
<4*DIM
;iidx
++)
182 /* Start outer loop over neighborlists */
183 for(iidx
=0; iidx
<nri
; iidx
++)
185 /* Load shift vector for this list */
186 i_shift_offset
= DIM
*shiftidx
[iidx
];
188 /* Load limits for loop over neighbors */
189 j_index_start
= jindex
[iidx
];
190 j_index_end
= jindex
[iidx
+1];
192 /* Get outer coordinate index */
194 i_coord_offset
= DIM
*inr
;
196 /* Load i particle coords and add shift vector */
197 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
198 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
200 fix0
= _mm_setzero_ps();
201 fiy0
= _mm_setzero_ps();
202 fiz0
= _mm_setzero_ps();
203 fix1
= _mm_setzero_ps();
204 fiy1
= _mm_setzero_ps();
205 fiz1
= _mm_setzero_ps();
206 fix2
= _mm_setzero_ps();
207 fiy2
= _mm_setzero_ps();
208 fiz2
= _mm_setzero_ps();
210 /* Reset potential sums */
211 velecsum
= _mm_setzero_ps();
212 vvdwsum
= _mm_setzero_ps();
214 /* Start inner kernel loop */
215 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+3]>=0; jidx
+=4)
218 /* Get j neighbor index, and coordinate index */
223 j_coord_offsetA
= DIM
*jnrA
;
224 j_coord_offsetB
= DIM
*jnrB
;
225 j_coord_offsetC
= DIM
*jnrC
;
226 j_coord_offsetD
= DIM
*jnrD
;
228 /* load j atom coordinates */
229 gmx_mm_load_3rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
230 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
231 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
233 /* Calculate displacement vector */
234 dx00
= _mm_sub_ps(ix0
,jx0
);
235 dy00
= _mm_sub_ps(iy0
,jy0
);
236 dz00
= _mm_sub_ps(iz0
,jz0
);
237 dx01
= _mm_sub_ps(ix0
,jx1
);
238 dy01
= _mm_sub_ps(iy0
,jy1
);
239 dz01
= _mm_sub_ps(iz0
,jz1
);
240 dx02
= _mm_sub_ps(ix0
,jx2
);
241 dy02
= _mm_sub_ps(iy0
,jy2
);
242 dz02
= _mm_sub_ps(iz0
,jz2
);
243 dx10
= _mm_sub_ps(ix1
,jx0
);
244 dy10
= _mm_sub_ps(iy1
,jy0
);
245 dz10
= _mm_sub_ps(iz1
,jz0
);
246 dx11
= _mm_sub_ps(ix1
,jx1
);
247 dy11
= _mm_sub_ps(iy1
,jy1
);
248 dz11
= _mm_sub_ps(iz1
,jz1
);
249 dx12
= _mm_sub_ps(ix1
,jx2
);
250 dy12
= _mm_sub_ps(iy1
,jy2
);
251 dz12
= _mm_sub_ps(iz1
,jz2
);
252 dx20
= _mm_sub_ps(ix2
,jx0
);
253 dy20
= _mm_sub_ps(iy2
,jy0
);
254 dz20
= _mm_sub_ps(iz2
,jz0
);
255 dx21
= _mm_sub_ps(ix2
,jx1
);
256 dy21
= _mm_sub_ps(iy2
,jy1
);
257 dz21
= _mm_sub_ps(iz2
,jz1
);
258 dx22
= _mm_sub_ps(ix2
,jx2
);
259 dy22
= _mm_sub_ps(iy2
,jy2
);
260 dz22
= _mm_sub_ps(iz2
,jz2
);
262 /* Calculate squared distance and things based on it */
263 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
264 rsq01
= gmx_mm_calc_rsq_ps(dx01
,dy01
,dz01
);
265 rsq02
= gmx_mm_calc_rsq_ps(dx02
,dy02
,dz02
);
266 rsq10
= gmx_mm_calc_rsq_ps(dx10
,dy10
,dz10
);
267 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
268 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
269 rsq20
= gmx_mm_calc_rsq_ps(dx20
,dy20
,dz20
);
270 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
271 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
273 rinv00
= avx128fma_invsqrt_f(rsq00
);
274 rinv01
= avx128fma_invsqrt_f(rsq01
);
275 rinv02
= avx128fma_invsqrt_f(rsq02
);
276 rinv10
= avx128fma_invsqrt_f(rsq10
);
277 rinv11
= avx128fma_invsqrt_f(rsq11
);
278 rinv12
= avx128fma_invsqrt_f(rsq12
);
279 rinv20
= avx128fma_invsqrt_f(rsq20
);
280 rinv21
= avx128fma_invsqrt_f(rsq21
);
281 rinv22
= avx128fma_invsqrt_f(rsq22
);
283 rinvsq00
= _mm_mul_ps(rinv00
,rinv00
);
284 rinvsq01
= _mm_mul_ps(rinv01
,rinv01
);
285 rinvsq02
= _mm_mul_ps(rinv02
,rinv02
);
286 rinvsq10
= _mm_mul_ps(rinv10
,rinv10
);
287 rinvsq11
= _mm_mul_ps(rinv11
,rinv11
);
288 rinvsq12
= _mm_mul_ps(rinv12
,rinv12
);
289 rinvsq20
= _mm_mul_ps(rinv20
,rinv20
);
290 rinvsq21
= _mm_mul_ps(rinv21
,rinv21
);
291 rinvsq22
= _mm_mul_ps(rinv22
,rinv22
);
293 fjx0
= _mm_setzero_ps();
294 fjy0
= _mm_setzero_ps();
295 fjz0
= _mm_setzero_ps();
296 fjx1
= _mm_setzero_ps();
297 fjy1
= _mm_setzero_ps();
298 fjz1
= _mm_setzero_ps();
299 fjx2
= _mm_setzero_ps();
300 fjy2
= _mm_setzero_ps();
301 fjz2
= _mm_setzero_ps();
303 /**************************
304 * CALCULATE INTERACTIONS *
305 **************************/
307 r00
= _mm_mul_ps(rsq00
,rinv00
);
309 /* EWALD ELECTROSTATICS */
311 /* Analytical PME correction */
312 zeta2
= _mm_mul_ps(beta2
,rsq00
);
313 rinv3
= _mm_mul_ps(rinvsq00
,rinv00
);
314 pmecorrF
= avx128fma_pmecorrF_f(zeta2
);
315 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
316 felec
= _mm_mul_ps(qq00
,felec
);
317 pmecorrV
= avx128fma_pmecorrV_f(zeta2
);
318 velec
= _mm_nmacc_ps(pmecorrV
,beta
,rinv00
);
319 velec
= _mm_mul_ps(qq00
,velec
);
321 /* LENNARD-JONES DISPERSION/REPULSION */
323 rinvsix
= _mm_mul_ps(_mm_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
324 vvdw6
= _mm_mul_ps(c6_00
,rinvsix
);
325 vvdw12
= _mm_mul_ps(c12_00
,_mm_mul_ps(rinvsix
,rinvsix
));
326 vvdw
= _mm_msub_ps(vvdw12
,one_twelfth
,_mm_mul_ps(vvdw6
,one_sixth
));
327 fvdw
= _mm_mul_ps(_mm_sub_ps(vvdw12
,vvdw6
),rinvsq00
);
329 /* Update potential sum for this i atom from the interaction with this j atom. */
330 velecsum
= _mm_add_ps(velecsum
,velec
);
331 vvdwsum
= _mm_add_ps(vvdwsum
,vvdw
);
333 fscal
= _mm_add_ps(felec
,fvdw
);
335 /* Update vectorial force */
336 fix0
= _mm_macc_ps(dx00
,fscal
,fix0
);
337 fiy0
= _mm_macc_ps(dy00
,fscal
,fiy0
);
338 fiz0
= _mm_macc_ps(dz00
,fscal
,fiz0
);
340 fjx0
= _mm_macc_ps(dx00
,fscal
,fjx0
);
341 fjy0
= _mm_macc_ps(dy00
,fscal
,fjy0
);
342 fjz0
= _mm_macc_ps(dz00
,fscal
,fjz0
);
344 /**************************
345 * CALCULATE INTERACTIONS *
346 **************************/
348 r01
= _mm_mul_ps(rsq01
,rinv01
);
350 /* EWALD ELECTROSTATICS */
352 /* Analytical PME correction */
353 zeta2
= _mm_mul_ps(beta2
,rsq01
);
354 rinv3
= _mm_mul_ps(rinvsq01
,rinv01
);
355 pmecorrF
= avx128fma_pmecorrF_f(zeta2
);
356 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
357 felec
= _mm_mul_ps(qq01
,felec
);
358 pmecorrV
= avx128fma_pmecorrV_f(zeta2
);
359 velec
= _mm_nmacc_ps(pmecorrV
,beta
,rinv01
);
360 velec
= _mm_mul_ps(qq01
,velec
);
362 /* Update potential sum for this i atom from the interaction with this j atom. */
363 velecsum
= _mm_add_ps(velecsum
,velec
);
367 /* Update vectorial force */
368 fix0
= _mm_macc_ps(dx01
,fscal
,fix0
);
369 fiy0
= _mm_macc_ps(dy01
,fscal
,fiy0
);
370 fiz0
= _mm_macc_ps(dz01
,fscal
,fiz0
);
372 fjx1
= _mm_macc_ps(dx01
,fscal
,fjx1
);
373 fjy1
= _mm_macc_ps(dy01
,fscal
,fjy1
);
374 fjz1
= _mm_macc_ps(dz01
,fscal
,fjz1
);
376 /**************************
377 * CALCULATE INTERACTIONS *
378 **************************/
380 r02
= _mm_mul_ps(rsq02
,rinv02
);
382 /* EWALD ELECTROSTATICS */
384 /* Analytical PME correction */
385 zeta2
= _mm_mul_ps(beta2
,rsq02
);
386 rinv3
= _mm_mul_ps(rinvsq02
,rinv02
);
387 pmecorrF
= avx128fma_pmecorrF_f(zeta2
);
388 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
389 felec
= _mm_mul_ps(qq02
,felec
);
390 pmecorrV
= avx128fma_pmecorrV_f(zeta2
);
391 velec
= _mm_nmacc_ps(pmecorrV
,beta
,rinv02
);
392 velec
= _mm_mul_ps(qq02
,velec
);
394 /* Update potential sum for this i atom from the interaction with this j atom. */
395 velecsum
= _mm_add_ps(velecsum
,velec
);
399 /* Update vectorial force */
400 fix0
= _mm_macc_ps(dx02
,fscal
,fix0
);
401 fiy0
= _mm_macc_ps(dy02
,fscal
,fiy0
);
402 fiz0
= _mm_macc_ps(dz02
,fscal
,fiz0
);
404 fjx2
= _mm_macc_ps(dx02
,fscal
,fjx2
);
405 fjy2
= _mm_macc_ps(dy02
,fscal
,fjy2
);
406 fjz2
= _mm_macc_ps(dz02
,fscal
,fjz2
);
408 /**************************
409 * CALCULATE INTERACTIONS *
410 **************************/
412 r10
= _mm_mul_ps(rsq10
,rinv10
);
414 /* EWALD ELECTROSTATICS */
416 /* Analytical PME correction */
417 zeta2
= _mm_mul_ps(beta2
,rsq10
);
418 rinv3
= _mm_mul_ps(rinvsq10
,rinv10
);
419 pmecorrF
= avx128fma_pmecorrF_f(zeta2
);
420 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
421 felec
= _mm_mul_ps(qq10
,felec
);
422 pmecorrV
= avx128fma_pmecorrV_f(zeta2
);
423 velec
= _mm_nmacc_ps(pmecorrV
,beta
,rinv10
);
424 velec
= _mm_mul_ps(qq10
,velec
);
426 /* Update potential sum for this i atom from the interaction with this j atom. */
427 velecsum
= _mm_add_ps(velecsum
,velec
);
431 /* Update vectorial force */
432 fix1
= _mm_macc_ps(dx10
,fscal
,fix1
);
433 fiy1
= _mm_macc_ps(dy10
,fscal
,fiy1
);
434 fiz1
= _mm_macc_ps(dz10
,fscal
,fiz1
);
436 fjx0
= _mm_macc_ps(dx10
,fscal
,fjx0
);
437 fjy0
= _mm_macc_ps(dy10
,fscal
,fjy0
);
438 fjz0
= _mm_macc_ps(dz10
,fscal
,fjz0
);
440 /**************************
441 * CALCULATE INTERACTIONS *
442 **************************/
444 r11
= _mm_mul_ps(rsq11
,rinv11
);
446 /* EWALD ELECTROSTATICS */
448 /* Analytical PME correction */
449 zeta2
= _mm_mul_ps(beta2
,rsq11
);
450 rinv3
= _mm_mul_ps(rinvsq11
,rinv11
);
451 pmecorrF
= avx128fma_pmecorrF_f(zeta2
);
452 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
453 felec
= _mm_mul_ps(qq11
,felec
);
454 pmecorrV
= avx128fma_pmecorrV_f(zeta2
);
455 velec
= _mm_nmacc_ps(pmecorrV
,beta
,rinv11
);
456 velec
= _mm_mul_ps(qq11
,velec
);
458 /* Update potential sum for this i atom from the interaction with this j atom. */
459 velecsum
= _mm_add_ps(velecsum
,velec
);
463 /* Update vectorial force */
464 fix1
= _mm_macc_ps(dx11
,fscal
,fix1
);
465 fiy1
= _mm_macc_ps(dy11
,fscal
,fiy1
);
466 fiz1
= _mm_macc_ps(dz11
,fscal
,fiz1
);
468 fjx1
= _mm_macc_ps(dx11
,fscal
,fjx1
);
469 fjy1
= _mm_macc_ps(dy11
,fscal
,fjy1
);
470 fjz1
= _mm_macc_ps(dz11
,fscal
,fjz1
);
472 /**************************
473 * CALCULATE INTERACTIONS *
474 **************************/
476 r12
= _mm_mul_ps(rsq12
,rinv12
);
478 /* EWALD ELECTROSTATICS */
480 /* Analytical PME correction */
481 zeta2
= _mm_mul_ps(beta2
,rsq12
);
482 rinv3
= _mm_mul_ps(rinvsq12
,rinv12
);
483 pmecorrF
= avx128fma_pmecorrF_f(zeta2
);
484 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
485 felec
= _mm_mul_ps(qq12
,felec
);
486 pmecorrV
= avx128fma_pmecorrV_f(zeta2
);
487 velec
= _mm_nmacc_ps(pmecorrV
,beta
,rinv12
);
488 velec
= _mm_mul_ps(qq12
,velec
);
490 /* Update potential sum for this i atom from the interaction with this j atom. */
491 velecsum
= _mm_add_ps(velecsum
,velec
);
495 /* Update vectorial force */
496 fix1
= _mm_macc_ps(dx12
,fscal
,fix1
);
497 fiy1
= _mm_macc_ps(dy12
,fscal
,fiy1
);
498 fiz1
= _mm_macc_ps(dz12
,fscal
,fiz1
);
500 fjx2
= _mm_macc_ps(dx12
,fscal
,fjx2
);
501 fjy2
= _mm_macc_ps(dy12
,fscal
,fjy2
);
502 fjz2
= _mm_macc_ps(dz12
,fscal
,fjz2
);
504 /**************************
505 * CALCULATE INTERACTIONS *
506 **************************/
508 r20
= _mm_mul_ps(rsq20
,rinv20
);
510 /* EWALD ELECTROSTATICS */
512 /* Analytical PME correction */
513 zeta2
= _mm_mul_ps(beta2
,rsq20
);
514 rinv3
= _mm_mul_ps(rinvsq20
,rinv20
);
515 pmecorrF
= avx128fma_pmecorrF_f(zeta2
);
516 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
517 felec
= _mm_mul_ps(qq20
,felec
);
518 pmecorrV
= avx128fma_pmecorrV_f(zeta2
);
519 velec
= _mm_nmacc_ps(pmecorrV
,beta
,rinv20
);
520 velec
= _mm_mul_ps(qq20
,velec
);
522 /* Update potential sum for this i atom from the interaction with this j atom. */
523 velecsum
= _mm_add_ps(velecsum
,velec
);
527 /* Update vectorial force */
528 fix2
= _mm_macc_ps(dx20
,fscal
,fix2
);
529 fiy2
= _mm_macc_ps(dy20
,fscal
,fiy2
);
530 fiz2
= _mm_macc_ps(dz20
,fscal
,fiz2
);
532 fjx0
= _mm_macc_ps(dx20
,fscal
,fjx0
);
533 fjy0
= _mm_macc_ps(dy20
,fscal
,fjy0
);
534 fjz0
= _mm_macc_ps(dz20
,fscal
,fjz0
);
536 /**************************
537 * CALCULATE INTERACTIONS *
538 **************************/
540 r21
= _mm_mul_ps(rsq21
,rinv21
);
542 /* EWALD ELECTROSTATICS */
544 /* Analytical PME correction */
545 zeta2
= _mm_mul_ps(beta2
,rsq21
);
546 rinv3
= _mm_mul_ps(rinvsq21
,rinv21
);
547 pmecorrF
= avx128fma_pmecorrF_f(zeta2
);
548 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
549 felec
= _mm_mul_ps(qq21
,felec
);
550 pmecorrV
= avx128fma_pmecorrV_f(zeta2
);
551 velec
= _mm_nmacc_ps(pmecorrV
,beta
,rinv21
);
552 velec
= _mm_mul_ps(qq21
,velec
);
554 /* Update potential sum for this i atom from the interaction with this j atom. */
555 velecsum
= _mm_add_ps(velecsum
,velec
);
559 /* Update vectorial force */
560 fix2
= _mm_macc_ps(dx21
,fscal
,fix2
);
561 fiy2
= _mm_macc_ps(dy21
,fscal
,fiy2
);
562 fiz2
= _mm_macc_ps(dz21
,fscal
,fiz2
);
564 fjx1
= _mm_macc_ps(dx21
,fscal
,fjx1
);
565 fjy1
= _mm_macc_ps(dy21
,fscal
,fjy1
);
566 fjz1
= _mm_macc_ps(dz21
,fscal
,fjz1
);
568 /**************************
569 * CALCULATE INTERACTIONS *
570 **************************/
572 r22
= _mm_mul_ps(rsq22
,rinv22
);
574 /* EWALD ELECTROSTATICS */
576 /* Analytical PME correction */
577 zeta2
= _mm_mul_ps(beta2
,rsq22
);
578 rinv3
= _mm_mul_ps(rinvsq22
,rinv22
);
579 pmecorrF
= avx128fma_pmecorrF_f(zeta2
);
580 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
581 felec
= _mm_mul_ps(qq22
,felec
);
582 pmecorrV
= avx128fma_pmecorrV_f(zeta2
);
583 velec
= _mm_nmacc_ps(pmecorrV
,beta
,rinv22
);
584 velec
= _mm_mul_ps(qq22
,velec
);
586 /* Update potential sum for this i atom from the interaction with this j atom. */
587 velecsum
= _mm_add_ps(velecsum
,velec
);
591 /* Update vectorial force */
592 fix2
= _mm_macc_ps(dx22
,fscal
,fix2
);
593 fiy2
= _mm_macc_ps(dy22
,fscal
,fiy2
);
594 fiz2
= _mm_macc_ps(dz22
,fscal
,fiz2
);
596 fjx2
= _mm_macc_ps(dx22
,fscal
,fjx2
);
597 fjy2
= _mm_macc_ps(dy22
,fscal
,fjy2
);
598 fjz2
= _mm_macc_ps(dz22
,fscal
,fjz2
);
600 fjptrA
= f
+j_coord_offsetA
;
601 fjptrB
= f
+j_coord_offsetB
;
602 fjptrC
= f
+j_coord_offsetC
;
603 fjptrD
= f
+j_coord_offsetD
;
605 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
606 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
608 /* Inner loop uses 273 flops */
614 /* Get j neighbor index, and coordinate index */
615 jnrlistA
= jjnr
[jidx
];
616 jnrlistB
= jjnr
[jidx
+1];
617 jnrlistC
= jjnr
[jidx
+2];
618 jnrlistD
= jjnr
[jidx
+3];
619 /* Sign of each element will be negative for non-real atoms.
620 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
621 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
623 dummy_mask
= gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128()));
624 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
625 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
626 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
627 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
628 j_coord_offsetA
= DIM
*jnrA
;
629 j_coord_offsetB
= DIM
*jnrB
;
630 j_coord_offsetC
= DIM
*jnrC
;
631 j_coord_offsetD
= DIM
*jnrD
;
633 /* load j atom coordinates */
634 gmx_mm_load_3rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
635 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
636 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
638 /* Calculate displacement vector */
639 dx00
= _mm_sub_ps(ix0
,jx0
);
640 dy00
= _mm_sub_ps(iy0
,jy0
);
641 dz00
= _mm_sub_ps(iz0
,jz0
);
642 dx01
= _mm_sub_ps(ix0
,jx1
);
643 dy01
= _mm_sub_ps(iy0
,jy1
);
644 dz01
= _mm_sub_ps(iz0
,jz1
);
645 dx02
= _mm_sub_ps(ix0
,jx2
);
646 dy02
= _mm_sub_ps(iy0
,jy2
);
647 dz02
= _mm_sub_ps(iz0
,jz2
);
648 dx10
= _mm_sub_ps(ix1
,jx0
);
649 dy10
= _mm_sub_ps(iy1
,jy0
);
650 dz10
= _mm_sub_ps(iz1
,jz0
);
651 dx11
= _mm_sub_ps(ix1
,jx1
);
652 dy11
= _mm_sub_ps(iy1
,jy1
);
653 dz11
= _mm_sub_ps(iz1
,jz1
);
654 dx12
= _mm_sub_ps(ix1
,jx2
);
655 dy12
= _mm_sub_ps(iy1
,jy2
);
656 dz12
= _mm_sub_ps(iz1
,jz2
);
657 dx20
= _mm_sub_ps(ix2
,jx0
);
658 dy20
= _mm_sub_ps(iy2
,jy0
);
659 dz20
= _mm_sub_ps(iz2
,jz0
);
660 dx21
= _mm_sub_ps(ix2
,jx1
);
661 dy21
= _mm_sub_ps(iy2
,jy1
);
662 dz21
= _mm_sub_ps(iz2
,jz1
);
663 dx22
= _mm_sub_ps(ix2
,jx2
);
664 dy22
= _mm_sub_ps(iy2
,jy2
);
665 dz22
= _mm_sub_ps(iz2
,jz2
);
667 /* Calculate squared distance and things based on it */
668 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
669 rsq01
= gmx_mm_calc_rsq_ps(dx01
,dy01
,dz01
);
670 rsq02
= gmx_mm_calc_rsq_ps(dx02
,dy02
,dz02
);
671 rsq10
= gmx_mm_calc_rsq_ps(dx10
,dy10
,dz10
);
672 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
673 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
674 rsq20
= gmx_mm_calc_rsq_ps(dx20
,dy20
,dz20
);
675 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
676 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
678 rinv00
= avx128fma_invsqrt_f(rsq00
);
679 rinv01
= avx128fma_invsqrt_f(rsq01
);
680 rinv02
= avx128fma_invsqrt_f(rsq02
);
681 rinv10
= avx128fma_invsqrt_f(rsq10
);
682 rinv11
= avx128fma_invsqrt_f(rsq11
);
683 rinv12
= avx128fma_invsqrt_f(rsq12
);
684 rinv20
= avx128fma_invsqrt_f(rsq20
);
685 rinv21
= avx128fma_invsqrt_f(rsq21
);
686 rinv22
= avx128fma_invsqrt_f(rsq22
);
688 rinvsq00
= _mm_mul_ps(rinv00
,rinv00
);
689 rinvsq01
= _mm_mul_ps(rinv01
,rinv01
);
690 rinvsq02
= _mm_mul_ps(rinv02
,rinv02
);
691 rinvsq10
= _mm_mul_ps(rinv10
,rinv10
);
692 rinvsq11
= _mm_mul_ps(rinv11
,rinv11
);
693 rinvsq12
= _mm_mul_ps(rinv12
,rinv12
);
694 rinvsq20
= _mm_mul_ps(rinv20
,rinv20
);
695 rinvsq21
= _mm_mul_ps(rinv21
,rinv21
);
696 rinvsq22
= _mm_mul_ps(rinv22
,rinv22
);
698 fjx0
= _mm_setzero_ps();
699 fjy0
= _mm_setzero_ps();
700 fjz0
= _mm_setzero_ps();
701 fjx1
= _mm_setzero_ps();
702 fjy1
= _mm_setzero_ps();
703 fjz1
= _mm_setzero_ps();
704 fjx2
= _mm_setzero_ps();
705 fjy2
= _mm_setzero_ps();
706 fjz2
= _mm_setzero_ps();
708 /**************************
709 * CALCULATE INTERACTIONS *
710 **************************/
712 r00
= _mm_mul_ps(rsq00
,rinv00
);
713 r00
= _mm_andnot_ps(dummy_mask
,r00
);
715 /* EWALD ELECTROSTATICS */
717 /* Analytical PME correction */
718 zeta2
= _mm_mul_ps(beta2
,rsq00
);
719 rinv3
= _mm_mul_ps(rinvsq00
,rinv00
);
720 pmecorrF
= avx128fma_pmecorrF_f(zeta2
);
721 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
722 felec
= _mm_mul_ps(qq00
,felec
);
723 pmecorrV
= avx128fma_pmecorrV_f(zeta2
);
724 velec
= _mm_nmacc_ps(pmecorrV
,beta
,rinv00
);
725 velec
= _mm_mul_ps(qq00
,velec
);
727 /* LENNARD-JONES DISPERSION/REPULSION */
729 rinvsix
= _mm_mul_ps(_mm_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
730 vvdw6
= _mm_mul_ps(c6_00
,rinvsix
);
731 vvdw12
= _mm_mul_ps(c12_00
,_mm_mul_ps(rinvsix
,rinvsix
));
732 vvdw
= _mm_msub_ps(vvdw12
,one_twelfth
,_mm_mul_ps(vvdw6
,one_sixth
));
733 fvdw
= _mm_mul_ps(_mm_sub_ps(vvdw12
,vvdw6
),rinvsq00
);
735 /* Update potential sum for this i atom from the interaction with this j atom. */
736 velec
= _mm_andnot_ps(dummy_mask
,velec
);
737 velecsum
= _mm_add_ps(velecsum
,velec
);
738 vvdw
= _mm_andnot_ps(dummy_mask
,vvdw
);
739 vvdwsum
= _mm_add_ps(vvdwsum
,vvdw
);
741 fscal
= _mm_add_ps(felec
,fvdw
);
743 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
745 /* Update vectorial force */
746 fix0
= _mm_macc_ps(dx00
,fscal
,fix0
);
747 fiy0
= _mm_macc_ps(dy00
,fscal
,fiy0
);
748 fiz0
= _mm_macc_ps(dz00
,fscal
,fiz0
);
750 fjx0
= _mm_macc_ps(dx00
,fscal
,fjx0
);
751 fjy0
= _mm_macc_ps(dy00
,fscal
,fjy0
);
752 fjz0
= _mm_macc_ps(dz00
,fscal
,fjz0
);
754 /**************************
755 * CALCULATE INTERACTIONS *
756 **************************/
758 r01
= _mm_mul_ps(rsq01
,rinv01
);
759 r01
= _mm_andnot_ps(dummy_mask
,r01
);
761 /* EWALD ELECTROSTATICS */
763 /* Analytical PME correction */
764 zeta2
= _mm_mul_ps(beta2
,rsq01
);
765 rinv3
= _mm_mul_ps(rinvsq01
,rinv01
);
766 pmecorrF
= avx128fma_pmecorrF_f(zeta2
);
767 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
768 felec
= _mm_mul_ps(qq01
,felec
);
769 pmecorrV
= avx128fma_pmecorrV_f(zeta2
);
770 velec
= _mm_nmacc_ps(pmecorrV
,beta
,rinv01
);
771 velec
= _mm_mul_ps(qq01
,velec
);
773 /* Update potential sum for this i atom from the interaction with this j atom. */
774 velec
= _mm_andnot_ps(dummy_mask
,velec
);
775 velecsum
= _mm_add_ps(velecsum
,velec
);
779 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
781 /* Update vectorial force */
782 fix0
= _mm_macc_ps(dx01
,fscal
,fix0
);
783 fiy0
= _mm_macc_ps(dy01
,fscal
,fiy0
);
784 fiz0
= _mm_macc_ps(dz01
,fscal
,fiz0
);
786 fjx1
= _mm_macc_ps(dx01
,fscal
,fjx1
);
787 fjy1
= _mm_macc_ps(dy01
,fscal
,fjy1
);
788 fjz1
= _mm_macc_ps(dz01
,fscal
,fjz1
);
790 /**************************
791 * CALCULATE INTERACTIONS *
792 **************************/
794 r02
= _mm_mul_ps(rsq02
,rinv02
);
795 r02
= _mm_andnot_ps(dummy_mask
,r02
);
797 /* EWALD ELECTROSTATICS */
799 /* Analytical PME correction */
800 zeta2
= _mm_mul_ps(beta2
,rsq02
);
801 rinv3
= _mm_mul_ps(rinvsq02
,rinv02
);
802 pmecorrF
= avx128fma_pmecorrF_f(zeta2
);
803 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
804 felec
= _mm_mul_ps(qq02
,felec
);
805 pmecorrV
= avx128fma_pmecorrV_f(zeta2
);
806 velec
= _mm_nmacc_ps(pmecorrV
,beta
,rinv02
);
807 velec
= _mm_mul_ps(qq02
,velec
);
809 /* Update potential sum for this i atom from the interaction with this j atom. */
810 velec
= _mm_andnot_ps(dummy_mask
,velec
);
811 velecsum
= _mm_add_ps(velecsum
,velec
);
815 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
817 /* Update vectorial force */
818 fix0
= _mm_macc_ps(dx02
,fscal
,fix0
);
819 fiy0
= _mm_macc_ps(dy02
,fscal
,fiy0
);
820 fiz0
= _mm_macc_ps(dz02
,fscal
,fiz0
);
822 fjx2
= _mm_macc_ps(dx02
,fscal
,fjx2
);
823 fjy2
= _mm_macc_ps(dy02
,fscal
,fjy2
);
824 fjz2
= _mm_macc_ps(dz02
,fscal
,fjz2
);
826 /**************************
827 * CALCULATE INTERACTIONS *
828 **************************/
830 r10
= _mm_mul_ps(rsq10
,rinv10
);
831 r10
= _mm_andnot_ps(dummy_mask
,r10
);
833 /* EWALD ELECTROSTATICS */
835 /* Analytical PME correction */
836 zeta2
= _mm_mul_ps(beta2
,rsq10
);
837 rinv3
= _mm_mul_ps(rinvsq10
,rinv10
);
838 pmecorrF
= avx128fma_pmecorrF_f(zeta2
);
839 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
840 felec
= _mm_mul_ps(qq10
,felec
);
841 pmecorrV
= avx128fma_pmecorrV_f(zeta2
);
842 velec
= _mm_nmacc_ps(pmecorrV
,beta
,rinv10
);
843 velec
= _mm_mul_ps(qq10
,velec
);
845 /* Update potential sum for this i atom from the interaction with this j atom. */
846 velec
= _mm_andnot_ps(dummy_mask
,velec
);
847 velecsum
= _mm_add_ps(velecsum
,velec
);
851 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
853 /* Update vectorial force */
854 fix1
= _mm_macc_ps(dx10
,fscal
,fix1
);
855 fiy1
= _mm_macc_ps(dy10
,fscal
,fiy1
);
856 fiz1
= _mm_macc_ps(dz10
,fscal
,fiz1
);
858 fjx0
= _mm_macc_ps(dx10
,fscal
,fjx0
);
859 fjy0
= _mm_macc_ps(dy10
,fscal
,fjy0
);
860 fjz0
= _mm_macc_ps(dz10
,fscal
,fjz0
);
862 /**************************
863 * CALCULATE INTERACTIONS *
864 **************************/
866 r11
= _mm_mul_ps(rsq11
,rinv11
);
867 r11
= _mm_andnot_ps(dummy_mask
,r11
);
869 /* EWALD ELECTROSTATICS */
871 /* Analytical PME correction */
872 zeta2
= _mm_mul_ps(beta2
,rsq11
);
873 rinv3
= _mm_mul_ps(rinvsq11
,rinv11
);
874 pmecorrF
= avx128fma_pmecorrF_f(zeta2
);
875 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
876 felec
= _mm_mul_ps(qq11
,felec
);
877 pmecorrV
= avx128fma_pmecorrV_f(zeta2
);
878 velec
= _mm_nmacc_ps(pmecorrV
,beta
,rinv11
);
879 velec
= _mm_mul_ps(qq11
,velec
);
881 /* Update potential sum for this i atom from the interaction with this j atom. */
882 velec
= _mm_andnot_ps(dummy_mask
,velec
);
883 velecsum
= _mm_add_ps(velecsum
,velec
);
887 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
889 /* Update vectorial force */
890 fix1
= _mm_macc_ps(dx11
,fscal
,fix1
);
891 fiy1
= _mm_macc_ps(dy11
,fscal
,fiy1
);
892 fiz1
= _mm_macc_ps(dz11
,fscal
,fiz1
);
894 fjx1
= _mm_macc_ps(dx11
,fscal
,fjx1
);
895 fjy1
= _mm_macc_ps(dy11
,fscal
,fjy1
);
896 fjz1
= _mm_macc_ps(dz11
,fscal
,fjz1
);
898 /**************************
899 * CALCULATE INTERACTIONS *
900 **************************/
902 r12
= _mm_mul_ps(rsq12
,rinv12
);
903 r12
= _mm_andnot_ps(dummy_mask
,r12
);
905 /* EWALD ELECTROSTATICS */
907 /* Analytical PME correction */
908 zeta2
= _mm_mul_ps(beta2
,rsq12
);
909 rinv3
= _mm_mul_ps(rinvsq12
,rinv12
);
910 pmecorrF
= avx128fma_pmecorrF_f(zeta2
);
911 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
912 felec
= _mm_mul_ps(qq12
,felec
);
913 pmecorrV
= avx128fma_pmecorrV_f(zeta2
);
914 velec
= _mm_nmacc_ps(pmecorrV
,beta
,rinv12
);
915 velec
= _mm_mul_ps(qq12
,velec
);
917 /* Update potential sum for this i atom from the interaction with this j atom. */
918 velec
= _mm_andnot_ps(dummy_mask
,velec
);
919 velecsum
= _mm_add_ps(velecsum
,velec
);
923 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
925 /* Update vectorial force */
926 fix1
= _mm_macc_ps(dx12
,fscal
,fix1
);
927 fiy1
= _mm_macc_ps(dy12
,fscal
,fiy1
);
928 fiz1
= _mm_macc_ps(dz12
,fscal
,fiz1
);
930 fjx2
= _mm_macc_ps(dx12
,fscal
,fjx2
);
931 fjy2
= _mm_macc_ps(dy12
,fscal
,fjy2
);
932 fjz2
= _mm_macc_ps(dz12
,fscal
,fjz2
);
934 /**************************
935 * CALCULATE INTERACTIONS *
936 **************************/
938 r20
= _mm_mul_ps(rsq20
,rinv20
);
939 r20
= _mm_andnot_ps(dummy_mask
,r20
);
941 /* EWALD ELECTROSTATICS */
943 /* Analytical PME correction */
944 zeta2
= _mm_mul_ps(beta2
,rsq20
);
945 rinv3
= _mm_mul_ps(rinvsq20
,rinv20
);
946 pmecorrF
= avx128fma_pmecorrF_f(zeta2
);
947 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
948 felec
= _mm_mul_ps(qq20
,felec
);
949 pmecorrV
= avx128fma_pmecorrV_f(zeta2
);
950 velec
= _mm_nmacc_ps(pmecorrV
,beta
,rinv20
);
951 velec
= _mm_mul_ps(qq20
,velec
);
953 /* Update potential sum for this i atom from the interaction with this j atom. */
954 velec
= _mm_andnot_ps(dummy_mask
,velec
);
955 velecsum
= _mm_add_ps(velecsum
,velec
);
959 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
961 /* Update vectorial force */
962 fix2
= _mm_macc_ps(dx20
,fscal
,fix2
);
963 fiy2
= _mm_macc_ps(dy20
,fscal
,fiy2
);
964 fiz2
= _mm_macc_ps(dz20
,fscal
,fiz2
);
966 fjx0
= _mm_macc_ps(dx20
,fscal
,fjx0
);
967 fjy0
= _mm_macc_ps(dy20
,fscal
,fjy0
);
968 fjz0
= _mm_macc_ps(dz20
,fscal
,fjz0
);
970 /**************************
971 * CALCULATE INTERACTIONS *
972 **************************/
974 r21
= _mm_mul_ps(rsq21
,rinv21
);
975 r21
= _mm_andnot_ps(dummy_mask
,r21
);
977 /* EWALD ELECTROSTATICS */
979 /* Analytical PME correction */
980 zeta2
= _mm_mul_ps(beta2
,rsq21
);
981 rinv3
= _mm_mul_ps(rinvsq21
,rinv21
);
982 pmecorrF
= avx128fma_pmecorrF_f(zeta2
);
983 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
984 felec
= _mm_mul_ps(qq21
,felec
);
985 pmecorrV
= avx128fma_pmecorrV_f(zeta2
);
986 velec
= _mm_nmacc_ps(pmecorrV
,beta
,rinv21
);
987 velec
= _mm_mul_ps(qq21
,velec
);
989 /* Update potential sum for this i atom from the interaction with this j atom. */
990 velec
= _mm_andnot_ps(dummy_mask
,velec
);
991 velecsum
= _mm_add_ps(velecsum
,velec
);
995 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
997 /* Update vectorial force */
998 fix2
= _mm_macc_ps(dx21
,fscal
,fix2
);
999 fiy2
= _mm_macc_ps(dy21
,fscal
,fiy2
);
1000 fiz2
= _mm_macc_ps(dz21
,fscal
,fiz2
);
1002 fjx1
= _mm_macc_ps(dx21
,fscal
,fjx1
);
1003 fjy1
= _mm_macc_ps(dy21
,fscal
,fjy1
);
1004 fjz1
= _mm_macc_ps(dz21
,fscal
,fjz1
);
1006 /**************************
1007 * CALCULATE INTERACTIONS *
1008 **************************/
1010 r22
= _mm_mul_ps(rsq22
,rinv22
);
1011 r22
= _mm_andnot_ps(dummy_mask
,r22
);
1013 /* EWALD ELECTROSTATICS */
1015 /* Analytical PME correction */
1016 zeta2
= _mm_mul_ps(beta2
,rsq22
);
1017 rinv3
= _mm_mul_ps(rinvsq22
,rinv22
);
1018 pmecorrF
= avx128fma_pmecorrF_f(zeta2
);
1019 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
1020 felec
= _mm_mul_ps(qq22
,felec
);
1021 pmecorrV
= avx128fma_pmecorrV_f(zeta2
);
1022 velec
= _mm_nmacc_ps(pmecorrV
,beta
,rinv22
);
1023 velec
= _mm_mul_ps(qq22
,velec
);
1025 /* Update potential sum for this i atom from the interaction with this j atom. */
1026 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1027 velecsum
= _mm_add_ps(velecsum
,velec
);
1031 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1033 /* Update vectorial force */
1034 fix2
= _mm_macc_ps(dx22
,fscal
,fix2
);
1035 fiy2
= _mm_macc_ps(dy22
,fscal
,fiy2
);
1036 fiz2
= _mm_macc_ps(dz22
,fscal
,fiz2
);
1038 fjx2
= _mm_macc_ps(dx22
,fscal
,fjx2
);
1039 fjy2
= _mm_macc_ps(dy22
,fscal
,fjy2
);
1040 fjz2
= _mm_macc_ps(dz22
,fscal
,fjz2
);
1042 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
1043 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
1044 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
1045 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
1047 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
1048 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
1050 /* Inner loop uses 282 flops */
1053 /* End of innermost loop */
1055 gmx_mm_update_iforce_3atom_swizzle_ps(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
1056 f
+i_coord_offset
,fshift
+i_shift_offset
);
1059 /* Update potential energies */
1060 gmx_mm_update_1pot_ps(velecsum
,kernel_data
->energygrp_elec
+ggid
);
1061 gmx_mm_update_1pot_ps(vvdwsum
,kernel_data
->energygrp_vdw
+ggid
);
1063 /* Increment number of inner iterations */
1064 inneriter
+= j_index_end
- j_index_start
;
1066 /* Outer loop uses 20 flops */
1069 /* Increment number of outer iterations */
1072 /* Update outer/inner flops */
1074 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W3W3_VF
,outeriter
*20 + inneriter
*282);
1077 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_avx_128_fma_single
1078 * Electrostatics interaction: Ewald
1079 * VdW interaction: LennardJones
1080 * Geometry: Water3-Water3
1081 * Calculate force/pot: Force
1084 nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_avx_128_fma_single
1085 (t_nblist
* gmx_restrict nlist
,
1086 rvec
* gmx_restrict xx
,
1087 rvec
* gmx_restrict ff
,
1088 struct t_forcerec
* gmx_restrict fr
,
1089 t_mdatoms
* gmx_restrict mdatoms
,
1090 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
1091 t_nrnb
* gmx_restrict nrnb
)
1093 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1094 * just 0 for non-waters.
1095 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
1096 * jnr indices corresponding to data put in the four positions in the SIMD register.
1098 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
1099 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
1100 int jnrA
,jnrB
,jnrC
,jnrD
;
1101 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
1102 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
1103 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
1104 real rcutoff_scalar
;
1105 real
*shiftvec
,*fshift
,*x
,*f
;
1106 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
;
1107 real scratch
[4*DIM
];
1108 __m128 fscal
,rcutoff
,rcutoff2
,jidxall
;
1110 __m128 ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
1112 __m128 ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
1114 __m128 ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
1115 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
;
1116 __m128 jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
1117 int vdwjidx1A
,vdwjidx1B
,vdwjidx1C
,vdwjidx1D
;
1118 __m128 jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
1119 int vdwjidx2A
,vdwjidx2B
,vdwjidx2C
,vdwjidx2D
;
1120 __m128 jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
1121 __m128 dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
1122 __m128 dx01
,dy01
,dz01
,rsq01
,rinv01
,rinvsq01
,r01
,qq01
,c6_01
,c12_01
;
1123 __m128 dx02
,dy02
,dz02
,rsq02
,rinv02
,rinvsq02
,r02
,qq02
,c6_02
,c12_02
;
1124 __m128 dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
1125 __m128 dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
1126 __m128 dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
1127 __m128 dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
1128 __m128 dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
1129 __m128 dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
1130 __m128 velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
1133 __m128 rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
1136 __m128 one_sixth
= _mm_set1_ps(1.0/6.0);
1137 __m128 one_twelfth
= _mm_set1_ps(1.0/12.0);
1139 __m128 ewtabscale
,eweps
,twoeweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
1140 __m128 beta
,beta2
,beta3
,zeta2
,pmecorrF
,pmecorrV
,rinv3
;
1142 __m128 dummy_mask
,cutoff_mask
;
1143 __m128 signbit
= _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1144 __m128 one
= _mm_set1_ps(1.0);
1145 __m128 two
= _mm_set1_ps(2.0);
1151 jindex
= nlist
->jindex
;
1153 shiftidx
= nlist
->shift
;
1155 shiftvec
= fr
->shift_vec
[0];
1156 fshift
= fr
->fshift
[0];
1157 facel
= _mm_set1_ps(fr
->ic
->epsfac
);
1158 charge
= mdatoms
->chargeA
;
1159 nvdwtype
= fr
->ntype
;
1160 vdwparam
= fr
->nbfp
;
1161 vdwtype
= mdatoms
->typeA
;
1163 sh_ewald
= _mm_set1_ps(fr
->ic
->sh_ewald
);
1164 beta
= _mm_set1_ps(fr
->ic
->ewaldcoeff_q
);
1165 beta2
= _mm_mul_ps(beta
,beta
);
1166 beta3
= _mm_mul_ps(beta
,beta2
);
1167 ewtab
= fr
->ic
->tabq_coul_F
;
1168 ewtabscale
= _mm_set1_ps(fr
->ic
->tabq_scale
);
1169 ewtabhalfspace
= _mm_set1_ps(0.5/fr
->ic
->tabq_scale
);
1171 /* Setup water-specific parameters */
1172 inr
= nlist
->iinr
[0];
1173 iq0
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+0]));
1174 iq1
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+1]));
1175 iq2
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+2]));
1176 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
1178 jq0
= _mm_set1_ps(charge
[inr
+0]);
1179 jq1
= _mm_set1_ps(charge
[inr
+1]);
1180 jq2
= _mm_set1_ps(charge
[inr
+2]);
1181 vdwjidx0A
= 2*vdwtype
[inr
+0];
1182 qq00
= _mm_mul_ps(iq0
,jq0
);
1183 c6_00
= _mm_set1_ps(vdwparam
[vdwioffset0
+vdwjidx0A
]);
1184 c12_00
= _mm_set1_ps(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
1185 qq01
= _mm_mul_ps(iq0
,jq1
);
1186 qq02
= _mm_mul_ps(iq0
,jq2
);
1187 qq10
= _mm_mul_ps(iq1
,jq0
);
1188 qq11
= _mm_mul_ps(iq1
,jq1
);
1189 qq12
= _mm_mul_ps(iq1
,jq2
);
1190 qq20
= _mm_mul_ps(iq2
,jq0
);
1191 qq21
= _mm_mul_ps(iq2
,jq1
);
1192 qq22
= _mm_mul_ps(iq2
,jq2
);
1194 /* Avoid stupid compiler warnings */
1195 jnrA
= jnrB
= jnrC
= jnrD
= 0;
1196 j_coord_offsetA
= 0;
1197 j_coord_offsetB
= 0;
1198 j_coord_offsetC
= 0;
1199 j_coord_offsetD
= 0;
1204 for(iidx
=0;iidx
<4*DIM
;iidx
++)
1206 scratch
[iidx
] = 0.0;
1209 /* Start outer loop over neighborlists */
1210 for(iidx
=0; iidx
<nri
; iidx
++)
1212 /* Load shift vector for this list */
1213 i_shift_offset
= DIM
*shiftidx
[iidx
];
1215 /* Load limits for loop over neighbors */
1216 j_index_start
= jindex
[iidx
];
1217 j_index_end
= jindex
[iidx
+1];
1219 /* Get outer coordinate index */
1221 i_coord_offset
= DIM
*inr
;
1223 /* Load i particle coords and add shift vector */
1224 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
1225 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
1227 fix0
= _mm_setzero_ps();
1228 fiy0
= _mm_setzero_ps();
1229 fiz0
= _mm_setzero_ps();
1230 fix1
= _mm_setzero_ps();
1231 fiy1
= _mm_setzero_ps();
1232 fiz1
= _mm_setzero_ps();
1233 fix2
= _mm_setzero_ps();
1234 fiy2
= _mm_setzero_ps();
1235 fiz2
= _mm_setzero_ps();
1237 /* Start inner kernel loop */
1238 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+3]>=0; jidx
+=4)
1241 /* Get j neighbor index, and coordinate index */
1243 jnrB
= jjnr
[jidx
+1];
1244 jnrC
= jjnr
[jidx
+2];
1245 jnrD
= jjnr
[jidx
+3];
1246 j_coord_offsetA
= DIM
*jnrA
;
1247 j_coord_offsetB
= DIM
*jnrB
;
1248 j_coord_offsetC
= DIM
*jnrC
;
1249 j_coord_offsetD
= DIM
*jnrD
;
1251 /* load j atom coordinates */
1252 gmx_mm_load_3rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1253 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
1254 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
1256 /* Calculate displacement vector */
1257 dx00
= _mm_sub_ps(ix0
,jx0
);
1258 dy00
= _mm_sub_ps(iy0
,jy0
);
1259 dz00
= _mm_sub_ps(iz0
,jz0
);
1260 dx01
= _mm_sub_ps(ix0
,jx1
);
1261 dy01
= _mm_sub_ps(iy0
,jy1
);
1262 dz01
= _mm_sub_ps(iz0
,jz1
);
1263 dx02
= _mm_sub_ps(ix0
,jx2
);
1264 dy02
= _mm_sub_ps(iy0
,jy2
);
1265 dz02
= _mm_sub_ps(iz0
,jz2
);
1266 dx10
= _mm_sub_ps(ix1
,jx0
);
1267 dy10
= _mm_sub_ps(iy1
,jy0
);
1268 dz10
= _mm_sub_ps(iz1
,jz0
);
1269 dx11
= _mm_sub_ps(ix1
,jx1
);
1270 dy11
= _mm_sub_ps(iy1
,jy1
);
1271 dz11
= _mm_sub_ps(iz1
,jz1
);
1272 dx12
= _mm_sub_ps(ix1
,jx2
);
1273 dy12
= _mm_sub_ps(iy1
,jy2
);
1274 dz12
= _mm_sub_ps(iz1
,jz2
);
1275 dx20
= _mm_sub_ps(ix2
,jx0
);
1276 dy20
= _mm_sub_ps(iy2
,jy0
);
1277 dz20
= _mm_sub_ps(iz2
,jz0
);
1278 dx21
= _mm_sub_ps(ix2
,jx1
);
1279 dy21
= _mm_sub_ps(iy2
,jy1
);
1280 dz21
= _mm_sub_ps(iz2
,jz1
);
1281 dx22
= _mm_sub_ps(ix2
,jx2
);
1282 dy22
= _mm_sub_ps(iy2
,jy2
);
1283 dz22
= _mm_sub_ps(iz2
,jz2
);
1285 /* Calculate squared distance and things based on it */
1286 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
1287 rsq01
= gmx_mm_calc_rsq_ps(dx01
,dy01
,dz01
);
1288 rsq02
= gmx_mm_calc_rsq_ps(dx02
,dy02
,dz02
);
1289 rsq10
= gmx_mm_calc_rsq_ps(dx10
,dy10
,dz10
);
1290 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
1291 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
1292 rsq20
= gmx_mm_calc_rsq_ps(dx20
,dy20
,dz20
);
1293 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
1294 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
1296 rinv00
= avx128fma_invsqrt_f(rsq00
);
1297 rinv01
= avx128fma_invsqrt_f(rsq01
);
1298 rinv02
= avx128fma_invsqrt_f(rsq02
);
1299 rinv10
= avx128fma_invsqrt_f(rsq10
);
1300 rinv11
= avx128fma_invsqrt_f(rsq11
);
1301 rinv12
= avx128fma_invsqrt_f(rsq12
);
1302 rinv20
= avx128fma_invsqrt_f(rsq20
);
1303 rinv21
= avx128fma_invsqrt_f(rsq21
);
1304 rinv22
= avx128fma_invsqrt_f(rsq22
);
1306 rinvsq00
= _mm_mul_ps(rinv00
,rinv00
);
1307 rinvsq01
= _mm_mul_ps(rinv01
,rinv01
);
1308 rinvsq02
= _mm_mul_ps(rinv02
,rinv02
);
1309 rinvsq10
= _mm_mul_ps(rinv10
,rinv10
);
1310 rinvsq11
= _mm_mul_ps(rinv11
,rinv11
);
1311 rinvsq12
= _mm_mul_ps(rinv12
,rinv12
);
1312 rinvsq20
= _mm_mul_ps(rinv20
,rinv20
);
1313 rinvsq21
= _mm_mul_ps(rinv21
,rinv21
);
1314 rinvsq22
= _mm_mul_ps(rinv22
,rinv22
);
1316 fjx0
= _mm_setzero_ps();
1317 fjy0
= _mm_setzero_ps();
1318 fjz0
= _mm_setzero_ps();
1319 fjx1
= _mm_setzero_ps();
1320 fjy1
= _mm_setzero_ps();
1321 fjz1
= _mm_setzero_ps();
1322 fjx2
= _mm_setzero_ps();
1323 fjy2
= _mm_setzero_ps();
1324 fjz2
= _mm_setzero_ps();
1326 /**************************
1327 * CALCULATE INTERACTIONS *
1328 **************************/
1330 r00
= _mm_mul_ps(rsq00
,rinv00
);
1332 /* EWALD ELECTROSTATICS */
1334 /* Analytical PME correction */
1335 zeta2
= _mm_mul_ps(beta2
,rsq00
);
1336 rinv3
= _mm_mul_ps(rinvsq00
,rinv00
);
1337 pmecorrF
= avx128fma_pmecorrF_f(zeta2
);
1338 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
1339 felec
= _mm_mul_ps(qq00
,felec
);
1341 /* LENNARD-JONES DISPERSION/REPULSION */
1343 rinvsix
= _mm_mul_ps(_mm_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
1344 fvdw
= _mm_mul_ps(_mm_msub_ps(c12_00
,rinvsix
,c6_00
),_mm_mul_ps(rinvsix
,rinvsq00
));
1346 fscal
= _mm_add_ps(felec
,fvdw
);
1348 /* Update vectorial force */
1349 fix0
= _mm_macc_ps(dx00
,fscal
,fix0
);
1350 fiy0
= _mm_macc_ps(dy00
,fscal
,fiy0
);
1351 fiz0
= _mm_macc_ps(dz00
,fscal
,fiz0
);
1353 fjx0
= _mm_macc_ps(dx00
,fscal
,fjx0
);
1354 fjy0
= _mm_macc_ps(dy00
,fscal
,fjy0
);
1355 fjz0
= _mm_macc_ps(dz00
,fscal
,fjz0
);
1357 /**************************
1358 * CALCULATE INTERACTIONS *
1359 **************************/
1361 r01
= _mm_mul_ps(rsq01
,rinv01
);
1363 /* EWALD ELECTROSTATICS */
1365 /* Analytical PME correction */
1366 zeta2
= _mm_mul_ps(beta2
,rsq01
);
1367 rinv3
= _mm_mul_ps(rinvsq01
,rinv01
);
1368 pmecorrF
= avx128fma_pmecorrF_f(zeta2
);
1369 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
1370 felec
= _mm_mul_ps(qq01
,felec
);
1374 /* Update vectorial force */
1375 fix0
= _mm_macc_ps(dx01
,fscal
,fix0
);
1376 fiy0
= _mm_macc_ps(dy01
,fscal
,fiy0
);
1377 fiz0
= _mm_macc_ps(dz01
,fscal
,fiz0
);
1379 fjx1
= _mm_macc_ps(dx01
,fscal
,fjx1
);
1380 fjy1
= _mm_macc_ps(dy01
,fscal
,fjy1
);
1381 fjz1
= _mm_macc_ps(dz01
,fscal
,fjz1
);
1383 /**************************
1384 * CALCULATE INTERACTIONS *
1385 **************************/
1387 r02
= _mm_mul_ps(rsq02
,rinv02
);
1389 /* EWALD ELECTROSTATICS */
1391 /* Analytical PME correction */
1392 zeta2
= _mm_mul_ps(beta2
,rsq02
);
1393 rinv3
= _mm_mul_ps(rinvsq02
,rinv02
);
1394 pmecorrF
= avx128fma_pmecorrF_f(zeta2
);
1395 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
1396 felec
= _mm_mul_ps(qq02
,felec
);
1400 /* Update vectorial force */
1401 fix0
= _mm_macc_ps(dx02
,fscal
,fix0
);
1402 fiy0
= _mm_macc_ps(dy02
,fscal
,fiy0
);
1403 fiz0
= _mm_macc_ps(dz02
,fscal
,fiz0
);
1405 fjx2
= _mm_macc_ps(dx02
,fscal
,fjx2
);
1406 fjy2
= _mm_macc_ps(dy02
,fscal
,fjy2
);
1407 fjz2
= _mm_macc_ps(dz02
,fscal
,fjz2
);
1409 /**************************
1410 * CALCULATE INTERACTIONS *
1411 **************************/
1413 r10
= _mm_mul_ps(rsq10
,rinv10
);
1415 /* EWALD ELECTROSTATICS */
1417 /* Analytical PME correction */
1418 zeta2
= _mm_mul_ps(beta2
,rsq10
);
1419 rinv3
= _mm_mul_ps(rinvsq10
,rinv10
);
1420 pmecorrF
= avx128fma_pmecorrF_f(zeta2
);
1421 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
1422 felec
= _mm_mul_ps(qq10
,felec
);
1426 /* Update vectorial force */
1427 fix1
= _mm_macc_ps(dx10
,fscal
,fix1
);
1428 fiy1
= _mm_macc_ps(dy10
,fscal
,fiy1
);
1429 fiz1
= _mm_macc_ps(dz10
,fscal
,fiz1
);
1431 fjx0
= _mm_macc_ps(dx10
,fscal
,fjx0
);
1432 fjy0
= _mm_macc_ps(dy10
,fscal
,fjy0
);
1433 fjz0
= _mm_macc_ps(dz10
,fscal
,fjz0
);
1435 /**************************
1436 * CALCULATE INTERACTIONS *
1437 **************************/
1439 r11
= _mm_mul_ps(rsq11
,rinv11
);
1441 /* EWALD ELECTROSTATICS */
1443 /* Analytical PME correction */
1444 zeta2
= _mm_mul_ps(beta2
,rsq11
);
1445 rinv3
= _mm_mul_ps(rinvsq11
,rinv11
);
1446 pmecorrF
= avx128fma_pmecorrF_f(zeta2
);
1447 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
1448 felec
= _mm_mul_ps(qq11
,felec
);
1452 /* Update vectorial force */
1453 fix1
= _mm_macc_ps(dx11
,fscal
,fix1
);
1454 fiy1
= _mm_macc_ps(dy11
,fscal
,fiy1
);
1455 fiz1
= _mm_macc_ps(dz11
,fscal
,fiz1
);
1457 fjx1
= _mm_macc_ps(dx11
,fscal
,fjx1
);
1458 fjy1
= _mm_macc_ps(dy11
,fscal
,fjy1
);
1459 fjz1
= _mm_macc_ps(dz11
,fscal
,fjz1
);
1461 /**************************
1462 * CALCULATE INTERACTIONS *
1463 **************************/
1465 r12
= _mm_mul_ps(rsq12
,rinv12
);
1467 /* EWALD ELECTROSTATICS */
1469 /* Analytical PME correction */
1470 zeta2
= _mm_mul_ps(beta2
,rsq12
);
1471 rinv3
= _mm_mul_ps(rinvsq12
,rinv12
);
1472 pmecorrF
= avx128fma_pmecorrF_f(zeta2
);
1473 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
1474 felec
= _mm_mul_ps(qq12
,felec
);
1478 /* Update vectorial force */
1479 fix1
= _mm_macc_ps(dx12
,fscal
,fix1
);
1480 fiy1
= _mm_macc_ps(dy12
,fscal
,fiy1
);
1481 fiz1
= _mm_macc_ps(dz12
,fscal
,fiz1
);
1483 fjx2
= _mm_macc_ps(dx12
,fscal
,fjx2
);
1484 fjy2
= _mm_macc_ps(dy12
,fscal
,fjy2
);
1485 fjz2
= _mm_macc_ps(dz12
,fscal
,fjz2
);
1487 /**************************
1488 * CALCULATE INTERACTIONS *
1489 **************************/
1491 r20
= _mm_mul_ps(rsq20
,rinv20
);
1493 /* EWALD ELECTROSTATICS */
1495 /* Analytical PME correction */
1496 zeta2
= _mm_mul_ps(beta2
,rsq20
);
1497 rinv3
= _mm_mul_ps(rinvsq20
,rinv20
);
1498 pmecorrF
= avx128fma_pmecorrF_f(zeta2
);
1499 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
1500 felec
= _mm_mul_ps(qq20
,felec
);
1504 /* Update vectorial force */
1505 fix2
= _mm_macc_ps(dx20
,fscal
,fix2
);
1506 fiy2
= _mm_macc_ps(dy20
,fscal
,fiy2
);
1507 fiz2
= _mm_macc_ps(dz20
,fscal
,fiz2
);
1509 fjx0
= _mm_macc_ps(dx20
,fscal
,fjx0
);
1510 fjy0
= _mm_macc_ps(dy20
,fscal
,fjy0
);
1511 fjz0
= _mm_macc_ps(dz20
,fscal
,fjz0
);
1513 /**************************
1514 * CALCULATE INTERACTIONS *
1515 **************************/
1517 r21
= _mm_mul_ps(rsq21
,rinv21
);
1519 /* EWALD ELECTROSTATICS */
1521 /* Analytical PME correction */
1522 zeta2
= _mm_mul_ps(beta2
,rsq21
);
1523 rinv3
= _mm_mul_ps(rinvsq21
,rinv21
);
1524 pmecorrF
= avx128fma_pmecorrF_f(zeta2
);
1525 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
1526 felec
= _mm_mul_ps(qq21
,felec
);
1530 /* Update vectorial force */
1531 fix2
= _mm_macc_ps(dx21
,fscal
,fix2
);
1532 fiy2
= _mm_macc_ps(dy21
,fscal
,fiy2
);
1533 fiz2
= _mm_macc_ps(dz21
,fscal
,fiz2
);
1535 fjx1
= _mm_macc_ps(dx21
,fscal
,fjx1
);
1536 fjy1
= _mm_macc_ps(dy21
,fscal
,fjy1
);
1537 fjz1
= _mm_macc_ps(dz21
,fscal
,fjz1
);
1539 /**************************
1540 * CALCULATE INTERACTIONS *
1541 **************************/
1543 r22
= _mm_mul_ps(rsq22
,rinv22
);
1545 /* EWALD ELECTROSTATICS */
1547 /* Analytical PME correction */
1548 zeta2
= _mm_mul_ps(beta2
,rsq22
);
1549 rinv3
= _mm_mul_ps(rinvsq22
,rinv22
);
1550 pmecorrF
= avx128fma_pmecorrF_f(zeta2
);
1551 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
1552 felec
= _mm_mul_ps(qq22
,felec
);
1556 /* Update vectorial force */
1557 fix2
= _mm_macc_ps(dx22
,fscal
,fix2
);
1558 fiy2
= _mm_macc_ps(dy22
,fscal
,fiy2
);
1559 fiz2
= _mm_macc_ps(dz22
,fscal
,fiz2
);
1561 fjx2
= _mm_macc_ps(dx22
,fscal
,fjx2
);
1562 fjy2
= _mm_macc_ps(dy22
,fscal
,fjy2
);
1563 fjz2
= _mm_macc_ps(dz22
,fscal
,fjz2
);
1565 fjptrA
= f
+j_coord_offsetA
;
1566 fjptrB
= f
+j_coord_offsetB
;
1567 fjptrC
= f
+j_coord_offsetC
;
1568 fjptrD
= f
+j_coord_offsetD
;
1570 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
1571 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
1573 /* Inner loop uses 259 flops */
1576 if(jidx
<j_index_end
)
1579 /* Get j neighbor index, and coordinate index */
1580 jnrlistA
= jjnr
[jidx
];
1581 jnrlistB
= jjnr
[jidx
+1];
1582 jnrlistC
= jjnr
[jidx
+2];
1583 jnrlistD
= jjnr
[jidx
+3];
1584 /* Sign of each element will be negative for non-real atoms.
1585 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1586 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1588 dummy_mask
= gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128()));
1589 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
1590 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
1591 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
1592 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
1593 j_coord_offsetA
= DIM
*jnrA
;
1594 j_coord_offsetB
= DIM
*jnrB
;
1595 j_coord_offsetC
= DIM
*jnrC
;
1596 j_coord_offsetD
= DIM
*jnrD
;
1598 /* load j atom coordinates */
1599 gmx_mm_load_3rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1600 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
1601 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
1603 /* Calculate displacement vector */
1604 dx00
= _mm_sub_ps(ix0
,jx0
);
1605 dy00
= _mm_sub_ps(iy0
,jy0
);
1606 dz00
= _mm_sub_ps(iz0
,jz0
);
1607 dx01
= _mm_sub_ps(ix0
,jx1
);
1608 dy01
= _mm_sub_ps(iy0
,jy1
);
1609 dz01
= _mm_sub_ps(iz0
,jz1
);
1610 dx02
= _mm_sub_ps(ix0
,jx2
);
1611 dy02
= _mm_sub_ps(iy0
,jy2
);
1612 dz02
= _mm_sub_ps(iz0
,jz2
);
1613 dx10
= _mm_sub_ps(ix1
,jx0
);
1614 dy10
= _mm_sub_ps(iy1
,jy0
);
1615 dz10
= _mm_sub_ps(iz1
,jz0
);
1616 dx11
= _mm_sub_ps(ix1
,jx1
);
1617 dy11
= _mm_sub_ps(iy1
,jy1
);
1618 dz11
= _mm_sub_ps(iz1
,jz1
);
1619 dx12
= _mm_sub_ps(ix1
,jx2
);
1620 dy12
= _mm_sub_ps(iy1
,jy2
);
1621 dz12
= _mm_sub_ps(iz1
,jz2
);
1622 dx20
= _mm_sub_ps(ix2
,jx0
);
1623 dy20
= _mm_sub_ps(iy2
,jy0
);
1624 dz20
= _mm_sub_ps(iz2
,jz0
);
1625 dx21
= _mm_sub_ps(ix2
,jx1
);
1626 dy21
= _mm_sub_ps(iy2
,jy1
);
1627 dz21
= _mm_sub_ps(iz2
,jz1
);
1628 dx22
= _mm_sub_ps(ix2
,jx2
);
1629 dy22
= _mm_sub_ps(iy2
,jy2
);
1630 dz22
= _mm_sub_ps(iz2
,jz2
);
1632 /* Calculate squared distance and things based on it */
1633 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
1634 rsq01
= gmx_mm_calc_rsq_ps(dx01
,dy01
,dz01
);
1635 rsq02
= gmx_mm_calc_rsq_ps(dx02
,dy02
,dz02
);
1636 rsq10
= gmx_mm_calc_rsq_ps(dx10
,dy10
,dz10
);
1637 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
1638 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
1639 rsq20
= gmx_mm_calc_rsq_ps(dx20
,dy20
,dz20
);
1640 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
1641 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
1643 rinv00
= avx128fma_invsqrt_f(rsq00
);
1644 rinv01
= avx128fma_invsqrt_f(rsq01
);
1645 rinv02
= avx128fma_invsqrt_f(rsq02
);
1646 rinv10
= avx128fma_invsqrt_f(rsq10
);
1647 rinv11
= avx128fma_invsqrt_f(rsq11
);
1648 rinv12
= avx128fma_invsqrt_f(rsq12
);
1649 rinv20
= avx128fma_invsqrt_f(rsq20
);
1650 rinv21
= avx128fma_invsqrt_f(rsq21
);
1651 rinv22
= avx128fma_invsqrt_f(rsq22
);
1653 rinvsq00
= _mm_mul_ps(rinv00
,rinv00
);
1654 rinvsq01
= _mm_mul_ps(rinv01
,rinv01
);
1655 rinvsq02
= _mm_mul_ps(rinv02
,rinv02
);
1656 rinvsq10
= _mm_mul_ps(rinv10
,rinv10
);
1657 rinvsq11
= _mm_mul_ps(rinv11
,rinv11
);
1658 rinvsq12
= _mm_mul_ps(rinv12
,rinv12
);
1659 rinvsq20
= _mm_mul_ps(rinv20
,rinv20
);
1660 rinvsq21
= _mm_mul_ps(rinv21
,rinv21
);
1661 rinvsq22
= _mm_mul_ps(rinv22
,rinv22
);
1663 fjx0
= _mm_setzero_ps();
1664 fjy0
= _mm_setzero_ps();
1665 fjz0
= _mm_setzero_ps();
1666 fjx1
= _mm_setzero_ps();
1667 fjy1
= _mm_setzero_ps();
1668 fjz1
= _mm_setzero_ps();
1669 fjx2
= _mm_setzero_ps();
1670 fjy2
= _mm_setzero_ps();
1671 fjz2
= _mm_setzero_ps();
1673 /**************************
1674 * CALCULATE INTERACTIONS *
1675 **************************/
1677 r00
= _mm_mul_ps(rsq00
,rinv00
);
1678 r00
= _mm_andnot_ps(dummy_mask
,r00
);
1680 /* EWALD ELECTROSTATICS */
1682 /* Analytical PME correction */
1683 zeta2
= _mm_mul_ps(beta2
,rsq00
);
1684 rinv3
= _mm_mul_ps(rinvsq00
,rinv00
);
1685 pmecorrF
= avx128fma_pmecorrF_f(zeta2
);
1686 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
1687 felec
= _mm_mul_ps(qq00
,felec
);
1689 /* LENNARD-JONES DISPERSION/REPULSION */
1691 rinvsix
= _mm_mul_ps(_mm_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
1692 fvdw
= _mm_mul_ps(_mm_msub_ps(c12_00
,rinvsix
,c6_00
),_mm_mul_ps(rinvsix
,rinvsq00
));
1694 fscal
= _mm_add_ps(felec
,fvdw
);
1696 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1698 /* Update vectorial force */
1699 fix0
= _mm_macc_ps(dx00
,fscal
,fix0
);
1700 fiy0
= _mm_macc_ps(dy00
,fscal
,fiy0
);
1701 fiz0
= _mm_macc_ps(dz00
,fscal
,fiz0
);
1703 fjx0
= _mm_macc_ps(dx00
,fscal
,fjx0
);
1704 fjy0
= _mm_macc_ps(dy00
,fscal
,fjy0
);
1705 fjz0
= _mm_macc_ps(dz00
,fscal
,fjz0
);
1707 /**************************
1708 * CALCULATE INTERACTIONS *
1709 **************************/
1711 r01
= _mm_mul_ps(rsq01
,rinv01
);
1712 r01
= _mm_andnot_ps(dummy_mask
,r01
);
1714 /* EWALD ELECTROSTATICS */
1716 /* Analytical PME correction */
1717 zeta2
= _mm_mul_ps(beta2
,rsq01
);
1718 rinv3
= _mm_mul_ps(rinvsq01
,rinv01
);
1719 pmecorrF
= avx128fma_pmecorrF_f(zeta2
);
1720 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
1721 felec
= _mm_mul_ps(qq01
,felec
);
1725 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1727 /* Update vectorial force */
1728 fix0
= _mm_macc_ps(dx01
,fscal
,fix0
);
1729 fiy0
= _mm_macc_ps(dy01
,fscal
,fiy0
);
1730 fiz0
= _mm_macc_ps(dz01
,fscal
,fiz0
);
1732 fjx1
= _mm_macc_ps(dx01
,fscal
,fjx1
);
1733 fjy1
= _mm_macc_ps(dy01
,fscal
,fjy1
);
1734 fjz1
= _mm_macc_ps(dz01
,fscal
,fjz1
);
1736 /**************************
1737 * CALCULATE INTERACTIONS *
1738 **************************/
1740 r02
= _mm_mul_ps(rsq02
,rinv02
);
1741 r02
= _mm_andnot_ps(dummy_mask
,r02
);
1743 /* EWALD ELECTROSTATICS */
1745 /* Analytical PME correction */
1746 zeta2
= _mm_mul_ps(beta2
,rsq02
);
1747 rinv3
= _mm_mul_ps(rinvsq02
,rinv02
);
1748 pmecorrF
= avx128fma_pmecorrF_f(zeta2
);
1749 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
1750 felec
= _mm_mul_ps(qq02
,felec
);
1754 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1756 /* Update vectorial force */
1757 fix0
= _mm_macc_ps(dx02
,fscal
,fix0
);
1758 fiy0
= _mm_macc_ps(dy02
,fscal
,fiy0
);
1759 fiz0
= _mm_macc_ps(dz02
,fscal
,fiz0
);
1761 fjx2
= _mm_macc_ps(dx02
,fscal
,fjx2
);
1762 fjy2
= _mm_macc_ps(dy02
,fscal
,fjy2
);
1763 fjz2
= _mm_macc_ps(dz02
,fscal
,fjz2
);
1765 /**************************
1766 * CALCULATE INTERACTIONS *
1767 **************************/
1769 r10
= _mm_mul_ps(rsq10
,rinv10
);
1770 r10
= _mm_andnot_ps(dummy_mask
,r10
);
1772 /* EWALD ELECTROSTATICS */
1774 /* Analytical PME correction */
1775 zeta2
= _mm_mul_ps(beta2
,rsq10
);
1776 rinv3
= _mm_mul_ps(rinvsq10
,rinv10
);
1777 pmecorrF
= avx128fma_pmecorrF_f(zeta2
);
1778 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
1779 felec
= _mm_mul_ps(qq10
,felec
);
1783 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1785 /* Update vectorial force */
1786 fix1
= _mm_macc_ps(dx10
,fscal
,fix1
);
1787 fiy1
= _mm_macc_ps(dy10
,fscal
,fiy1
);
1788 fiz1
= _mm_macc_ps(dz10
,fscal
,fiz1
);
1790 fjx0
= _mm_macc_ps(dx10
,fscal
,fjx0
);
1791 fjy0
= _mm_macc_ps(dy10
,fscal
,fjy0
);
1792 fjz0
= _mm_macc_ps(dz10
,fscal
,fjz0
);
1794 /**************************
1795 * CALCULATE INTERACTIONS *
1796 **************************/
1798 r11
= _mm_mul_ps(rsq11
,rinv11
);
1799 r11
= _mm_andnot_ps(dummy_mask
,r11
);
1801 /* EWALD ELECTROSTATICS */
1803 /* Analytical PME correction */
1804 zeta2
= _mm_mul_ps(beta2
,rsq11
);
1805 rinv3
= _mm_mul_ps(rinvsq11
,rinv11
);
1806 pmecorrF
= avx128fma_pmecorrF_f(zeta2
);
1807 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
1808 felec
= _mm_mul_ps(qq11
,felec
);
1812 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1814 /* Update vectorial force */
1815 fix1
= _mm_macc_ps(dx11
,fscal
,fix1
);
1816 fiy1
= _mm_macc_ps(dy11
,fscal
,fiy1
);
1817 fiz1
= _mm_macc_ps(dz11
,fscal
,fiz1
);
1819 fjx1
= _mm_macc_ps(dx11
,fscal
,fjx1
);
1820 fjy1
= _mm_macc_ps(dy11
,fscal
,fjy1
);
1821 fjz1
= _mm_macc_ps(dz11
,fscal
,fjz1
);
1823 /**************************
1824 * CALCULATE INTERACTIONS *
1825 **************************/
1827 r12
= _mm_mul_ps(rsq12
,rinv12
);
1828 r12
= _mm_andnot_ps(dummy_mask
,r12
);
1830 /* EWALD ELECTROSTATICS */
1832 /* Analytical PME correction */
1833 zeta2
= _mm_mul_ps(beta2
,rsq12
);
1834 rinv3
= _mm_mul_ps(rinvsq12
,rinv12
);
1835 pmecorrF
= avx128fma_pmecorrF_f(zeta2
);
1836 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
1837 felec
= _mm_mul_ps(qq12
,felec
);
1841 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1843 /* Update vectorial force */
1844 fix1
= _mm_macc_ps(dx12
,fscal
,fix1
);
1845 fiy1
= _mm_macc_ps(dy12
,fscal
,fiy1
);
1846 fiz1
= _mm_macc_ps(dz12
,fscal
,fiz1
);
1848 fjx2
= _mm_macc_ps(dx12
,fscal
,fjx2
);
1849 fjy2
= _mm_macc_ps(dy12
,fscal
,fjy2
);
1850 fjz2
= _mm_macc_ps(dz12
,fscal
,fjz2
);
1852 /**************************
1853 * CALCULATE INTERACTIONS *
1854 **************************/
1856 r20
= _mm_mul_ps(rsq20
,rinv20
);
1857 r20
= _mm_andnot_ps(dummy_mask
,r20
);
1859 /* EWALD ELECTROSTATICS */
1861 /* Analytical PME correction */
1862 zeta2
= _mm_mul_ps(beta2
,rsq20
);
1863 rinv3
= _mm_mul_ps(rinvsq20
,rinv20
);
1864 pmecorrF
= avx128fma_pmecorrF_f(zeta2
);
1865 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
1866 felec
= _mm_mul_ps(qq20
,felec
);
1870 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1872 /* Update vectorial force */
1873 fix2
= _mm_macc_ps(dx20
,fscal
,fix2
);
1874 fiy2
= _mm_macc_ps(dy20
,fscal
,fiy2
);
1875 fiz2
= _mm_macc_ps(dz20
,fscal
,fiz2
);
1877 fjx0
= _mm_macc_ps(dx20
,fscal
,fjx0
);
1878 fjy0
= _mm_macc_ps(dy20
,fscal
,fjy0
);
1879 fjz0
= _mm_macc_ps(dz20
,fscal
,fjz0
);
1881 /**************************
1882 * CALCULATE INTERACTIONS *
1883 **************************/
1885 r21
= _mm_mul_ps(rsq21
,rinv21
);
1886 r21
= _mm_andnot_ps(dummy_mask
,r21
);
1888 /* EWALD ELECTROSTATICS */
1890 /* Analytical PME correction */
1891 zeta2
= _mm_mul_ps(beta2
,rsq21
);
1892 rinv3
= _mm_mul_ps(rinvsq21
,rinv21
);
1893 pmecorrF
= avx128fma_pmecorrF_f(zeta2
);
1894 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
1895 felec
= _mm_mul_ps(qq21
,felec
);
1899 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1901 /* Update vectorial force */
1902 fix2
= _mm_macc_ps(dx21
,fscal
,fix2
);
1903 fiy2
= _mm_macc_ps(dy21
,fscal
,fiy2
);
1904 fiz2
= _mm_macc_ps(dz21
,fscal
,fiz2
);
1906 fjx1
= _mm_macc_ps(dx21
,fscal
,fjx1
);
1907 fjy1
= _mm_macc_ps(dy21
,fscal
,fjy1
);
1908 fjz1
= _mm_macc_ps(dz21
,fscal
,fjz1
);
1910 /**************************
1911 * CALCULATE INTERACTIONS *
1912 **************************/
1914 r22
= _mm_mul_ps(rsq22
,rinv22
);
1915 r22
= _mm_andnot_ps(dummy_mask
,r22
);
1917 /* EWALD ELECTROSTATICS */
1919 /* Analytical PME correction */
1920 zeta2
= _mm_mul_ps(beta2
,rsq22
);
1921 rinv3
= _mm_mul_ps(rinvsq22
,rinv22
);
1922 pmecorrF
= avx128fma_pmecorrF_f(zeta2
);
1923 felec
= _mm_macc_ps(pmecorrF
,beta3
,rinv3
);
1924 felec
= _mm_mul_ps(qq22
,felec
);
1928 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1930 /* Update vectorial force */
1931 fix2
= _mm_macc_ps(dx22
,fscal
,fix2
);
1932 fiy2
= _mm_macc_ps(dy22
,fscal
,fiy2
);
1933 fiz2
= _mm_macc_ps(dz22
,fscal
,fiz2
);
1935 fjx2
= _mm_macc_ps(dx22
,fscal
,fjx2
);
1936 fjy2
= _mm_macc_ps(dy22
,fscal
,fjy2
);
1937 fjz2
= _mm_macc_ps(dz22
,fscal
,fjz2
);
1939 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
1940 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
1941 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
1942 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
1944 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
1945 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
1947 /* Inner loop uses 268 flops */
1950 /* End of innermost loop */
1952 gmx_mm_update_iforce_3atom_swizzle_ps(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
1953 f
+i_coord_offset
,fshift
+i_shift_offset
);
1955 /* Increment number of inner iterations */
1956 inneriter
+= j_index_end
- j_index_start
;
1958 /* Outer loop uses 18 flops */
1961 /* Increment number of outer iterations */
1964 /* Update outer/inner flops */
1966 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W3W3_F
,outeriter
*18 + inneriter
*268);