2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_avx_128_fma_single.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_avx_128_fma_single
51 * Electrostatics interaction: ReactionField
52 * VdW interaction: LennardJones
53 * Geometry: Water4-Water4
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecRF_VdwLJ_GeomW4W4_VF_avx_128_fma_single
58 (t_nblist
* gmx_restrict nlist
,
59 rvec
* gmx_restrict xx
,
60 rvec
* gmx_restrict ff
,
61 struct t_forcerec
* gmx_restrict fr
,
62 t_mdatoms
* gmx_restrict mdatoms
,
63 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
64 t_nrnb
* gmx_restrict nrnb
)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
72 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
73 int jnrA
,jnrB
,jnrC
,jnrD
;
74 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
75 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
76 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
78 real
*shiftvec
,*fshift
,*x
,*f
;
79 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
;
81 __m128 fscal
,rcutoff
,rcutoff2
,jidxall
;
83 __m128 ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
85 __m128 ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
87 __m128 ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
89 __m128 ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
90 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
;
91 __m128 jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
92 int vdwjidx1A
,vdwjidx1B
,vdwjidx1C
,vdwjidx1D
;
93 __m128 jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
94 int vdwjidx2A
,vdwjidx2B
,vdwjidx2C
,vdwjidx2D
;
95 __m128 jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
96 int vdwjidx3A
,vdwjidx3B
,vdwjidx3C
,vdwjidx3D
;
97 __m128 jx3
,jy3
,jz3
,fjx3
,fjy3
,fjz3
,jq3
,isaj3
;
98 __m128 dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
99 __m128 dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
100 __m128 dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
101 __m128 dx13
,dy13
,dz13
,rsq13
,rinv13
,rinvsq13
,r13
,qq13
,c6_13
,c12_13
;
102 __m128 dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
103 __m128 dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
104 __m128 dx23
,dy23
,dz23
,rsq23
,rinv23
,rinvsq23
,r23
,qq23
,c6_23
,c12_23
;
105 __m128 dx31
,dy31
,dz31
,rsq31
,rinv31
,rinvsq31
,r31
,qq31
,c6_31
,c12_31
;
106 __m128 dx32
,dy32
,dz32
,rsq32
,rinv32
,rinvsq32
,r32
,qq32
,c6_32
,c12_32
;
107 __m128 dx33
,dy33
,dz33
,rsq33
,rinv33
,rinvsq33
,r33
,qq33
,c6_33
,c12_33
;
108 __m128 velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
111 __m128 rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
114 __m128 one_sixth
= _mm_set1_ps(1.0/6.0);
115 __m128 one_twelfth
= _mm_set1_ps(1.0/12.0);
116 __m128 dummy_mask
,cutoff_mask
;
117 __m128 signbit
= _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
118 __m128 one
= _mm_set1_ps(1.0);
119 __m128 two
= _mm_set1_ps(2.0);
125 jindex
= nlist
->jindex
;
127 shiftidx
= nlist
->shift
;
129 shiftvec
= fr
->shift_vec
[0];
130 fshift
= fr
->fshift
[0];
131 facel
= _mm_set1_ps(fr
->ic
->epsfac
);
132 charge
= mdatoms
->chargeA
;
133 krf
= _mm_set1_ps(fr
->ic
->k_rf
);
134 krf2
= _mm_set1_ps(fr
->ic
->k_rf
*2.0);
135 crf
= _mm_set1_ps(fr
->ic
->c_rf
);
136 nvdwtype
= fr
->ntype
;
138 vdwtype
= mdatoms
->typeA
;
140 /* Setup water-specific parameters */
141 inr
= nlist
->iinr
[0];
142 iq1
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+1]));
143 iq2
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+2]));
144 iq3
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+3]));
145 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
147 jq1
= _mm_set1_ps(charge
[inr
+1]);
148 jq2
= _mm_set1_ps(charge
[inr
+2]);
149 jq3
= _mm_set1_ps(charge
[inr
+3]);
150 vdwjidx0A
= 2*vdwtype
[inr
+0];
151 c6_00
= _mm_set1_ps(vdwparam
[vdwioffset0
+vdwjidx0A
]);
152 c12_00
= _mm_set1_ps(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
153 qq11
= _mm_mul_ps(iq1
,jq1
);
154 qq12
= _mm_mul_ps(iq1
,jq2
);
155 qq13
= _mm_mul_ps(iq1
,jq3
);
156 qq21
= _mm_mul_ps(iq2
,jq1
);
157 qq22
= _mm_mul_ps(iq2
,jq2
);
158 qq23
= _mm_mul_ps(iq2
,jq3
);
159 qq31
= _mm_mul_ps(iq3
,jq1
);
160 qq32
= _mm_mul_ps(iq3
,jq2
);
161 qq33
= _mm_mul_ps(iq3
,jq3
);
163 /* Avoid stupid compiler warnings */
164 jnrA
= jnrB
= jnrC
= jnrD
= 0;
173 for(iidx
=0;iidx
<4*DIM
;iidx
++)
178 /* Start outer loop over neighborlists */
179 for(iidx
=0; iidx
<nri
; iidx
++)
181 /* Load shift vector for this list */
182 i_shift_offset
= DIM
*shiftidx
[iidx
];
184 /* Load limits for loop over neighbors */
185 j_index_start
= jindex
[iidx
];
186 j_index_end
= jindex
[iidx
+1];
188 /* Get outer coordinate index */
190 i_coord_offset
= DIM
*inr
;
192 /* Load i particle coords and add shift vector */
193 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
194 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
196 fix0
= _mm_setzero_ps();
197 fiy0
= _mm_setzero_ps();
198 fiz0
= _mm_setzero_ps();
199 fix1
= _mm_setzero_ps();
200 fiy1
= _mm_setzero_ps();
201 fiz1
= _mm_setzero_ps();
202 fix2
= _mm_setzero_ps();
203 fiy2
= _mm_setzero_ps();
204 fiz2
= _mm_setzero_ps();
205 fix3
= _mm_setzero_ps();
206 fiy3
= _mm_setzero_ps();
207 fiz3
= _mm_setzero_ps();
209 /* Reset potential sums */
210 velecsum
= _mm_setzero_ps();
211 vvdwsum
= _mm_setzero_ps();
213 /* Start inner kernel loop */
214 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+3]>=0; jidx
+=4)
217 /* Get j neighbor index, and coordinate index */
222 j_coord_offsetA
= DIM
*jnrA
;
223 j_coord_offsetB
= DIM
*jnrB
;
224 j_coord_offsetC
= DIM
*jnrC
;
225 j_coord_offsetD
= DIM
*jnrD
;
227 /* load j atom coordinates */
228 gmx_mm_load_4rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
229 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
230 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
231 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
233 /* Calculate displacement vector */
234 dx00
= _mm_sub_ps(ix0
,jx0
);
235 dy00
= _mm_sub_ps(iy0
,jy0
);
236 dz00
= _mm_sub_ps(iz0
,jz0
);
237 dx11
= _mm_sub_ps(ix1
,jx1
);
238 dy11
= _mm_sub_ps(iy1
,jy1
);
239 dz11
= _mm_sub_ps(iz1
,jz1
);
240 dx12
= _mm_sub_ps(ix1
,jx2
);
241 dy12
= _mm_sub_ps(iy1
,jy2
);
242 dz12
= _mm_sub_ps(iz1
,jz2
);
243 dx13
= _mm_sub_ps(ix1
,jx3
);
244 dy13
= _mm_sub_ps(iy1
,jy3
);
245 dz13
= _mm_sub_ps(iz1
,jz3
);
246 dx21
= _mm_sub_ps(ix2
,jx1
);
247 dy21
= _mm_sub_ps(iy2
,jy1
);
248 dz21
= _mm_sub_ps(iz2
,jz1
);
249 dx22
= _mm_sub_ps(ix2
,jx2
);
250 dy22
= _mm_sub_ps(iy2
,jy2
);
251 dz22
= _mm_sub_ps(iz2
,jz2
);
252 dx23
= _mm_sub_ps(ix2
,jx3
);
253 dy23
= _mm_sub_ps(iy2
,jy3
);
254 dz23
= _mm_sub_ps(iz2
,jz3
);
255 dx31
= _mm_sub_ps(ix3
,jx1
);
256 dy31
= _mm_sub_ps(iy3
,jy1
);
257 dz31
= _mm_sub_ps(iz3
,jz1
);
258 dx32
= _mm_sub_ps(ix3
,jx2
);
259 dy32
= _mm_sub_ps(iy3
,jy2
);
260 dz32
= _mm_sub_ps(iz3
,jz2
);
261 dx33
= _mm_sub_ps(ix3
,jx3
);
262 dy33
= _mm_sub_ps(iy3
,jy3
);
263 dz33
= _mm_sub_ps(iz3
,jz3
);
265 /* Calculate squared distance and things based on it */
266 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
267 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
268 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
269 rsq13
= gmx_mm_calc_rsq_ps(dx13
,dy13
,dz13
);
270 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
271 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
272 rsq23
= gmx_mm_calc_rsq_ps(dx23
,dy23
,dz23
);
273 rsq31
= gmx_mm_calc_rsq_ps(dx31
,dy31
,dz31
);
274 rsq32
= gmx_mm_calc_rsq_ps(dx32
,dy32
,dz32
);
275 rsq33
= gmx_mm_calc_rsq_ps(dx33
,dy33
,dz33
);
277 rinv11
= avx128fma_invsqrt_f(rsq11
);
278 rinv12
= avx128fma_invsqrt_f(rsq12
);
279 rinv13
= avx128fma_invsqrt_f(rsq13
);
280 rinv21
= avx128fma_invsqrt_f(rsq21
);
281 rinv22
= avx128fma_invsqrt_f(rsq22
);
282 rinv23
= avx128fma_invsqrt_f(rsq23
);
283 rinv31
= avx128fma_invsqrt_f(rsq31
);
284 rinv32
= avx128fma_invsqrt_f(rsq32
);
285 rinv33
= avx128fma_invsqrt_f(rsq33
);
287 rinvsq00
= avx128fma_inv_f(rsq00
);
288 rinvsq11
= _mm_mul_ps(rinv11
,rinv11
);
289 rinvsq12
= _mm_mul_ps(rinv12
,rinv12
);
290 rinvsq13
= _mm_mul_ps(rinv13
,rinv13
);
291 rinvsq21
= _mm_mul_ps(rinv21
,rinv21
);
292 rinvsq22
= _mm_mul_ps(rinv22
,rinv22
);
293 rinvsq23
= _mm_mul_ps(rinv23
,rinv23
);
294 rinvsq31
= _mm_mul_ps(rinv31
,rinv31
);
295 rinvsq32
= _mm_mul_ps(rinv32
,rinv32
);
296 rinvsq33
= _mm_mul_ps(rinv33
,rinv33
);
298 fjx0
= _mm_setzero_ps();
299 fjy0
= _mm_setzero_ps();
300 fjz0
= _mm_setzero_ps();
301 fjx1
= _mm_setzero_ps();
302 fjy1
= _mm_setzero_ps();
303 fjz1
= _mm_setzero_ps();
304 fjx2
= _mm_setzero_ps();
305 fjy2
= _mm_setzero_ps();
306 fjz2
= _mm_setzero_ps();
307 fjx3
= _mm_setzero_ps();
308 fjy3
= _mm_setzero_ps();
309 fjz3
= _mm_setzero_ps();
311 /**************************
312 * CALCULATE INTERACTIONS *
313 **************************/
315 /* LENNARD-JONES DISPERSION/REPULSION */
317 rinvsix
= _mm_mul_ps(_mm_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
318 vvdw6
= _mm_mul_ps(c6_00
,rinvsix
);
319 vvdw12
= _mm_mul_ps(c12_00
,_mm_mul_ps(rinvsix
,rinvsix
));
320 vvdw
= _mm_msub_ps(vvdw12
,one_twelfth
,_mm_mul_ps(vvdw6
,one_sixth
));
321 fvdw
= _mm_mul_ps(_mm_sub_ps(vvdw12
,vvdw6
),rinvsq00
);
323 /* Update potential sum for this i atom from the interaction with this j atom. */
324 vvdwsum
= _mm_add_ps(vvdwsum
,vvdw
);
328 /* Update vectorial force */
329 fix0
= _mm_macc_ps(dx00
,fscal
,fix0
);
330 fiy0
= _mm_macc_ps(dy00
,fscal
,fiy0
);
331 fiz0
= _mm_macc_ps(dz00
,fscal
,fiz0
);
333 fjx0
= _mm_macc_ps(dx00
,fscal
,fjx0
);
334 fjy0
= _mm_macc_ps(dy00
,fscal
,fjy0
);
335 fjz0
= _mm_macc_ps(dz00
,fscal
,fjz0
);
337 /**************************
338 * CALCULATE INTERACTIONS *
339 **************************/
341 /* REACTION-FIELD ELECTROSTATICS */
342 velec
= _mm_mul_ps(qq11
,_mm_sub_ps(_mm_macc_ps(krf
,rsq11
,rinv11
),crf
));
343 felec
= _mm_mul_ps(qq11
,_mm_msub_ps(rinv11
,rinvsq11
,krf2
));
345 /* Update potential sum for this i atom from the interaction with this j atom. */
346 velecsum
= _mm_add_ps(velecsum
,velec
);
350 /* Update vectorial force */
351 fix1
= _mm_macc_ps(dx11
,fscal
,fix1
);
352 fiy1
= _mm_macc_ps(dy11
,fscal
,fiy1
);
353 fiz1
= _mm_macc_ps(dz11
,fscal
,fiz1
);
355 fjx1
= _mm_macc_ps(dx11
,fscal
,fjx1
);
356 fjy1
= _mm_macc_ps(dy11
,fscal
,fjy1
);
357 fjz1
= _mm_macc_ps(dz11
,fscal
,fjz1
);
359 /**************************
360 * CALCULATE INTERACTIONS *
361 **************************/
363 /* REACTION-FIELD ELECTROSTATICS */
364 velec
= _mm_mul_ps(qq12
,_mm_sub_ps(_mm_macc_ps(krf
,rsq12
,rinv12
),crf
));
365 felec
= _mm_mul_ps(qq12
,_mm_msub_ps(rinv12
,rinvsq12
,krf2
));
367 /* Update potential sum for this i atom from the interaction with this j atom. */
368 velecsum
= _mm_add_ps(velecsum
,velec
);
372 /* Update vectorial force */
373 fix1
= _mm_macc_ps(dx12
,fscal
,fix1
);
374 fiy1
= _mm_macc_ps(dy12
,fscal
,fiy1
);
375 fiz1
= _mm_macc_ps(dz12
,fscal
,fiz1
);
377 fjx2
= _mm_macc_ps(dx12
,fscal
,fjx2
);
378 fjy2
= _mm_macc_ps(dy12
,fscal
,fjy2
);
379 fjz2
= _mm_macc_ps(dz12
,fscal
,fjz2
);
381 /**************************
382 * CALCULATE INTERACTIONS *
383 **************************/
385 /* REACTION-FIELD ELECTROSTATICS */
386 velec
= _mm_mul_ps(qq13
,_mm_sub_ps(_mm_macc_ps(krf
,rsq13
,rinv13
),crf
));
387 felec
= _mm_mul_ps(qq13
,_mm_msub_ps(rinv13
,rinvsq13
,krf2
));
389 /* Update potential sum for this i atom from the interaction with this j atom. */
390 velecsum
= _mm_add_ps(velecsum
,velec
);
394 /* Update vectorial force */
395 fix1
= _mm_macc_ps(dx13
,fscal
,fix1
);
396 fiy1
= _mm_macc_ps(dy13
,fscal
,fiy1
);
397 fiz1
= _mm_macc_ps(dz13
,fscal
,fiz1
);
399 fjx3
= _mm_macc_ps(dx13
,fscal
,fjx3
);
400 fjy3
= _mm_macc_ps(dy13
,fscal
,fjy3
);
401 fjz3
= _mm_macc_ps(dz13
,fscal
,fjz3
);
403 /**************************
404 * CALCULATE INTERACTIONS *
405 **************************/
407 /* REACTION-FIELD ELECTROSTATICS */
408 velec
= _mm_mul_ps(qq21
,_mm_sub_ps(_mm_macc_ps(krf
,rsq21
,rinv21
),crf
));
409 felec
= _mm_mul_ps(qq21
,_mm_msub_ps(rinv21
,rinvsq21
,krf2
));
411 /* Update potential sum for this i atom from the interaction with this j atom. */
412 velecsum
= _mm_add_ps(velecsum
,velec
);
416 /* Update vectorial force */
417 fix2
= _mm_macc_ps(dx21
,fscal
,fix2
);
418 fiy2
= _mm_macc_ps(dy21
,fscal
,fiy2
);
419 fiz2
= _mm_macc_ps(dz21
,fscal
,fiz2
);
421 fjx1
= _mm_macc_ps(dx21
,fscal
,fjx1
);
422 fjy1
= _mm_macc_ps(dy21
,fscal
,fjy1
);
423 fjz1
= _mm_macc_ps(dz21
,fscal
,fjz1
);
425 /**************************
426 * CALCULATE INTERACTIONS *
427 **************************/
429 /* REACTION-FIELD ELECTROSTATICS */
430 velec
= _mm_mul_ps(qq22
,_mm_sub_ps(_mm_macc_ps(krf
,rsq22
,rinv22
),crf
));
431 felec
= _mm_mul_ps(qq22
,_mm_msub_ps(rinv22
,rinvsq22
,krf2
));
433 /* Update potential sum for this i atom from the interaction with this j atom. */
434 velecsum
= _mm_add_ps(velecsum
,velec
);
438 /* Update vectorial force */
439 fix2
= _mm_macc_ps(dx22
,fscal
,fix2
);
440 fiy2
= _mm_macc_ps(dy22
,fscal
,fiy2
);
441 fiz2
= _mm_macc_ps(dz22
,fscal
,fiz2
);
443 fjx2
= _mm_macc_ps(dx22
,fscal
,fjx2
);
444 fjy2
= _mm_macc_ps(dy22
,fscal
,fjy2
);
445 fjz2
= _mm_macc_ps(dz22
,fscal
,fjz2
);
447 /**************************
448 * CALCULATE INTERACTIONS *
449 **************************/
451 /* REACTION-FIELD ELECTROSTATICS */
452 velec
= _mm_mul_ps(qq23
,_mm_sub_ps(_mm_macc_ps(krf
,rsq23
,rinv23
),crf
));
453 felec
= _mm_mul_ps(qq23
,_mm_msub_ps(rinv23
,rinvsq23
,krf2
));
455 /* Update potential sum for this i atom from the interaction with this j atom. */
456 velecsum
= _mm_add_ps(velecsum
,velec
);
460 /* Update vectorial force */
461 fix2
= _mm_macc_ps(dx23
,fscal
,fix2
);
462 fiy2
= _mm_macc_ps(dy23
,fscal
,fiy2
);
463 fiz2
= _mm_macc_ps(dz23
,fscal
,fiz2
);
465 fjx3
= _mm_macc_ps(dx23
,fscal
,fjx3
);
466 fjy3
= _mm_macc_ps(dy23
,fscal
,fjy3
);
467 fjz3
= _mm_macc_ps(dz23
,fscal
,fjz3
);
469 /**************************
470 * CALCULATE INTERACTIONS *
471 **************************/
473 /* REACTION-FIELD ELECTROSTATICS */
474 velec
= _mm_mul_ps(qq31
,_mm_sub_ps(_mm_macc_ps(krf
,rsq31
,rinv31
),crf
));
475 felec
= _mm_mul_ps(qq31
,_mm_msub_ps(rinv31
,rinvsq31
,krf2
));
477 /* Update potential sum for this i atom from the interaction with this j atom. */
478 velecsum
= _mm_add_ps(velecsum
,velec
);
482 /* Update vectorial force */
483 fix3
= _mm_macc_ps(dx31
,fscal
,fix3
);
484 fiy3
= _mm_macc_ps(dy31
,fscal
,fiy3
);
485 fiz3
= _mm_macc_ps(dz31
,fscal
,fiz3
);
487 fjx1
= _mm_macc_ps(dx31
,fscal
,fjx1
);
488 fjy1
= _mm_macc_ps(dy31
,fscal
,fjy1
);
489 fjz1
= _mm_macc_ps(dz31
,fscal
,fjz1
);
491 /**************************
492 * CALCULATE INTERACTIONS *
493 **************************/
495 /* REACTION-FIELD ELECTROSTATICS */
496 velec
= _mm_mul_ps(qq32
,_mm_sub_ps(_mm_macc_ps(krf
,rsq32
,rinv32
),crf
));
497 felec
= _mm_mul_ps(qq32
,_mm_msub_ps(rinv32
,rinvsq32
,krf2
));
499 /* Update potential sum for this i atom from the interaction with this j atom. */
500 velecsum
= _mm_add_ps(velecsum
,velec
);
504 /* Update vectorial force */
505 fix3
= _mm_macc_ps(dx32
,fscal
,fix3
);
506 fiy3
= _mm_macc_ps(dy32
,fscal
,fiy3
);
507 fiz3
= _mm_macc_ps(dz32
,fscal
,fiz3
);
509 fjx2
= _mm_macc_ps(dx32
,fscal
,fjx2
);
510 fjy2
= _mm_macc_ps(dy32
,fscal
,fjy2
);
511 fjz2
= _mm_macc_ps(dz32
,fscal
,fjz2
);
513 /**************************
514 * CALCULATE INTERACTIONS *
515 **************************/
517 /* REACTION-FIELD ELECTROSTATICS */
518 velec
= _mm_mul_ps(qq33
,_mm_sub_ps(_mm_macc_ps(krf
,rsq33
,rinv33
),crf
));
519 felec
= _mm_mul_ps(qq33
,_mm_msub_ps(rinv33
,rinvsq33
,krf2
));
521 /* Update potential sum for this i atom from the interaction with this j atom. */
522 velecsum
= _mm_add_ps(velecsum
,velec
);
526 /* Update vectorial force */
527 fix3
= _mm_macc_ps(dx33
,fscal
,fix3
);
528 fiy3
= _mm_macc_ps(dy33
,fscal
,fiy3
);
529 fiz3
= _mm_macc_ps(dz33
,fscal
,fiz3
);
531 fjx3
= _mm_macc_ps(dx33
,fscal
,fjx3
);
532 fjy3
= _mm_macc_ps(dy33
,fscal
,fjy3
);
533 fjz3
= _mm_macc_ps(dz33
,fscal
,fjz3
);
535 fjptrA
= f
+j_coord_offsetA
;
536 fjptrB
= f
+j_coord_offsetB
;
537 fjptrC
= f
+j_coord_offsetC
;
538 fjptrD
= f
+j_coord_offsetD
;
540 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
541 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,
542 fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
544 /* Inner loop uses 353 flops */
550 /* Get j neighbor index, and coordinate index */
551 jnrlistA
= jjnr
[jidx
];
552 jnrlistB
= jjnr
[jidx
+1];
553 jnrlistC
= jjnr
[jidx
+2];
554 jnrlistD
= jjnr
[jidx
+3];
555 /* Sign of each element will be negative for non-real atoms.
556 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
557 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
559 dummy_mask
= gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128()));
560 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
561 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
562 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
563 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
564 j_coord_offsetA
= DIM
*jnrA
;
565 j_coord_offsetB
= DIM
*jnrB
;
566 j_coord_offsetC
= DIM
*jnrC
;
567 j_coord_offsetD
= DIM
*jnrD
;
569 /* load j atom coordinates */
570 gmx_mm_load_4rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
571 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
572 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
573 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
575 /* Calculate displacement vector */
576 dx00
= _mm_sub_ps(ix0
,jx0
);
577 dy00
= _mm_sub_ps(iy0
,jy0
);
578 dz00
= _mm_sub_ps(iz0
,jz0
);
579 dx11
= _mm_sub_ps(ix1
,jx1
);
580 dy11
= _mm_sub_ps(iy1
,jy1
);
581 dz11
= _mm_sub_ps(iz1
,jz1
);
582 dx12
= _mm_sub_ps(ix1
,jx2
);
583 dy12
= _mm_sub_ps(iy1
,jy2
);
584 dz12
= _mm_sub_ps(iz1
,jz2
);
585 dx13
= _mm_sub_ps(ix1
,jx3
);
586 dy13
= _mm_sub_ps(iy1
,jy3
);
587 dz13
= _mm_sub_ps(iz1
,jz3
);
588 dx21
= _mm_sub_ps(ix2
,jx1
);
589 dy21
= _mm_sub_ps(iy2
,jy1
);
590 dz21
= _mm_sub_ps(iz2
,jz1
);
591 dx22
= _mm_sub_ps(ix2
,jx2
);
592 dy22
= _mm_sub_ps(iy2
,jy2
);
593 dz22
= _mm_sub_ps(iz2
,jz2
);
594 dx23
= _mm_sub_ps(ix2
,jx3
);
595 dy23
= _mm_sub_ps(iy2
,jy3
);
596 dz23
= _mm_sub_ps(iz2
,jz3
);
597 dx31
= _mm_sub_ps(ix3
,jx1
);
598 dy31
= _mm_sub_ps(iy3
,jy1
);
599 dz31
= _mm_sub_ps(iz3
,jz1
);
600 dx32
= _mm_sub_ps(ix3
,jx2
);
601 dy32
= _mm_sub_ps(iy3
,jy2
);
602 dz32
= _mm_sub_ps(iz3
,jz2
);
603 dx33
= _mm_sub_ps(ix3
,jx3
);
604 dy33
= _mm_sub_ps(iy3
,jy3
);
605 dz33
= _mm_sub_ps(iz3
,jz3
);
607 /* Calculate squared distance and things based on it */
608 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
609 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
610 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
611 rsq13
= gmx_mm_calc_rsq_ps(dx13
,dy13
,dz13
);
612 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
613 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
614 rsq23
= gmx_mm_calc_rsq_ps(dx23
,dy23
,dz23
);
615 rsq31
= gmx_mm_calc_rsq_ps(dx31
,dy31
,dz31
);
616 rsq32
= gmx_mm_calc_rsq_ps(dx32
,dy32
,dz32
);
617 rsq33
= gmx_mm_calc_rsq_ps(dx33
,dy33
,dz33
);
619 rinv11
= avx128fma_invsqrt_f(rsq11
);
620 rinv12
= avx128fma_invsqrt_f(rsq12
);
621 rinv13
= avx128fma_invsqrt_f(rsq13
);
622 rinv21
= avx128fma_invsqrt_f(rsq21
);
623 rinv22
= avx128fma_invsqrt_f(rsq22
);
624 rinv23
= avx128fma_invsqrt_f(rsq23
);
625 rinv31
= avx128fma_invsqrt_f(rsq31
);
626 rinv32
= avx128fma_invsqrt_f(rsq32
);
627 rinv33
= avx128fma_invsqrt_f(rsq33
);
629 rinvsq00
= avx128fma_inv_f(rsq00
);
630 rinvsq11
= _mm_mul_ps(rinv11
,rinv11
);
631 rinvsq12
= _mm_mul_ps(rinv12
,rinv12
);
632 rinvsq13
= _mm_mul_ps(rinv13
,rinv13
);
633 rinvsq21
= _mm_mul_ps(rinv21
,rinv21
);
634 rinvsq22
= _mm_mul_ps(rinv22
,rinv22
);
635 rinvsq23
= _mm_mul_ps(rinv23
,rinv23
);
636 rinvsq31
= _mm_mul_ps(rinv31
,rinv31
);
637 rinvsq32
= _mm_mul_ps(rinv32
,rinv32
);
638 rinvsq33
= _mm_mul_ps(rinv33
,rinv33
);
640 fjx0
= _mm_setzero_ps();
641 fjy0
= _mm_setzero_ps();
642 fjz0
= _mm_setzero_ps();
643 fjx1
= _mm_setzero_ps();
644 fjy1
= _mm_setzero_ps();
645 fjz1
= _mm_setzero_ps();
646 fjx2
= _mm_setzero_ps();
647 fjy2
= _mm_setzero_ps();
648 fjz2
= _mm_setzero_ps();
649 fjx3
= _mm_setzero_ps();
650 fjy3
= _mm_setzero_ps();
651 fjz3
= _mm_setzero_ps();
653 /**************************
654 * CALCULATE INTERACTIONS *
655 **************************/
657 /* LENNARD-JONES DISPERSION/REPULSION */
659 rinvsix
= _mm_mul_ps(_mm_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
660 vvdw6
= _mm_mul_ps(c6_00
,rinvsix
);
661 vvdw12
= _mm_mul_ps(c12_00
,_mm_mul_ps(rinvsix
,rinvsix
));
662 vvdw
= _mm_msub_ps(vvdw12
,one_twelfth
,_mm_mul_ps(vvdw6
,one_sixth
));
663 fvdw
= _mm_mul_ps(_mm_sub_ps(vvdw12
,vvdw6
),rinvsq00
);
665 /* Update potential sum for this i atom from the interaction with this j atom. */
666 vvdw
= _mm_andnot_ps(dummy_mask
,vvdw
);
667 vvdwsum
= _mm_add_ps(vvdwsum
,vvdw
);
671 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
673 /* Update vectorial force */
674 fix0
= _mm_macc_ps(dx00
,fscal
,fix0
);
675 fiy0
= _mm_macc_ps(dy00
,fscal
,fiy0
);
676 fiz0
= _mm_macc_ps(dz00
,fscal
,fiz0
);
678 fjx0
= _mm_macc_ps(dx00
,fscal
,fjx0
);
679 fjy0
= _mm_macc_ps(dy00
,fscal
,fjy0
);
680 fjz0
= _mm_macc_ps(dz00
,fscal
,fjz0
);
682 /**************************
683 * CALCULATE INTERACTIONS *
684 **************************/
686 /* REACTION-FIELD ELECTROSTATICS */
687 velec
= _mm_mul_ps(qq11
,_mm_sub_ps(_mm_macc_ps(krf
,rsq11
,rinv11
),crf
));
688 felec
= _mm_mul_ps(qq11
,_mm_msub_ps(rinv11
,rinvsq11
,krf2
));
690 /* Update potential sum for this i atom from the interaction with this j atom. */
691 velec
= _mm_andnot_ps(dummy_mask
,velec
);
692 velecsum
= _mm_add_ps(velecsum
,velec
);
696 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
698 /* Update vectorial force */
699 fix1
= _mm_macc_ps(dx11
,fscal
,fix1
);
700 fiy1
= _mm_macc_ps(dy11
,fscal
,fiy1
);
701 fiz1
= _mm_macc_ps(dz11
,fscal
,fiz1
);
703 fjx1
= _mm_macc_ps(dx11
,fscal
,fjx1
);
704 fjy1
= _mm_macc_ps(dy11
,fscal
,fjy1
);
705 fjz1
= _mm_macc_ps(dz11
,fscal
,fjz1
);
707 /**************************
708 * CALCULATE INTERACTIONS *
709 **************************/
711 /* REACTION-FIELD ELECTROSTATICS */
712 velec
= _mm_mul_ps(qq12
,_mm_sub_ps(_mm_macc_ps(krf
,rsq12
,rinv12
),crf
));
713 felec
= _mm_mul_ps(qq12
,_mm_msub_ps(rinv12
,rinvsq12
,krf2
));
715 /* Update potential sum for this i atom from the interaction with this j atom. */
716 velec
= _mm_andnot_ps(dummy_mask
,velec
);
717 velecsum
= _mm_add_ps(velecsum
,velec
);
721 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
723 /* Update vectorial force */
724 fix1
= _mm_macc_ps(dx12
,fscal
,fix1
);
725 fiy1
= _mm_macc_ps(dy12
,fscal
,fiy1
);
726 fiz1
= _mm_macc_ps(dz12
,fscal
,fiz1
);
728 fjx2
= _mm_macc_ps(dx12
,fscal
,fjx2
);
729 fjy2
= _mm_macc_ps(dy12
,fscal
,fjy2
);
730 fjz2
= _mm_macc_ps(dz12
,fscal
,fjz2
);
732 /**************************
733 * CALCULATE INTERACTIONS *
734 **************************/
736 /* REACTION-FIELD ELECTROSTATICS */
737 velec
= _mm_mul_ps(qq13
,_mm_sub_ps(_mm_macc_ps(krf
,rsq13
,rinv13
),crf
));
738 felec
= _mm_mul_ps(qq13
,_mm_msub_ps(rinv13
,rinvsq13
,krf2
));
740 /* Update potential sum for this i atom from the interaction with this j atom. */
741 velec
= _mm_andnot_ps(dummy_mask
,velec
);
742 velecsum
= _mm_add_ps(velecsum
,velec
);
746 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
748 /* Update vectorial force */
749 fix1
= _mm_macc_ps(dx13
,fscal
,fix1
);
750 fiy1
= _mm_macc_ps(dy13
,fscal
,fiy1
);
751 fiz1
= _mm_macc_ps(dz13
,fscal
,fiz1
);
753 fjx3
= _mm_macc_ps(dx13
,fscal
,fjx3
);
754 fjy3
= _mm_macc_ps(dy13
,fscal
,fjy3
);
755 fjz3
= _mm_macc_ps(dz13
,fscal
,fjz3
);
757 /**************************
758 * CALCULATE INTERACTIONS *
759 **************************/
761 /* REACTION-FIELD ELECTROSTATICS */
762 velec
= _mm_mul_ps(qq21
,_mm_sub_ps(_mm_macc_ps(krf
,rsq21
,rinv21
),crf
));
763 felec
= _mm_mul_ps(qq21
,_mm_msub_ps(rinv21
,rinvsq21
,krf2
));
765 /* Update potential sum for this i atom from the interaction with this j atom. */
766 velec
= _mm_andnot_ps(dummy_mask
,velec
);
767 velecsum
= _mm_add_ps(velecsum
,velec
);
771 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
773 /* Update vectorial force */
774 fix2
= _mm_macc_ps(dx21
,fscal
,fix2
);
775 fiy2
= _mm_macc_ps(dy21
,fscal
,fiy2
);
776 fiz2
= _mm_macc_ps(dz21
,fscal
,fiz2
);
778 fjx1
= _mm_macc_ps(dx21
,fscal
,fjx1
);
779 fjy1
= _mm_macc_ps(dy21
,fscal
,fjy1
);
780 fjz1
= _mm_macc_ps(dz21
,fscal
,fjz1
);
782 /**************************
783 * CALCULATE INTERACTIONS *
784 **************************/
786 /* REACTION-FIELD ELECTROSTATICS */
787 velec
= _mm_mul_ps(qq22
,_mm_sub_ps(_mm_macc_ps(krf
,rsq22
,rinv22
),crf
));
788 felec
= _mm_mul_ps(qq22
,_mm_msub_ps(rinv22
,rinvsq22
,krf2
));
790 /* Update potential sum for this i atom from the interaction with this j atom. */
791 velec
= _mm_andnot_ps(dummy_mask
,velec
);
792 velecsum
= _mm_add_ps(velecsum
,velec
);
796 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
798 /* Update vectorial force */
799 fix2
= _mm_macc_ps(dx22
,fscal
,fix2
);
800 fiy2
= _mm_macc_ps(dy22
,fscal
,fiy2
);
801 fiz2
= _mm_macc_ps(dz22
,fscal
,fiz2
);
803 fjx2
= _mm_macc_ps(dx22
,fscal
,fjx2
);
804 fjy2
= _mm_macc_ps(dy22
,fscal
,fjy2
);
805 fjz2
= _mm_macc_ps(dz22
,fscal
,fjz2
);
807 /**************************
808 * CALCULATE INTERACTIONS *
809 **************************/
811 /* REACTION-FIELD ELECTROSTATICS */
812 velec
= _mm_mul_ps(qq23
,_mm_sub_ps(_mm_macc_ps(krf
,rsq23
,rinv23
),crf
));
813 felec
= _mm_mul_ps(qq23
,_mm_msub_ps(rinv23
,rinvsq23
,krf2
));
815 /* Update potential sum for this i atom from the interaction with this j atom. */
816 velec
= _mm_andnot_ps(dummy_mask
,velec
);
817 velecsum
= _mm_add_ps(velecsum
,velec
);
821 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
823 /* Update vectorial force */
824 fix2
= _mm_macc_ps(dx23
,fscal
,fix2
);
825 fiy2
= _mm_macc_ps(dy23
,fscal
,fiy2
);
826 fiz2
= _mm_macc_ps(dz23
,fscal
,fiz2
);
828 fjx3
= _mm_macc_ps(dx23
,fscal
,fjx3
);
829 fjy3
= _mm_macc_ps(dy23
,fscal
,fjy3
);
830 fjz3
= _mm_macc_ps(dz23
,fscal
,fjz3
);
832 /**************************
833 * CALCULATE INTERACTIONS *
834 **************************/
836 /* REACTION-FIELD ELECTROSTATICS */
837 velec
= _mm_mul_ps(qq31
,_mm_sub_ps(_mm_macc_ps(krf
,rsq31
,rinv31
),crf
));
838 felec
= _mm_mul_ps(qq31
,_mm_msub_ps(rinv31
,rinvsq31
,krf2
));
840 /* Update potential sum for this i atom from the interaction with this j atom. */
841 velec
= _mm_andnot_ps(dummy_mask
,velec
);
842 velecsum
= _mm_add_ps(velecsum
,velec
);
846 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
848 /* Update vectorial force */
849 fix3
= _mm_macc_ps(dx31
,fscal
,fix3
);
850 fiy3
= _mm_macc_ps(dy31
,fscal
,fiy3
);
851 fiz3
= _mm_macc_ps(dz31
,fscal
,fiz3
);
853 fjx1
= _mm_macc_ps(dx31
,fscal
,fjx1
);
854 fjy1
= _mm_macc_ps(dy31
,fscal
,fjy1
);
855 fjz1
= _mm_macc_ps(dz31
,fscal
,fjz1
);
857 /**************************
858 * CALCULATE INTERACTIONS *
859 **************************/
861 /* REACTION-FIELD ELECTROSTATICS */
862 velec
= _mm_mul_ps(qq32
,_mm_sub_ps(_mm_macc_ps(krf
,rsq32
,rinv32
),crf
));
863 felec
= _mm_mul_ps(qq32
,_mm_msub_ps(rinv32
,rinvsq32
,krf2
));
865 /* Update potential sum for this i atom from the interaction with this j atom. */
866 velec
= _mm_andnot_ps(dummy_mask
,velec
);
867 velecsum
= _mm_add_ps(velecsum
,velec
);
871 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
873 /* Update vectorial force */
874 fix3
= _mm_macc_ps(dx32
,fscal
,fix3
);
875 fiy3
= _mm_macc_ps(dy32
,fscal
,fiy3
);
876 fiz3
= _mm_macc_ps(dz32
,fscal
,fiz3
);
878 fjx2
= _mm_macc_ps(dx32
,fscal
,fjx2
);
879 fjy2
= _mm_macc_ps(dy32
,fscal
,fjy2
);
880 fjz2
= _mm_macc_ps(dz32
,fscal
,fjz2
);
882 /**************************
883 * CALCULATE INTERACTIONS *
884 **************************/
886 /* REACTION-FIELD ELECTROSTATICS */
887 velec
= _mm_mul_ps(qq33
,_mm_sub_ps(_mm_macc_ps(krf
,rsq33
,rinv33
),crf
));
888 felec
= _mm_mul_ps(qq33
,_mm_msub_ps(rinv33
,rinvsq33
,krf2
));
890 /* Update potential sum for this i atom from the interaction with this j atom. */
891 velec
= _mm_andnot_ps(dummy_mask
,velec
);
892 velecsum
= _mm_add_ps(velecsum
,velec
);
896 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
898 /* Update vectorial force */
899 fix3
= _mm_macc_ps(dx33
,fscal
,fix3
);
900 fiy3
= _mm_macc_ps(dy33
,fscal
,fiy3
);
901 fiz3
= _mm_macc_ps(dz33
,fscal
,fiz3
);
903 fjx3
= _mm_macc_ps(dx33
,fscal
,fjx3
);
904 fjy3
= _mm_macc_ps(dy33
,fscal
,fjy3
);
905 fjz3
= _mm_macc_ps(dz33
,fscal
,fjz3
);
907 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
908 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
909 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
910 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
912 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
913 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,
914 fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
916 /* Inner loop uses 353 flops */
919 /* End of innermost loop */
921 gmx_mm_update_iforce_4atom_swizzle_ps(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
922 f
+i_coord_offset
,fshift
+i_shift_offset
);
925 /* Update potential energies */
926 gmx_mm_update_1pot_ps(velecsum
,kernel_data
->energygrp_elec
+ggid
);
927 gmx_mm_update_1pot_ps(vvdwsum
,kernel_data
->energygrp_vdw
+ggid
);
929 /* Increment number of inner iterations */
930 inneriter
+= j_index_end
- j_index_start
;
932 /* Outer loop uses 26 flops */
935 /* Increment number of outer iterations */
938 /* Update outer/inner flops */
940 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W4W4_VF
,outeriter
*26 + inneriter
*353);
943 * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_avx_128_fma_single
944 * Electrostatics interaction: ReactionField
945 * VdW interaction: LennardJones
946 * Geometry: Water4-Water4
947 * Calculate force/pot: Force
950 nb_kernel_ElecRF_VdwLJ_GeomW4W4_F_avx_128_fma_single
951 (t_nblist
* gmx_restrict nlist
,
952 rvec
* gmx_restrict xx
,
953 rvec
* gmx_restrict ff
,
954 struct t_forcerec
* gmx_restrict fr
,
955 t_mdatoms
* gmx_restrict mdatoms
,
956 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
957 t_nrnb
* gmx_restrict nrnb
)
959 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
960 * just 0 for non-waters.
961 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
962 * jnr indices corresponding to data put in the four positions in the SIMD register.
964 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
965 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
966 int jnrA
,jnrB
,jnrC
,jnrD
;
967 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
968 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
969 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
971 real
*shiftvec
,*fshift
,*x
,*f
;
972 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
;
974 __m128 fscal
,rcutoff
,rcutoff2
,jidxall
;
976 __m128 ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
978 __m128 ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
980 __m128 ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
982 __m128 ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
983 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
;
984 __m128 jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
985 int vdwjidx1A
,vdwjidx1B
,vdwjidx1C
,vdwjidx1D
;
986 __m128 jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
987 int vdwjidx2A
,vdwjidx2B
,vdwjidx2C
,vdwjidx2D
;
988 __m128 jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
989 int vdwjidx3A
,vdwjidx3B
,vdwjidx3C
,vdwjidx3D
;
990 __m128 jx3
,jy3
,jz3
,fjx3
,fjy3
,fjz3
,jq3
,isaj3
;
991 __m128 dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
992 __m128 dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
993 __m128 dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
994 __m128 dx13
,dy13
,dz13
,rsq13
,rinv13
,rinvsq13
,r13
,qq13
,c6_13
,c12_13
;
995 __m128 dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
996 __m128 dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
997 __m128 dx23
,dy23
,dz23
,rsq23
,rinv23
,rinvsq23
,r23
,qq23
,c6_23
,c12_23
;
998 __m128 dx31
,dy31
,dz31
,rsq31
,rinv31
,rinvsq31
,r31
,qq31
,c6_31
,c12_31
;
999 __m128 dx32
,dy32
,dz32
,rsq32
,rinv32
,rinvsq32
,r32
,qq32
,c6_32
,c12_32
;
1000 __m128 dx33
,dy33
,dz33
,rsq33
,rinv33
,rinvsq33
,r33
,qq33
,c6_33
,c12_33
;
1001 __m128 velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
1004 __m128 rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
1007 __m128 one_sixth
= _mm_set1_ps(1.0/6.0);
1008 __m128 one_twelfth
= _mm_set1_ps(1.0/12.0);
1009 __m128 dummy_mask
,cutoff_mask
;
1010 __m128 signbit
= _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1011 __m128 one
= _mm_set1_ps(1.0);
1012 __m128 two
= _mm_set1_ps(2.0);
1018 jindex
= nlist
->jindex
;
1020 shiftidx
= nlist
->shift
;
1022 shiftvec
= fr
->shift_vec
[0];
1023 fshift
= fr
->fshift
[0];
1024 facel
= _mm_set1_ps(fr
->ic
->epsfac
);
1025 charge
= mdatoms
->chargeA
;
1026 krf
= _mm_set1_ps(fr
->ic
->k_rf
);
1027 krf2
= _mm_set1_ps(fr
->ic
->k_rf
*2.0);
1028 crf
= _mm_set1_ps(fr
->ic
->c_rf
);
1029 nvdwtype
= fr
->ntype
;
1030 vdwparam
= fr
->nbfp
;
1031 vdwtype
= mdatoms
->typeA
;
1033 /* Setup water-specific parameters */
1034 inr
= nlist
->iinr
[0];
1035 iq1
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+1]));
1036 iq2
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+2]));
1037 iq3
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+3]));
1038 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
1040 jq1
= _mm_set1_ps(charge
[inr
+1]);
1041 jq2
= _mm_set1_ps(charge
[inr
+2]);
1042 jq3
= _mm_set1_ps(charge
[inr
+3]);
1043 vdwjidx0A
= 2*vdwtype
[inr
+0];
1044 c6_00
= _mm_set1_ps(vdwparam
[vdwioffset0
+vdwjidx0A
]);
1045 c12_00
= _mm_set1_ps(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
1046 qq11
= _mm_mul_ps(iq1
,jq1
);
1047 qq12
= _mm_mul_ps(iq1
,jq2
);
1048 qq13
= _mm_mul_ps(iq1
,jq3
);
1049 qq21
= _mm_mul_ps(iq2
,jq1
);
1050 qq22
= _mm_mul_ps(iq2
,jq2
);
1051 qq23
= _mm_mul_ps(iq2
,jq3
);
1052 qq31
= _mm_mul_ps(iq3
,jq1
);
1053 qq32
= _mm_mul_ps(iq3
,jq2
);
1054 qq33
= _mm_mul_ps(iq3
,jq3
);
1056 /* Avoid stupid compiler warnings */
1057 jnrA
= jnrB
= jnrC
= jnrD
= 0;
1058 j_coord_offsetA
= 0;
1059 j_coord_offsetB
= 0;
1060 j_coord_offsetC
= 0;
1061 j_coord_offsetD
= 0;
1066 for(iidx
=0;iidx
<4*DIM
;iidx
++)
1068 scratch
[iidx
] = 0.0;
1071 /* Start outer loop over neighborlists */
1072 for(iidx
=0; iidx
<nri
; iidx
++)
1074 /* Load shift vector for this list */
1075 i_shift_offset
= DIM
*shiftidx
[iidx
];
1077 /* Load limits for loop over neighbors */
1078 j_index_start
= jindex
[iidx
];
1079 j_index_end
= jindex
[iidx
+1];
1081 /* Get outer coordinate index */
1083 i_coord_offset
= DIM
*inr
;
1085 /* Load i particle coords and add shift vector */
1086 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
1087 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
1089 fix0
= _mm_setzero_ps();
1090 fiy0
= _mm_setzero_ps();
1091 fiz0
= _mm_setzero_ps();
1092 fix1
= _mm_setzero_ps();
1093 fiy1
= _mm_setzero_ps();
1094 fiz1
= _mm_setzero_ps();
1095 fix2
= _mm_setzero_ps();
1096 fiy2
= _mm_setzero_ps();
1097 fiz2
= _mm_setzero_ps();
1098 fix3
= _mm_setzero_ps();
1099 fiy3
= _mm_setzero_ps();
1100 fiz3
= _mm_setzero_ps();
1102 /* Start inner kernel loop */
1103 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+3]>=0; jidx
+=4)
1106 /* Get j neighbor index, and coordinate index */
1108 jnrB
= jjnr
[jidx
+1];
1109 jnrC
= jjnr
[jidx
+2];
1110 jnrD
= jjnr
[jidx
+3];
1111 j_coord_offsetA
= DIM
*jnrA
;
1112 j_coord_offsetB
= DIM
*jnrB
;
1113 j_coord_offsetC
= DIM
*jnrC
;
1114 j_coord_offsetD
= DIM
*jnrD
;
1116 /* load j atom coordinates */
1117 gmx_mm_load_4rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1118 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
1119 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
1120 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
1122 /* Calculate displacement vector */
1123 dx00
= _mm_sub_ps(ix0
,jx0
);
1124 dy00
= _mm_sub_ps(iy0
,jy0
);
1125 dz00
= _mm_sub_ps(iz0
,jz0
);
1126 dx11
= _mm_sub_ps(ix1
,jx1
);
1127 dy11
= _mm_sub_ps(iy1
,jy1
);
1128 dz11
= _mm_sub_ps(iz1
,jz1
);
1129 dx12
= _mm_sub_ps(ix1
,jx2
);
1130 dy12
= _mm_sub_ps(iy1
,jy2
);
1131 dz12
= _mm_sub_ps(iz1
,jz2
);
1132 dx13
= _mm_sub_ps(ix1
,jx3
);
1133 dy13
= _mm_sub_ps(iy1
,jy3
);
1134 dz13
= _mm_sub_ps(iz1
,jz3
);
1135 dx21
= _mm_sub_ps(ix2
,jx1
);
1136 dy21
= _mm_sub_ps(iy2
,jy1
);
1137 dz21
= _mm_sub_ps(iz2
,jz1
);
1138 dx22
= _mm_sub_ps(ix2
,jx2
);
1139 dy22
= _mm_sub_ps(iy2
,jy2
);
1140 dz22
= _mm_sub_ps(iz2
,jz2
);
1141 dx23
= _mm_sub_ps(ix2
,jx3
);
1142 dy23
= _mm_sub_ps(iy2
,jy3
);
1143 dz23
= _mm_sub_ps(iz2
,jz3
);
1144 dx31
= _mm_sub_ps(ix3
,jx1
);
1145 dy31
= _mm_sub_ps(iy3
,jy1
);
1146 dz31
= _mm_sub_ps(iz3
,jz1
);
1147 dx32
= _mm_sub_ps(ix3
,jx2
);
1148 dy32
= _mm_sub_ps(iy3
,jy2
);
1149 dz32
= _mm_sub_ps(iz3
,jz2
);
1150 dx33
= _mm_sub_ps(ix3
,jx3
);
1151 dy33
= _mm_sub_ps(iy3
,jy3
);
1152 dz33
= _mm_sub_ps(iz3
,jz3
);
1154 /* Calculate squared distance and things based on it */
1155 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
1156 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
1157 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
1158 rsq13
= gmx_mm_calc_rsq_ps(dx13
,dy13
,dz13
);
1159 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
1160 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
1161 rsq23
= gmx_mm_calc_rsq_ps(dx23
,dy23
,dz23
);
1162 rsq31
= gmx_mm_calc_rsq_ps(dx31
,dy31
,dz31
);
1163 rsq32
= gmx_mm_calc_rsq_ps(dx32
,dy32
,dz32
);
1164 rsq33
= gmx_mm_calc_rsq_ps(dx33
,dy33
,dz33
);
1166 rinv11
= avx128fma_invsqrt_f(rsq11
);
1167 rinv12
= avx128fma_invsqrt_f(rsq12
);
1168 rinv13
= avx128fma_invsqrt_f(rsq13
);
1169 rinv21
= avx128fma_invsqrt_f(rsq21
);
1170 rinv22
= avx128fma_invsqrt_f(rsq22
);
1171 rinv23
= avx128fma_invsqrt_f(rsq23
);
1172 rinv31
= avx128fma_invsqrt_f(rsq31
);
1173 rinv32
= avx128fma_invsqrt_f(rsq32
);
1174 rinv33
= avx128fma_invsqrt_f(rsq33
);
1176 rinvsq00
= avx128fma_inv_f(rsq00
);
1177 rinvsq11
= _mm_mul_ps(rinv11
,rinv11
);
1178 rinvsq12
= _mm_mul_ps(rinv12
,rinv12
);
1179 rinvsq13
= _mm_mul_ps(rinv13
,rinv13
);
1180 rinvsq21
= _mm_mul_ps(rinv21
,rinv21
);
1181 rinvsq22
= _mm_mul_ps(rinv22
,rinv22
);
1182 rinvsq23
= _mm_mul_ps(rinv23
,rinv23
);
1183 rinvsq31
= _mm_mul_ps(rinv31
,rinv31
);
1184 rinvsq32
= _mm_mul_ps(rinv32
,rinv32
);
1185 rinvsq33
= _mm_mul_ps(rinv33
,rinv33
);
1187 fjx0
= _mm_setzero_ps();
1188 fjy0
= _mm_setzero_ps();
1189 fjz0
= _mm_setzero_ps();
1190 fjx1
= _mm_setzero_ps();
1191 fjy1
= _mm_setzero_ps();
1192 fjz1
= _mm_setzero_ps();
1193 fjx2
= _mm_setzero_ps();
1194 fjy2
= _mm_setzero_ps();
1195 fjz2
= _mm_setzero_ps();
1196 fjx3
= _mm_setzero_ps();
1197 fjy3
= _mm_setzero_ps();
1198 fjz3
= _mm_setzero_ps();
1200 /**************************
1201 * CALCULATE INTERACTIONS *
1202 **************************/
1204 /* LENNARD-JONES DISPERSION/REPULSION */
1206 rinvsix
= _mm_mul_ps(_mm_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
1207 fvdw
= _mm_mul_ps(_mm_msub_ps(c12_00
,rinvsix
,c6_00
),_mm_mul_ps(rinvsix
,rinvsq00
));
1211 /* Update vectorial force */
1212 fix0
= _mm_macc_ps(dx00
,fscal
,fix0
);
1213 fiy0
= _mm_macc_ps(dy00
,fscal
,fiy0
);
1214 fiz0
= _mm_macc_ps(dz00
,fscal
,fiz0
);
1216 fjx0
= _mm_macc_ps(dx00
,fscal
,fjx0
);
1217 fjy0
= _mm_macc_ps(dy00
,fscal
,fjy0
);
1218 fjz0
= _mm_macc_ps(dz00
,fscal
,fjz0
);
1220 /**************************
1221 * CALCULATE INTERACTIONS *
1222 **************************/
1224 /* REACTION-FIELD ELECTROSTATICS */
1225 felec
= _mm_mul_ps(qq11
,_mm_msub_ps(rinv11
,rinvsq11
,krf2
));
1229 /* Update vectorial force */
1230 fix1
= _mm_macc_ps(dx11
,fscal
,fix1
);
1231 fiy1
= _mm_macc_ps(dy11
,fscal
,fiy1
);
1232 fiz1
= _mm_macc_ps(dz11
,fscal
,fiz1
);
1234 fjx1
= _mm_macc_ps(dx11
,fscal
,fjx1
);
1235 fjy1
= _mm_macc_ps(dy11
,fscal
,fjy1
);
1236 fjz1
= _mm_macc_ps(dz11
,fscal
,fjz1
);
1238 /**************************
1239 * CALCULATE INTERACTIONS *
1240 **************************/
1242 /* REACTION-FIELD ELECTROSTATICS */
1243 felec
= _mm_mul_ps(qq12
,_mm_msub_ps(rinv12
,rinvsq12
,krf2
));
1247 /* Update vectorial force */
1248 fix1
= _mm_macc_ps(dx12
,fscal
,fix1
);
1249 fiy1
= _mm_macc_ps(dy12
,fscal
,fiy1
);
1250 fiz1
= _mm_macc_ps(dz12
,fscal
,fiz1
);
1252 fjx2
= _mm_macc_ps(dx12
,fscal
,fjx2
);
1253 fjy2
= _mm_macc_ps(dy12
,fscal
,fjy2
);
1254 fjz2
= _mm_macc_ps(dz12
,fscal
,fjz2
);
1256 /**************************
1257 * CALCULATE INTERACTIONS *
1258 **************************/
1260 /* REACTION-FIELD ELECTROSTATICS */
1261 felec
= _mm_mul_ps(qq13
,_mm_msub_ps(rinv13
,rinvsq13
,krf2
));
1265 /* Update vectorial force */
1266 fix1
= _mm_macc_ps(dx13
,fscal
,fix1
);
1267 fiy1
= _mm_macc_ps(dy13
,fscal
,fiy1
);
1268 fiz1
= _mm_macc_ps(dz13
,fscal
,fiz1
);
1270 fjx3
= _mm_macc_ps(dx13
,fscal
,fjx3
);
1271 fjy3
= _mm_macc_ps(dy13
,fscal
,fjy3
);
1272 fjz3
= _mm_macc_ps(dz13
,fscal
,fjz3
);
1274 /**************************
1275 * CALCULATE INTERACTIONS *
1276 **************************/
1278 /* REACTION-FIELD ELECTROSTATICS */
1279 felec
= _mm_mul_ps(qq21
,_mm_msub_ps(rinv21
,rinvsq21
,krf2
));
1283 /* Update vectorial force */
1284 fix2
= _mm_macc_ps(dx21
,fscal
,fix2
);
1285 fiy2
= _mm_macc_ps(dy21
,fscal
,fiy2
);
1286 fiz2
= _mm_macc_ps(dz21
,fscal
,fiz2
);
1288 fjx1
= _mm_macc_ps(dx21
,fscal
,fjx1
);
1289 fjy1
= _mm_macc_ps(dy21
,fscal
,fjy1
);
1290 fjz1
= _mm_macc_ps(dz21
,fscal
,fjz1
);
1292 /**************************
1293 * CALCULATE INTERACTIONS *
1294 **************************/
1296 /* REACTION-FIELD ELECTROSTATICS */
1297 felec
= _mm_mul_ps(qq22
,_mm_msub_ps(rinv22
,rinvsq22
,krf2
));
1301 /* Update vectorial force */
1302 fix2
= _mm_macc_ps(dx22
,fscal
,fix2
);
1303 fiy2
= _mm_macc_ps(dy22
,fscal
,fiy2
);
1304 fiz2
= _mm_macc_ps(dz22
,fscal
,fiz2
);
1306 fjx2
= _mm_macc_ps(dx22
,fscal
,fjx2
);
1307 fjy2
= _mm_macc_ps(dy22
,fscal
,fjy2
);
1308 fjz2
= _mm_macc_ps(dz22
,fscal
,fjz2
);
1310 /**************************
1311 * CALCULATE INTERACTIONS *
1312 **************************/
1314 /* REACTION-FIELD ELECTROSTATICS */
1315 felec
= _mm_mul_ps(qq23
,_mm_msub_ps(rinv23
,rinvsq23
,krf2
));
1319 /* Update vectorial force */
1320 fix2
= _mm_macc_ps(dx23
,fscal
,fix2
);
1321 fiy2
= _mm_macc_ps(dy23
,fscal
,fiy2
);
1322 fiz2
= _mm_macc_ps(dz23
,fscal
,fiz2
);
1324 fjx3
= _mm_macc_ps(dx23
,fscal
,fjx3
);
1325 fjy3
= _mm_macc_ps(dy23
,fscal
,fjy3
);
1326 fjz3
= _mm_macc_ps(dz23
,fscal
,fjz3
);
1328 /**************************
1329 * CALCULATE INTERACTIONS *
1330 **************************/
1332 /* REACTION-FIELD ELECTROSTATICS */
1333 felec
= _mm_mul_ps(qq31
,_mm_msub_ps(rinv31
,rinvsq31
,krf2
));
1337 /* Update vectorial force */
1338 fix3
= _mm_macc_ps(dx31
,fscal
,fix3
);
1339 fiy3
= _mm_macc_ps(dy31
,fscal
,fiy3
);
1340 fiz3
= _mm_macc_ps(dz31
,fscal
,fiz3
);
1342 fjx1
= _mm_macc_ps(dx31
,fscal
,fjx1
);
1343 fjy1
= _mm_macc_ps(dy31
,fscal
,fjy1
);
1344 fjz1
= _mm_macc_ps(dz31
,fscal
,fjz1
);
1346 /**************************
1347 * CALCULATE INTERACTIONS *
1348 **************************/
1350 /* REACTION-FIELD ELECTROSTATICS */
1351 felec
= _mm_mul_ps(qq32
,_mm_msub_ps(rinv32
,rinvsq32
,krf2
));
1355 /* Update vectorial force */
1356 fix3
= _mm_macc_ps(dx32
,fscal
,fix3
);
1357 fiy3
= _mm_macc_ps(dy32
,fscal
,fiy3
);
1358 fiz3
= _mm_macc_ps(dz32
,fscal
,fiz3
);
1360 fjx2
= _mm_macc_ps(dx32
,fscal
,fjx2
);
1361 fjy2
= _mm_macc_ps(dy32
,fscal
,fjy2
);
1362 fjz2
= _mm_macc_ps(dz32
,fscal
,fjz2
);
1364 /**************************
1365 * CALCULATE INTERACTIONS *
1366 **************************/
1368 /* REACTION-FIELD ELECTROSTATICS */
1369 felec
= _mm_mul_ps(qq33
,_mm_msub_ps(rinv33
,rinvsq33
,krf2
));
1373 /* Update vectorial force */
1374 fix3
= _mm_macc_ps(dx33
,fscal
,fix3
);
1375 fiy3
= _mm_macc_ps(dy33
,fscal
,fiy3
);
1376 fiz3
= _mm_macc_ps(dz33
,fscal
,fiz3
);
1378 fjx3
= _mm_macc_ps(dx33
,fscal
,fjx3
);
1379 fjy3
= _mm_macc_ps(dy33
,fscal
,fjy3
);
1380 fjz3
= _mm_macc_ps(dz33
,fscal
,fjz3
);
1382 fjptrA
= f
+j_coord_offsetA
;
1383 fjptrB
= f
+j_coord_offsetB
;
1384 fjptrC
= f
+j_coord_offsetC
;
1385 fjptrD
= f
+j_coord_offsetD
;
1387 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
1388 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,
1389 fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
1391 /* Inner loop uses 303 flops */
1394 if(jidx
<j_index_end
)
1397 /* Get j neighbor index, and coordinate index */
1398 jnrlistA
= jjnr
[jidx
];
1399 jnrlistB
= jjnr
[jidx
+1];
1400 jnrlistC
= jjnr
[jidx
+2];
1401 jnrlistD
= jjnr
[jidx
+3];
1402 /* Sign of each element will be negative for non-real atoms.
1403 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1404 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1406 dummy_mask
= gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128()));
1407 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
1408 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
1409 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
1410 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
1411 j_coord_offsetA
= DIM
*jnrA
;
1412 j_coord_offsetB
= DIM
*jnrB
;
1413 j_coord_offsetC
= DIM
*jnrC
;
1414 j_coord_offsetD
= DIM
*jnrD
;
1416 /* load j atom coordinates */
1417 gmx_mm_load_4rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1418 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
1419 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
1420 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
1422 /* Calculate displacement vector */
1423 dx00
= _mm_sub_ps(ix0
,jx0
);
1424 dy00
= _mm_sub_ps(iy0
,jy0
);
1425 dz00
= _mm_sub_ps(iz0
,jz0
);
1426 dx11
= _mm_sub_ps(ix1
,jx1
);
1427 dy11
= _mm_sub_ps(iy1
,jy1
);
1428 dz11
= _mm_sub_ps(iz1
,jz1
);
1429 dx12
= _mm_sub_ps(ix1
,jx2
);
1430 dy12
= _mm_sub_ps(iy1
,jy2
);
1431 dz12
= _mm_sub_ps(iz1
,jz2
);
1432 dx13
= _mm_sub_ps(ix1
,jx3
);
1433 dy13
= _mm_sub_ps(iy1
,jy3
);
1434 dz13
= _mm_sub_ps(iz1
,jz3
);
1435 dx21
= _mm_sub_ps(ix2
,jx1
);
1436 dy21
= _mm_sub_ps(iy2
,jy1
);
1437 dz21
= _mm_sub_ps(iz2
,jz1
);
1438 dx22
= _mm_sub_ps(ix2
,jx2
);
1439 dy22
= _mm_sub_ps(iy2
,jy2
);
1440 dz22
= _mm_sub_ps(iz2
,jz2
);
1441 dx23
= _mm_sub_ps(ix2
,jx3
);
1442 dy23
= _mm_sub_ps(iy2
,jy3
);
1443 dz23
= _mm_sub_ps(iz2
,jz3
);
1444 dx31
= _mm_sub_ps(ix3
,jx1
);
1445 dy31
= _mm_sub_ps(iy3
,jy1
);
1446 dz31
= _mm_sub_ps(iz3
,jz1
);
1447 dx32
= _mm_sub_ps(ix3
,jx2
);
1448 dy32
= _mm_sub_ps(iy3
,jy2
);
1449 dz32
= _mm_sub_ps(iz3
,jz2
);
1450 dx33
= _mm_sub_ps(ix3
,jx3
);
1451 dy33
= _mm_sub_ps(iy3
,jy3
);
1452 dz33
= _mm_sub_ps(iz3
,jz3
);
1454 /* Calculate squared distance and things based on it */
1455 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
1456 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
1457 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
1458 rsq13
= gmx_mm_calc_rsq_ps(dx13
,dy13
,dz13
);
1459 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
1460 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
1461 rsq23
= gmx_mm_calc_rsq_ps(dx23
,dy23
,dz23
);
1462 rsq31
= gmx_mm_calc_rsq_ps(dx31
,dy31
,dz31
);
1463 rsq32
= gmx_mm_calc_rsq_ps(dx32
,dy32
,dz32
);
1464 rsq33
= gmx_mm_calc_rsq_ps(dx33
,dy33
,dz33
);
1466 rinv11
= avx128fma_invsqrt_f(rsq11
);
1467 rinv12
= avx128fma_invsqrt_f(rsq12
);
1468 rinv13
= avx128fma_invsqrt_f(rsq13
);
1469 rinv21
= avx128fma_invsqrt_f(rsq21
);
1470 rinv22
= avx128fma_invsqrt_f(rsq22
);
1471 rinv23
= avx128fma_invsqrt_f(rsq23
);
1472 rinv31
= avx128fma_invsqrt_f(rsq31
);
1473 rinv32
= avx128fma_invsqrt_f(rsq32
);
1474 rinv33
= avx128fma_invsqrt_f(rsq33
);
1476 rinvsq00
= avx128fma_inv_f(rsq00
);
1477 rinvsq11
= _mm_mul_ps(rinv11
,rinv11
);
1478 rinvsq12
= _mm_mul_ps(rinv12
,rinv12
);
1479 rinvsq13
= _mm_mul_ps(rinv13
,rinv13
);
1480 rinvsq21
= _mm_mul_ps(rinv21
,rinv21
);
1481 rinvsq22
= _mm_mul_ps(rinv22
,rinv22
);
1482 rinvsq23
= _mm_mul_ps(rinv23
,rinv23
);
1483 rinvsq31
= _mm_mul_ps(rinv31
,rinv31
);
1484 rinvsq32
= _mm_mul_ps(rinv32
,rinv32
);
1485 rinvsq33
= _mm_mul_ps(rinv33
,rinv33
);
1487 fjx0
= _mm_setzero_ps();
1488 fjy0
= _mm_setzero_ps();
1489 fjz0
= _mm_setzero_ps();
1490 fjx1
= _mm_setzero_ps();
1491 fjy1
= _mm_setzero_ps();
1492 fjz1
= _mm_setzero_ps();
1493 fjx2
= _mm_setzero_ps();
1494 fjy2
= _mm_setzero_ps();
1495 fjz2
= _mm_setzero_ps();
1496 fjx3
= _mm_setzero_ps();
1497 fjy3
= _mm_setzero_ps();
1498 fjz3
= _mm_setzero_ps();
1500 /**************************
1501 * CALCULATE INTERACTIONS *
1502 **************************/
1504 /* LENNARD-JONES DISPERSION/REPULSION */
1506 rinvsix
= _mm_mul_ps(_mm_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
1507 fvdw
= _mm_mul_ps(_mm_msub_ps(c12_00
,rinvsix
,c6_00
),_mm_mul_ps(rinvsix
,rinvsq00
));
1511 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1513 /* Update vectorial force */
1514 fix0
= _mm_macc_ps(dx00
,fscal
,fix0
);
1515 fiy0
= _mm_macc_ps(dy00
,fscal
,fiy0
);
1516 fiz0
= _mm_macc_ps(dz00
,fscal
,fiz0
);
1518 fjx0
= _mm_macc_ps(dx00
,fscal
,fjx0
);
1519 fjy0
= _mm_macc_ps(dy00
,fscal
,fjy0
);
1520 fjz0
= _mm_macc_ps(dz00
,fscal
,fjz0
);
1522 /**************************
1523 * CALCULATE INTERACTIONS *
1524 **************************/
1526 /* REACTION-FIELD ELECTROSTATICS */
1527 felec
= _mm_mul_ps(qq11
,_mm_msub_ps(rinv11
,rinvsq11
,krf2
));
1531 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1533 /* Update vectorial force */
1534 fix1
= _mm_macc_ps(dx11
,fscal
,fix1
);
1535 fiy1
= _mm_macc_ps(dy11
,fscal
,fiy1
);
1536 fiz1
= _mm_macc_ps(dz11
,fscal
,fiz1
);
1538 fjx1
= _mm_macc_ps(dx11
,fscal
,fjx1
);
1539 fjy1
= _mm_macc_ps(dy11
,fscal
,fjy1
);
1540 fjz1
= _mm_macc_ps(dz11
,fscal
,fjz1
);
1542 /**************************
1543 * CALCULATE INTERACTIONS *
1544 **************************/
1546 /* REACTION-FIELD ELECTROSTATICS */
1547 felec
= _mm_mul_ps(qq12
,_mm_msub_ps(rinv12
,rinvsq12
,krf2
));
1551 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1553 /* Update vectorial force */
1554 fix1
= _mm_macc_ps(dx12
,fscal
,fix1
);
1555 fiy1
= _mm_macc_ps(dy12
,fscal
,fiy1
);
1556 fiz1
= _mm_macc_ps(dz12
,fscal
,fiz1
);
1558 fjx2
= _mm_macc_ps(dx12
,fscal
,fjx2
);
1559 fjy2
= _mm_macc_ps(dy12
,fscal
,fjy2
);
1560 fjz2
= _mm_macc_ps(dz12
,fscal
,fjz2
);
1562 /**************************
1563 * CALCULATE INTERACTIONS *
1564 **************************/
1566 /* REACTION-FIELD ELECTROSTATICS */
1567 felec
= _mm_mul_ps(qq13
,_mm_msub_ps(rinv13
,rinvsq13
,krf2
));
1571 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1573 /* Update vectorial force */
1574 fix1
= _mm_macc_ps(dx13
,fscal
,fix1
);
1575 fiy1
= _mm_macc_ps(dy13
,fscal
,fiy1
);
1576 fiz1
= _mm_macc_ps(dz13
,fscal
,fiz1
);
1578 fjx3
= _mm_macc_ps(dx13
,fscal
,fjx3
);
1579 fjy3
= _mm_macc_ps(dy13
,fscal
,fjy3
);
1580 fjz3
= _mm_macc_ps(dz13
,fscal
,fjz3
);
1582 /**************************
1583 * CALCULATE INTERACTIONS *
1584 **************************/
1586 /* REACTION-FIELD ELECTROSTATICS */
1587 felec
= _mm_mul_ps(qq21
,_mm_msub_ps(rinv21
,rinvsq21
,krf2
));
1591 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1593 /* Update vectorial force */
1594 fix2
= _mm_macc_ps(dx21
,fscal
,fix2
);
1595 fiy2
= _mm_macc_ps(dy21
,fscal
,fiy2
);
1596 fiz2
= _mm_macc_ps(dz21
,fscal
,fiz2
);
1598 fjx1
= _mm_macc_ps(dx21
,fscal
,fjx1
);
1599 fjy1
= _mm_macc_ps(dy21
,fscal
,fjy1
);
1600 fjz1
= _mm_macc_ps(dz21
,fscal
,fjz1
);
1602 /**************************
1603 * CALCULATE INTERACTIONS *
1604 **************************/
1606 /* REACTION-FIELD ELECTROSTATICS */
1607 felec
= _mm_mul_ps(qq22
,_mm_msub_ps(rinv22
,rinvsq22
,krf2
));
1611 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1613 /* Update vectorial force */
1614 fix2
= _mm_macc_ps(dx22
,fscal
,fix2
);
1615 fiy2
= _mm_macc_ps(dy22
,fscal
,fiy2
);
1616 fiz2
= _mm_macc_ps(dz22
,fscal
,fiz2
);
1618 fjx2
= _mm_macc_ps(dx22
,fscal
,fjx2
);
1619 fjy2
= _mm_macc_ps(dy22
,fscal
,fjy2
);
1620 fjz2
= _mm_macc_ps(dz22
,fscal
,fjz2
);
1622 /**************************
1623 * CALCULATE INTERACTIONS *
1624 **************************/
1626 /* REACTION-FIELD ELECTROSTATICS */
1627 felec
= _mm_mul_ps(qq23
,_mm_msub_ps(rinv23
,rinvsq23
,krf2
));
1631 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1633 /* Update vectorial force */
1634 fix2
= _mm_macc_ps(dx23
,fscal
,fix2
);
1635 fiy2
= _mm_macc_ps(dy23
,fscal
,fiy2
);
1636 fiz2
= _mm_macc_ps(dz23
,fscal
,fiz2
);
1638 fjx3
= _mm_macc_ps(dx23
,fscal
,fjx3
);
1639 fjy3
= _mm_macc_ps(dy23
,fscal
,fjy3
);
1640 fjz3
= _mm_macc_ps(dz23
,fscal
,fjz3
);
1642 /**************************
1643 * CALCULATE INTERACTIONS *
1644 **************************/
1646 /* REACTION-FIELD ELECTROSTATICS */
1647 felec
= _mm_mul_ps(qq31
,_mm_msub_ps(rinv31
,rinvsq31
,krf2
));
1651 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1653 /* Update vectorial force */
1654 fix3
= _mm_macc_ps(dx31
,fscal
,fix3
);
1655 fiy3
= _mm_macc_ps(dy31
,fscal
,fiy3
);
1656 fiz3
= _mm_macc_ps(dz31
,fscal
,fiz3
);
1658 fjx1
= _mm_macc_ps(dx31
,fscal
,fjx1
);
1659 fjy1
= _mm_macc_ps(dy31
,fscal
,fjy1
);
1660 fjz1
= _mm_macc_ps(dz31
,fscal
,fjz1
);
1662 /**************************
1663 * CALCULATE INTERACTIONS *
1664 **************************/
1666 /* REACTION-FIELD ELECTROSTATICS */
1667 felec
= _mm_mul_ps(qq32
,_mm_msub_ps(rinv32
,rinvsq32
,krf2
));
1671 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1673 /* Update vectorial force */
1674 fix3
= _mm_macc_ps(dx32
,fscal
,fix3
);
1675 fiy3
= _mm_macc_ps(dy32
,fscal
,fiy3
);
1676 fiz3
= _mm_macc_ps(dz32
,fscal
,fiz3
);
1678 fjx2
= _mm_macc_ps(dx32
,fscal
,fjx2
);
1679 fjy2
= _mm_macc_ps(dy32
,fscal
,fjy2
);
1680 fjz2
= _mm_macc_ps(dz32
,fscal
,fjz2
);
1682 /**************************
1683 * CALCULATE INTERACTIONS *
1684 **************************/
1686 /* REACTION-FIELD ELECTROSTATICS */
1687 felec
= _mm_mul_ps(qq33
,_mm_msub_ps(rinv33
,rinvsq33
,krf2
));
1691 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1693 /* Update vectorial force */
1694 fix3
= _mm_macc_ps(dx33
,fscal
,fix3
);
1695 fiy3
= _mm_macc_ps(dy33
,fscal
,fiy3
);
1696 fiz3
= _mm_macc_ps(dz33
,fscal
,fiz3
);
1698 fjx3
= _mm_macc_ps(dx33
,fscal
,fjx3
);
1699 fjy3
= _mm_macc_ps(dy33
,fscal
,fjy3
);
1700 fjz3
= _mm_macc_ps(dz33
,fscal
,fjz3
);
1702 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
1703 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
1704 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
1705 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
1707 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
1708 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,
1709 fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
1711 /* Inner loop uses 303 flops */
1714 /* End of innermost loop */
1716 gmx_mm_update_iforce_4atom_swizzle_ps(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
1717 f
+i_coord_offset
,fshift
+i_shift_offset
);
1719 /* Increment number of inner iterations */
1720 inneriter
+= j_index_end
- j_index_start
;
1722 /* Outer loop uses 24 flops */
1725 /* Increment number of outer iterations */
1728 /* Update outer/inner flops */
1730 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W4W4_F
,outeriter
*24 + inneriter
*303);