2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
47 #include "gromacs/simd/math_x86_avx_128_fma_single.h"
48 #include "kernelutil_x86_avx_128_fma_single.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_avx_128_fma_single
52 * Electrostatics interaction: ReactionField
53 * VdW interaction: LennardJones
54 * Geometry: Water3-Water3
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_VF_avx_128_fma_single
59 (t_nblist
* gmx_restrict nlist
,
60 rvec
* gmx_restrict xx
,
61 rvec
* gmx_restrict ff
,
62 t_forcerec
* gmx_restrict fr
,
63 t_mdatoms
* gmx_restrict mdatoms
,
64 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
65 t_nrnb
* gmx_restrict nrnb
)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
73 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
74 int jnrA
,jnrB
,jnrC
,jnrD
;
75 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
76 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
77 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
79 real
*shiftvec
,*fshift
,*x
,*f
;
80 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
;
82 __m128 fscal
,rcutoff
,rcutoff2
,jidxall
;
84 __m128 ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
86 __m128 ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
88 __m128 ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
89 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
;
90 __m128 jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
91 int vdwjidx1A
,vdwjidx1B
,vdwjidx1C
,vdwjidx1D
;
92 __m128 jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
93 int vdwjidx2A
,vdwjidx2B
,vdwjidx2C
,vdwjidx2D
;
94 __m128 jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
95 __m128 dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
96 __m128 dx01
,dy01
,dz01
,rsq01
,rinv01
,rinvsq01
,r01
,qq01
,c6_01
,c12_01
;
97 __m128 dx02
,dy02
,dz02
,rsq02
,rinv02
,rinvsq02
,r02
,qq02
,c6_02
,c12_02
;
98 __m128 dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
99 __m128 dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
100 __m128 dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
101 __m128 dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
102 __m128 dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
103 __m128 dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
104 __m128 velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
107 __m128 rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
110 __m128 one_sixth
= _mm_set1_ps(1.0/6.0);
111 __m128 one_twelfth
= _mm_set1_ps(1.0/12.0);
112 __m128 rswitch
,swV3
,swV4
,swV5
,swF2
,swF3
,swF4
,d
,d2
,sw
,dsw
;
113 real rswitch_scalar
,d_scalar
;
114 __m128 dummy_mask
,cutoff_mask
;
115 __m128 signbit
= _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
116 __m128 one
= _mm_set1_ps(1.0);
117 __m128 two
= _mm_set1_ps(2.0);
123 jindex
= nlist
->jindex
;
125 shiftidx
= nlist
->shift
;
127 shiftvec
= fr
->shift_vec
[0];
128 fshift
= fr
->fshift
[0];
129 facel
= _mm_set1_ps(fr
->epsfac
);
130 charge
= mdatoms
->chargeA
;
131 krf
= _mm_set1_ps(fr
->ic
->k_rf
);
132 krf2
= _mm_set1_ps(fr
->ic
->k_rf
*2.0);
133 crf
= _mm_set1_ps(fr
->ic
->c_rf
);
134 nvdwtype
= fr
->ntype
;
136 vdwtype
= mdatoms
->typeA
;
138 /* Setup water-specific parameters */
139 inr
= nlist
->iinr
[0];
140 iq0
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+0]));
141 iq1
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+1]));
142 iq2
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+2]));
143 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
145 jq0
= _mm_set1_ps(charge
[inr
+0]);
146 jq1
= _mm_set1_ps(charge
[inr
+1]);
147 jq2
= _mm_set1_ps(charge
[inr
+2]);
148 vdwjidx0A
= 2*vdwtype
[inr
+0];
149 qq00
= _mm_mul_ps(iq0
,jq0
);
150 c6_00
= _mm_set1_ps(vdwparam
[vdwioffset0
+vdwjidx0A
]);
151 c12_00
= _mm_set1_ps(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
152 qq01
= _mm_mul_ps(iq0
,jq1
);
153 qq02
= _mm_mul_ps(iq0
,jq2
);
154 qq10
= _mm_mul_ps(iq1
,jq0
);
155 qq11
= _mm_mul_ps(iq1
,jq1
);
156 qq12
= _mm_mul_ps(iq1
,jq2
);
157 qq20
= _mm_mul_ps(iq2
,jq0
);
158 qq21
= _mm_mul_ps(iq2
,jq1
);
159 qq22
= _mm_mul_ps(iq2
,jq2
);
161 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
162 rcutoff_scalar
= fr
->rcoulomb
;
163 rcutoff
= _mm_set1_ps(rcutoff_scalar
);
164 rcutoff2
= _mm_mul_ps(rcutoff
,rcutoff
);
166 rswitch_scalar
= fr
->rvdw_switch
;
167 rswitch
= _mm_set1_ps(rswitch_scalar
);
168 /* Setup switch parameters */
169 d_scalar
= rcutoff_scalar
-rswitch_scalar
;
170 d
= _mm_set1_ps(d_scalar
);
171 swV3
= _mm_set1_ps(-10.0/(d_scalar
*d_scalar
*d_scalar
));
172 swV4
= _mm_set1_ps( 15.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
));
173 swV5
= _mm_set1_ps( -6.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
*d_scalar
));
174 swF2
= _mm_set1_ps(-30.0/(d_scalar
*d_scalar
*d_scalar
));
175 swF3
= _mm_set1_ps( 60.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
));
176 swF4
= _mm_set1_ps(-30.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
*d_scalar
));
178 /* Avoid stupid compiler warnings */
179 jnrA
= jnrB
= jnrC
= jnrD
= 0;
188 for(iidx
=0;iidx
<4*DIM
;iidx
++)
193 /* Start outer loop over neighborlists */
194 for(iidx
=0; iidx
<nri
; iidx
++)
196 /* Load shift vector for this list */
197 i_shift_offset
= DIM
*shiftidx
[iidx
];
199 /* Load limits for loop over neighbors */
200 j_index_start
= jindex
[iidx
];
201 j_index_end
= jindex
[iidx
+1];
203 /* Get outer coordinate index */
205 i_coord_offset
= DIM
*inr
;
207 /* Load i particle coords and add shift vector */
208 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
209 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
211 fix0
= _mm_setzero_ps();
212 fiy0
= _mm_setzero_ps();
213 fiz0
= _mm_setzero_ps();
214 fix1
= _mm_setzero_ps();
215 fiy1
= _mm_setzero_ps();
216 fiz1
= _mm_setzero_ps();
217 fix2
= _mm_setzero_ps();
218 fiy2
= _mm_setzero_ps();
219 fiz2
= _mm_setzero_ps();
221 /* Reset potential sums */
222 velecsum
= _mm_setzero_ps();
223 vvdwsum
= _mm_setzero_ps();
225 /* Start inner kernel loop */
226 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+3]>=0; jidx
+=4)
229 /* Get j neighbor index, and coordinate index */
234 j_coord_offsetA
= DIM
*jnrA
;
235 j_coord_offsetB
= DIM
*jnrB
;
236 j_coord_offsetC
= DIM
*jnrC
;
237 j_coord_offsetD
= DIM
*jnrD
;
239 /* load j atom coordinates */
240 gmx_mm_load_3rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
241 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
242 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
244 /* Calculate displacement vector */
245 dx00
= _mm_sub_ps(ix0
,jx0
);
246 dy00
= _mm_sub_ps(iy0
,jy0
);
247 dz00
= _mm_sub_ps(iz0
,jz0
);
248 dx01
= _mm_sub_ps(ix0
,jx1
);
249 dy01
= _mm_sub_ps(iy0
,jy1
);
250 dz01
= _mm_sub_ps(iz0
,jz1
);
251 dx02
= _mm_sub_ps(ix0
,jx2
);
252 dy02
= _mm_sub_ps(iy0
,jy2
);
253 dz02
= _mm_sub_ps(iz0
,jz2
);
254 dx10
= _mm_sub_ps(ix1
,jx0
);
255 dy10
= _mm_sub_ps(iy1
,jy0
);
256 dz10
= _mm_sub_ps(iz1
,jz0
);
257 dx11
= _mm_sub_ps(ix1
,jx1
);
258 dy11
= _mm_sub_ps(iy1
,jy1
);
259 dz11
= _mm_sub_ps(iz1
,jz1
);
260 dx12
= _mm_sub_ps(ix1
,jx2
);
261 dy12
= _mm_sub_ps(iy1
,jy2
);
262 dz12
= _mm_sub_ps(iz1
,jz2
);
263 dx20
= _mm_sub_ps(ix2
,jx0
);
264 dy20
= _mm_sub_ps(iy2
,jy0
);
265 dz20
= _mm_sub_ps(iz2
,jz0
);
266 dx21
= _mm_sub_ps(ix2
,jx1
);
267 dy21
= _mm_sub_ps(iy2
,jy1
);
268 dz21
= _mm_sub_ps(iz2
,jz1
);
269 dx22
= _mm_sub_ps(ix2
,jx2
);
270 dy22
= _mm_sub_ps(iy2
,jy2
);
271 dz22
= _mm_sub_ps(iz2
,jz2
);
273 /* Calculate squared distance and things based on it */
274 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
275 rsq01
= gmx_mm_calc_rsq_ps(dx01
,dy01
,dz01
);
276 rsq02
= gmx_mm_calc_rsq_ps(dx02
,dy02
,dz02
);
277 rsq10
= gmx_mm_calc_rsq_ps(dx10
,dy10
,dz10
);
278 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
279 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
280 rsq20
= gmx_mm_calc_rsq_ps(dx20
,dy20
,dz20
);
281 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
282 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
284 rinv00
= gmx_mm_invsqrt_ps(rsq00
);
285 rinv01
= gmx_mm_invsqrt_ps(rsq01
);
286 rinv02
= gmx_mm_invsqrt_ps(rsq02
);
287 rinv10
= gmx_mm_invsqrt_ps(rsq10
);
288 rinv11
= gmx_mm_invsqrt_ps(rsq11
);
289 rinv12
= gmx_mm_invsqrt_ps(rsq12
);
290 rinv20
= gmx_mm_invsqrt_ps(rsq20
);
291 rinv21
= gmx_mm_invsqrt_ps(rsq21
);
292 rinv22
= gmx_mm_invsqrt_ps(rsq22
);
294 rinvsq00
= _mm_mul_ps(rinv00
,rinv00
);
295 rinvsq01
= _mm_mul_ps(rinv01
,rinv01
);
296 rinvsq02
= _mm_mul_ps(rinv02
,rinv02
);
297 rinvsq10
= _mm_mul_ps(rinv10
,rinv10
);
298 rinvsq11
= _mm_mul_ps(rinv11
,rinv11
);
299 rinvsq12
= _mm_mul_ps(rinv12
,rinv12
);
300 rinvsq20
= _mm_mul_ps(rinv20
,rinv20
);
301 rinvsq21
= _mm_mul_ps(rinv21
,rinv21
);
302 rinvsq22
= _mm_mul_ps(rinv22
,rinv22
);
304 fjx0
= _mm_setzero_ps();
305 fjy0
= _mm_setzero_ps();
306 fjz0
= _mm_setzero_ps();
307 fjx1
= _mm_setzero_ps();
308 fjy1
= _mm_setzero_ps();
309 fjz1
= _mm_setzero_ps();
310 fjx2
= _mm_setzero_ps();
311 fjy2
= _mm_setzero_ps();
312 fjz2
= _mm_setzero_ps();
314 /**************************
315 * CALCULATE INTERACTIONS *
316 **************************/
318 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
321 r00
= _mm_mul_ps(rsq00
,rinv00
);
323 /* REACTION-FIELD ELECTROSTATICS */
324 velec
= _mm_mul_ps(qq00
,_mm_sub_ps(_mm_macc_ps(krf
,rsq00
,rinv00
),crf
));
325 felec
= _mm_mul_ps(qq00
,_mm_msub_ps(rinv00
,rinvsq00
,krf2
));
327 /* LENNARD-JONES DISPERSION/REPULSION */
329 rinvsix
= _mm_mul_ps(_mm_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
330 vvdw6
= _mm_mul_ps(c6_00
,rinvsix
);
331 vvdw12
= _mm_mul_ps(c12_00
,_mm_mul_ps(rinvsix
,rinvsix
));
332 vvdw
= _mm_msub_ps(vvdw12
,one_twelfth
,_mm_mul_ps(vvdw6
,one_sixth
));
333 fvdw
= _mm_mul_ps(_mm_sub_ps(vvdw12
,vvdw6
),rinvsq00
);
335 d
= _mm_sub_ps(r00
,rswitch
);
336 d
= _mm_max_ps(d
,_mm_setzero_ps());
337 d2
= _mm_mul_ps(d
,d
);
338 sw
= _mm_add_ps(one
,_mm_mul_ps(d2
,_mm_mul_ps(d
,_mm_macc_ps(d
,_mm_macc_ps(d
,swV5
,swV4
),swV3
))));
340 dsw
= _mm_mul_ps(d2
,_mm_macc_ps(d
,_mm_macc_ps(d
,swF4
,swF3
),swF2
));
342 /* Evaluate switch function */
343 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
344 fvdw
= _mm_msub_ps( fvdw
,sw
, _mm_mul_ps(rinv00
,_mm_mul_ps(vvdw
,dsw
)) );
345 vvdw
= _mm_mul_ps(vvdw
,sw
);
346 cutoff_mask
= _mm_cmplt_ps(rsq00
,rcutoff2
);
348 /* Update potential sum for this i atom from the interaction with this j atom. */
349 velec
= _mm_and_ps(velec
,cutoff_mask
);
350 velecsum
= _mm_add_ps(velecsum
,velec
);
351 vvdw
= _mm_and_ps(vvdw
,cutoff_mask
);
352 vvdwsum
= _mm_add_ps(vvdwsum
,vvdw
);
354 fscal
= _mm_add_ps(felec
,fvdw
);
356 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
358 /* Update vectorial force */
359 fix0
= _mm_macc_ps(dx00
,fscal
,fix0
);
360 fiy0
= _mm_macc_ps(dy00
,fscal
,fiy0
);
361 fiz0
= _mm_macc_ps(dz00
,fscal
,fiz0
);
363 fjx0
= _mm_macc_ps(dx00
,fscal
,fjx0
);
364 fjy0
= _mm_macc_ps(dy00
,fscal
,fjy0
);
365 fjz0
= _mm_macc_ps(dz00
,fscal
,fjz0
);
369 /**************************
370 * CALCULATE INTERACTIONS *
371 **************************/
373 if (gmx_mm_any_lt(rsq01
,rcutoff2
))
376 /* REACTION-FIELD ELECTROSTATICS */
377 velec
= _mm_mul_ps(qq01
,_mm_sub_ps(_mm_macc_ps(krf
,rsq01
,rinv01
),crf
));
378 felec
= _mm_mul_ps(qq01
,_mm_msub_ps(rinv01
,rinvsq01
,krf2
));
380 cutoff_mask
= _mm_cmplt_ps(rsq01
,rcutoff2
);
382 /* Update potential sum for this i atom from the interaction with this j atom. */
383 velec
= _mm_and_ps(velec
,cutoff_mask
);
384 velecsum
= _mm_add_ps(velecsum
,velec
);
388 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
390 /* Update vectorial force */
391 fix0
= _mm_macc_ps(dx01
,fscal
,fix0
);
392 fiy0
= _mm_macc_ps(dy01
,fscal
,fiy0
);
393 fiz0
= _mm_macc_ps(dz01
,fscal
,fiz0
);
395 fjx1
= _mm_macc_ps(dx01
,fscal
,fjx1
);
396 fjy1
= _mm_macc_ps(dy01
,fscal
,fjy1
);
397 fjz1
= _mm_macc_ps(dz01
,fscal
,fjz1
);
401 /**************************
402 * CALCULATE INTERACTIONS *
403 **************************/
405 if (gmx_mm_any_lt(rsq02
,rcutoff2
))
408 /* REACTION-FIELD ELECTROSTATICS */
409 velec
= _mm_mul_ps(qq02
,_mm_sub_ps(_mm_macc_ps(krf
,rsq02
,rinv02
),crf
));
410 felec
= _mm_mul_ps(qq02
,_mm_msub_ps(rinv02
,rinvsq02
,krf2
));
412 cutoff_mask
= _mm_cmplt_ps(rsq02
,rcutoff2
);
414 /* Update potential sum for this i atom from the interaction with this j atom. */
415 velec
= _mm_and_ps(velec
,cutoff_mask
);
416 velecsum
= _mm_add_ps(velecsum
,velec
);
420 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
422 /* Update vectorial force */
423 fix0
= _mm_macc_ps(dx02
,fscal
,fix0
);
424 fiy0
= _mm_macc_ps(dy02
,fscal
,fiy0
);
425 fiz0
= _mm_macc_ps(dz02
,fscal
,fiz0
);
427 fjx2
= _mm_macc_ps(dx02
,fscal
,fjx2
);
428 fjy2
= _mm_macc_ps(dy02
,fscal
,fjy2
);
429 fjz2
= _mm_macc_ps(dz02
,fscal
,fjz2
);
433 /**************************
434 * CALCULATE INTERACTIONS *
435 **************************/
437 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
440 /* REACTION-FIELD ELECTROSTATICS */
441 velec
= _mm_mul_ps(qq10
,_mm_sub_ps(_mm_macc_ps(krf
,rsq10
,rinv10
),crf
));
442 felec
= _mm_mul_ps(qq10
,_mm_msub_ps(rinv10
,rinvsq10
,krf2
));
444 cutoff_mask
= _mm_cmplt_ps(rsq10
,rcutoff2
);
446 /* Update potential sum for this i atom from the interaction with this j atom. */
447 velec
= _mm_and_ps(velec
,cutoff_mask
);
448 velecsum
= _mm_add_ps(velecsum
,velec
);
452 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
454 /* Update vectorial force */
455 fix1
= _mm_macc_ps(dx10
,fscal
,fix1
);
456 fiy1
= _mm_macc_ps(dy10
,fscal
,fiy1
);
457 fiz1
= _mm_macc_ps(dz10
,fscal
,fiz1
);
459 fjx0
= _mm_macc_ps(dx10
,fscal
,fjx0
);
460 fjy0
= _mm_macc_ps(dy10
,fscal
,fjy0
);
461 fjz0
= _mm_macc_ps(dz10
,fscal
,fjz0
);
465 /**************************
466 * CALCULATE INTERACTIONS *
467 **************************/
469 if (gmx_mm_any_lt(rsq11
,rcutoff2
))
472 /* REACTION-FIELD ELECTROSTATICS */
473 velec
= _mm_mul_ps(qq11
,_mm_sub_ps(_mm_macc_ps(krf
,rsq11
,rinv11
),crf
));
474 felec
= _mm_mul_ps(qq11
,_mm_msub_ps(rinv11
,rinvsq11
,krf2
));
476 cutoff_mask
= _mm_cmplt_ps(rsq11
,rcutoff2
);
478 /* Update potential sum for this i atom from the interaction with this j atom. */
479 velec
= _mm_and_ps(velec
,cutoff_mask
);
480 velecsum
= _mm_add_ps(velecsum
,velec
);
484 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
486 /* Update vectorial force */
487 fix1
= _mm_macc_ps(dx11
,fscal
,fix1
);
488 fiy1
= _mm_macc_ps(dy11
,fscal
,fiy1
);
489 fiz1
= _mm_macc_ps(dz11
,fscal
,fiz1
);
491 fjx1
= _mm_macc_ps(dx11
,fscal
,fjx1
);
492 fjy1
= _mm_macc_ps(dy11
,fscal
,fjy1
);
493 fjz1
= _mm_macc_ps(dz11
,fscal
,fjz1
);
497 /**************************
498 * CALCULATE INTERACTIONS *
499 **************************/
501 if (gmx_mm_any_lt(rsq12
,rcutoff2
))
504 /* REACTION-FIELD ELECTROSTATICS */
505 velec
= _mm_mul_ps(qq12
,_mm_sub_ps(_mm_macc_ps(krf
,rsq12
,rinv12
),crf
));
506 felec
= _mm_mul_ps(qq12
,_mm_msub_ps(rinv12
,rinvsq12
,krf2
));
508 cutoff_mask
= _mm_cmplt_ps(rsq12
,rcutoff2
);
510 /* Update potential sum for this i atom from the interaction with this j atom. */
511 velec
= _mm_and_ps(velec
,cutoff_mask
);
512 velecsum
= _mm_add_ps(velecsum
,velec
);
516 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
518 /* Update vectorial force */
519 fix1
= _mm_macc_ps(dx12
,fscal
,fix1
);
520 fiy1
= _mm_macc_ps(dy12
,fscal
,fiy1
);
521 fiz1
= _mm_macc_ps(dz12
,fscal
,fiz1
);
523 fjx2
= _mm_macc_ps(dx12
,fscal
,fjx2
);
524 fjy2
= _mm_macc_ps(dy12
,fscal
,fjy2
);
525 fjz2
= _mm_macc_ps(dz12
,fscal
,fjz2
);
529 /**************************
530 * CALCULATE INTERACTIONS *
531 **************************/
533 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
536 /* REACTION-FIELD ELECTROSTATICS */
537 velec
= _mm_mul_ps(qq20
,_mm_sub_ps(_mm_macc_ps(krf
,rsq20
,rinv20
),crf
));
538 felec
= _mm_mul_ps(qq20
,_mm_msub_ps(rinv20
,rinvsq20
,krf2
));
540 cutoff_mask
= _mm_cmplt_ps(rsq20
,rcutoff2
);
542 /* Update potential sum for this i atom from the interaction with this j atom. */
543 velec
= _mm_and_ps(velec
,cutoff_mask
);
544 velecsum
= _mm_add_ps(velecsum
,velec
);
548 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
550 /* Update vectorial force */
551 fix2
= _mm_macc_ps(dx20
,fscal
,fix2
);
552 fiy2
= _mm_macc_ps(dy20
,fscal
,fiy2
);
553 fiz2
= _mm_macc_ps(dz20
,fscal
,fiz2
);
555 fjx0
= _mm_macc_ps(dx20
,fscal
,fjx0
);
556 fjy0
= _mm_macc_ps(dy20
,fscal
,fjy0
);
557 fjz0
= _mm_macc_ps(dz20
,fscal
,fjz0
);
561 /**************************
562 * CALCULATE INTERACTIONS *
563 **************************/
565 if (gmx_mm_any_lt(rsq21
,rcutoff2
))
568 /* REACTION-FIELD ELECTROSTATICS */
569 velec
= _mm_mul_ps(qq21
,_mm_sub_ps(_mm_macc_ps(krf
,rsq21
,rinv21
),crf
));
570 felec
= _mm_mul_ps(qq21
,_mm_msub_ps(rinv21
,rinvsq21
,krf2
));
572 cutoff_mask
= _mm_cmplt_ps(rsq21
,rcutoff2
);
574 /* Update potential sum for this i atom from the interaction with this j atom. */
575 velec
= _mm_and_ps(velec
,cutoff_mask
);
576 velecsum
= _mm_add_ps(velecsum
,velec
);
580 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
582 /* Update vectorial force */
583 fix2
= _mm_macc_ps(dx21
,fscal
,fix2
);
584 fiy2
= _mm_macc_ps(dy21
,fscal
,fiy2
);
585 fiz2
= _mm_macc_ps(dz21
,fscal
,fiz2
);
587 fjx1
= _mm_macc_ps(dx21
,fscal
,fjx1
);
588 fjy1
= _mm_macc_ps(dy21
,fscal
,fjy1
);
589 fjz1
= _mm_macc_ps(dz21
,fscal
,fjz1
);
593 /**************************
594 * CALCULATE INTERACTIONS *
595 **************************/
597 if (gmx_mm_any_lt(rsq22
,rcutoff2
))
600 /* REACTION-FIELD ELECTROSTATICS */
601 velec
= _mm_mul_ps(qq22
,_mm_sub_ps(_mm_macc_ps(krf
,rsq22
,rinv22
),crf
));
602 felec
= _mm_mul_ps(qq22
,_mm_msub_ps(rinv22
,rinvsq22
,krf2
));
604 cutoff_mask
= _mm_cmplt_ps(rsq22
,rcutoff2
);
606 /* Update potential sum for this i atom from the interaction with this j atom. */
607 velec
= _mm_and_ps(velec
,cutoff_mask
);
608 velecsum
= _mm_add_ps(velecsum
,velec
);
612 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
614 /* Update vectorial force */
615 fix2
= _mm_macc_ps(dx22
,fscal
,fix2
);
616 fiy2
= _mm_macc_ps(dy22
,fscal
,fiy2
);
617 fiz2
= _mm_macc_ps(dz22
,fscal
,fiz2
);
619 fjx2
= _mm_macc_ps(dx22
,fscal
,fjx2
);
620 fjy2
= _mm_macc_ps(dy22
,fscal
,fjy2
);
621 fjz2
= _mm_macc_ps(dz22
,fscal
,fjz2
);
625 fjptrA
= f
+j_coord_offsetA
;
626 fjptrB
= f
+j_coord_offsetB
;
627 fjptrC
= f
+j_coord_offsetC
;
628 fjptrD
= f
+j_coord_offsetD
;
630 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
631 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
633 /* Inner loop uses 385 flops */
639 /* Get j neighbor index, and coordinate index */
640 jnrlistA
= jjnr
[jidx
];
641 jnrlistB
= jjnr
[jidx
+1];
642 jnrlistC
= jjnr
[jidx
+2];
643 jnrlistD
= jjnr
[jidx
+3];
644 /* Sign of each element will be negative for non-real atoms.
645 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
646 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
648 dummy_mask
= gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128()));
649 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
650 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
651 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
652 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
653 j_coord_offsetA
= DIM
*jnrA
;
654 j_coord_offsetB
= DIM
*jnrB
;
655 j_coord_offsetC
= DIM
*jnrC
;
656 j_coord_offsetD
= DIM
*jnrD
;
658 /* load j atom coordinates */
659 gmx_mm_load_3rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
660 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
661 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
663 /* Calculate displacement vector */
664 dx00
= _mm_sub_ps(ix0
,jx0
);
665 dy00
= _mm_sub_ps(iy0
,jy0
);
666 dz00
= _mm_sub_ps(iz0
,jz0
);
667 dx01
= _mm_sub_ps(ix0
,jx1
);
668 dy01
= _mm_sub_ps(iy0
,jy1
);
669 dz01
= _mm_sub_ps(iz0
,jz1
);
670 dx02
= _mm_sub_ps(ix0
,jx2
);
671 dy02
= _mm_sub_ps(iy0
,jy2
);
672 dz02
= _mm_sub_ps(iz0
,jz2
);
673 dx10
= _mm_sub_ps(ix1
,jx0
);
674 dy10
= _mm_sub_ps(iy1
,jy0
);
675 dz10
= _mm_sub_ps(iz1
,jz0
);
676 dx11
= _mm_sub_ps(ix1
,jx1
);
677 dy11
= _mm_sub_ps(iy1
,jy1
);
678 dz11
= _mm_sub_ps(iz1
,jz1
);
679 dx12
= _mm_sub_ps(ix1
,jx2
);
680 dy12
= _mm_sub_ps(iy1
,jy2
);
681 dz12
= _mm_sub_ps(iz1
,jz2
);
682 dx20
= _mm_sub_ps(ix2
,jx0
);
683 dy20
= _mm_sub_ps(iy2
,jy0
);
684 dz20
= _mm_sub_ps(iz2
,jz0
);
685 dx21
= _mm_sub_ps(ix2
,jx1
);
686 dy21
= _mm_sub_ps(iy2
,jy1
);
687 dz21
= _mm_sub_ps(iz2
,jz1
);
688 dx22
= _mm_sub_ps(ix2
,jx2
);
689 dy22
= _mm_sub_ps(iy2
,jy2
);
690 dz22
= _mm_sub_ps(iz2
,jz2
);
692 /* Calculate squared distance and things based on it */
693 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
694 rsq01
= gmx_mm_calc_rsq_ps(dx01
,dy01
,dz01
);
695 rsq02
= gmx_mm_calc_rsq_ps(dx02
,dy02
,dz02
);
696 rsq10
= gmx_mm_calc_rsq_ps(dx10
,dy10
,dz10
);
697 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
698 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
699 rsq20
= gmx_mm_calc_rsq_ps(dx20
,dy20
,dz20
);
700 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
701 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
703 rinv00
= gmx_mm_invsqrt_ps(rsq00
);
704 rinv01
= gmx_mm_invsqrt_ps(rsq01
);
705 rinv02
= gmx_mm_invsqrt_ps(rsq02
);
706 rinv10
= gmx_mm_invsqrt_ps(rsq10
);
707 rinv11
= gmx_mm_invsqrt_ps(rsq11
);
708 rinv12
= gmx_mm_invsqrt_ps(rsq12
);
709 rinv20
= gmx_mm_invsqrt_ps(rsq20
);
710 rinv21
= gmx_mm_invsqrt_ps(rsq21
);
711 rinv22
= gmx_mm_invsqrt_ps(rsq22
);
713 rinvsq00
= _mm_mul_ps(rinv00
,rinv00
);
714 rinvsq01
= _mm_mul_ps(rinv01
,rinv01
);
715 rinvsq02
= _mm_mul_ps(rinv02
,rinv02
);
716 rinvsq10
= _mm_mul_ps(rinv10
,rinv10
);
717 rinvsq11
= _mm_mul_ps(rinv11
,rinv11
);
718 rinvsq12
= _mm_mul_ps(rinv12
,rinv12
);
719 rinvsq20
= _mm_mul_ps(rinv20
,rinv20
);
720 rinvsq21
= _mm_mul_ps(rinv21
,rinv21
);
721 rinvsq22
= _mm_mul_ps(rinv22
,rinv22
);
723 fjx0
= _mm_setzero_ps();
724 fjy0
= _mm_setzero_ps();
725 fjz0
= _mm_setzero_ps();
726 fjx1
= _mm_setzero_ps();
727 fjy1
= _mm_setzero_ps();
728 fjz1
= _mm_setzero_ps();
729 fjx2
= _mm_setzero_ps();
730 fjy2
= _mm_setzero_ps();
731 fjz2
= _mm_setzero_ps();
733 /**************************
734 * CALCULATE INTERACTIONS *
735 **************************/
737 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
740 r00
= _mm_mul_ps(rsq00
,rinv00
);
741 r00
= _mm_andnot_ps(dummy_mask
,r00
);
743 /* REACTION-FIELD ELECTROSTATICS */
744 velec
= _mm_mul_ps(qq00
,_mm_sub_ps(_mm_macc_ps(krf
,rsq00
,rinv00
),crf
));
745 felec
= _mm_mul_ps(qq00
,_mm_msub_ps(rinv00
,rinvsq00
,krf2
));
747 /* LENNARD-JONES DISPERSION/REPULSION */
749 rinvsix
= _mm_mul_ps(_mm_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
750 vvdw6
= _mm_mul_ps(c6_00
,rinvsix
);
751 vvdw12
= _mm_mul_ps(c12_00
,_mm_mul_ps(rinvsix
,rinvsix
));
752 vvdw
= _mm_msub_ps(vvdw12
,one_twelfth
,_mm_mul_ps(vvdw6
,one_sixth
));
753 fvdw
= _mm_mul_ps(_mm_sub_ps(vvdw12
,vvdw6
),rinvsq00
);
755 d
= _mm_sub_ps(r00
,rswitch
);
756 d
= _mm_max_ps(d
,_mm_setzero_ps());
757 d2
= _mm_mul_ps(d
,d
);
758 sw
= _mm_add_ps(one
,_mm_mul_ps(d2
,_mm_mul_ps(d
,_mm_macc_ps(d
,_mm_macc_ps(d
,swV5
,swV4
),swV3
))));
760 dsw
= _mm_mul_ps(d2
,_mm_macc_ps(d
,_mm_macc_ps(d
,swF4
,swF3
),swF2
));
762 /* Evaluate switch function */
763 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
764 fvdw
= _mm_msub_ps( fvdw
,sw
, _mm_mul_ps(rinv00
,_mm_mul_ps(vvdw
,dsw
)) );
765 vvdw
= _mm_mul_ps(vvdw
,sw
);
766 cutoff_mask
= _mm_cmplt_ps(rsq00
,rcutoff2
);
768 /* Update potential sum for this i atom from the interaction with this j atom. */
769 velec
= _mm_and_ps(velec
,cutoff_mask
);
770 velec
= _mm_andnot_ps(dummy_mask
,velec
);
771 velecsum
= _mm_add_ps(velecsum
,velec
);
772 vvdw
= _mm_and_ps(vvdw
,cutoff_mask
);
773 vvdw
= _mm_andnot_ps(dummy_mask
,vvdw
);
774 vvdwsum
= _mm_add_ps(vvdwsum
,vvdw
);
776 fscal
= _mm_add_ps(felec
,fvdw
);
778 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
780 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
782 /* Update vectorial force */
783 fix0
= _mm_macc_ps(dx00
,fscal
,fix0
);
784 fiy0
= _mm_macc_ps(dy00
,fscal
,fiy0
);
785 fiz0
= _mm_macc_ps(dz00
,fscal
,fiz0
);
787 fjx0
= _mm_macc_ps(dx00
,fscal
,fjx0
);
788 fjy0
= _mm_macc_ps(dy00
,fscal
,fjy0
);
789 fjz0
= _mm_macc_ps(dz00
,fscal
,fjz0
);
793 /**************************
794 * CALCULATE INTERACTIONS *
795 **************************/
797 if (gmx_mm_any_lt(rsq01
,rcutoff2
))
800 /* REACTION-FIELD ELECTROSTATICS */
801 velec
= _mm_mul_ps(qq01
,_mm_sub_ps(_mm_macc_ps(krf
,rsq01
,rinv01
),crf
));
802 felec
= _mm_mul_ps(qq01
,_mm_msub_ps(rinv01
,rinvsq01
,krf2
));
804 cutoff_mask
= _mm_cmplt_ps(rsq01
,rcutoff2
);
806 /* Update potential sum for this i atom from the interaction with this j atom. */
807 velec
= _mm_and_ps(velec
,cutoff_mask
);
808 velec
= _mm_andnot_ps(dummy_mask
,velec
);
809 velecsum
= _mm_add_ps(velecsum
,velec
);
813 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
815 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
817 /* Update vectorial force */
818 fix0
= _mm_macc_ps(dx01
,fscal
,fix0
);
819 fiy0
= _mm_macc_ps(dy01
,fscal
,fiy0
);
820 fiz0
= _mm_macc_ps(dz01
,fscal
,fiz0
);
822 fjx1
= _mm_macc_ps(dx01
,fscal
,fjx1
);
823 fjy1
= _mm_macc_ps(dy01
,fscal
,fjy1
);
824 fjz1
= _mm_macc_ps(dz01
,fscal
,fjz1
);
828 /**************************
829 * CALCULATE INTERACTIONS *
830 **************************/
832 if (gmx_mm_any_lt(rsq02
,rcutoff2
))
835 /* REACTION-FIELD ELECTROSTATICS */
836 velec
= _mm_mul_ps(qq02
,_mm_sub_ps(_mm_macc_ps(krf
,rsq02
,rinv02
),crf
));
837 felec
= _mm_mul_ps(qq02
,_mm_msub_ps(rinv02
,rinvsq02
,krf2
));
839 cutoff_mask
= _mm_cmplt_ps(rsq02
,rcutoff2
);
841 /* Update potential sum for this i atom from the interaction with this j atom. */
842 velec
= _mm_and_ps(velec
,cutoff_mask
);
843 velec
= _mm_andnot_ps(dummy_mask
,velec
);
844 velecsum
= _mm_add_ps(velecsum
,velec
);
848 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
850 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
852 /* Update vectorial force */
853 fix0
= _mm_macc_ps(dx02
,fscal
,fix0
);
854 fiy0
= _mm_macc_ps(dy02
,fscal
,fiy0
);
855 fiz0
= _mm_macc_ps(dz02
,fscal
,fiz0
);
857 fjx2
= _mm_macc_ps(dx02
,fscal
,fjx2
);
858 fjy2
= _mm_macc_ps(dy02
,fscal
,fjy2
);
859 fjz2
= _mm_macc_ps(dz02
,fscal
,fjz2
);
863 /**************************
864 * CALCULATE INTERACTIONS *
865 **************************/
867 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
870 /* REACTION-FIELD ELECTROSTATICS */
871 velec
= _mm_mul_ps(qq10
,_mm_sub_ps(_mm_macc_ps(krf
,rsq10
,rinv10
),crf
));
872 felec
= _mm_mul_ps(qq10
,_mm_msub_ps(rinv10
,rinvsq10
,krf2
));
874 cutoff_mask
= _mm_cmplt_ps(rsq10
,rcutoff2
);
876 /* Update potential sum for this i atom from the interaction with this j atom. */
877 velec
= _mm_and_ps(velec
,cutoff_mask
);
878 velec
= _mm_andnot_ps(dummy_mask
,velec
);
879 velecsum
= _mm_add_ps(velecsum
,velec
);
883 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
885 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
887 /* Update vectorial force */
888 fix1
= _mm_macc_ps(dx10
,fscal
,fix1
);
889 fiy1
= _mm_macc_ps(dy10
,fscal
,fiy1
);
890 fiz1
= _mm_macc_ps(dz10
,fscal
,fiz1
);
892 fjx0
= _mm_macc_ps(dx10
,fscal
,fjx0
);
893 fjy0
= _mm_macc_ps(dy10
,fscal
,fjy0
);
894 fjz0
= _mm_macc_ps(dz10
,fscal
,fjz0
);
898 /**************************
899 * CALCULATE INTERACTIONS *
900 **************************/
902 if (gmx_mm_any_lt(rsq11
,rcutoff2
))
905 /* REACTION-FIELD ELECTROSTATICS */
906 velec
= _mm_mul_ps(qq11
,_mm_sub_ps(_mm_macc_ps(krf
,rsq11
,rinv11
),crf
));
907 felec
= _mm_mul_ps(qq11
,_mm_msub_ps(rinv11
,rinvsq11
,krf2
));
909 cutoff_mask
= _mm_cmplt_ps(rsq11
,rcutoff2
);
911 /* Update potential sum for this i atom from the interaction with this j atom. */
912 velec
= _mm_and_ps(velec
,cutoff_mask
);
913 velec
= _mm_andnot_ps(dummy_mask
,velec
);
914 velecsum
= _mm_add_ps(velecsum
,velec
);
918 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
920 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
922 /* Update vectorial force */
923 fix1
= _mm_macc_ps(dx11
,fscal
,fix1
);
924 fiy1
= _mm_macc_ps(dy11
,fscal
,fiy1
);
925 fiz1
= _mm_macc_ps(dz11
,fscal
,fiz1
);
927 fjx1
= _mm_macc_ps(dx11
,fscal
,fjx1
);
928 fjy1
= _mm_macc_ps(dy11
,fscal
,fjy1
);
929 fjz1
= _mm_macc_ps(dz11
,fscal
,fjz1
);
933 /**************************
934 * CALCULATE INTERACTIONS *
935 **************************/
937 if (gmx_mm_any_lt(rsq12
,rcutoff2
))
940 /* REACTION-FIELD ELECTROSTATICS */
941 velec
= _mm_mul_ps(qq12
,_mm_sub_ps(_mm_macc_ps(krf
,rsq12
,rinv12
),crf
));
942 felec
= _mm_mul_ps(qq12
,_mm_msub_ps(rinv12
,rinvsq12
,krf2
));
944 cutoff_mask
= _mm_cmplt_ps(rsq12
,rcutoff2
);
946 /* Update potential sum for this i atom from the interaction with this j atom. */
947 velec
= _mm_and_ps(velec
,cutoff_mask
);
948 velec
= _mm_andnot_ps(dummy_mask
,velec
);
949 velecsum
= _mm_add_ps(velecsum
,velec
);
953 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
955 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
957 /* Update vectorial force */
958 fix1
= _mm_macc_ps(dx12
,fscal
,fix1
);
959 fiy1
= _mm_macc_ps(dy12
,fscal
,fiy1
);
960 fiz1
= _mm_macc_ps(dz12
,fscal
,fiz1
);
962 fjx2
= _mm_macc_ps(dx12
,fscal
,fjx2
);
963 fjy2
= _mm_macc_ps(dy12
,fscal
,fjy2
);
964 fjz2
= _mm_macc_ps(dz12
,fscal
,fjz2
);
968 /**************************
969 * CALCULATE INTERACTIONS *
970 **************************/
972 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
975 /* REACTION-FIELD ELECTROSTATICS */
976 velec
= _mm_mul_ps(qq20
,_mm_sub_ps(_mm_macc_ps(krf
,rsq20
,rinv20
),crf
));
977 felec
= _mm_mul_ps(qq20
,_mm_msub_ps(rinv20
,rinvsq20
,krf2
));
979 cutoff_mask
= _mm_cmplt_ps(rsq20
,rcutoff2
);
981 /* Update potential sum for this i atom from the interaction with this j atom. */
982 velec
= _mm_and_ps(velec
,cutoff_mask
);
983 velec
= _mm_andnot_ps(dummy_mask
,velec
);
984 velecsum
= _mm_add_ps(velecsum
,velec
);
988 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
990 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
992 /* Update vectorial force */
993 fix2
= _mm_macc_ps(dx20
,fscal
,fix2
);
994 fiy2
= _mm_macc_ps(dy20
,fscal
,fiy2
);
995 fiz2
= _mm_macc_ps(dz20
,fscal
,fiz2
);
997 fjx0
= _mm_macc_ps(dx20
,fscal
,fjx0
);
998 fjy0
= _mm_macc_ps(dy20
,fscal
,fjy0
);
999 fjz0
= _mm_macc_ps(dz20
,fscal
,fjz0
);
1003 /**************************
1004 * CALCULATE INTERACTIONS *
1005 **************************/
1007 if (gmx_mm_any_lt(rsq21
,rcutoff2
))
1010 /* REACTION-FIELD ELECTROSTATICS */
1011 velec
= _mm_mul_ps(qq21
,_mm_sub_ps(_mm_macc_ps(krf
,rsq21
,rinv21
),crf
));
1012 felec
= _mm_mul_ps(qq21
,_mm_msub_ps(rinv21
,rinvsq21
,krf2
));
1014 cutoff_mask
= _mm_cmplt_ps(rsq21
,rcutoff2
);
1016 /* Update potential sum for this i atom from the interaction with this j atom. */
1017 velec
= _mm_and_ps(velec
,cutoff_mask
);
1018 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1019 velecsum
= _mm_add_ps(velecsum
,velec
);
1023 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1025 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1027 /* Update vectorial force */
1028 fix2
= _mm_macc_ps(dx21
,fscal
,fix2
);
1029 fiy2
= _mm_macc_ps(dy21
,fscal
,fiy2
);
1030 fiz2
= _mm_macc_ps(dz21
,fscal
,fiz2
);
1032 fjx1
= _mm_macc_ps(dx21
,fscal
,fjx1
);
1033 fjy1
= _mm_macc_ps(dy21
,fscal
,fjy1
);
1034 fjz1
= _mm_macc_ps(dz21
,fscal
,fjz1
);
1038 /**************************
1039 * CALCULATE INTERACTIONS *
1040 **************************/
1042 if (gmx_mm_any_lt(rsq22
,rcutoff2
))
1045 /* REACTION-FIELD ELECTROSTATICS */
1046 velec
= _mm_mul_ps(qq22
,_mm_sub_ps(_mm_macc_ps(krf
,rsq22
,rinv22
),crf
));
1047 felec
= _mm_mul_ps(qq22
,_mm_msub_ps(rinv22
,rinvsq22
,krf2
));
1049 cutoff_mask
= _mm_cmplt_ps(rsq22
,rcutoff2
);
1051 /* Update potential sum for this i atom from the interaction with this j atom. */
1052 velec
= _mm_and_ps(velec
,cutoff_mask
);
1053 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1054 velecsum
= _mm_add_ps(velecsum
,velec
);
1058 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1060 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1062 /* Update vectorial force */
1063 fix2
= _mm_macc_ps(dx22
,fscal
,fix2
);
1064 fiy2
= _mm_macc_ps(dy22
,fscal
,fiy2
);
1065 fiz2
= _mm_macc_ps(dz22
,fscal
,fiz2
);
1067 fjx2
= _mm_macc_ps(dx22
,fscal
,fjx2
);
1068 fjy2
= _mm_macc_ps(dy22
,fscal
,fjy2
);
1069 fjz2
= _mm_macc_ps(dz22
,fscal
,fjz2
);
1073 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
1074 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
1075 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
1076 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
1078 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
1079 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
1081 /* Inner loop uses 386 flops */
1084 /* End of innermost loop */
1086 gmx_mm_update_iforce_3atom_swizzle_ps(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
1087 f
+i_coord_offset
,fshift
+i_shift_offset
);
1090 /* Update potential energies */
1091 gmx_mm_update_1pot_ps(velecsum
,kernel_data
->energygrp_elec
+ggid
);
1092 gmx_mm_update_1pot_ps(vvdwsum
,kernel_data
->energygrp_vdw
+ggid
);
1094 /* Increment number of inner iterations */
1095 inneriter
+= j_index_end
- j_index_start
;
1097 /* Outer loop uses 20 flops */
1100 /* Increment number of outer iterations */
1103 /* Update outer/inner flops */
1105 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W3W3_VF
,outeriter
*20 + inneriter
*386);
1108 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_avx_128_fma_single
1109 * Electrostatics interaction: ReactionField
1110 * VdW interaction: LennardJones
1111 * Geometry: Water3-Water3
1112 * Calculate force/pot: Force
1115 nb_kernel_ElecRFCut_VdwLJSw_GeomW3W3_F_avx_128_fma_single
1116 (t_nblist
* gmx_restrict nlist
,
1117 rvec
* gmx_restrict xx
,
1118 rvec
* gmx_restrict ff
,
1119 t_forcerec
* gmx_restrict fr
,
1120 t_mdatoms
* gmx_restrict mdatoms
,
1121 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
1122 t_nrnb
* gmx_restrict nrnb
)
1124 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1125 * just 0 for non-waters.
1126 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
1127 * jnr indices corresponding to data put in the four positions in the SIMD register.
1129 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
1130 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
1131 int jnrA
,jnrB
,jnrC
,jnrD
;
1132 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
1133 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
1134 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
1135 real rcutoff_scalar
;
1136 real
*shiftvec
,*fshift
,*x
,*f
;
1137 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
;
1138 real scratch
[4*DIM
];
1139 __m128 fscal
,rcutoff
,rcutoff2
,jidxall
;
1141 __m128 ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
1143 __m128 ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
1145 __m128 ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
1146 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
;
1147 __m128 jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
1148 int vdwjidx1A
,vdwjidx1B
,vdwjidx1C
,vdwjidx1D
;
1149 __m128 jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
1150 int vdwjidx2A
,vdwjidx2B
,vdwjidx2C
,vdwjidx2D
;
1151 __m128 jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
1152 __m128 dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
1153 __m128 dx01
,dy01
,dz01
,rsq01
,rinv01
,rinvsq01
,r01
,qq01
,c6_01
,c12_01
;
1154 __m128 dx02
,dy02
,dz02
,rsq02
,rinv02
,rinvsq02
,r02
,qq02
,c6_02
,c12_02
;
1155 __m128 dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
1156 __m128 dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
1157 __m128 dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
1158 __m128 dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
1159 __m128 dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
1160 __m128 dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
1161 __m128 velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
1164 __m128 rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
1167 __m128 one_sixth
= _mm_set1_ps(1.0/6.0);
1168 __m128 one_twelfth
= _mm_set1_ps(1.0/12.0);
1169 __m128 rswitch
,swV3
,swV4
,swV5
,swF2
,swF3
,swF4
,d
,d2
,sw
,dsw
;
1170 real rswitch_scalar
,d_scalar
;
1171 __m128 dummy_mask
,cutoff_mask
;
1172 __m128 signbit
= _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1173 __m128 one
= _mm_set1_ps(1.0);
1174 __m128 two
= _mm_set1_ps(2.0);
1180 jindex
= nlist
->jindex
;
1182 shiftidx
= nlist
->shift
;
1184 shiftvec
= fr
->shift_vec
[0];
1185 fshift
= fr
->fshift
[0];
1186 facel
= _mm_set1_ps(fr
->epsfac
);
1187 charge
= mdatoms
->chargeA
;
1188 krf
= _mm_set1_ps(fr
->ic
->k_rf
);
1189 krf2
= _mm_set1_ps(fr
->ic
->k_rf
*2.0);
1190 crf
= _mm_set1_ps(fr
->ic
->c_rf
);
1191 nvdwtype
= fr
->ntype
;
1192 vdwparam
= fr
->nbfp
;
1193 vdwtype
= mdatoms
->typeA
;
1195 /* Setup water-specific parameters */
1196 inr
= nlist
->iinr
[0];
1197 iq0
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+0]));
1198 iq1
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+1]));
1199 iq2
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+2]));
1200 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
1202 jq0
= _mm_set1_ps(charge
[inr
+0]);
1203 jq1
= _mm_set1_ps(charge
[inr
+1]);
1204 jq2
= _mm_set1_ps(charge
[inr
+2]);
1205 vdwjidx0A
= 2*vdwtype
[inr
+0];
1206 qq00
= _mm_mul_ps(iq0
,jq0
);
1207 c6_00
= _mm_set1_ps(vdwparam
[vdwioffset0
+vdwjidx0A
]);
1208 c12_00
= _mm_set1_ps(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
1209 qq01
= _mm_mul_ps(iq0
,jq1
);
1210 qq02
= _mm_mul_ps(iq0
,jq2
);
1211 qq10
= _mm_mul_ps(iq1
,jq0
);
1212 qq11
= _mm_mul_ps(iq1
,jq1
);
1213 qq12
= _mm_mul_ps(iq1
,jq2
);
1214 qq20
= _mm_mul_ps(iq2
,jq0
);
1215 qq21
= _mm_mul_ps(iq2
,jq1
);
1216 qq22
= _mm_mul_ps(iq2
,jq2
);
1218 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1219 rcutoff_scalar
= fr
->rcoulomb
;
1220 rcutoff
= _mm_set1_ps(rcutoff_scalar
);
1221 rcutoff2
= _mm_mul_ps(rcutoff
,rcutoff
);
1223 rswitch_scalar
= fr
->rvdw_switch
;
1224 rswitch
= _mm_set1_ps(rswitch_scalar
);
1225 /* Setup switch parameters */
1226 d_scalar
= rcutoff_scalar
-rswitch_scalar
;
1227 d
= _mm_set1_ps(d_scalar
);
1228 swV3
= _mm_set1_ps(-10.0/(d_scalar
*d_scalar
*d_scalar
));
1229 swV4
= _mm_set1_ps( 15.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
));
1230 swV5
= _mm_set1_ps( -6.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
*d_scalar
));
1231 swF2
= _mm_set1_ps(-30.0/(d_scalar
*d_scalar
*d_scalar
));
1232 swF3
= _mm_set1_ps( 60.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
));
1233 swF4
= _mm_set1_ps(-30.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
*d_scalar
));
1235 /* Avoid stupid compiler warnings */
1236 jnrA
= jnrB
= jnrC
= jnrD
= 0;
1237 j_coord_offsetA
= 0;
1238 j_coord_offsetB
= 0;
1239 j_coord_offsetC
= 0;
1240 j_coord_offsetD
= 0;
1245 for(iidx
=0;iidx
<4*DIM
;iidx
++)
1247 scratch
[iidx
] = 0.0;
1250 /* Start outer loop over neighborlists */
1251 for(iidx
=0; iidx
<nri
; iidx
++)
1253 /* Load shift vector for this list */
1254 i_shift_offset
= DIM
*shiftidx
[iidx
];
1256 /* Load limits for loop over neighbors */
1257 j_index_start
= jindex
[iidx
];
1258 j_index_end
= jindex
[iidx
+1];
1260 /* Get outer coordinate index */
1262 i_coord_offset
= DIM
*inr
;
1264 /* Load i particle coords and add shift vector */
1265 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
1266 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
1268 fix0
= _mm_setzero_ps();
1269 fiy0
= _mm_setzero_ps();
1270 fiz0
= _mm_setzero_ps();
1271 fix1
= _mm_setzero_ps();
1272 fiy1
= _mm_setzero_ps();
1273 fiz1
= _mm_setzero_ps();
1274 fix2
= _mm_setzero_ps();
1275 fiy2
= _mm_setzero_ps();
1276 fiz2
= _mm_setzero_ps();
1278 /* Start inner kernel loop */
1279 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+3]>=0; jidx
+=4)
1282 /* Get j neighbor index, and coordinate index */
1284 jnrB
= jjnr
[jidx
+1];
1285 jnrC
= jjnr
[jidx
+2];
1286 jnrD
= jjnr
[jidx
+3];
1287 j_coord_offsetA
= DIM
*jnrA
;
1288 j_coord_offsetB
= DIM
*jnrB
;
1289 j_coord_offsetC
= DIM
*jnrC
;
1290 j_coord_offsetD
= DIM
*jnrD
;
1292 /* load j atom coordinates */
1293 gmx_mm_load_3rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1294 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
1295 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
1297 /* Calculate displacement vector */
1298 dx00
= _mm_sub_ps(ix0
,jx0
);
1299 dy00
= _mm_sub_ps(iy0
,jy0
);
1300 dz00
= _mm_sub_ps(iz0
,jz0
);
1301 dx01
= _mm_sub_ps(ix0
,jx1
);
1302 dy01
= _mm_sub_ps(iy0
,jy1
);
1303 dz01
= _mm_sub_ps(iz0
,jz1
);
1304 dx02
= _mm_sub_ps(ix0
,jx2
);
1305 dy02
= _mm_sub_ps(iy0
,jy2
);
1306 dz02
= _mm_sub_ps(iz0
,jz2
);
1307 dx10
= _mm_sub_ps(ix1
,jx0
);
1308 dy10
= _mm_sub_ps(iy1
,jy0
);
1309 dz10
= _mm_sub_ps(iz1
,jz0
);
1310 dx11
= _mm_sub_ps(ix1
,jx1
);
1311 dy11
= _mm_sub_ps(iy1
,jy1
);
1312 dz11
= _mm_sub_ps(iz1
,jz1
);
1313 dx12
= _mm_sub_ps(ix1
,jx2
);
1314 dy12
= _mm_sub_ps(iy1
,jy2
);
1315 dz12
= _mm_sub_ps(iz1
,jz2
);
1316 dx20
= _mm_sub_ps(ix2
,jx0
);
1317 dy20
= _mm_sub_ps(iy2
,jy0
);
1318 dz20
= _mm_sub_ps(iz2
,jz0
);
1319 dx21
= _mm_sub_ps(ix2
,jx1
);
1320 dy21
= _mm_sub_ps(iy2
,jy1
);
1321 dz21
= _mm_sub_ps(iz2
,jz1
);
1322 dx22
= _mm_sub_ps(ix2
,jx2
);
1323 dy22
= _mm_sub_ps(iy2
,jy2
);
1324 dz22
= _mm_sub_ps(iz2
,jz2
);
1326 /* Calculate squared distance and things based on it */
1327 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
1328 rsq01
= gmx_mm_calc_rsq_ps(dx01
,dy01
,dz01
);
1329 rsq02
= gmx_mm_calc_rsq_ps(dx02
,dy02
,dz02
);
1330 rsq10
= gmx_mm_calc_rsq_ps(dx10
,dy10
,dz10
);
1331 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
1332 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
1333 rsq20
= gmx_mm_calc_rsq_ps(dx20
,dy20
,dz20
);
1334 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
1335 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
1337 rinv00
= gmx_mm_invsqrt_ps(rsq00
);
1338 rinv01
= gmx_mm_invsqrt_ps(rsq01
);
1339 rinv02
= gmx_mm_invsqrt_ps(rsq02
);
1340 rinv10
= gmx_mm_invsqrt_ps(rsq10
);
1341 rinv11
= gmx_mm_invsqrt_ps(rsq11
);
1342 rinv12
= gmx_mm_invsqrt_ps(rsq12
);
1343 rinv20
= gmx_mm_invsqrt_ps(rsq20
);
1344 rinv21
= gmx_mm_invsqrt_ps(rsq21
);
1345 rinv22
= gmx_mm_invsqrt_ps(rsq22
);
1347 rinvsq00
= _mm_mul_ps(rinv00
,rinv00
);
1348 rinvsq01
= _mm_mul_ps(rinv01
,rinv01
);
1349 rinvsq02
= _mm_mul_ps(rinv02
,rinv02
);
1350 rinvsq10
= _mm_mul_ps(rinv10
,rinv10
);
1351 rinvsq11
= _mm_mul_ps(rinv11
,rinv11
);
1352 rinvsq12
= _mm_mul_ps(rinv12
,rinv12
);
1353 rinvsq20
= _mm_mul_ps(rinv20
,rinv20
);
1354 rinvsq21
= _mm_mul_ps(rinv21
,rinv21
);
1355 rinvsq22
= _mm_mul_ps(rinv22
,rinv22
);
1357 fjx0
= _mm_setzero_ps();
1358 fjy0
= _mm_setzero_ps();
1359 fjz0
= _mm_setzero_ps();
1360 fjx1
= _mm_setzero_ps();
1361 fjy1
= _mm_setzero_ps();
1362 fjz1
= _mm_setzero_ps();
1363 fjx2
= _mm_setzero_ps();
1364 fjy2
= _mm_setzero_ps();
1365 fjz2
= _mm_setzero_ps();
1367 /**************************
1368 * CALCULATE INTERACTIONS *
1369 **************************/
1371 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
1374 r00
= _mm_mul_ps(rsq00
,rinv00
);
1376 /* REACTION-FIELD ELECTROSTATICS */
1377 felec
= _mm_mul_ps(qq00
,_mm_msub_ps(rinv00
,rinvsq00
,krf2
));
1379 /* LENNARD-JONES DISPERSION/REPULSION */
1381 rinvsix
= _mm_mul_ps(_mm_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
1382 vvdw6
= _mm_mul_ps(c6_00
,rinvsix
);
1383 vvdw12
= _mm_mul_ps(c12_00
,_mm_mul_ps(rinvsix
,rinvsix
));
1384 vvdw
= _mm_msub_ps(vvdw12
,one_twelfth
,_mm_mul_ps(vvdw6
,one_sixth
));
1385 fvdw
= _mm_mul_ps(_mm_sub_ps(vvdw12
,vvdw6
),rinvsq00
);
1387 d
= _mm_sub_ps(r00
,rswitch
);
1388 d
= _mm_max_ps(d
,_mm_setzero_ps());
1389 d2
= _mm_mul_ps(d
,d
);
1390 sw
= _mm_add_ps(one
,_mm_mul_ps(d2
,_mm_mul_ps(d
,_mm_macc_ps(d
,_mm_macc_ps(d
,swV5
,swV4
),swV3
))));
1392 dsw
= _mm_mul_ps(d2
,_mm_macc_ps(d
,_mm_macc_ps(d
,swF4
,swF3
),swF2
));
1394 /* Evaluate switch function */
1395 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1396 fvdw
= _mm_msub_ps( fvdw
,sw
, _mm_mul_ps(rinv00
,_mm_mul_ps(vvdw
,dsw
)) );
1397 cutoff_mask
= _mm_cmplt_ps(rsq00
,rcutoff2
);
1399 fscal
= _mm_add_ps(felec
,fvdw
);
1401 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1403 /* Update vectorial force */
1404 fix0
= _mm_macc_ps(dx00
,fscal
,fix0
);
1405 fiy0
= _mm_macc_ps(dy00
,fscal
,fiy0
);
1406 fiz0
= _mm_macc_ps(dz00
,fscal
,fiz0
);
1408 fjx0
= _mm_macc_ps(dx00
,fscal
,fjx0
);
1409 fjy0
= _mm_macc_ps(dy00
,fscal
,fjy0
);
1410 fjz0
= _mm_macc_ps(dz00
,fscal
,fjz0
);
1414 /**************************
1415 * CALCULATE INTERACTIONS *
1416 **************************/
1418 if (gmx_mm_any_lt(rsq01
,rcutoff2
))
1421 /* REACTION-FIELD ELECTROSTATICS */
1422 felec
= _mm_mul_ps(qq01
,_mm_msub_ps(rinv01
,rinvsq01
,krf2
));
1424 cutoff_mask
= _mm_cmplt_ps(rsq01
,rcutoff2
);
1428 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1430 /* Update vectorial force */
1431 fix0
= _mm_macc_ps(dx01
,fscal
,fix0
);
1432 fiy0
= _mm_macc_ps(dy01
,fscal
,fiy0
);
1433 fiz0
= _mm_macc_ps(dz01
,fscal
,fiz0
);
1435 fjx1
= _mm_macc_ps(dx01
,fscal
,fjx1
);
1436 fjy1
= _mm_macc_ps(dy01
,fscal
,fjy1
);
1437 fjz1
= _mm_macc_ps(dz01
,fscal
,fjz1
);
1441 /**************************
1442 * CALCULATE INTERACTIONS *
1443 **************************/
1445 if (gmx_mm_any_lt(rsq02
,rcutoff2
))
1448 /* REACTION-FIELD ELECTROSTATICS */
1449 felec
= _mm_mul_ps(qq02
,_mm_msub_ps(rinv02
,rinvsq02
,krf2
));
1451 cutoff_mask
= _mm_cmplt_ps(rsq02
,rcutoff2
);
1455 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1457 /* Update vectorial force */
1458 fix0
= _mm_macc_ps(dx02
,fscal
,fix0
);
1459 fiy0
= _mm_macc_ps(dy02
,fscal
,fiy0
);
1460 fiz0
= _mm_macc_ps(dz02
,fscal
,fiz0
);
1462 fjx2
= _mm_macc_ps(dx02
,fscal
,fjx2
);
1463 fjy2
= _mm_macc_ps(dy02
,fscal
,fjy2
);
1464 fjz2
= _mm_macc_ps(dz02
,fscal
,fjz2
);
1468 /**************************
1469 * CALCULATE INTERACTIONS *
1470 **************************/
1472 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
1475 /* REACTION-FIELD ELECTROSTATICS */
1476 felec
= _mm_mul_ps(qq10
,_mm_msub_ps(rinv10
,rinvsq10
,krf2
));
1478 cutoff_mask
= _mm_cmplt_ps(rsq10
,rcutoff2
);
1482 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1484 /* Update vectorial force */
1485 fix1
= _mm_macc_ps(dx10
,fscal
,fix1
);
1486 fiy1
= _mm_macc_ps(dy10
,fscal
,fiy1
);
1487 fiz1
= _mm_macc_ps(dz10
,fscal
,fiz1
);
1489 fjx0
= _mm_macc_ps(dx10
,fscal
,fjx0
);
1490 fjy0
= _mm_macc_ps(dy10
,fscal
,fjy0
);
1491 fjz0
= _mm_macc_ps(dz10
,fscal
,fjz0
);
1495 /**************************
1496 * CALCULATE INTERACTIONS *
1497 **************************/
1499 if (gmx_mm_any_lt(rsq11
,rcutoff2
))
1502 /* REACTION-FIELD ELECTROSTATICS */
1503 felec
= _mm_mul_ps(qq11
,_mm_msub_ps(rinv11
,rinvsq11
,krf2
));
1505 cutoff_mask
= _mm_cmplt_ps(rsq11
,rcutoff2
);
1509 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1511 /* Update vectorial force */
1512 fix1
= _mm_macc_ps(dx11
,fscal
,fix1
);
1513 fiy1
= _mm_macc_ps(dy11
,fscal
,fiy1
);
1514 fiz1
= _mm_macc_ps(dz11
,fscal
,fiz1
);
1516 fjx1
= _mm_macc_ps(dx11
,fscal
,fjx1
);
1517 fjy1
= _mm_macc_ps(dy11
,fscal
,fjy1
);
1518 fjz1
= _mm_macc_ps(dz11
,fscal
,fjz1
);
1522 /**************************
1523 * CALCULATE INTERACTIONS *
1524 **************************/
1526 if (gmx_mm_any_lt(rsq12
,rcutoff2
))
1529 /* REACTION-FIELD ELECTROSTATICS */
1530 felec
= _mm_mul_ps(qq12
,_mm_msub_ps(rinv12
,rinvsq12
,krf2
));
1532 cutoff_mask
= _mm_cmplt_ps(rsq12
,rcutoff2
);
1536 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1538 /* Update vectorial force */
1539 fix1
= _mm_macc_ps(dx12
,fscal
,fix1
);
1540 fiy1
= _mm_macc_ps(dy12
,fscal
,fiy1
);
1541 fiz1
= _mm_macc_ps(dz12
,fscal
,fiz1
);
1543 fjx2
= _mm_macc_ps(dx12
,fscal
,fjx2
);
1544 fjy2
= _mm_macc_ps(dy12
,fscal
,fjy2
);
1545 fjz2
= _mm_macc_ps(dz12
,fscal
,fjz2
);
1549 /**************************
1550 * CALCULATE INTERACTIONS *
1551 **************************/
1553 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
1556 /* REACTION-FIELD ELECTROSTATICS */
1557 felec
= _mm_mul_ps(qq20
,_mm_msub_ps(rinv20
,rinvsq20
,krf2
));
1559 cutoff_mask
= _mm_cmplt_ps(rsq20
,rcutoff2
);
1563 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1565 /* Update vectorial force */
1566 fix2
= _mm_macc_ps(dx20
,fscal
,fix2
);
1567 fiy2
= _mm_macc_ps(dy20
,fscal
,fiy2
);
1568 fiz2
= _mm_macc_ps(dz20
,fscal
,fiz2
);
1570 fjx0
= _mm_macc_ps(dx20
,fscal
,fjx0
);
1571 fjy0
= _mm_macc_ps(dy20
,fscal
,fjy0
);
1572 fjz0
= _mm_macc_ps(dz20
,fscal
,fjz0
);
1576 /**************************
1577 * CALCULATE INTERACTIONS *
1578 **************************/
1580 if (gmx_mm_any_lt(rsq21
,rcutoff2
))
1583 /* REACTION-FIELD ELECTROSTATICS */
1584 felec
= _mm_mul_ps(qq21
,_mm_msub_ps(rinv21
,rinvsq21
,krf2
));
1586 cutoff_mask
= _mm_cmplt_ps(rsq21
,rcutoff2
);
1590 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1592 /* Update vectorial force */
1593 fix2
= _mm_macc_ps(dx21
,fscal
,fix2
);
1594 fiy2
= _mm_macc_ps(dy21
,fscal
,fiy2
);
1595 fiz2
= _mm_macc_ps(dz21
,fscal
,fiz2
);
1597 fjx1
= _mm_macc_ps(dx21
,fscal
,fjx1
);
1598 fjy1
= _mm_macc_ps(dy21
,fscal
,fjy1
);
1599 fjz1
= _mm_macc_ps(dz21
,fscal
,fjz1
);
1603 /**************************
1604 * CALCULATE INTERACTIONS *
1605 **************************/
1607 if (gmx_mm_any_lt(rsq22
,rcutoff2
))
1610 /* REACTION-FIELD ELECTROSTATICS */
1611 felec
= _mm_mul_ps(qq22
,_mm_msub_ps(rinv22
,rinvsq22
,krf2
));
1613 cutoff_mask
= _mm_cmplt_ps(rsq22
,rcutoff2
);
1617 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1619 /* Update vectorial force */
1620 fix2
= _mm_macc_ps(dx22
,fscal
,fix2
);
1621 fiy2
= _mm_macc_ps(dy22
,fscal
,fiy2
);
1622 fiz2
= _mm_macc_ps(dz22
,fscal
,fiz2
);
1624 fjx2
= _mm_macc_ps(dx22
,fscal
,fjx2
);
1625 fjy2
= _mm_macc_ps(dy22
,fscal
,fjy2
);
1626 fjz2
= _mm_macc_ps(dz22
,fscal
,fjz2
);
1630 fjptrA
= f
+j_coord_offsetA
;
1631 fjptrB
= f
+j_coord_offsetB
;
1632 fjptrC
= f
+j_coord_offsetC
;
1633 fjptrD
= f
+j_coord_offsetD
;
1635 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
1636 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
1638 /* Inner loop uses 328 flops */
1641 if(jidx
<j_index_end
)
1644 /* Get j neighbor index, and coordinate index */
1645 jnrlistA
= jjnr
[jidx
];
1646 jnrlistB
= jjnr
[jidx
+1];
1647 jnrlistC
= jjnr
[jidx
+2];
1648 jnrlistD
= jjnr
[jidx
+3];
1649 /* Sign of each element will be negative for non-real atoms.
1650 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1651 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1653 dummy_mask
= gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128()));
1654 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
1655 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
1656 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
1657 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
1658 j_coord_offsetA
= DIM
*jnrA
;
1659 j_coord_offsetB
= DIM
*jnrB
;
1660 j_coord_offsetC
= DIM
*jnrC
;
1661 j_coord_offsetD
= DIM
*jnrD
;
1663 /* load j atom coordinates */
1664 gmx_mm_load_3rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1665 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
1666 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
1668 /* Calculate displacement vector */
1669 dx00
= _mm_sub_ps(ix0
,jx0
);
1670 dy00
= _mm_sub_ps(iy0
,jy0
);
1671 dz00
= _mm_sub_ps(iz0
,jz0
);
1672 dx01
= _mm_sub_ps(ix0
,jx1
);
1673 dy01
= _mm_sub_ps(iy0
,jy1
);
1674 dz01
= _mm_sub_ps(iz0
,jz1
);
1675 dx02
= _mm_sub_ps(ix0
,jx2
);
1676 dy02
= _mm_sub_ps(iy0
,jy2
);
1677 dz02
= _mm_sub_ps(iz0
,jz2
);
1678 dx10
= _mm_sub_ps(ix1
,jx0
);
1679 dy10
= _mm_sub_ps(iy1
,jy0
);
1680 dz10
= _mm_sub_ps(iz1
,jz0
);
1681 dx11
= _mm_sub_ps(ix1
,jx1
);
1682 dy11
= _mm_sub_ps(iy1
,jy1
);
1683 dz11
= _mm_sub_ps(iz1
,jz1
);
1684 dx12
= _mm_sub_ps(ix1
,jx2
);
1685 dy12
= _mm_sub_ps(iy1
,jy2
);
1686 dz12
= _mm_sub_ps(iz1
,jz2
);
1687 dx20
= _mm_sub_ps(ix2
,jx0
);
1688 dy20
= _mm_sub_ps(iy2
,jy0
);
1689 dz20
= _mm_sub_ps(iz2
,jz0
);
1690 dx21
= _mm_sub_ps(ix2
,jx1
);
1691 dy21
= _mm_sub_ps(iy2
,jy1
);
1692 dz21
= _mm_sub_ps(iz2
,jz1
);
1693 dx22
= _mm_sub_ps(ix2
,jx2
);
1694 dy22
= _mm_sub_ps(iy2
,jy2
);
1695 dz22
= _mm_sub_ps(iz2
,jz2
);
1697 /* Calculate squared distance and things based on it */
1698 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
1699 rsq01
= gmx_mm_calc_rsq_ps(dx01
,dy01
,dz01
);
1700 rsq02
= gmx_mm_calc_rsq_ps(dx02
,dy02
,dz02
);
1701 rsq10
= gmx_mm_calc_rsq_ps(dx10
,dy10
,dz10
);
1702 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
1703 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
1704 rsq20
= gmx_mm_calc_rsq_ps(dx20
,dy20
,dz20
);
1705 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
1706 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
1708 rinv00
= gmx_mm_invsqrt_ps(rsq00
);
1709 rinv01
= gmx_mm_invsqrt_ps(rsq01
);
1710 rinv02
= gmx_mm_invsqrt_ps(rsq02
);
1711 rinv10
= gmx_mm_invsqrt_ps(rsq10
);
1712 rinv11
= gmx_mm_invsqrt_ps(rsq11
);
1713 rinv12
= gmx_mm_invsqrt_ps(rsq12
);
1714 rinv20
= gmx_mm_invsqrt_ps(rsq20
);
1715 rinv21
= gmx_mm_invsqrt_ps(rsq21
);
1716 rinv22
= gmx_mm_invsqrt_ps(rsq22
);
1718 rinvsq00
= _mm_mul_ps(rinv00
,rinv00
);
1719 rinvsq01
= _mm_mul_ps(rinv01
,rinv01
);
1720 rinvsq02
= _mm_mul_ps(rinv02
,rinv02
);
1721 rinvsq10
= _mm_mul_ps(rinv10
,rinv10
);
1722 rinvsq11
= _mm_mul_ps(rinv11
,rinv11
);
1723 rinvsq12
= _mm_mul_ps(rinv12
,rinv12
);
1724 rinvsq20
= _mm_mul_ps(rinv20
,rinv20
);
1725 rinvsq21
= _mm_mul_ps(rinv21
,rinv21
);
1726 rinvsq22
= _mm_mul_ps(rinv22
,rinv22
);
1728 fjx0
= _mm_setzero_ps();
1729 fjy0
= _mm_setzero_ps();
1730 fjz0
= _mm_setzero_ps();
1731 fjx1
= _mm_setzero_ps();
1732 fjy1
= _mm_setzero_ps();
1733 fjz1
= _mm_setzero_ps();
1734 fjx2
= _mm_setzero_ps();
1735 fjy2
= _mm_setzero_ps();
1736 fjz2
= _mm_setzero_ps();
1738 /**************************
1739 * CALCULATE INTERACTIONS *
1740 **************************/
1742 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
1745 r00
= _mm_mul_ps(rsq00
,rinv00
);
1746 r00
= _mm_andnot_ps(dummy_mask
,r00
);
1748 /* REACTION-FIELD ELECTROSTATICS */
1749 felec
= _mm_mul_ps(qq00
,_mm_msub_ps(rinv00
,rinvsq00
,krf2
));
1751 /* LENNARD-JONES DISPERSION/REPULSION */
1753 rinvsix
= _mm_mul_ps(_mm_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
1754 vvdw6
= _mm_mul_ps(c6_00
,rinvsix
);
1755 vvdw12
= _mm_mul_ps(c12_00
,_mm_mul_ps(rinvsix
,rinvsix
));
1756 vvdw
= _mm_msub_ps(vvdw12
,one_twelfth
,_mm_mul_ps(vvdw6
,one_sixth
));
1757 fvdw
= _mm_mul_ps(_mm_sub_ps(vvdw12
,vvdw6
),rinvsq00
);
1759 d
= _mm_sub_ps(r00
,rswitch
);
1760 d
= _mm_max_ps(d
,_mm_setzero_ps());
1761 d2
= _mm_mul_ps(d
,d
);
1762 sw
= _mm_add_ps(one
,_mm_mul_ps(d2
,_mm_mul_ps(d
,_mm_macc_ps(d
,_mm_macc_ps(d
,swV5
,swV4
),swV3
))));
1764 dsw
= _mm_mul_ps(d2
,_mm_macc_ps(d
,_mm_macc_ps(d
,swF4
,swF3
),swF2
));
1766 /* Evaluate switch function */
1767 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1768 fvdw
= _mm_msub_ps( fvdw
,sw
, _mm_mul_ps(rinv00
,_mm_mul_ps(vvdw
,dsw
)) );
1769 cutoff_mask
= _mm_cmplt_ps(rsq00
,rcutoff2
);
1771 fscal
= _mm_add_ps(felec
,fvdw
);
1773 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1775 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1777 /* Update vectorial force */
1778 fix0
= _mm_macc_ps(dx00
,fscal
,fix0
);
1779 fiy0
= _mm_macc_ps(dy00
,fscal
,fiy0
);
1780 fiz0
= _mm_macc_ps(dz00
,fscal
,fiz0
);
1782 fjx0
= _mm_macc_ps(dx00
,fscal
,fjx0
);
1783 fjy0
= _mm_macc_ps(dy00
,fscal
,fjy0
);
1784 fjz0
= _mm_macc_ps(dz00
,fscal
,fjz0
);
1788 /**************************
1789 * CALCULATE INTERACTIONS *
1790 **************************/
1792 if (gmx_mm_any_lt(rsq01
,rcutoff2
))
1795 /* REACTION-FIELD ELECTROSTATICS */
1796 felec
= _mm_mul_ps(qq01
,_mm_msub_ps(rinv01
,rinvsq01
,krf2
));
1798 cutoff_mask
= _mm_cmplt_ps(rsq01
,rcutoff2
);
1802 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1804 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1806 /* Update vectorial force */
1807 fix0
= _mm_macc_ps(dx01
,fscal
,fix0
);
1808 fiy0
= _mm_macc_ps(dy01
,fscal
,fiy0
);
1809 fiz0
= _mm_macc_ps(dz01
,fscal
,fiz0
);
1811 fjx1
= _mm_macc_ps(dx01
,fscal
,fjx1
);
1812 fjy1
= _mm_macc_ps(dy01
,fscal
,fjy1
);
1813 fjz1
= _mm_macc_ps(dz01
,fscal
,fjz1
);
1817 /**************************
1818 * CALCULATE INTERACTIONS *
1819 **************************/
1821 if (gmx_mm_any_lt(rsq02
,rcutoff2
))
1824 /* REACTION-FIELD ELECTROSTATICS */
1825 felec
= _mm_mul_ps(qq02
,_mm_msub_ps(rinv02
,rinvsq02
,krf2
));
1827 cutoff_mask
= _mm_cmplt_ps(rsq02
,rcutoff2
);
1831 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1833 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1835 /* Update vectorial force */
1836 fix0
= _mm_macc_ps(dx02
,fscal
,fix0
);
1837 fiy0
= _mm_macc_ps(dy02
,fscal
,fiy0
);
1838 fiz0
= _mm_macc_ps(dz02
,fscal
,fiz0
);
1840 fjx2
= _mm_macc_ps(dx02
,fscal
,fjx2
);
1841 fjy2
= _mm_macc_ps(dy02
,fscal
,fjy2
);
1842 fjz2
= _mm_macc_ps(dz02
,fscal
,fjz2
);
1846 /**************************
1847 * CALCULATE INTERACTIONS *
1848 **************************/
1850 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
1853 /* REACTION-FIELD ELECTROSTATICS */
1854 felec
= _mm_mul_ps(qq10
,_mm_msub_ps(rinv10
,rinvsq10
,krf2
));
1856 cutoff_mask
= _mm_cmplt_ps(rsq10
,rcutoff2
);
1860 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1862 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1864 /* Update vectorial force */
1865 fix1
= _mm_macc_ps(dx10
,fscal
,fix1
);
1866 fiy1
= _mm_macc_ps(dy10
,fscal
,fiy1
);
1867 fiz1
= _mm_macc_ps(dz10
,fscal
,fiz1
);
1869 fjx0
= _mm_macc_ps(dx10
,fscal
,fjx0
);
1870 fjy0
= _mm_macc_ps(dy10
,fscal
,fjy0
);
1871 fjz0
= _mm_macc_ps(dz10
,fscal
,fjz0
);
1875 /**************************
1876 * CALCULATE INTERACTIONS *
1877 **************************/
1879 if (gmx_mm_any_lt(rsq11
,rcutoff2
))
1882 /* REACTION-FIELD ELECTROSTATICS */
1883 felec
= _mm_mul_ps(qq11
,_mm_msub_ps(rinv11
,rinvsq11
,krf2
));
1885 cutoff_mask
= _mm_cmplt_ps(rsq11
,rcutoff2
);
1889 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1891 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1893 /* Update vectorial force */
1894 fix1
= _mm_macc_ps(dx11
,fscal
,fix1
);
1895 fiy1
= _mm_macc_ps(dy11
,fscal
,fiy1
);
1896 fiz1
= _mm_macc_ps(dz11
,fscal
,fiz1
);
1898 fjx1
= _mm_macc_ps(dx11
,fscal
,fjx1
);
1899 fjy1
= _mm_macc_ps(dy11
,fscal
,fjy1
);
1900 fjz1
= _mm_macc_ps(dz11
,fscal
,fjz1
);
1904 /**************************
1905 * CALCULATE INTERACTIONS *
1906 **************************/
1908 if (gmx_mm_any_lt(rsq12
,rcutoff2
))
1911 /* REACTION-FIELD ELECTROSTATICS */
1912 felec
= _mm_mul_ps(qq12
,_mm_msub_ps(rinv12
,rinvsq12
,krf2
));
1914 cutoff_mask
= _mm_cmplt_ps(rsq12
,rcutoff2
);
1918 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1920 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1922 /* Update vectorial force */
1923 fix1
= _mm_macc_ps(dx12
,fscal
,fix1
);
1924 fiy1
= _mm_macc_ps(dy12
,fscal
,fiy1
);
1925 fiz1
= _mm_macc_ps(dz12
,fscal
,fiz1
);
1927 fjx2
= _mm_macc_ps(dx12
,fscal
,fjx2
);
1928 fjy2
= _mm_macc_ps(dy12
,fscal
,fjy2
);
1929 fjz2
= _mm_macc_ps(dz12
,fscal
,fjz2
);
1933 /**************************
1934 * CALCULATE INTERACTIONS *
1935 **************************/
1937 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
1940 /* REACTION-FIELD ELECTROSTATICS */
1941 felec
= _mm_mul_ps(qq20
,_mm_msub_ps(rinv20
,rinvsq20
,krf2
));
1943 cutoff_mask
= _mm_cmplt_ps(rsq20
,rcutoff2
);
1947 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1949 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1951 /* Update vectorial force */
1952 fix2
= _mm_macc_ps(dx20
,fscal
,fix2
);
1953 fiy2
= _mm_macc_ps(dy20
,fscal
,fiy2
);
1954 fiz2
= _mm_macc_ps(dz20
,fscal
,fiz2
);
1956 fjx0
= _mm_macc_ps(dx20
,fscal
,fjx0
);
1957 fjy0
= _mm_macc_ps(dy20
,fscal
,fjy0
);
1958 fjz0
= _mm_macc_ps(dz20
,fscal
,fjz0
);
1962 /**************************
1963 * CALCULATE INTERACTIONS *
1964 **************************/
1966 if (gmx_mm_any_lt(rsq21
,rcutoff2
))
1969 /* REACTION-FIELD ELECTROSTATICS */
1970 felec
= _mm_mul_ps(qq21
,_mm_msub_ps(rinv21
,rinvsq21
,krf2
));
1972 cutoff_mask
= _mm_cmplt_ps(rsq21
,rcutoff2
);
1976 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1978 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1980 /* Update vectorial force */
1981 fix2
= _mm_macc_ps(dx21
,fscal
,fix2
);
1982 fiy2
= _mm_macc_ps(dy21
,fscal
,fiy2
);
1983 fiz2
= _mm_macc_ps(dz21
,fscal
,fiz2
);
1985 fjx1
= _mm_macc_ps(dx21
,fscal
,fjx1
);
1986 fjy1
= _mm_macc_ps(dy21
,fscal
,fjy1
);
1987 fjz1
= _mm_macc_ps(dz21
,fscal
,fjz1
);
1991 /**************************
1992 * CALCULATE INTERACTIONS *
1993 **************************/
1995 if (gmx_mm_any_lt(rsq22
,rcutoff2
))
1998 /* REACTION-FIELD ELECTROSTATICS */
1999 felec
= _mm_mul_ps(qq22
,_mm_msub_ps(rinv22
,rinvsq22
,krf2
));
2001 cutoff_mask
= _mm_cmplt_ps(rsq22
,rcutoff2
);
2005 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
2007 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2009 /* Update vectorial force */
2010 fix2
= _mm_macc_ps(dx22
,fscal
,fix2
);
2011 fiy2
= _mm_macc_ps(dy22
,fscal
,fiy2
);
2012 fiz2
= _mm_macc_ps(dz22
,fscal
,fiz2
);
2014 fjx2
= _mm_macc_ps(dx22
,fscal
,fjx2
);
2015 fjy2
= _mm_macc_ps(dy22
,fscal
,fjy2
);
2016 fjz2
= _mm_macc_ps(dz22
,fscal
,fjz2
);
2020 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
2021 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
2022 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
2023 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
2025 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
2026 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
2028 /* Inner loop uses 329 flops */
2031 /* End of innermost loop */
2033 gmx_mm_update_iforce_3atom_swizzle_ps(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
2034 f
+i_coord_offset
,fshift
+i_shift_offset
);
2036 /* Increment number of inner iterations */
2037 inneriter
+= j_index_end
- j_index_start
;
2039 /* Outer loop uses 18 flops */
2042 /* Increment number of outer iterations */
2045 /* Update outer/inner flops */
2047 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W3W3_F
,outeriter
*18 + inneriter
*329);