2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/math/vec.h"
46 #include "gromacs/legacyheaders/nrnb.h"
48 #include "gromacs/simd/math_x86_sse2_double.h"
49 #include "kernelutil_x86_sse2_double.h"
52 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_sse2_double
53 * Electrostatics interaction: ReactionField
54 * VdW interaction: LennardJones
55 * Geometry: Water3-Particle
56 * Calculate force/pot: PotentialAndForce
59 nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_VF_sse2_double
60 (t_nblist
* gmx_restrict nlist
,
61 rvec
* gmx_restrict xx
,
62 rvec
* gmx_restrict ff
,
63 t_forcerec
* gmx_restrict fr
,
64 t_mdatoms
* gmx_restrict mdatoms
,
65 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
66 t_nrnb
* gmx_restrict nrnb
)
68 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
69 * just 0 for non-waters.
70 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
71 * jnr indices corresponding to data put in the four positions in the SIMD register.
73 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
74 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
76 int j_coord_offsetA
,j_coord_offsetB
;
77 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
79 real
*shiftvec
,*fshift
,*x
,*f
;
80 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
82 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
84 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
86 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
87 int vdwjidx0A
,vdwjidx0B
;
88 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
89 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
90 __m128d dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
91 __m128d dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
92 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
95 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
98 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
99 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
100 __m128d rswitch
,swV3
,swV4
,swV5
,swF2
,swF3
,swF4
,d
,d2
,sw
,dsw
;
101 real rswitch_scalar
,d_scalar
;
102 __m128d dummy_mask
,cutoff_mask
;
103 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
104 __m128d one
= _mm_set1_pd(1.0);
105 __m128d two
= _mm_set1_pd(2.0);
111 jindex
= nlist
->jindex
;
113 shiftidx
= nlist
->shift
;
115 shiftvec
= fr
->shift_vec
[0];
116 fshift
= fr
->fshift
[0];
117 facel
= _mm_set1_pd(fr
->epsfac
);
118 charge
= mdatoms
->chargeA
;
119 krf
= _mm_set1_pd(fr
->ic
->k_rf
);
120 krf2
= _mm_set1_pd(fr
->ic
->k_rf
*2.0);
121 crf
= _mm_set1_pd(fr
->ic
->c_rf
);
122 nvdwtype
= fr
->ntype
;
124 vdwtype
= mdatoms
->typeA
;
126 /* Setup water-specific parameters */
127 inr
= nlist
->iinr
[0];
128 iq0
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+0]));
129 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
130 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
131 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
133 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
134 rcutoff_scalar
= fr
->rcoulomb
;
135 rcutoff
= _mm_set1_pd(rcutoff_scalar
);
136 rcutoff2
= _mm_mul_pd(rcutoff
,rcutoff
);
138 rswitch_scalar
= fr
->rvdw_switch
;
139 rswitch
= _mm_set1_pd(rswitch_scalar
);
140 /* Setup switch parameters */
141 d_scalar
= rcutoff_scalar
-rswitch_scalar
;
142 d
= _mm_set1_pd(d_scalar
);
143 swV3
= _mm_set1_pd(-10.0/(d_scalar
*d_scalar
*d_scalar
));
144 swV4
= _mm_set1_pd( 15.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
));
145 swV5
= _mm_set1_pd( -6.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
*d_scalar
));
146 swF2
= _mm_set1_pd(-30.0/(d_scalar
*d_scalar
*d_scalar
));
147 swF3
= _mm_set1_pd( 60.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
));
148 swF4
= _mm_set1_pd(-30.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
*d_scalar
));
150 /* Avoid stupid compiler warnings */
158 /* Start outer loop over neighborlists */
159 for(iidx
=0; iidx
<nri
; iidx
++)
161 /* Load shift vector for this list */
162 i_shift_offset
= DIM
*shiftidx
[iidx
];
164 /* Load limits for loop over neighbors */
165 j_index_start
= jindex
[iidx
];
166 j_index_end
= jindex
[iidx
+1];
168 /* Get outer coordinate index */
170 i_coord_offset
= DIM
*inr
;
172 /* Load i particle coords and add shift vector */
173 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
174 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
176 fix0
= _mm_setzero_pd();
177 fiy0
= _mm_setzero_pd();
178 fiz0
= _mm_setzero_pd();
179 fix1
= _mm_setzero_pd();
180 fiy1
= _mm_setzero_pd();
181 fiz1
= _mm_setzero_pd();
182 fix2
= _mm_setzero_pd();
183 fiy2
= _mm_setzero_pd();
184 fiz2
= _mm_setzero_pd();
186 /* Reset potential sums */
187 velecsum
= _mm_setzero_pd();
188 vvdwsum
= _mm_setzero_pd();
190 /* Start inner kernel loop */
191 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
194 /* Get j neighbor index, and coordinate index */
197 j_coord_offsetA
= DIM
*jnrA
;
198 j_coord_offsetB
= DIM
*jnrB
;
200 /* load j atom coordinates */
201 gmx_mm_load_1rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
204 /* Calculate displacement vector */
205 dx00
= _mm_sub_pd(ix0
,jx0
);
206 dy00
= _mm_sub_pd(iy0
,jy0
);
207 dz00
= _mm_sub_pd(iz0
,jz0
);
208 dx10
= _mm_sub_pd(ix1
,jx0
);
209 dy10
= _mm_sub_pd(iy1
,jy0
);
210 dz10
= _mm_sub_pd(iz1
,jz0
);
211 dx20
= _mm_sub_pd(ix2
,jx0
);
212 dy20
= _mm_sub_pd(iy2
,jy0
);
213 dz20
= _mm_sub_pd(iz2
,jz0
);
215 /* Calculate squared distance and things based on it */
216 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
217 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
218 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
220 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
221 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
222 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
224 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
225 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
226 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
228 /* Load parameters for j particles */
229 jq0
= gmx_mm_load_2real_swizzle_pd(charge
+jnrA
+0,charge
+jnrB
+0);
230 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
231 vdwjidx0B
= 2*vdwtype
[jnrB
+0];
233 fjx0
= _mm_setzero_pd();
234 fjy0
= _mm_setzero_pd();
235 fjz0
= _mm_setzero_pd();
237 /**************************
238 * CALCULATE INTERACTIONS *
239 **************************/
241 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
244 r00
= _mm_mul_pd(rsq00
,rinv00
);
246 /* Compute parameters for interactions between i and j atoms */
247 qq00
= _mm_mul_pd(iq0
,jq0
);
248 gmx_mm_load_2pair_swizzle_pd(vdwparam
+vdwioffset0
+vdwjidx0A
,
249 vdwparam
+vdwioffset0
+vdwjidx0B
,&c6_00
,&c12_00
);
251 /* REACTION-FIELD ELECTROSTATICS */
252 velec
= _mm_mul_pd(qq00
,_mm_sub_pd(_mm_add_pd(rinv00
,_mm_mul_pd(krf
,rsq00
)),crf
));
253 felec
= _mm_mul_pd(qq00
,_mm_sub_pd(_mm_mul_pd(rinv00
,rinvsq00
),krf2
));
255 /* LENNARD-JONES DISPERSION/REPULSION */
257 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
258 vvdw6
= _mm_mul_pd(c6_00
,rinvsix
);
259 vvdw12
= _mm_mul_pd(c12_00
,_mm_mul_pd(rinvsix
,rinvsix
));
260 vvdw
= _mm_sub_pd( _mm_mul_pd(vvdw12
,one_twelfth
) , _mm_mul_pd(vvdw6
,one_sixth
) );
261 fvdw
= _mm_mul_pd(_mm_sub_pd(vvdw12
,vvdw6
),rinvsq00
);
263 d
= _mm_sub_pd(r00
,rswitch
);
264 d
= _mm_max_pd(d
,_mm_setzero_pd());
265 d2
= _mm_mul_pd(d
,d
);
266 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
268 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
270 /* Evaluate switch function */
271 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
272 fvdw
= _mm_sub_pd( _mm_mul_pd(fvdw
,sw
) , _mm_mul_pd(rinv00
,_mm_mul_pd(vvdw
,dsw
)) );
273 vvdw
= _mm_mul_pd(vvdw
,sw
);
274 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
276 /* Update potential sum for this i atom from the interaction with this j atom. */
277 velec
= _mm_and_pd(velec
,cutoff_mask
);
278 velecsum
= _mm_add_pd(velecsum
,velec
);
279 vvdw
= _mm_and_pd(vvdw
,cutoff_mask
);
280 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
282 fscal
= _mm_add_pd(felec
,fvdw
);
284 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
286 /* Calculate temporary vectorial force */
287 tx
= _mm_mul_pd(fscal
,dx00
);
288 ty
= _mm_mul_pd(fscal
,dy00
);
289 tz
= _mm_mul_pd(fscal
,dz00
);
291 /* Update vectorial force */
292 fix0
= _mm_add_pd(fix0
,tx
);
293 fiy0
= _mm_add_pd(fiy0
,ty
);
294 fiz0
= _mm_add_pd(fiz0
,tz
);
296 fjx0
= _mm_add_pd(fjx0
,tx
);
297 fjy0
= _mm_add_pd(fjy0
,ty
);
298 fjz0
= _mm_add_pd(fjz0
,tz
);
302 /**************************
303 * CALCULATE INTERACTIONS *
304 **************************/
306 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
309 /* Compute parameters for interactions between i and j atoms */
310 qq10
= _mm_mul_pd(iq1
,jq0
);
312 /* REACTION-FIELD ELECTROSTATICS */
313 velec
= _mm_mul_pd(qq10
,_mm_sub_pd(_mm_add_pd(rinv10
,_mm_mul_pd(krf
,rsq10
)),crf
));
314 felec
= _mm_mul_pd(qq10
,_mm_sub_pd(_mm_mul_pd(rinv10
,rinvsq10
),krf2
));
316 cutoff_mask
= _mm_cmplt_pd(rsq10
,rcutoff2
);
318 /* Update potential sum for this i atom from the interaction with this j atom. */
319 velec
= _mm_and_pd(velec
,cutoff_mask
);
320 velecsum
= _mm_add_pd(velecsum
,velec
);
324 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
326 /* Calculate temporary vectorial force */
327 tx
= _mm_mul_pd(fscal
,dx10
);
328 ty
= _mm_mul_pd(fscal
,dy10
);
329 tz
= _mm_mul_pd(fscal
,dz10
);
331 /* Update vectorial force */
332 fix1
= _mm_add_pd(fix1
,tx
);
333 fiy1
= _mm_add_pd(fiy1
,ty
);
334 fiz1
= _mm_add_pd(fiz1
,tz
);
336 fjx0
= _mm_add_pd(fjx0
,tx
);
337 fjy0
= _mm_add_pd(fjy0
,ty
);
338 fjz0
= _mm_add_pd(fjz0
,tz
);
342 /**************************
343 * CALCULATE INTERACTIONS *
344 **************************/
346 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
349 /* Compute parameters for interactions between i and j atoms */
350 qq20
= _mm_mul_pd(iq2
,jq0
);
352 /* REACTION-FIELD ELECTROSTATICS */
353 velec
= _mm_mul_pd(qq20
,_mm_sub_pd(_mm_add_pd(rinv20
,_mm_mul_pd(krf
,rsq20
)),crf
));
354 felec
= _mm_mul_pd(qq20
,_mm_sub_pd(_mm_mul_pd(rinv20
,rinvsq20
),krf2
));
356 cutoff_mask
= _mm_cmplt_pd(rsq20
,rcutoff2
);
358 /* Update potential sum for this i atom from the interaction with this j atom. */
359 velec
= _mm_and_pd(velec
,cutoff_mask
);
360 velecsum
= _mm_add_pd(velecsum
,velec
);
364 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
366 /* Calculate temporary vectorial force */
367 tx
= _mm_mul_pd(fscal
,dx20
);
368 ty
= _mm_mul_pd(fscal
,dy20
);
369 tz
= _mm_mul_pd(fscal
,dz20
);
371 /* Update vectorial force */
372 fix2
= _mm_add_pd(fix2
,tx
);
373 fiy2
= _mm_add_pd(fiy2
,ty
);
374 fiz2
= _mm_add_pd(fiz2
,tz
);
376 fjx0
= _mm_add_pd(fjx0
,tx
);
377 fjy0
= _mm_add_pd(fjy0
,ty
);
378 fjz0
= _mm_add_pd(fjz0
,tz
);
382 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
);
384 /* Inner loop uses 145 flops */
391 j_coord_offsetA
= DIM
*jnrA
;
393 /* load j atom coordinates */
394 gmx_mm_load_1rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
397 /* Calculate displacement vector */
398 dx00
= _mm_sub_pd(ix0
,jx0
);
399 dy00
= _mm_sub_pd(iy0
,jy0
);
400 dz00
= _mm_sub_pd(iz0
,jz0
);
401 dx10
= _mm_sub_pd(ix1
,jx0
);
402 dy10
= _mm_sub_pd(iy1
,jy0
);
403 dz10
= _mm_sub_pd(iz1
,jz0
);
404 dx20
= _mm_sub_pd(ix2
,jx0
);
405 dy20
= _mm_sub_pd(iy2
,jy0
);
406 dz20
= _mm_sub_pd(iz2
,jz0
);
408 /* Calculate squared distance and things based on it */
409 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
410 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
411 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
413 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
414 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
415 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
417 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
418 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
419 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
421 /* Load parameters for j particles */
422 jq0
= _mm_load_sd(charge
+jnrA
+0);
423 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
425 fjx0
= _mm_setzero_pd();
426 fjy0
= _mm_setzero_pd();
427 fjz0
= _mm_setzero_pd();
429 /**************************
430 * CALCULATE INTERACTIONS *
431 **************************/
433 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
436 r00
= _mm_mul_pd(rsq00
,rinv00
);
438 /* Compute parameters for interactions between i and j atoms */
439 qq00
= _mm_mul_pd(iq0
,jq0
);
440 gmx_mm_load_1pair_swizzle_pd(vdwparam
+vdwioffset0
+vdwjidx0A
,&c6_00
,&c12_00
);
442 /* REACTION-FIELD ELECTROSTATICS */
443 velec
= _mm_mul_pd(qq00
,_mm_sub_pd(_mm_add_pd(rinv00
,_mm_mul_pd(krf
,rsq00
)),crf
));
444 felec
= _mm_mul_pd(qq00
,_mm_sub_pd(_mm_mul_pd(rinv00
,rinvsq00
),krf2
));
446 /* LENNARD-JONES DISPERSION/REPULSION */
448 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
449 vvdw6
= _mm_mul_pd(c6_00
,rinvsix
);
450 vvdw12
= _mm_mul_pd(c12_00
,_mm_mul_pd(rinvsix
,rinvsix
));
451 vvdw
= _mm_sub_pd( _mm_mul_pd(vvdw12
,one_twelfth
) , _mm_mul_pd(vvdw6
,one_sixth
) );
452 fvdw
= _mm_mul_pd(_mm_sub_pd(vvdw12
,vvdw6
),rinvsq00
);
454 d
= _mm_sub_pd(r00
,rswitch
);
455 d
= _mm_max_pd(d
,_mm_setzero_pd());
456 d2
= _mm_mul_pd(d
,d
);
457 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
459 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
461 /* Evaluate switch function */
462 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
463 fvdw
= _mm_sub_pd( _mm_mul_pd(fvdw
,sw
) , _mm_mul_pd(rinv00
,_mm_mul_pd(vvdw
,dsw
)) );
464 vvdw
= _mm_mul_pd(vvdw
,sw
);
465 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
467 /* Update potential sum for this i atom from the interaction with this j atom. */
468 velec
= _mm_and_pd(velec
,cutoff_mask
);
469 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
470 velecsum
= _mm_add_pd(velecsum
,velec
);
471 vvdw
= _mm_and_pd(vvdw
,cutoff_mask
);
472 vvdw
= _mm_unpacklo_pd(vvdw
,_mm_setzero_pd());
473 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
475 fscal
= _mm_add_pd(felec
,fvdw
);
477 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
479 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
481 /* Calculate temporary vectorial force */
482 tx
= _mm_mul_pd(fscal
,dx00
);
483 ty
= _mm_mul_pd(fscal
,dy00
);
484 tz
= _mm_mul_pd(fscal
,dz00
);
486 /* Update vectorial force */
487 fix0
= _mm_add_pd(fix0
,tx
);
488 fiy0
= _mm_add_pd(fiy0
,ty
);
489 fiz0
= _mm_add_pd(fiz0
,tz
);
491 fjx0
= _mm_add_pd(fjx0
,tx
);
492 fjy0
= _mm_add_pd(fjy0
,ty
);
493 fjz0
= _mm_add_pd(fjz0
,tz
);
497 /**************************
498 * CALCULATE INTERACTIONS *
499 **************************/
501 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
504 /* Compute parameters for interactions between i and j atoms */
505 qq10
= _mm_mul_pd(iq1
,jq0
);
507 /* REACTION-FIELD ELECTROSTATICS */
508 velec
= _mm_mul_pd(qq10
,_mm_sub_pd(_mm_add_pd(rinv10
,_mm_mul_pd(krf
,rsq10
)),crf
));
509 felec
= _mm_mul_pd(qq10
,_mm_sub_pd(_mm_mul_pd(rinv10
,rinvsq10
),krf2
));
511 cutoff_mask
= _mm_cmplt_pd(rsq10
,rcutoff2
);
513 /* Update potential sum for this i atom from the interaction with this j atom. */
514 velec
= _mm_and_pd(velec
,cutoff_mask
);
515 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
516 velecsum
= _mm_add_pd(velecsum
,velec
);
520 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
522 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
524 /* Calculate temporary vectorial force */
525 tx
= _mm_mul_pd(fscal
,dx10
);
526 ty
= _mm_mul_pd(fscal
,dy10
);
527 tz
= _mm_mul_pd(fscal
,dz10
);
529 /* Update vectorial force */
530 fix1
= _mm_add_pd(fix1
,tx
);
531 fiy1
= _mm_add_pd(fiy1
,ty
);
532 fiz1
= _mm_add_pd(fiz1
,tz
);
534 fjx0
= _mm_add_pd(fjx0
,tx
);
535 fjy0
= _mm_add_pd(fjy0
,ty
);
536 fjz0
= _mm_add_pd(fjz0
,tz
);
540 /**************************
541 * CALCULATE INTERACTIONS *
542 **************************/
544 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
547 /* Compute parameters for interactions between i and j atoms */
548 qq20
= _mm_mul_pd(iq2
,jq0
);
550 /* REACTION-FIELD ELECTROSTATICS */
551 velec
= _mm_mul_pd(qq20
,_mm_sub_pd(_mm_add_pd(rinv20
,_mm_mul_pd(krf
,rsq20
)),crf
));
552 felec
= _mm_mul_pd(qq20
,_mm_sub_pd(_mm_mul_pd(rinv20
,rinvsq20
),krf2
));
554 cutoff_mask
= _mm_cmplt_pd(rsq20
,rcutoff2
);
556 /* Update potential sum for this i atom from the interaction with this j atom. */
557 velec
= _mm_and_pd(velec
,cutoff_mask
);
558 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
559 velecsum
= _mm_add_pd(velecsum
,velec
);
563 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
565 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
567 /* Calculate temporary vectorial force */
568 tx
= _mm_mul_pd(fscal
,dx20
);
569 ty
= _mm_mul_pd(fscal
,dy20
);
570 tz
= _mm_mul_pd(fscal
,dz20
);
572 /* Update vectorial force */
573 fix2
= _mm_add_pd(fix2
,tx
);
574 fiy2
= _mm_add_pd(fiy2
,ty
);
575 fiz2
= _mm_add_pd(fiz2
,tz
);
577 fjx0
= _mm_add_pd(fjx0
,tx
);
578 fjy0
= _mm_add_pd(fjy0
,ty
);
579 fjz0
= _mm_add_pd(fjz0
,tz
);
583 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
);
585 /* Inner loop uses 145 flops */
588 /* End of innermost loop */
590 gmx_mm_update_iforce_3atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
591 f
+i_coord_offset
,fshift
+i_shift_offset
);
594 /* Update potential energies */
595 gmx_mm_update_1pot_pd(velecsum
,kernel_data
->energygrp_elec
+ggid
);
596 gmx_mm_update_1pot_pd(vvdwsum
,kernel_data
->energygrp_vdw
+ggid
);
598 /* Increment number of inner iterations */
599 inneriter
+= j_index_end
- j_index_start
;
601 /* Outer loop uses 20 flops */
604 /* Increment number of outer iterations */
607 /* Update outer/inner flops */
609 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W3_VF
,outeriter
*20 + inneriter
*145);
612 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_sse2_double
613 * Electrostatics interaction: ReactionField
614 * VdW interaction: LennardJones
615 * Geometry: Water3-Particle
616 * Calculate force/pot: Force
619 nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_F_sse2_double
620 (t_nblist
* gmx_restrict nlist
,
621 rvec
* gmx_restrict xx
,
622 rvec
* gmx_restrict ff
,
623 t_forcerec
* gmx_restrict fr
,
624 t_mdatoms
* gmx_restrict mdatoms
,
625 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
626 t_nrnb
* gmx_restrict nrnb
)
628 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
629 * just 0 for non-waters.
630 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
631 * jnr indices corresponding to data put in the four positions in the SIMD register.
633 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
634 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
636 int j_coord_offsetA
,j_coord_offsetB
;
637 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
639 real
*shiftvec
,*fshift
,*x
,*f
;
640 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
642 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
644 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
646 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
647 int vdwjidx0A
,vdwjidx0B
;
648 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
649 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
650 __m128d dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
651 __m128d dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
652 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
655 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
658 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
659 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
660 __m128d rswitch
,swV3
,swV4
,swV5
,swF2
,swF3
,swF4
,d
,d2
,sw
,dsw
;
661 real rswitch_scalar
,d_scalar
;
662 __m128d dummy_mask
,cutoff_mask
;
663 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
664 __m128d one
= _mm_set1_pd(1.0);
665 __m128d two
= _mm_set1_pd(2.0);
671 jindex
= nlist
->jindex
;
673 shiftidx
= nlist
->shift
;
675 shiftvec
= fr
->shift_vec
[0];
676 fshift
= fr
->fshift
[0];
677 facel
= _mm_set1_pd(fr
->epsfac
);
678 charge
= mdatoms
->chargeA
;
679 krf
= _mm_set1_pd(fr
->ic
->k_rf
);
680 krf2
= _mm_set1_pd(fr
->ic
->k_rf
*2.0);
681 crf
= _mm_set1_pd(fr
->ic
->c_rf
);
682 nvdwtype
= fr
->ntype
;
684 vdwtype
= mdatoms
->typeA
;
686 /* Setup water-specific parameters */
687 inr
= nlist
->iinr
[0];
688 iq0
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+0]));
689 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
690 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
691 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
693 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
694 rcutoff_scalar
= fr
->rcoulomb
;
695 rcutoff
= _mm_set1_pd(rcutoff_scalar
);
696 rcutoff2
= _mm_mul_pd(rcutoff
,rcutoff
);
698 rswitch_scalar
= fr
->rvdw_switch
;
699 rswitch
= _mm_set1_pd(rswitch_scalar
);
700 /* Setup switch parameters */
701 d_scalar
= rcutoff_scalar
-rswitch_scalar
;
702 d
= _mm_set1_pd(d_scalar
);
703 swV3
= _mm_set1_pd(-10.0/(d_scalar
*d_scalar
*d_scalar
));
704 swV4
= _mm_set1_pd( 15.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
));
705 swV5
= _mm_set1_pd( -6.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
*d_scalar
));
706 swF2
= _mm_set1_pd(-30.0/(d_scalar
*d_scalar
*d_scalar
));
707 swF3
= _mm_set1_pd( 60.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
));
708 swF4
= _mm_set1_pd(-30.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
*d_scalar
));
710 /* Avoid stupid compiler warnings */
718 /* Start outer loop over neighborlists */
719 for(iidx
=0; iidx
<nri
; iidx
++)
721 /* Load shift vector for this list */
722 i_shift_offset
= DIM
*shiftidx
[iidx
];
724 /* Load limits for loop over neighbors */
725 j_index_start
= jindex
[iidx
];
726 j_index_end
= jindex
[iidx
+1];
728 /* Get outer coordinate index */
730 i_coord_offset
= DIM
*inr
;
732 /* Load i particle coords and add shift vector */
733 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
734 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
736 fix0
= _mm_setzero_pd();
737 fiy0
= _mm_setzero_pd();
738 fiz0
= _mm_setzero_pd();
739 fix1
= _mm_setzero_pd();
740 fiy1
= _mm_setzero_pd();
741 fiz1
= _mm_setzero_pd();
742 fix2
= _mm_setzero_pd();
743 fiy2
= _mm_setzero_pd();
744 fiz2
= _mm_setzero_pd();
746 /* Start inner kernel loop */
747 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
750 /* Get j neighbor index, and coordinate index */
753 j_coord_offsetA
= DIM
*jnrA
;
754 j_coord_offsetB
= DIM
*jnrB
;
756 /* load j atom coordinates */
757 gmx_mm_load_1rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
760 /* Calculate displacement vector */
761 dx00
= _mm_sub_pd(ix0
,jx0
);
762 dy00
= _mm_sub_pd(iy0
,jy0
);
763 dz00
= _mm_sub_pd(iz0
,jz0
);
764 dx10
= _mm_sub_pd(ix1
,jx0
);
765 dy10
= _mm_sub_pd(iy1
,jy0
);
766 dz10
= _mm_sub_pd(iz1
,jz0
);
767 dx20
= _mm_sub_pd(ix2
,jx0
);
768 dy20
= _mm_sub_pd(iy2
,jy0
);
769 dz20
= _mm_sub_pd(iz2
,jz0
);
771 /* Calculate squared distance and things based on it */
772 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
773 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
774 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
776 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
777 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
778 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
780 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
781 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
782 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
784 /* Load parameters for j particles */
785 jq0
= gmx_mm_load_2real_swizzle_pd(charge
+jnrA
+0,charge
+jnrB
+0);
786 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
787 vdwjidx0B
= 2*vdwtype
[jnrB
+0];
789 fjx0
= _mm_setzero_pd();
790 fjy0
= _mm_setzero_pd();
791 fjz0
= _mm_setzero_pd();
793 /**************************
794 * CALCULATE INTERACTIONS *
795 **************************/
797 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
800 r00
= _mm_mul_pd(rsq00
,rinv00
);
802 /* Compute parameters for interactions between i and j atoms */
803 qq00
= _mm_mul_pd(iq0
,jq0
);
804 gmx_mm_load_2pair_swizzle_pd(vdwparam
+vdwioffset0
+vdwjidx0A
,
805 vdwparam
+vdwioffset0
+vdwjidx0B
,&c6_00
,&c12_00
);
807 /* REACTION-FIELD ELECTROSTATICS */
808 felec
= _mm_mul_pd(qq00
,_mm_sub_pd(_mm_mul_pd(rinv00
,rinvsq00
),krf2
));
810 /* LENNARD-JONES DISPERSION/REPULSION */
812 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
813 vvdw6
= _mm_mul_pd(c6_00
,rinvsix
);
814 vvdw12
= _mm_mul_pd(c12_00
,_mm_mul_pd(rinvsix
,rinvsix
));
815 vvdw
= _mm_sub_pd( _mm_mul_pd(vvdw12
,one_twelfth
) , _mm_mul_pd(vvdw6
,one_sixth
) );
816 fvdw
= _mm_mul_pd(_mm_sub_pd(vvdw12
,vvdw6
),rinvsq00
);
818 d
= _mm_sub_pd(r00
,rswitch
);
819 d
= _mm_max_pd(d
,_mm_setzero_pd());
820 d2
= _mm_mul_pd(d
,d
);
821 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
823 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
825 /* Evaluate switch function */
826 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
827 fvdw
= _mm_sub_pd( _mm_mul_pd(fvdw
,sw
) , _mm_mul_pd(rinv00
,_mm_mul_pd(vvdw
,dsw
)) );
828 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
830 fscal
= _mm_add_pd(felec
,fvdw
);
832 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
834 /* Calculate temporary vectorial force */
835 tx
= _mm_mul_pd(fscal
,dx00
);
836 ty
= _mm_mul_pd(fscal
,dy00
);
837 tz
= _mm_mul_pd(fscal
,dz00
);
839 /* Update vectorial force */
840 fix0
= _mm_add_pd(fix0
,tx
);
841 fiy0
= _mm_add_pd(fiy0
,ty
);
842 fiz0
= _mm_add_pd(fiz0
,tz
);
844 fjx0
= _mm_add_pd(fjx0
,tx
);
845 fjy0
= _mm_add_pd(fjy0
,ty
);
846 fjz0
= _mm_add_pd(fjz0
,tz
);
850 /**************************
851 * CALCULATE INTERACTIONS *
852 **************************/
854 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
857 /* Compute parameters for interactions between i and j atoms */
858 qq10
= _mm_mul_pd(iq1
,jq0
);
860 /* REACTION-FIELD ELECTROSTATICS */
861 felec
= _mm_mul_pd(qq10
,_mm_sub_pd(_mm_mul_pd(rinv10
,rinvsq10
),krf2
));
863 cutoff_mask
= _mm_cmplt_pd(rsq10
,rcutoff2
);
867 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
869 /* Calculate temporary vectorial force */
870 tx
= _mm_mul_pd(fscal
,dx10
);
871 ty
= _mm_mul_pd(fscal
,dy10
);
872 tz
= _mm_mul_pd(fscal
,dz10
);
874 /* Update vectorial force */
875 fix1
= _mm_add_pd(fix1
,tx
);
876 fiy1
= _mm_add_pd(fiy1
,ty
);
877 fiz1
= _mm_add_pd(fiz1
,tz
);
879 fjx0
= _mm_add_pd(fjx0
,tx
);
880 fjy0
= _mm_add_pd(fjy0
,ty
);
881 fjz0
= _mm_add_pd(fjz0
,tz
);
885 /**************************
886 * CALCULATE INTERACTIONS *
887 **************************/
889 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
892 /* Compute parameters for interactions between i and j atoms */
893 qq20
= _mm_mul_pd(iq2
,jq0
);
895 /* REACTION-FIELD ELECTROSTATICS */
896 felec
= _mm_mul_pd(qq20
,_mm_sub_pd(_mm_mul_pd(rinv20
,rinvsq20
),krf2
));
898 cutoff_mask
= _mm_cmplt_pd(rsq20
,rcutoff2
);
902 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
904 /* Calculate temporary vectorial force */
905 tx
= _mm_mul_pd(fscal
,dx20
);
906 ty
= _mm_mul_pd(fscal
,dy20
);
907 tz
= _mm_mul_pd(fscal
,dz20
);
909 /* Update vectorial force */
910 fix2
= _mm_add_pd(fix2
,tx
);
911 fiy2
= _mm_add_pd(fiy2
,ty
);
912 fiz2
= _mm_add_pd(fiz2
,tz
);
914 fjx0
= _mm_add_pd(fjx0
,tx
);
915 fjy0
= _mm_add_pd(fjy0
,ty
);
916 fjz0
= _mm_add_pd(fjz0
,tz
);
920 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
);
922 /* Inner loop uses 124 flops */
929 j_coord_offsetA
= DIM
*jnrA
;
931 /* load j atom coordinates */
932 gmx_mm_load_1rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
935 /* Calculate displacement vector */
936 dx00
= _mm_sub_pd(ix0
,jx0
);
937 dy00
= _mm_sub_pd(iy0
,jy0
);
938 dz00
= _mm_sub_pd(iz0
,jz0
);
939 dx10
= _mm_sub_pd(ix1
,jx0
);
940 dy10
= _mm_sub_pd(iy1
,jy0
);
941 dz10
= _mm_sub_pd(iz1
,jz0
);
942 dx20
= _mm_sub_pd(ix2
,jx0
);
943 dy20
= _mm_sub_pd(iy2
,jy0
);
944 dz20
= _mm_sub_pd(iz2
,jz0
);
946 /* Calculate squared distance and things based on it */
947 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
948 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
949 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
951 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
952 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
953 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
955 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
956 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
957 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
959 /* Load parameters for j particles */
960 jq0
= _mm_load_sd(charge
+jnrA
+0);
961 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
963 fjx0
= _mm_setzero_pd();
964 fjy0
= _mm_setzero_pd();
965 fjz0
= _mm_setzero_pd();
967 /**************************
968 * CALCULATE INTERACTIONS *
969 **************************/
971 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
974 r00
= _mm_mul_pd(rsq00
,rinv00
);
976 /* Compute parameters for interactions between i and j atoms */
977 qq00
= _mm_mul_pd(iq0
,jq0
);
978 gmx_mm_load_1pair_swizzle_pd(vdwparam
+vdwioffset0
+vdwjidx0A
,&c6_00
,&c12_00
);
980 /* REACTION-FIELD ELECTROSTATICS */
981 felec
= _mm_mul_pd(qq00
,_mm_sub_pd(_mm_mul_pd(rinv00
,rinvsq00
),krf2
));
983 /* LENNARD-JONES DISPERSION/REPULSION */
985 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
986 vvdw6
= _mm_mul_pd(c6_00
,rinvsix
);
987 vvdw12
= _mm_mul_pd(c12_00
,_mm_mul_pd(rinvsix
,rinvsix
));
988 vvdw
= _mm_sub_pd( _mm_mul_pd(vvdw12
,one_twelfth
) , _mm_mul_pd(vvdw6
,one_sixth
) );
989 fvdw
= _mm_mul_pd(_mm_sub_pd(vvdw12
,vvdw6
),rinvsq00
);
991 d
= _mm_sub_pd(r00
,rswitch
);
992 d
= _mm_max_pd(d
,_mm_setzero_pd());
993 d2
= _mm_mul_pd(d
,d
);
994 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
996 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
998 /* Evaluate switch function */
999 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1000 fvdw
= _mm_sub_pd( _mm_mul_pd(fvdw
,sw
) , _mm_mul_pd(rinv00
,_mm_mul_pd(vvdw
,dsw
)) );
1001 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
1003 fscal
= _mm_add_pd(felec
,fvdw
);
1005 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1007 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1009 /* Calculate temporary vectorial force */
1010 tx
= _mm_mul_pd(fscal
,dx00
);
1011 ty
= _mm_mul_pd(fscal
,dy00
);
1012 tz
= _mm_mul_pd(fscal
,dz00
);
1014 /* Update vectorial force */
1015 fix0
= _mm_add_pd(fix0
,tx
);
1016 fiy0
= _mm_add_pd(fiy0
,ty
);
1017 fiz0
= _mm_add_pd(fiz0
,tz
);
1019 fjx0
= _mm_add_pd(fjx0
,tx
);
1020 fjy0
= _mm_add_pd(fjy0
,ty
);
1021 fjz0
= _mm_add_pd(fjz0
,tz
);
1025 /**************************
1026 * CALCULATE INTERACTIONS *
1027 **************************/
1029 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
1032 /* Compute parameters for interactions between i and j atoms */
1033 qq10
= _mm_mul_pd(iq1
,jq0
);
1035 /* REACTION-FIELD ELECTROSTATICS */
1036 felec
= _mm_mul_pd(qq10
,_mm_sub_pd(_mm_mul_pd(rinv10
,rinvsq10
),krf2
));
1038 cutoff_mask
= _mm_cmplt_pd(rsq10
,rcutoff2
);
1042 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1044 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1046 /* Calculate temporary vectorial force */
1047 tx
= _mm_mul_pd(fscal
,dx10
);
1048 ty
= _mm_mul_pd(fscal
,dy10
);
1049 tz
= _mm_mul_pd(fscal
,dz10
);
1051 /* Update vectorial force */
1052 fix1
= _mm_add_pd(fix1
,tx
);
1053 fiy1
= _mm_add_pd(fiy1
,ty
);
1054 fiz1
= _mm_add_pd(fiz1
,tz
);
1056 fjx0
= _mm_add_pd(fjx0
,tx
);
1057 fjy0
= _mm_add_pd(fjy0
,ty
);
1058 fjz0
= _mm_add_pd(fjz0
,tz
);
1062 /**************************
1063 * CALCULATE INTERACTIONS *
1064 **************************/
1066 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
1069 /* Compute parameters for interactions between i and j atoms */
1070 qq20
= _mm_mul_pd(iq2
,jq0
);
1072 /* REACTION-FIELD ELECTROSTATICS */
1073 felec
= _mm_mul_pd(qq20
,_mm_sub_pd(_mm_mul_pd(rinv20
,rinvsq20
),krf2
));
1075 cutoff_mask
= _mm_cmplt_pd(rsq20
,rcutoff2
);
1079 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1081 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1083 /* Calculate temporary vectorial force */
1084 tx
= _mm_mul_pd(fscal
,dx20
);
1085 ty
= _mm_mul_pd(fscal
,dy20
);
1086 tz
= _mm_mul_pd(fscal
,dz20
);
1088 /* Update vectorial force */
1089 fix2
= _mm_add_pd(fix2
,tx
);
1090 fiy2
= _mm_add_pd(fiy2
,ty
);
1091 fiz2
= _mm_add_pd(fiz2
,tz
);
1093 fjx0
= _mm_add_pd(fjx0
,tx
);
1094 fjy0
= _mm_add_pd(fjy0
,ty
);
1095 fjz0
= _mm_add_pd(fjz0
,tz
);
1099 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
);
1101 /* Inner loop uses 124 flops */
1104 /* End of innermost loop */
1106 gmx_mm_update_iforce_3atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
1107 f
+i_coord_offset
,fshift
+i_shift_offset
);
1109 /* Increment number of inner iterations */
1110 inneriter
+= j_index_end
- j_index_start
;
1112 /* Outer loop uses 18 flops */
1115 /* Increment number of outer iterations */
1118 /* Update outer/inner flops */
1120 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W3_F
,outeriter
*18 + inneriter
*124);