2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/math/vec.h"
46 #include "gromacs/legacyheaders/nrnb.h"
48 #include "gromacs/simd/math_x86_sse2_double.h"
49 #include "kernelutil_x86_sse2_double.h"
52 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_sse2_double
53 * Electrostatics interaction: ReactionField
54 * VdW interaction: LennardJones
55 * Geometry: Water4-Water4
56 * Calculate force/pot: PotentialAndForce
59 nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_VF_sse2_double
60 (t_nblist
* gmx_restrict nlist
,
61 rvec
* gmx_restrict xx
,
62 rvec
* gmx_restrict ff
,
63 t_forcerec
* gmx_restrict fr
,
64 t_mdatoms
* gmx_restrict mdatoms
,
65 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
66 t_nrnb
* gmx_restrict nrnb
)
68 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
69 * just 0 for non-waters.
70 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
71 * jnr indices corresponding to data put in the four positions in the SIMD register.
73 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
74 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
76 int j_coord_offsetA
,j_coord_offsetB
;
77 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
79 real
*shiftvec
,*fshift
,*x
,*f
;
80 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
82 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
84 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
86 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
88 __m128d ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
89 int vdwjidx0A
,vdwjidx0B
;
90 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
91 int vdwjidx1A
,vdwjidx1B
;
92 __m128d jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
93 int vdwjidx2A
,vdwjidx2B
;
94 __m128d jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
95 int vdwjidx3A
,vdwjidx3B
;
96 __m128d jx3
,jy3
,jz3
,fjx3
,fjy3
,fjz3
,jq3
,isaj3
;
97 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
98 __m128d dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
99 __m128d dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
100 __m128d dx13
,dy13
,dz13
,rsq13
,rinv13
,rinvsq13
,r13
,qq13
,c6_13
,c12_13
;
101 __m128d dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
102 __m128d dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
103 __m128d dx23
,dy23
,dz23
,rsq23
,rinv23
,rinvsq23
,r23
,qq23
,c6_23
,c12_23
;
104 __m128d dx31
,dy31
,dz31
,rsq31
,rinv31
,rinvsq31
,r31
,qq31
,c6_31
,c12_31
;
105 __m128d dx32
,dy32
,dz32
,rsq32
,rinv32
,rinvsq32
,r32
,qq32
,c6_32
,c12_32
;
106 __m128d dx33
,dy33
,dz33
,rsq33
,rinv33
,rinvsq33
,r33
,qq33
,c6_33
,c12_33
;
107 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
110 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
113 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
114 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
115 __m128d rswitch
,swV3
,swV4
,swV5
,swF2
,swF3
,swF4
,d
,d2
,sw
,dsw
;
116 real rswitch_scalar
,d_scalar
;
117 __m128d dummy_mask
,cutoff_mask
;
118 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
119 __m128d one
= _mm_set1_pd(1.0);
120 __m128d two
= _mm_set1_pd(2.0);
126 jindex
= nlist
->jindex
;
128 shiftidx
= nlist
->shift
;
130 shiftvec
= fr
->shift_vec
[0];
131 fshift
= fr
->fshift
[0];
132 facel
= _mm_set1_pd(fr
->epsfac
);
133 charge
= mdatoms
->chargeA
;
134 krf
= _mm_set1_pd(fr
->ic
->k_rf
);
135 krf2
= _mm_set1_pd(fr
->ic
->k_rf
*2.0);
136 crf
= _mm_set1_pd(fr
->ic
->c_rf
);
137 nvdwtype
= fr
->ntype
;
139 vdwtype
= mdatoms
->typeA
;
141 /* Setup water-specific parameters */
142 inr
= nlist
->iinr
[0];
143 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
144 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
145 iq3
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+3]));
146 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
148 jq1
= _mm_set1_pd(charge
[inr
+1]);
149 jq2
= _mm_set1_pd(charge
[inr
+2]);
150 jq3
= _mm_set1_pd(charge
[inr
+3]);
151 vdwjidx0A
= 2*vdwtype
[inr
+0];
152 c6_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
]);
153 c12_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
154 qq11
= _mm_mul_pd(iq1
,jq1
);
155 qq12
= _mm_mul_pd(iq1
,jq2
);
156 qq13
= _mm_mul_pd(iq1
,jq3
);
157 qq21
= _mm_mul_pd(iq2
,jq1
);
158 qq22
= _mm_mul_pd(iq2
,jq2
);
159 qq23
= _mm_mul_pd(iq2
,jq3
);
160 qq31
= _mm_mul_pd(iq3
,jq1
);
161 qq32
= _mm_mul_pd(iq3
,jq2
);
162 qq33
= _mm_mul_pd(iq3
,jq3
);
164 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
165 rcutoff_scalar
= fr
->rcoulomb
;
166 rcutoff
= _mm_set1_pd(rcutoff_scalar
);
167 rcutoff2
= _mm_mul_pd(rcutoff
,rcutoff
);
169 rswitch_scalar
= fr
->rvdw_switch
;
170 rswitch
= _mm_set1_pd(rswitch_scalar
);
171 /* Setup switch parameters */
172 d_scalar
= rcutoff_scalar
-rswitch_scalar
;
173 d
= _mm_set1_pd(d_scalar
);
174 swV3
= _mm_set1_pd(-10.0/(d_scalar
*d_scalar
*d_scalar
));
175 swV4
= _mm_set1_pd( 15.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
));
176 swV5
= _mm_set1_pd( -6.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
*d_scalar
));
177 swF2
= _mm_set1_pd(-30.0/(d_scalar
*d_scalar
*d_scalar
));
178 swF3
= _mm_set1_pd( 60.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
));
179 swF4
= _mm_set1_pd(-30.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
*d_scalar
));
181 /* Avoid stupid compiler warnings */
189 /* Start outer loop over neighborlists */
190 for(iidx
=0; iidx
<nri
; iidx
++)
192 /* Load shift vector for this list */
193 i_shift_offset
= DIM
*shiftidx
[iidx
];
195 /* Load limits for loop over neighbors */
196 j_index_start
= jindex
[iidx
];
197 j_index_end
= jindex
[iidx
+1];
199 /* Get outer coordinate index */
201 i_coord_offset
= DIM
*inr
;
203 /* Load i particle coords and add shift vector */
204 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
205 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
207 fix0
= _mm_setzero_pd();
208 fiy0
= _mm_setzero_pd();
209 fiz0
= _mm_setzero_pd();
210 fix1
= _mm_setzero_pd();
211 fiy1
= _mm_setzero_pd();
212 fiz1
= _mm_setzero_pd();
213 fix2
= _mm_setzero_pd();
214 fiy2
= _mm_setzero_pd();
215 fiz2
= _mm_setzero_pd();
216 fix3
= _mm_setzero_pd();
217 fiy3
= _mm_setzero_pd();
218 fiz3
= _mm_setzero_pd();
220 /* Reset potential sums */
221 velecsum
= _mm_setzero_pd();
222 vvdwsum
= _mm_setzero_pd();
224 /* Start inner kernel loop */
225 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
228 /* Get j neighbor index, and coordinate index */
231 j_coord_offsetA
= DIM
*jnrA
;
232 j_coord_offsetB
= DIM
*jnrB
;
234 /* load j atom coordinates */
235 gmx_mm_load_4rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
236 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
237 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
239 /* Calculate displacement vector */
240 dx00
= _mm_sub_pd(ix0
,jx0
);
241 dy00
= _mm_sub_pd(iy0
,jy0
);
242 dz00
= _mm_sub_pd(iz0
,jz0
);
243 dx11
= _mm_sub_pd(ix1
,jx1
);
244 dy11
= _mm_sub_pd(iy1
,jy1
);
245 dz11
= _mm_sub_pd(iz1
,jz1
);
246 dx12
= _mm_sub_pd(ix1
,jx2
);
247 dy12
= _mm_sub_pd(iy1
,jy2
);
248 dz12
= _mm_sub_pd(iz1
,jz2
);
249 dx13
= _mm_sub_pd(ix1
,jx3
);
250 dy13
= _mm_sub_pd(iy1
,jy3
);
251 dz13
= _mm_sub_pd(iz1
,jz3
);
252 dx21
= _mm_sub_pd(ix2
,jx1
);
253 dy21
= _mm_sub_pd(iy2
,jy1
);
254 dz21
= _mm_sub_pd(iz2
,jz1
);
255 dx22
= _mm_sub_pd(ix2
,jx2
);
256 dy22
= _mm_sub_pd(iy2
,jy2
);
257 dz22
= _mm_sub_pd(iz2
,jz2
);
258 dx23
= _mm_sub_pd(ix2
,jx3
);
259 dy23
= _mm_sub_pd(iy2
,jy3
);
260 dz23
= _mm_sub_pd(iz2
,jz3
);
261 dx31
= _mm_sub_pd(ix3
,jx1
);
262 dy31
= _mm_sub_pd(iy3
,jy1
);
263 dz31
= _mm_sub_pd(iz3
,jz1
);
264 dx32
= _mm_sub_pd(ix3
,jx2
);
265 dy32
= _mm_sub_pd(iy3
,jy2
);
266 dz32
= _mm_sub_pd(iz3
,jz2
);
267 dx33
= _mm_sub_pd(ix3
,jx3
);
268 dy33
= _mm_sub_pd(iy3
,jy3
);
269 dz33
= _mm_sub_pd(iz3
,jz3
);
271 /* Calculate squared distance and things based on it */
272 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
273 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
274 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
275 rsq13
= gmx_mm_calc_rsq_pd(dx13
,dy13
,dz13
);
276 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
277 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
278 rsq23
= gmx_mm_calc_rsq_pd(dx23
,dy23
,dz23
);
279 rsq31
= gmx_mm_calc_rsq_pd(dx31
,dy31
,dz31
);
280 rsq32
= gmx_mm_calc_rsq_pd(dx32
,dy32
,dz32
);
281 rsq33
= gmx_mm_calc_rsq_pd(dx33
,dy33
,dz33
);
283 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
284 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
285 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
286 rinv13
= gmx_mm_invsqrt_pd(rsq13
);
287 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
288 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
289 rinv23
= gmx_mm_invsqrt_pd(rsq23
);
290 rinv31
= gmx_mm_invsqrt_pd(rsq31
);
291 rinv32
= gmx_mm_invsqrt_pd(rsq32
);
292 rinv33
= gmx_mm_invsqrt_pd(rsq33
);
294 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
295 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
296 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
297 rinvsq13
= _mm_mul_pd(rinv13
,rinv13
);
298 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
299 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
300 rinvsq23
= _mm_mul_pd(rinv23
,rinv23
);
301 rinvsq31
= _mm_mul_pd(rinv31
,rinv31
);
302 rinvsq32
= _mm_mul_pd(rinv32
,rinv32
);
303 rinvsq33
= _mm_mul_pd(rinv33
,rinv33
);
305 fjx0
= _mm_setzero_pd();
306 fjy0
= _mm_setzero_pd();
307 fjz0
= _mm_setzero_pd();
308 fjx1
= _mm_setzero_pd();
309 fjy1
= _mm_setzero_pd();
310 fjz1
= _mm_setzero_pd();
311 fjx2
= _mm_setzero_pd();
312 fjy2
= _mm_setzero_pd();
313 fjz2
= _mm_setzero_pd();
314 fjx3
= _mm_setzero_pd();
315 fjy3
= _mm_setzero_pd();
316 fjz3
= _mm_setzero_pd();
318 /**************************
319 * CALCULATE INTERACTIONS *
320 **************************/
322 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
325 r00
= _mm_mul_pd(rsq00
,rinv00
);
327 /* LENNARD-JONES DISPERSION/REPULSION */
329 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
330 vvdw6
= _mm_mul_pd(c6_00
,rinvsix
);
331 vvdw12
= _mm_mul_pd(c12_00
,_mm_mul_pd(rinvsix
,rinvsix
));
332 vvdw
= _mm_sub_pd( _mm_mul_pd(vvdw12
,one_twelfth
) , _mm_mul_pd(vvdw6
,one_sixth
) );
333 fvdw
= _mm_mul_pd(_mm_sub_pd(vvdw12
,vvdw6
),rinvsq00
);
335 d
= _mm_sub_pd(r00
,rswitch
);
336 d
= _mm_max_pd(d
,_mm_setzero_pd());
337 d2
= _mm_mul_pd(d
,d
);
338 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
340 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
342 /* Evaluate switch function */
343 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
344 fvdw
= _mm_sub_pd( _mm_mul_pd(fvdw
,sw
) , _mm_mul_pd(rinv00
,_mm_mul_pd(vvdw
,dsw
)) );
345 vvdw
= _mm_mul_pd(vvdw
,sw
);
346 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
348 /* Update potential sum for this i atom from the interaction with this j atom. */
349 vvdw
= _mm_and_pd(vvdw
,cutoff_mask
);
350 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
354 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
356 /* Calculate temporary vectorial force */
357 tx
= _mm_mul_pd(fscal
,dx00
);
358 ty
= _mm_mul_pd(fscal
,dy00
);
359 tz
= _mm_mul_pd(fscal
,dz00
);
361 /* Update vectorial force */
362 fix0
= _mm_add_pd(fix0
,tx
);
363 fiy0
= _mm_add_pd(fiy0
,ty
);
364 fiz0
= _mm_add_pd(fiz0
,tz
);
366 fjx0
= _mm_add_pd(fjx0
,tx
);
367 fjy0
= _mm_add_pd(fjy0
,ty
);
368 fjz0
= _mm_add_pd(fjz0
,tz
);
372 /**************************
373 * CALCULATE INTERACTIONS *
374 **************************/
376 if (gmx_mm_any_lt(rsq11
,rcutoff2
))
379 /* REACTION-FIELD ELECTROSTATICS */
380 velec
= _mm_mul_pd(qq11
,_mm_sub_pd(_mm_add_pd(rinv11
,_mm_mul_pd(krf
,rsq11
)),crf
));
381 felec
= _mm_mul_pd(qq11
,_mm_sub_pd(_mm_mul_pd(rinv11
,rinvsq11
),krf2
));
383 cutoff_mask
= _mm_cmplt_pd(rsq11
,rcutoff2
);
385 /* Update potential sum for this i atom from the interaction with this j atom. */
386 velec
= _mm_and_pd(velec
,cutoff_mask
);
387 velecsum
= _mm_add_pd(velecsum
,velec
);
391 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
393 /* Calculate temporary vectorial force */
394 tx
= _mm_mul_pd(fscal
,dx11
);
395 ty
= _mm_mul_pd(fscal
,dy11
);
396 tz
= _mm_mul_pd(fscal
,dz11
);
398 /* Update vectorial force */
399 fix1
= _mm_add_pd(fix1
,tx
);
400 fiy1
= _mm_add_pd(fiy1
,ty
);
401 fiz1
= _mm_add_pd(fiz1
,tz
);
403 fjx1
= _mm_add_pd(fjx1
,tx
);
404 fjy1
= _mm_add_pd(fjy1
,ty
);
405 fjz1
= _mm_add_pd(fjz1
,tz
);
409 /**************************
410 * CALCULATE INTERACTIONS *
411 **************************/
413 if (gmx_mm_any_lt(rsq12
,rcutoff2
))
416 /* REACTION-FIELD ELECTROSTATICS */
417 velec
= _mm_mul_pd(qq12
,_mm_sub_pd(_mm_add_pd(rinv12
,_mm_mul_pd(krf
,rsq12
)),crf
));
418 felec
= _mm_mul_pd(qq12
,_mm_sub_pd(_mm_mul_pd(rinv12
,rinvsq12
),krf2
));
420 cutoff_mask
= _mm_cmplt_pd(rsq12
,rcutoff2
);
422 /* Update potential sum for this i atom from the interaction with this j atom. */
423 velec
= _mm_and_pd(velec
,cutoff_mask
);
424 velecsum
= _mm_add_pd(velecsum
,velec
);
428 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
430 /* Calculate temporary vectorial force */
431 tx
= _mm_mul_pd(fscal
,dx12
);
432 ty
= _mm_mul_pd(fscal
,dy12
);
433 tz
= _mm_mul_pd(fscal
,dz12
);
435 /* Update vectorial force */
436 fix1
= _mm_add_pd(fix1
,tx
);
437 fiy1
= _mm_add_pd(fiy1
,ty
);
438 fiz1
= _mm_add_pd(fiz1
,tz
);
440 fjx2
= _mm_add_pd(fjx2
,tx
);
441 fjy2
= _mm_add_pd(fjy2
,ty
);
442 fjz2
= _mm_add_pd(fjz2
,tz
);
446 /**************************
447 * CALCULATE INTERACTIONS *
448 **************************/
450 if (gmx_mm_any_lt(rsq13
,rcutoff2
))
453 /* REACTION-FIELD ELECTROSTATICS */
454 velec
= _mm_mul_pd(qq13
,_mm_sub_pd(_mm_add_pd(rinv13
,_mm_mul_pd(krf
,rsq13
)),crf
));
455 felec
= _mm_mul_pd(qq13
,_mm_sub_pd(_mm_mul_pd(rinv13
,rinvsq13
),krf2
));
457 cutoff_mask
= _mm_cmplt_pd(rsq13
,rcutoff2
);
459 /* Update potential sum for this i atom from the interaction with this j atom. */
460 velec
= _mm_and_pd(velec
,cutoff_mask
);
461 velecsum
= _mm_add_pd(velecsum
,velec
);
465 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
467 /* Calculate temporary vectorial force */
468 tx
= _mm_mul_pd(fscal
,dx13
);
469 ty
= _mm_mul_pd(fscal
,dy13
);
470 tz
= _mm_mul_pd(fscal
,dz13
);
472 /* Update vectorial force */
473 fix1
= _mm_add_pd(fix1
,tx
);
474 fiy1
= _mm_add_pd(fiy1
,ty
);
475 fiz1
= _mm_add_pd(fiz1
,tz
);
477 fjx3
= _mm_add_pd(fjx3
,tx
);
478 fjy3
= _mm_add_pd(fjy3
,ty
);
479 fjz3
= _mm_add_pd(fjz3
,tz
);
483 /**************************
484 * CALCULATE INTERACTIONS *
485 **************************/
487 if (gmx_mm_any_lt(rsq21
,rcutoff2
))
490 /* REACTION-FIELD ELECTROSTATICS */
491 velec
= _mm_mul_pd(qq21
,_mm_sub_pd(_mm_add_pd(rinv21
,_mm_mul_pd(krf
,rsq21
)),crf
));
492 felec
= _mm_mul_pd(qq21
,_mm_sub_pd(_mm_mul_pd(rinv21
,rinvsq21
),krf2
));
494 cutoff_mask
= _mm_cmplt_pd(rsq21
,rcutoff2
);
496 /* Update potential sum for this i atom from the interaction with this j atom. */
497 velec
= _mm_and_pd(velec
,cutoff_mask
);
498 velecsum
= _mm_add_pd(velecsum
,velec
);
502 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
504 /* Calculate temporary vectorial force */
505 tx
= _mm_mul_pd(fscal
,dx21
);
506 ty
= _mm_mul_pd(fscal
,dy21
);
507 tz
= _mm_mul_pd(fscal
,dz21
);
509 /* Update vectorial force */
510 fix2
= _mm_add_pd(fix2
,tx
);
511 fiy2
= _mm_add_pd(fiy2
,ty
);
512 fiz2
= _mm_add_pd(fiz2
,tz
);
514 fjx1
= _mm_add_pd(fjx1
,tx
);
515 fjy1
= _mm_add_pd(fjy1
,ty
);
516 fjz1
= _mm_add_pd(fjz1
,tz
);
520 /**************************
521 * CALCULATE INTERACTIONS *
522 **************************/
524 if (gmx_mm_any_lt(rsq22
,rcutoff2
))
527 /* REACTION-FIELD ELECTROSTATICS */
528 velec
= _mm_mul_pd(qq22
,_mm_sub_pd(_mm_add_pd(rinv22
,_mm_mul_pd(krf
,rsq22
)),crf
));
529 felec
= _mm_mul_pd(qq22
,_mm_sub_pd(_mm_mul_pd(rinv22
,rinvsq22
),krf2
));
531 cutoff_mask
= _mm_cmplt_pd(rsq22
,rcutoff2
);
533 /* Update potential sum for this i atom from the interaction with this j atom. */
534 velec
= _mm_and_pd(velec
,cutoff_mask
);
535 velecsum
= _mm_add_pd(velecsum
,velec
);
539 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
541 /* Calculate temporary vectorial force */
542 tx
= _mm_mul_pd(fscal
,dx22
);
543 ty
= _mm_mul_pd(fscal
,dy22
);
544 tz
= _mm_mul_pd(fscal
,dz22
);
546 /* Update vectorial force */
547 fix2
= _mm_add_pd(fix2
,tx
);
548 fiy2
= _mm_add_pd(fiy2
,ty
);
549 fiz2
= _mm_add_pd(fiz2
,tz
);
551 fjx2
= _mm_add_pd(fjx2
,tx
);
552 fjy2
= _mm_add_pd(fjy2
,ty
);
553 fjz2
= _mm_add_pd(fjz2
,tz
);
557 /**************************
558 * CALCULATE INTERACTIONS *
559 **************************/
561 if (gmx_mm_any_lt(rsq23
,rcutoff2
))
564 /* REACTION-FIELD ELECTROSTATICS */
565 velec
= _mm_mul_pd(qq23
,_mm_sub_pd(_mm_add_pd(rinv23
,_mm_mul_pd(krf
,rsq23
)),crf
));
566 felec
= _mm_mul_pd(qq23
,_mm_sub_pd(_mm_mul_pd(rinv23
,rinvsq23
),krf2
));
568 cutoff_mask
= _mm_cmplt_pd(rsq23
,rcutoff2
);
570 /* Update potential sum for this i atom from the interaction with this j atom. */
571 velec
= _mm_and_pd(velec
,cutoff_mask
);
572 velecsum
= _mm_add_pd(velecsum
,velec
);
576 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
578 /* Calculate temporary vectorial force */
579 tx
= _mm_mul_pd(fscal
,dx23
);
580 ty
= _mm_mul_pd(fscal
,dy23
);
581 tz
= _mm_mul_pd(fscal
,dz23
);
583 /* Update vectorial force */
584 fix2
= _mm_add_pd(fix2
,tx
);
585 fiy2
= _mm_add_pd(fiy2
,ty
);
586 fiz2
= _mm_add_pd(fiz2
,tz
);
588 fjx3
= _mm_add_pd(fjx3
,tx
);
589 fjy3
= _mm_add_pd(fjy3
,ty
);
590 fjz3
= _mm_add_pd(fjz3
,tz
);
594 /**************************
595 * CALCULATE INTERACTIONS *
596 **************************/
598 if (gmx_mm_any_lt(rsq31
,rcutoff2
))
601 /* REACTION-FIELD ELECTROSTATICS */
602 velec
= _mm_mul_pd(qq31
,_mm_sub_pd(_mm_add_pd(rinv31
,_mm_mul_pd(krf
,rsq31
)),crf
));
603 felec
= _mm_mul_pd(qq31
,_mm_sub_pd(_mm_mul_pd(rinv31
,rinvsq31
),krf2
));
605 cutoff_mask
= _mm_cmplt_pd(rsq31
,rcutoff2
);
607 /* Update potential sum for this i atom from the interaction with this j atom. */
608 velec
= _mm_and_pd(velec
,cutoff_mask
);
609 velecsum
= _mm_add_pd(velecsum
,velec
);
613 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
615 /* Calculate temporary vectorial force */
616 tx
= _mm_mul_pd(fscal
,dx31
);
617 ty
= _mm_mul_pd(fscal
,dy31
);
618 tz
= _mm_mul_pd(fscal
,dz31
);
620 /* Update vectorial force */
621 fix3
= _mm_add_pd(fix3
,tx
);
622 fiy3
= _mm_add_pd(fiy3
,ty
);
623 fiz3
= _mm_add_pd(fiz3
,tz
);
625 fjx1
= _mm_add_pd(fjx1
,tx
);
626 fjy1
= _mm_add_pd(fjy1
,ty
);
627 fjz1
= _mm_add_pd(fjz1
,tz
);
631 /**************************
632 * CALCULATE INTERACTIONS *
633 **************************/
635 if (gmx_mm_any_lt(rsq32
,rcutoff2
))
638 /* REACTION-FIELD ELECTROSTATICS */
639 velec
= _mm_mul_pd(qq32
,_mm_sub_pd(_mm_add_pd(rinv32
,_mm_mul_pd(krf
,rsq32
)),crf
));
640 felec
= _mm_mul_pd(qq32
,_mm_sub_pd(_mm_mul_pd(rinv32
,rinvsq32
),krf2
));
642 cutoff_mask
= _mm_cmplt_pd(rsq32
,rcutoff2
);
644 /* Update potential sum for this i atom from the interaction with this j atom. */
645 velec
= _mm_and_pd(velec
,cutoff_mask
);
646 velecsum
= _mm_add_pd(velecsum
,velec
);
650 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
652 /* Calculate temporary vectorial force */
653 tx
= _mm_mul_pd(fscal
,dx32
);
654 ty
= _mm_mul_pd(fscal
,dy32
);
655 tz
= _mm_mul_pd(fscal
,dz32
);
657 /* Update vectorial force */
658 fix3
= _mm_add_pd(fix3
,tx
);
659 fiy3
= _mm_add_pd(fiy3
,ty
);
660 fiz3
= _mm_add_pd(fiz3
,tz
);
662 fjx2
= _mm_add_pd(fjx2
,tx
);
663 fjy2
= _mm_add_pd(fjy2
,ty
);
664 fjz2
= _mm_add_pd(fjz2
,tz
);
668 /**************************
669 * CALCULATE INTERACTIONS *
670 **************************/
672 if (gmx_mm_any_lt(rsq33
,rcutoff2
))
675 /* REACTION-FIELD ELECTROSTATICS */
676 velec
= _mm_mul_pd(qq33
,_mm_sub_pd(_mm_add_pd(rinv33
,_mm_mul_pd(krf
,rsq33
)),crf
));
677 felec
= _mm_mul_pd(qq33
,_mm_sub_pd(_mm_mul_pd(rinv33
,rinvsq33
),krf2
));
679 cutoff_mask
= _mm_cmplt_pd(rsq33
,rcutoff2
);
681 /* Update potential sum for this i atom from the interaction with this j atom. */
682 velec
= _mm_and_pd(velec
,cutoff_mask
);
683 velecsum
= _mm_add_pd(velecsum
,velec
);
687 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
689 /* Calculate temporary vectorial force */
690 tx
= _mm_mul_pd(fscal
,dx33
);
691 ty
= _mm_mul_pd(fscal
,dy33
);
692 tz
= _mm_mul_pd(fscal
,dz33
);
694 /* Update vectorial force */
695 fix3
= _mm_add_pd(fix3
,tx
);
696 fiy3
= _mm_add_pd(fiy3
,ty
);
697 fiz3
= _mm_add_pd(fiz3
,tz
);
699 fjx3
= _mm_add_pd(fjx3
,tx
);
700 fjy3
= _mm_add_pd(fjy3
,ty
);
701 fjz3
= _mm_add_pd(fjz3
,tz
);
705 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
707 /* Inner loop uses 386 flops */
714 j_coord_offsetA
= DIM
*jnrA
;
716 /* load j atom coordinates */
717 gmx_mm_load_4rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
718 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
719 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
721 /* Calculate displacement vector */
722 dx00
= _mm_sub_pd(ix0
,jx0
);
723 dy00
= _mm_sub_pd(iy0
,jy0
);
724 dz00
= _mm_sub_pd(iz0
,jz0
);
725 dx11
= _mm_sub_pd(ix1
,jx1
);
726 dy11
= _mm_sub_pd(iy1
,jy1
);
727 dz11
= _mm_sub_pd(iz1
,jz1
);
728 dx12
= _mm_sub_pd(ix1
,jx2
);
729 dy12
= _mm_sub_pd(iy1
,jy2
);
730 dz12
= _mm_sub_pd(iz1
,jz2
);
731 dx13
= _mm_sub_pd(ix1
,jx3
);
732 dy13
= _mm_sub_pd(iy1
,jy3
);
733 dz13
= _mm_sub_pd(iz1
,jz3
);
734 dx21
= _mm_sub_pd(ix2
,jx1
);
735 dy21
= _mm_sub_pd(iy2
,jy1
);
736 dz21
= _mm_sub_pd(iz2
,jz1
);
737 dx22
= _mm_sub_pd(ix2
,jx2
);
738 dy22
= _mm_sub_pd(iy2
,jy2
);
739 dz22
= _mm_sub_pd(iz2
,jz2
);
740 dx23
= _mm_sub_pd(ix2
,jx3
);
741 dy23
= _mm_sub_pd(iy2
,jy3
);
742 dz23
= _mm_sub_pd(iz2
,jz3
);
743 dx31
= _mm_sub_pd(ix3
,jx1
);
744 dy31
= _mm_sub_pd(iy3
,jy1
);
745 dz31
= _mm_sub_pd(iz3
,jz1
);
746 dx32
= _mm_sub_pd(ix3
,jx2
);
747 dy32
= _mm_sub_pd(iy3
,jy2
);
748 dz32
= _mm_sub_pd(iz3
,jz2
);
749 dx33
= _mm_sub_pd(ix3
,jx3
);
750 dy33
= _mm_sub_pd(iy3
,jy3
);
751 dz33
= _mm_sub_pd(iz3
,jz3
);
753 /* Calculate squared distance and things based on it */
754 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
755 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
756 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
757 rsq13
= gmx_mm_calc_rsq_pd(dx13
,dy13
,dz13
);
758 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
759 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
760 rsq23
= gmx_mm_calc_rsq_pd(dx23
,dy23
,dz23
);
761 rsq31
= gmx_mm_calc_rsq_pd(dx31
,dy31
,dz31
);
762 rsq32
= gmx_mm_calc_rsq_pd(dx32
,dy32
,dz32
);
763 rsq33
= gmx_mm_calc_rsq_pd(dx33
,dy33
,dz33
);
765 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
766 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
767 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
768 rinv13
= gmx_mm_invsqrt_pd(rsq13
);
769 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
770 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
771 rinv23
= gmx_mm_invsqrt_pd(rsq23
);
772 rinv31
= gmx_mm_invsqrt_pd(rsq31
);
773 rinv32
= gmx_mm_invsqrt_pd(rsq32
);
774 rinv33
= gmx_mm_invsqrt_pd(rsq33
);
776 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
777 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
778 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
779 rinvsq13
= _mm_mul_pd(rinv13
,rinv13
);
780 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
781 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
782 rinvsq23
= _mm_mul_pd(rinv23
,rinv23
);
783 rinvsq31
= _mm_mul_pd(rinv31
,rinv31
);
784 rinvsq32
= _mm_mul_pd(rinv32
,rinv32
);
785 rinvsq33
= _mm_mul_pd(rinv33
,rinv33
);
787 fjx0
= _mm_setzero_pd();
788 fjy0
= _mm_setzero_pd();
789 fjz0
= _mm_setzero_pd();
790 fjx1
= _mm_setzero_pd();
791 fjy1
= _mm_setzero_pd();
792 fjz1
= _mm_setzero_pd();
793 fjx2
= _mm_setzero_pd();
794 fjy2
= _mm_setzero_pd();
795 fjz2
= _mm_setzero_pd();
796 fjx3
= _mm_setzero_pd();
797 fjy3
= _mm_setzero_pd();
798 fjz3
= _mm_setzero_pd();
800 /**************************
801 * CALCULATE INTERACTIONS *
802 **************************/
804 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
807 r00
= _mm_mul_pd(rsq00
,rinv00
);
809 /* LENNARD-JONES DISPERSION/REPULSION */
811 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
812 vvdw6
= _mm_mul_pd(c6_00
,rinvsix
);
813 vvdw12
= _mm_mul_pd(c12_00
,_mm_mul_pd(rinvsix
,rinvsix
));
814 vvdw
= _mm_sub_pd( _mm_mul_pd(vvdw12
,one_twelfth
) , _mm_mul_pd(vvdw6
,one_sixth
) );
815 fvdw
= _mm_mul_pd(_mm_sub_pd(vvdw12
,vvdw6
),rinvsq00
);
817 d
= _mm_sub_pd(r00
,rswitch
);
818 d
= _mm_max_pd(d
,_mm_setzero_pd());
819 d2
= _mm_mul_pd(d
,d
);
820 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
822 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
824 /* Evaluate switch function */
825 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
826 fvdw
= _mm_sub_pd( _mm_mul_pd(fvdw
,sw
) , _mm_mul_pd(rinv00
,_mm_mul_pd(vvdw
,dsw
)) );
827 vvdw
= _mm_mul_pd(vvdw
,sw
);
828 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
830 /* Update potential sum for this i atom from the interaction with this j atom. */
831 vvdw
= _mm_and_pd(vvdw
,cutoff_mask
);
832 vvdw
= _mm_unpacklo_pd(vvdw
,_mm_setzero_pd());
833 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
837 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
839 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
841 /* Calculate temporary vectorial force */
842 tx
= _mm_mul_pd(fscal
,dx00
);
843 ty
= _mm_mul_pd(fscal
,dy00
);
844 tz
= _mm_mul_pd(fscal
,dz00
);
846 /* Update vectorial force */
847 fix0
= _mm_add_pd(fix0
,tx
);
848 fiy0
= _mm_add_pd(fiy0
,ty
);
849 fiz0
= _mm_add_pd(fiz0
,tz
);
851 fjx0
= _mm_add_pd(fjx0
,tx
);
852 fjy0
= _mm_add_pd(fjy0
,ty
);
853 fjz0
= _mm_add_pd(fjz0
,tz
);
857 /**************************
858 * CALCULATE INTERACTIONS *
859 **************************/
861 if (gmx_mm_any_lt(rsq11
,rcutoff2
))
864 /* REACTION-FIELD ELECTROSTATICS */
865 velec
= _mm_mul_pd(qq11
,_mm_sub_pd(_mm_add_pd(rinv11
,_mm_mul_pd(krf
,rsq11
)),crf
));
866 felec
= _mm_mul_pd(qq11
,_mm_sub_pd(_mm_mul_pd(rinv11
,rinvsq11
),krf2
));
868 cutoff_mask
= _mm_cmplt_pd(rsq11
,rcutoff2
);
870 /* Update potential sum for this i atom from the interaction with this j atom. */
871 velec
= _mm_and_pd(velec
,cutoff_mask
);
872 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
873 velecsum
= _mm_add_pd(velecsum
,velec
);
877 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
879 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
881 /* Calculate temporary vectorial force */
882 tx
= _mm_mul_pd(fscal
,dx11
);
883 ty
= _mm_mul_pd(fscal
,dy11
);
884 tz
= _mm_mul_pd(fscal
,dz11
);
886 /* Update vectorial force */
887 fix1
= _mm_add_pd(fix1
,tx
);
888 fiy1
= _mm_add_pd(fiy1
,ty
);
889 fiz1
= _mm_add_pd(fiz1
,tz
);
891 fjx1
= _mm_add_pd(fjx1
,tx
);
892 fjy1
= _mm_add_pd(fjy1
,ty
);
893 fjz1
= _mm_add_pd(fjz1
,tz
);
897 /**************************
898 * CALCULATE INTERACTIONS *
899 **************************/
901 if (gmx_mm_any_lt(rsq12
,rcutoff2
))
904 /* REACTION-FIELD ELECTROSTATICS */
905 velec
= _mm_mul_pd(qq12
,_mm_sub_pd(_mm_add_pd(rinv12
,_mm_mul_pd(krf
,rsq12
)),crf
));
906 felec
= _mm_mul_pd(qq12
,_mm_sub_pd(_mm_mul_pd(rinv12
,rinvsq12
),krf2
));
908 cutoff_mask
= _mm_cmplt_pd(rsq12
,rcutoff2
);
910 /* Update potential sum for this i atom from the interaction with this j atom. */
911 velec
= _mm_and_pd(velec
,cutoff_mask
);
912 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
913 velecsum
= _mm_add_pd(velecsum
,velec
);
917 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
919 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
921 /* Calculate temporary vectorial force */
922 tx
= _mm_mul_pd(fscal
,dx12
);
923 ty
= _mm_mul_pd(fscal
,dy12
);
924 tz
= _mm_mul_pd(fscal
,dz12
);
926 /* Update vectorial force */
927 fix1
= _mm_add_pd(fix1
,tx
);
928 fiy1
= _mm_add_pd(fiy1
,ty
);
929 fiz1
= _mm_add_pd(fiz1
,tz
);
931 fjx2
= _mm_add_pd(fjx2
,tx
);
932 fjy2
= _mm_add_pd(fjy2
,ty
);
933 fjz2
= _mm_add_pd(fjz2
,tz
);
937 /**************************
938 * CALCULATE INTERACTIONS *
939 **************************/
941 if (gmx_mm_any_lt(rsq13
,rcutoff2
))
944 /* REACTION-FIELD ELECTROSTATICS */
945 velec
= _mm_mul_pd(qq13
,_mm_sub_pd(_mm_add_pd(rinv13
,_mm_mul_pd(krf
,rsq13
)),crf
));
946 felec
= _mm_mul_pd(qq13
,_mm_sub_pd(_mm_mul_pd(rinv13
,rinvsq13
),krf2
));
948 cutoff_mask
= _mm_cmplt_pd(rsq13
,rcutoff2
);
950 /* Update potential sum for this i atom from the interaction with this j atom. */
951 velec
= _mm_and_pd(velec
,cutoff_mask
);
952 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
953 velecsum
= _mm_add_pd(velecsum
,velec
);
957 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
959 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
961 /* Calculate temporary vectorial force */
962 tx
= _mm_mul_pd(fscal
,dx13
);
963 ty
= _mm_mul_pd(fscal
,dy13
);
964 tz
= _mm_mul_pd(fscal
,dz13
);
966 /* Update vectorial force */
967 fix1
= _mm_add_pd(fix1
,tx
);
968 fiy1
= _mm_add_pd(fiy1
,ty
);
969 fiz1
= _mm_add_pd(fiz1
,tz
);
971 fjx3
= _mm_add_pd(fjx3
,tx
);
972 fjy3
= _mm_add_pd(fjy3
,ty
);
973 fjz3
= _mm_add_pd(fjz3
,tz
);
977 /**************************
978 * CALCULATE INTERACTIONS *
979 **************************/
981 if (gmx_mm_any_lt(rsq21
,rcutoff2
))
984 /* REACTION-FIELD ELECTROSTATICS */
985 velec
= _mm_mul_pd(qq21
,_mm_sub_pd(_mm_add_pd(rinv21
,_mm_mul_pd(krf
,rsq21
)),crf
));
986 felec
= _mm_mul_pd(qq21
,_mm_sub_pd(_mm_mul_pd(rinv21
,rinvsq21
),krf2
));
988 cutoff_mask
= _mm_cmplt_pd(rsq21
,rcutoff2
);
990 /* Update potential sum for this i atom from the interaction with this j atom. */
991 velec
= _mm_and_pd(velec
,cutoff_mask
);
992 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
993 velecsum
= _mm_add_pd(velecsum
,velec
);
997 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
999 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1001 /* Calculate temporary vectorial force */
1002 tx
= _mm_mul_pd(fscal
,dx21
);
1003 ty
= _mm_mul_pd(fscal
,dy21
);
1004 tz
= _mm_mul_pd(fscal
,dz21
);
1006 /* Update vectorial force */
1007 fix2
= _mm_add_pd(fix2
,tx
);
1008 fiy2
= _mm_add_pd(fiy2
,ty
);
1009 fiz2
= _mm_add_pd(fiz2
,tz
);
1011 fjx1
= _mm_add_pd(fjx1
,tx
);
1012 fjy1
= _mm_add_pd(fjy1
,ty
);
1013 fjz1
= _mm_add_pd(fjz1
,tz
);
1017 /**************************
1018 * CALCULATE INTERACTIONS *
1019 **************************/
1021 if (gmx_mm_any_lt(rsq22
,rcutoff2
))
1024 /* REACTION-FIELD ELECTROSTATICS */
1025 velec
= _mm_mul_pd(qq22
,_mm_sub_pd(_mm_add_pd(rinv22
,_mm_mul_pd(krf
,rsq22
)),crf
));
1026 felec
= _mm_mul_pd(qq22
,_mm_sub_pd(_mm_mul_pd(rinv22
,rinvsq22
),krf2
));
1028 cutoff_mask
= _mm_cmplt_pd(rsq22
,rcutoff2
);
1030 /* Update potential sum for this i atom from the interaction with this j atom. */
1031 velec
= _mm_and_pd(velec
,cutoff_mask
);
1032 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1033 velecsum
= _mm_add_pd(velecsum
,velec
);
1037 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1039 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1041 /* Calculate temporary vectorial force */
1042 tx
= _mm_mul_pd(fscal
,dx22
);
1043 ty
= _mm_mul_pd(fscal
,dy22
);
1044 tz
= _mm_mul_pd(fscal
,dz22
);
1046 /* Update vectorial force */
1047 fix2
= _mm_add_pd(fix2
,tx
);
1048 fiy2
= _mm_add_pd(fiy2
,ty
);
1049 fiz2
= _mm_add_pd(fiz2
,tz
);
1051 fjx2
= _mm_add_pd(fjx2
,tx
);
1052 fjy2
= _mm_add_pd(fjy2
,ty
);
1053 fjz2
= _mm_add_pd(fjz2
,tz
);
1057 /**************************
1058 * CALCULATE INTERACTIONS *
1059 **************************/
1061 if (gmx_mm_any_lt(rsq23
,rcutoff2
))
1064 /* REACTION-FIELD ELECTROSTATICS */
1065 velec
= _mm_mul_pd(qq23
,_mm_sub_pd(_mm_add_pd(rinv23
,_mm_mul_pd(krf
,rsq23
)),crf
));
1066 felec
= _mm_mul_pd(qq23
,_mm_sub_pd(_mm_mul_pd(rinv23
,rinvsq23
),krf2
));
1068 cutoff_mask
= _mm_cmplt_pd(rsq23
,rcutoff2
);
1070 /* Update potential sum for this i atom from the interaction with this j atom. */
1071 velec
= _mm_and_pd(velec
,cutoff_mask
);
1072 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1073 velecsum
= _mm_add_pd(velecsum
,velec
);
1077 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1079 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1081 /* Calculate temporary vectorial force */
1082 tx
= _mm_mul_pd(fscal
,dx23
);
1083 ty
= _mm_mul_pd(fscal
,dy23
);
1084 tz
= _mm_mul_pd(fscal
,dz23
);
1086 /* Update vectorial force */
1087 fix2
= _mm_add_pd(fix2
,tx
);
1088 fiy2
= _mm_add_pd(fiy2
,ty
);
1089 fiz2
= _mm_add_pd(fiz2
,tz
);
1091 fjx3
= _mm_add_pd(fjx3
,tx
);
1092 fjy3
= _mm_add_pd(fjy3
,ty
);
1093 fjz3
= _mm_add_pd(fjz3
,tz
);
1097 /**************************
1098 * CALCULATE INTERACTIONS *
1099 **************************/
1101 if (gmx_mm_any_lt(rsq31
,rcutoff2
))
1104 /* REACTION-FIELD ELECTROSTATICS */
1105 velec
= _mm_mul_pd(qq31
,_mm_sub_pd(_mm_add_pd(rinv31
,_mm_mul_pd(krf
,rsq31
)),crf
));
1106 felec
= _mm_mul_pd(qq31
,_mm_sub_pd(_mm_mul_pd(rinv31
,rinvsq31
),krf2
));
1108 cutoff_mask
= _mm_cmplt_pd(rsq31
,rcutoff2
);
1110 /* Update potential sum for this i atom from the interaction with this j atom. */
1111 velec
= _mm_and_pd(velec
,cutoff_mask
);
1112 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1113 velecsum
= _mm_add_pd(velecsum
,velec
);
1117 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1119 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1121 /* Calculate temporary vectorial force */
1122 tx
= _mm_mul_pd(fscal
,dx31
);
1123 ty
= _mm_mul_pd(fscal
,dy31
);
1124 tz
= _mm_mul_pd(fscal
,dz31
);
1126 /* Update vectorial force */
1127 fix3
= _mm_add_pd(fix3
,tx
);
1128 fiy3
= _mm_add_pd(fiy3
,ty
);
1129 fiz3
= _mm_add_pd(fiz3
,tz
);
1131 fjx1
= _mm_add_pd(fjx1
,tx
);
1132 fjy1
= _mm_add_pd(fjy1
,ty
);
1133 fjz1
= _mm_add_pd(fjz1
,tz
);
1137 /**************************
1138 * CALCULATE INTERACTIONS *
1139 **************************/
1141 if (gmx_mm_any_lt(rsq32
,rcutoff2
))
1144 /* REACTION-FIELD ELECTROSTATICS */
1145 velec
= _mm_mul_pd(qq32
,_mm_sub_pd(_mm_add_pd(rinv32
,_mm_mul_pd(krf
,rsq32
)),crf
));
1146 felec
= _mm_mul_pd(qq32
,_mm_sub_pd(_mm_mul_pd(rinv32
,rinvsq32
),krf2
));
1148 cutoff_mask
= _mm_cmplt_pd(rsq32
,rcutoff2
);
1150 /* Update potential sum for this i atom from the interaction with this j atom. */
1151 velec
= _mm_and_pd(velec
,cutoff_mask
);
1152 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1153 velecsum
= _mm_add_pd(velecsum
,velec
);
1157 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1159 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1161 /* Calculate temporary vectorial force */
1162 tx
= _mm_mul_pd(fscal
,dx32
);
1163 ty
= _mm_mul_pd(fscal
,dy32
);
1164 tz
= _mm_mul_pd(fscal
,dz32
);
1166 /* Update vectorial force */
1167 fix3
= _mm_add_pd(fix3
,tx
);
1168 fiy3
= _mm_add_pd(fiy3
,ty
);
1169 fiz3
= _mm_add_pd(fiz3
,tz
);
1171 fjx2
= _mm_add_pd(fjx2
,tx
);
1172 fjy2
= _mm_add_pd(fjy2
,ty
);
1173 fjz2
= _mm_add_pd(fjz2
,tz
);
1177 /**************************
1178 * CALCULATE INTERACTIONS *
1179 **************************/
1181 if (gmx_mm_any_lt(rsq33
,rcutoff2
))
1184 /* REACTION-FIELD ELECTROSTATICS */
1185 velec
= _mm_mul_pd(qq33
,_mm_sub_pd(_mm_add_pd(rinv33
,_mm_mul_pd(krf
,rsq33
)),crf
));
1186 felec
= _mm_mul_pd(qq33
,_mm_sub_pd(_mm_mul_pd(rinv33
,rinvsq33
),krf2
));
1188 cutoff_mask
= _mm_cmplt_pd(rsq33
,rcutoff2
);
1190 /* Update potential sum for this i atom from the interaction with this j atom. */
1191 velec
= _mm_and_pd(velec
,cutoff_mask
);
1192 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1193 velecsum
= _mm_add_pd(velecsum
,velec
);
1197 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1199 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1201 /* Calculate temporary vectorial force */
1202 tx
= _mm_mul_pd(fscal
,dx33
);
1203 ty
= _mm_mul_pd(fscal
,dy33
);
1204 tz
= _mm_mul_pd(fscal
,dz33
);
1206 /* Update vectorial force */
1207 fix3
= _mm_add_pd(fix3
,tx
);
1208 fiy3
= _mm_add_pd(fiy3
,ty
);
1209 fiz3
= _mm_add_pd(fiz3
,tz
);
1211 fjx3
= _mm_add_pd(fjx3
,tx
);
1212 fjy3
= _mm_add_pd(fjy3
,ty
);
1213 fjz3
= _mm_add_pd(fjz3
,tz
);
1217 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
1219 /* Inner loop uses 386 flops */
1222 /* End of innermost loop */
1224 gmx_mm_update_iforce_4atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
1225 f
+i_coord_offset
,fshift
+i_shift_offset
);
1228 /* Update potential energies */
1229 gmx_mm_update_1pot_pd(velecsum
,kernel_data
->energygrp_elec
+ggid
);
1230 gmx_mm_update_1pot_pd(vvdwsum
,kernel_data
->energygrp_vdw
+ggid
);
1232 /* Increment number of inner iterations */
1233 inneriter
+= j_index_end
- j_index_start
;
1235 /* Outer loop uses 26 flops */
1238 /* Increment number of outer iterations */
1241 /* Update outer/inner flops */
1243 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W4W4_VF
,outeriter
*26 + inneriter
*386);
1246 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_sse2_double
1247 * Electrostatics interaction: ReactionField
1248 * VdW interaction: LennardJones
1249 * Geometry: Water4-Water4
1250 * Calculate force/pot: Force
1253 nb_kernel_ElecRFCut_VdwLJSw_GeomW4W4_F_sse2_double
1254 (t_nblist
* gmx_restrict nlist
,
1255 rvec
* gmx_restrict xx
,
1256 rvec
* gmx_restrict ff
,
1257 t_forcerec
* gmx_restrict fr
,
1258 t_mdatoms
* gmx_restrict mdatoms
,
1259 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
1260 t_nrnb
* gmx_restrict nrnb
)
1262 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1263 * just 0 for non-waters.
1264 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1265 * jnr indices corresponding to data put in the four positions in the SIMD register.
1267 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
1268 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
1270 int j_coord_offsetA
,j_coord_offsetB
;
1271 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
1272 real rcutoff_scalar
;
1273 real
*shiftvec
,*fshift
,*x
,*f
;
1274 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
1276 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
1278 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
1280 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
1282 __m128d ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
1283 int vdwjidx0A
,vdwjidx0B
;
1284 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
1285 int vdwjidx1A
,vdwjidx1B
;
1286 __m128d jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
1287 int vdwjidx2A
,vdwjidx2B
;
1288 __m128d jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
1289 int vdwjidx3A
,vdwjidx3B
;
1290 __m128d jx3
,jy3
,jz3
,fjx3
,fjy3
,fjz3
,jq3
,isaj3
;
1291 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
1292 __m128d dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
1293 __m128d dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
1294 __m128d dx13
,dy13
,dz13
,rsq13
,rinv13
,rinvsq13
,r13
,qq13
,c6_13
,c12_13
;
1295 __m128d dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
1296 __m128d dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
1297 __m128d dx23
,dy23
,dz23
,rsq23
,rinv23
,rinvsq23
,r23
,qq23
,c6_23
,c12_23
;
1298 __m128d dx31
,dy31
,dz31
,rsq31
,rinv31
,rinvsq31
,r31
,qq31
,c6_31
,c12_31
;
1299 __m128d dx32
,dy32
,dz32
,rsq32
,rinv32
,rinvsq32
,r32
,qq32
,c6_32
,c12_32
;
1300 __m128d dx33
,dy33
,dz33
,rsq33
,rinv33
,rinvsq33
,r33
,qq33
,c6_33
,c12_33
;
1301 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
1304 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
1307 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
1308 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
1309 __m128d rswitch
,swV3
,swV4
,swV5
,swF2
,swF3
,swF4
,d
,d2
,sw
,dsw
;
1310 real rswitch_scalar
,d_scalar
;
1311 __m128d dummy_mask
,cutoff_mask
;
1312 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1313 __m128d one
= _mm_set1_pd(1.0);
1314 __m128d two
= _mm_set1_pd(2.0);
1320 jindex
= nlist
->jindex
;
1322 shiftidx
= nlist
->shift
;
1324 shiftvec
= fr
->shift_vec
[0];
1325 fshift
= fr
->fshift
[0];
1326 facel
= _mm_set1_pd(fr
->epsfac
);
1327 charge
= mdatoms
->chargeA
;
1328 krf
= _mm_set1_pd(fr
->ic
->k_rf
);
1329 krf2
= _mm_set1_pd(fr
->ic
->k_rf
*2.0);
1330 crf
= _mm_set1_pd(fr
->ic
->c_rf
);
1331 nvdwtype
= fr
->ntype
;
1332 vdwparam
= fr
->nbfp
;
1333 vdwtype
= mdatoms
->typeA
;
1335 /* Setup water-specific parameters */
1336 inr
= nlist
->iinr
[0];
1337 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
1338 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
1339 iq3
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+3]));
1340 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
1342 jq1
= _mm_set1_pd(charge
[inr
+1]);
1343 jq2
= _mm_set1_pd(charge
[inr
+2]);
1344 jq3
= _mm_set1_pd(charge
[inr
+3]);
1345 vdwjidx0A
= 2*vdwtype
[inr
+0];
1346 c6_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
]);
1347 c12_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
1348 qq11
= _mm_mul_pd(iq1
,jq1
);
1349 qq12
= _mm_mul_pd(iq1
,jq2
);
1350 qq13
= _mm_mul_pd(iq1
,jq3
);
1351 qq21
= _mm_mul_pd(iq2
,jq1
);
1352 qq22
= _mm_mul_pd(iq2
,jq2
);
1353 qq23
= _mm_mul_pd(iq2
,jq3
);
1354 qq31
= _mm_mul_pd(iq3
,jq1
);
1355 qq32
= _mm_mul_pd(iq3
,jq2
);
1356 qq33
= _mm_mul_pd(iq3
,jq3
);
1358 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1359 rcutoff_scalar
= fr
->rcoulomb
;
1360 rcutoff
= _mm_set1_pd(rcutoff_scalar
);
1361 rcutoff2
= _mm_mul_pd(rcutoff
,rcutoff
);
1363 rswitch_scalar
= fr
->rvdw_switch
;
1364 rswitch
= _mm_set1_pd(rswitch_scalar
);
1365 /* Setup switch parameters */
1366 d_scalar
= rcutoff_scalar
-rswitch_scalar
;
1367 d
= _mm_set1_pd(d_scalar
);
1368 swV3
= _mm_set1_pd(-10.0/(d_scalar
*d_scalar
*d_scalar
));
1369 swV4
= _mm_set1_pd( 15.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
));
1370 swV5
= _mm_set1_pd( -6.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
*d_scalar
));
1371 swF2
= _mm_set1_pd(-30.0/(d_scalar
*d_scalar
*d_scalar
));
1372 swF3
= _mm_set1_pd( 60.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
));
1373 swF4
= _mm_set1_pd(-30.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
*d_scalar
));
1375 /* Avoid stupid compiler warnings */
1377 j_coord_offsetA
= 0;
1378 j_coord_offsetB
= 0;
1383 /* Start outer loop over neighborlists */
1384 for(iidx
=0; iidx
<nri
; iidx
++)
1386 /* Load shift vector for this list */
1387 i_shift_offset
= DIM
*shiftidx
[iidx
];
1389 /* Load limits for loop over neighbors */
1390 j_index_start
= jindex
[iidx
];
1391 j_index_end
= jindex
[iidx
+1];
1393 /* Get outer coordinate index */
1395 i_coord_offset
= DIM
*inr
;
1397 /* Load i particle coords and add shift vector */
1398 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
1399 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
1401 fix0
= _mm_setzero_pd();
1402 fiy0
= _mm_setzero_pd();
1403 fiz0
= _mm_setzero_pd();
1404 fix1
= _mm_setzero_pd();
1405 fiy1
= _mm_setzero_pd();
1406 fiz1
= _mm_setzero_pd();
1407 fix2
= _mm_setzero_pd();
1408 fiy2
= _mm_setzero_pd();
1409 fiz2
= _mm_setzero_pd();
1410 fix3
= _mm_setzero_pd();
1411 fiy3
= _mm_setzero_pd();
1412 fiz3
= _mm_setzero_pd();
1414 /* Start inner kernel loop */
1415 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
1418 /* Get j neighbor index, and coordinate index */
1420 jnrB
= jjnr
[jidx
+1];
1421 j_coord_offsetA
= DIM
*jnrA
;
1422 j_coord_offsetB
= DIM
*jnrB
;
1424 /* load j atom coordinates */
1425 gmx_mm_load_4rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1426 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
1427 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
1429 /* Calculate displacement vector */
1430 dx00
= _mm_sub_pd(ix0
,jx0
);
1431 dy00
= _mm_sub_pd(iy0
,jy0
);
1432 dz00
= _mm_sub_pd(iz0
,jz0
);
1433 dx11
= _mm_sub_pd(ix1
,jx1
);
1434 dy11
= _mm_sub_pd(iy1
,jy1
);
1435 dz11
= _mm_sub_pd(iz1
,jz1
);
1436 dx12
= _mm_sub_pd(ix1
,jx2
);
1437 dy12
= _mm_sub_pd(iy1
,jy2
);
1438 dz12
= _mm_sub_pd(iz1
,jz2
);
1439 dx13
= _mm_sub_pd(ix1
,jx3
);
1440 dy13
= _mm_sub_pd(iy1
,jy3
);
1441 dz13
= _mm_sub_pd(iz1
,jz3
);
1442 dx21
= _mm_sub_pd(ix2
,jx1
);
1443 dy21
= _mm_sub_pd(iy2
,jy1
);
1444 dz21
= _mm_sub_pd(iz2
,jz1
);
1445 dx22
= _mm_sub_pd(ix2
,jx2
);
1446 dy22
= _mm_sub_pd(iy2
,jy2
);
1447 dz22
= _mm_sub_pd(iz2
,jz2
);
1448 dx23
= _mm_sub_pd(ix2
,jx3
);
1449 dy23
= _mm_sub_pd(iy2
,jy3
);
1450 dz23
= _mm_sub_pd(iz2
,jz3
);
1451 dx31
= _mm_sub_pd(ix3
,jx1
);
1452 dy31
= _mm_sub_pd(iy3
,jy1
);
1453 dz31
= _mm_sub_pd(iz3
,jz1
);
1454 dx32
= _mm_sub_pd(ix3
,jx2
);
1455 dy32
= _mm_sub_pd(iy3
,jy2
);
1456 dz32
= _mm_sub_pd(iz3
,jz2
);
1457 dx33
= _mm_sub_pd(ix3
,jx3
);
1458 dy33
= _mm_sub_pd(iy3
,jy3
);
1459 dz33
= _mm_sub_pd(iz3
,jz3
);
1461 /* Calculate squared distance and things based on it */
1462 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
1463 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
1464 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
1465 rsq13
= gmx_mm_calc_rsq_pd(dx13
,dy13
,dz13
);
1466 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
1467 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
1468 rsq23
= gmx_mm_calc_rsq_pd(dx23
,dy23
,dz23
);
1469 rsq31
= gmx_mm_calc_rsq_pd(dx31
,dy31
,dz31
);
1470 rsq32
= gmx_mm_calc_rsq_pd(dx32
,dy32
,dz32
);
1471 rsq33
= gmx_mm_calc_rsq_pd(dx33
,dy33
,dz33
);
1473 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
1474 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
1475 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
1476 rinv13
= gmx_mm_invsqrt_pd(rsq13
);
1477 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
1478 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
1479 rinv23
= gmx_mm_invsqrt_pd(rsq23
);
1480 rinv31
= gmx_mm_invsqrt_pd(rsq31
);
1481 rinv32
= gmx_mm_invsqrt_pd(rsq32
);
1482 rinv33
= gmx_mm_invsqrt_pd(rsq33
);
1484 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
1485 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
1486 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
1487 rinvsq13
= _mm_mul_pd(rinv13
,rinv13
);
1488 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
1489 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
1490 rinvsq23
= _mm_mul_pd(rinv23
,rinv23
);
1491 rinvsq31
= _mm_mul_pd(rinv31
,rinv31
);
1492 rinvsq32
= _mm_mul_pd(rinv32
,rinv32
);
1493 rinvsq33
= _mm_mul_pd(rinv33
,rinv33
);
1495 fjx0
= _mm_setzero_pd();
1496 fjy0
= _mm_setzero_pd();
1497 fjz0
= _mm_setzero_pd();
1498 fjx1
= _mm_setzero_pd();
1499 fjy1
= _mm_setzero_pd();
1500 fjz1
= _mm_setzero_pd();
1501 fjx2
= _mm_setzero_pd();
1502 fjy2
= _mm_setzero_pd();
1503 fjz2
= _mm_setzero_pd();
1504 fjx3
= _mm_setzero_pd();
1505 fjy3
= _mm_setzero_pd();
1506 fjz3
= _mm_setzero_pd();
1508 /**************************
1509 * CALCULATE INTERACTIONS *
1510 **************************/
1512 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
1515 r00
= _mm_mul_pd(rsq00
,rinv00
);
1517 /* LENNARD-JONES DISPERSION/REPULSION */
1519 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
1520 vvdw6
= _mm_mul_pd(c6_00
,rinvsix
);
1521 vvdw12
= _mm_mul_pd(c12_00
,_mm_mul_pd(rinvsix
,rinvsix
));
1522 vvdw
= _mm_sub_pd( _mm_mul_pd(vvdw12
,one_twelfth
) , _mm_mul_pd(vvdw6
,one_sixth
) );
1523 fvdw
= _mm_mul_pd(_mm_sub_pd(vvdw12
,vvdw6
),rinvsq00
);
1525 d
= _mm_sub_pd(r00
,rswitch
);
1526 d
= _mm_max_pd(d
,_mm_setzero_pd());
1527 d2
= _mm_mul_pd(d
,d
);
1528 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
1530 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
1532 /* Evaluate switch function */
1533 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1534 fvdw
= _mm_sub_pd( _mm_mul_pd(fvdw
,sw
) , _mm_mul_pd(rinv00
,_mm_mul_pd(vvdw
,dsw
)) );
1535 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
1539 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1541 /* Calculate temporary vectorial force */
1542 tx
= _mm_mul_pd(fscal
,dx00
);
1543 ty
= _mm_mul_pd(fscal
,dy00
);
1544 tz
= _mm_mul_pd(fscal
,dz00
);
1546 /* Update vectorial force */
1547 fix0
= _mm_add_pd(fix0
,tx
);
1548 fiy0
= _mm_add_pd(fiy0
,ty
);
1549 fiz0
= _mm_add_pd(fiz0
,tz
);
1551 fjx0
= _mm_add_pd(fjx0
,tx
);
1552 fjy0
= _mm_add_pd(fjy0
,ty
);
1553 fjz0
= _mm_add_pd(fjz0
,tz
);
1557 /**************************
1558 * CALCULATE INTERACTIONS *
1559 **************************/
1561 if (gmx_mm_any_lt(rsq11
,rcutoff2
))
1564 /* REACTION-FIELD ELECTROSTATICS */
1565 felec
= _mm_mul_pd(qq11
,_mm_sub_pd(_mm_mul_pd(rinv11
,rinvsq11
),krf2
));
1567 cutoff_mask
= _mm_cmplt_pd(rsq11
,rcutoff2
);
1571 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1573 /* Calculate temporary vectorial force */
1574 tx
= _mm_mul_pd(fscal
,dx11
);
1575 ty
= _mm_mul_pd(fscal
,dy11
);
1576 tz
= _mm_mul_pd(fscal
,dz11
);
1578 /* Update vectorial force */
1579 fix1
= _mm_add_pd(fix1
,tx
);
1580 fiy1
= _mm_add_pd(fiy1
,ty
);
1581 fiz1
= _mm_add_pd(fiz1
,tz
);
1583 fjx1
= _mm_add_pd(fjx1
,tx
);
1584 fjy1
= _mm_add_pd(fjy1
,ty
);
1585 fjz1
= _mm_add_pd(fjz1
,tz
);
1589 /**************************
1590 * CALCULATE INTERACTIONS *
1591 **************************/
1593 if (gmx_mm_any_lt(rsq12
,rcutoff2
))
1596 /* REACTION-FIELD ELECTROSTATICS */
1597 felec
= _mm_mul_pd(qq12
,_mm_sub_pd(_mm_mul_pd(rinv12
,rinvsq12
),krf2
));
1599 cutoff_mask
= _mm_cmplt_pd(rsq12
,rcutoff2
);
1603 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1605 /* Calculate temporary vectorial force */
1606 tx
= _mm_mul_pd(fscal
,dx12
);
1607 ty
= _mm_mul_pd(fscal
,dy12
);
1608 tz
= _mm_mul_pd(fscal
,dz12
);
1610 /* Update vectorial force */
1611 fix1
= _mm_add_pd(fix1
,tx
);
1612 fiy1
= _mm_add_pd(fiy1
,ty
);
1613 fiz1
= _mm_add_pd(fiz1
,tz
);
1615 fjx2
= _mm_add_pd(fjx2
,tx
);
1616 fjy2
= _mm_add_pd(fjy2
,ty
);
1617 fjz2
= _mm_add_pd(fjz2
,tz
);
1621 /**************************
1622 * CALCULATE INTERACTIONS *
1623 **************************/
1625 if (gmx_mm_any_lt(rsq13
,rcutoff2
))
1628 /* REACTION-FIELD ELECTROSTATICS */
1629 felec
= _mm_mul_pd(qq13
,_mm_sub_pd(_mm_mul_pd(rinv13
,rinvsq13
),krf2
));
1631 cutoff_mask
= _mm_cmplt_pd(rsq13
,rcutoff2
);
1635 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1637 /* Calculate temporary vectorial force */
1638 tx
= _mm_mul_pd(fscal
,dx13
);
1639 ty
= _mm_mul_pd(fscal
,dy13
);
1640 tz
= _mm_mul_pd(fscal
,dz13
);
1642 /* Update vectorial force */
1643 fix1
= _mm_add_pd(fix1
,tx
);
1644 fiy1
= _mm_add_pd(fiy1
,ty
);
1645 fiz1
= _mm_add_pd(fiz1
,tz
);
1647 fjx3
= _mm_add_pd(fjx3
,tx
);
1648 fjy3
= _mm_add_pd(fjy3
,ty
);
1649 fjz3
= _mm_add_pd(fjz3
,tz
);
1653 /**************************
1654 * CALCULATE INTERACTIONS *
1655 **************************/
1657 if (gmx_mm_any_lt(rsq21
,rcutoff2
))
1660 /* REACTION-FIELD ELECTROSTATICS */
1661 felec
= _mm_mul_pd(qq21
,_mm_sub_pd(_mm_mul_pd(rinv21
,rinvsq21
),krf2
));
1663 cutoff_mask
= _mm_cmplt_pd(rsq21
,rcutoff2
);
1667 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1669 /* Calculate temporary vectorial force */
1670 tx
= _mm_mul_pd(fscal
,dx21
);
1671 ty
= _mm_mul_pd(fscal
,dy21
);
1672 tz
= _mm_mul_pd(fscal
,dz21
);
1674 /* Update vectorial force */
1675 fix2
= _mm_add_pd(fix2
,tx
);
1676 fiy2
= _mm_add_pd(fiy2
,ty
);
1677 fiz2
= _mm_add_pd(fiz2
,tz
);
1679 fjx1
= _mm_add_pd(fjx1
,tx
);
1680 fjy1
= _mm_add_pd(fjy1
,ty
);
1681 fjz1
= _mm_add_pd(fjz1
,tz
);
1685 /**************************
1686 * CALCULATE INTERACTIONS *
1687 **************************/
1689 if (gmx_mm_any_lt(rsq22
,rcutoff2
))
1692 /* REACTION-FIELD ELECTROSTATICS */
1693 felec
= _mm_mul_pd(qq22
,_mm_sub_pd(_mm_mul_pd(rinv22
,rinvsq22
),krf2
));
1695 cutoff_mask
= _mm_cmplt_pd(rsq22
,rcutoff2
);
1699 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1701 /* Calculate temporary vectorial force */
1702 tx
= _mm_mul_pd(fscal
,dx22
);
1703 ty
= _mm_mul_pd(fscal
,dy22
);
1704 tz
= _mm_mul_pd(fscal
,dz22
);
1706 /* Update vectorial force */
1707 fix2
= _mm_add_pd(fix2
,tx
);
1708 fiy2
= _mm_add_pd(fiy2
,ty
);
1709 fiz2
= _mm_add_pd(fiz2
,tz
);
1711 fjx2
= _mm_add_pd(fjx2
,tx
);
1712 fjy2
= _mm_add_pd(fjy2
,ty
);
1713 fjz2
= _mm_add_pd(fjz2
,tz
);
1717 /**************************
1718 * CALCULATE INTERACTIONS *
1719 **************************/
1721 if (gmx_mm_any_lt(rsq23
,rcutoff2
))
1724 /* REACTION-FIELD ELECTROSTATICS */
1725 felec
= _mm_mul_pd(qq23
,_mm_sub_pd(_mm_mul_pd(rinv23
,rinvsq23
),krf2
));
1727 cutoff_mask
= _mm_cmplt_pd(rsq23
,rcutoff2
);
1731 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1733 /* Calculate temporary vectorial force */
1734 tx
= _mm_mul_pd(fscal
,dx23
);
1735 ty
= _mm_mul_pd(fscal
,dy23
);
1736 tz
= _mm_mul_pd(fscal
,dz23
);
1738 /* Update vectorial force */
1739 fix2
= _mm_add_pd(fix2
,tx
);
1740 fiy2
= _mm_add_pd(fiy2
,ty
);
1741 fiz2
= _mm_add_pd(fiz2
,tz
);
1743 fjx3
= _mm_add_pd(fjx3
,tx
);
1744 fjy3
= _mm_add_pd(fjy3
,ty
);
1745 fjz3
= _mm_add_pd(fjz3
,tz
);
1749 /**************************
1750 * CALCULATE INTERACTIONS *
1751 **************************/
1753 if (gmx_mm_any_lt(rsq31
,rcutoff2
))
1756 /* REACTION-FIELD ELECTROSTATICS */
1757 felec
= _mm_mul_pd(qq31
,_mm_sub_pd(_mm_mul_pd(rinv31
,rinvsq31
),krf2
));
1759 cutoff_mask
= _mm_cmplt_pd(rsq31
,rcutoff2
);
1763 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1765 /* Calculate temporary vectorial force */
1766 tx
= _mm_mul_pd(fscal
,dx31
);
1767 ty
= _mm_mul_pd(fscal
,dy31
);
1768 tz
= _mm_mul_pd(fscal
,dz31
);
1770 /* Update vectorial force */
1771 fix3
= _mm_add_pd(fix3
,tx
);
1772 fiy3
= _mm_add_pd(fiy3
,ty
);
1773 fiz3
= _mm_add_pd(fiz3
,tz
);
1775 fjx1
= _mm_add_pd(fjx1
,tx
);
1776 fjy1
= _mm_add_pd(fjy1
,ty
);
1777 fjz1
= _mm_add_pd(fjz1
,tz
);
1781 /**************************
1782 * CALCULATE INTERACTIONS *
1783 **************************/
1785 if (gmx_mm_any_lt(rsq32
,rcutoff2
))
1788 /* REACTION-FIELD ELECTROSTATICS */
1789 felec
= _mm_mul_pd(qq32
,_mm_sub_pd(_mm_mul_pd(rinv32
,rinvsq32
),krf2
));
1791 cutoff_mask
= _mm_cmplt_pd(rsq32
,rcutoff2
);
1795 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1797 /* Calculate temporary vectorial force */
1798 tx
= _mm_mul_pd(fscal
,dx32
);
1799 ty
= _mm_mul_pd(fscal
,dy32
);
1800 tz
= _mm_mul_pd(fscal
,dz32
);
1802 /* Update vectorial force */
1803 fix3
= _mm_add_pd(fix3
,tx
);
1804 fiy3
= _mm_add_pd(fiy3
,ty
);
1805 fiz3
= _mm_add_pd(fiz3
,tz
);
1807 fjx2
= _mm_add_pd(fjx2
,tx
);
1808 fjy2
= _mm_add_pd(fjy2
,ty
);
1809 fjz2
= _mm_add_pd(fjz2
,tz
);
1813 /**************************
1814 * CALCULATE INTERACTIONS *
1815 **************************/
1817 if (gmx_mm_any_lt(rsq33
,rcutoff2
))
1820 /* REACTION-FIELD ELECTROSTATICS */
1821 felec
= _mm_mul_pd(qq33
,_mm_sub_pd(_mm_mul_pd(rinv33
,rinvsq33
),krf2
));
1823 cutoff_mask
= _mm_cmplt_pd(rsq33
,rcutoff2
);
1827 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1829 /* Calculate temporary vectorial force */
1830 tx
= _mm_mul_pd(fscal
,dx33
);
1831 ty
= _mm_mul_pd(fscal
,dy33
);
1832 tz
= _mm_mul_pd(fscal
,dz33
);
1834 /* Update vectorial force */
1835 fix3
= _mm_add_pd(fix3
,tx
);
1836 fiy3
= _mm_add_pd(fiy3
,ty
);
1837 fiz3
= _mm_add_pd(fiz3
,tz
);
1839 fjx3
= _mm_add_pd(fjx3
,tx
);
1840 fjy3
= _mm_add_pd(fjy3
,ty
);
1841 fjz3
= _mm_add_pd(fjz3
,tz
);
1845 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
1847 /* Inner loop uses 329 flops */
1850 if(jidx
<j_index_end
)
1854 j_coord_offsetA
= DIM
*jnrA
;
1856 /* load j atom coordinates */
1857 gmx_mm_load_4rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
1858 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
1859 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
1861 /* Calculate displacement vector */
1862 dx00
= _mm_sub_pd(ix0
,jx0
);
1863 dy00
= _mm_sub_pd(iy0
,jy0
);
1864 dz00
= _mm_sub_pd(iz0
,jz0
);
1865 dx11
= _mm_sub_pd(ix1
,jx1
);
1866 dy11
= _mm_sub_pd(iy1
,jy1
);
1867 dz11
= _mm_sub_pd(iz1
,jz1
);
1868 dx12
= _mm_sub_pd(ix1
,jx2
);
1869 dy12
= _mm_sub_pd(iy1
,jy2
);
1870 dz12
= _mm_sub_pd(iz1
,jz2
);
1871 dx13
= _mm_sub_pd(ix1
,jx3
);
1872 dy13
= _mm_sub_pd(iy1
,jy3
);
1873 dz13
= _mm_sub_pd(iz1
,jz3
);
1874 dx21
= _mm_sub_pd(ix2
,jx1
);
1875 dy21
= _mm_sub_pd(iy2
,jy1
);
1876 dz21
= _mm_sub_pd(iz2
,jz1
);
1877 dx22
= _mm_sub_pd(ix2
,jx2
);
1878 dy22
= _mm_sub_pd(iy2
,jy2
);
1879 dz22
= _mm_sub_pd(iz2
,jz2
);
1880 dx23
= _mm_sub_pd(ix2
,jx3
);
1881 dy23
= _mm_sub_pd(iy2
,jy3
);
1882 dz23
= _mm_sub_pd(iz2
,jz3
);
1883 dx31
= _mm_sub_pd(ix3
,jx1
);
1884 dy31
= _mm_sub_pd(iy3
,jy1
);
1885 dz31
= _mm_sub_pd(iz3
,jz1
);
1886 dx32
= _mm_sub_pd(ix3
,jx2
);
1887 dy32
= _mm_sub_pd(iy3
,jy2
);
1888 dz32
= _mm_sub_pd(iz3
,jz2
);
1889 dx33
= _mm_sub_pd(ix3
,jx3
);
1890 dy33
= _mm_sub_pd(iy3
,jy3
);
1891 dz33
= _mm_sub_pd(iz3
,jz3
);
1893 /* Calculate squared distance and things based on it */
1894 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
1895 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
1896 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
1897 rsq13
= gmx_mm_calc_rsq_pd(dx13
,dy13
,dz13
);
1898 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
1899 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
1900 rsq23
= gmx_mm_calc_rsq_pd(dx23
,dy23
,dz23
);
1901 rsq31
= gmx_mm_calc_rsq_pd(dx31
,dy31
,dz31
);
1902 rsq32
= gmx_mm_calc_rsq_pd(dx32
,dy32
,dz32
);
1903 rsq33
= gmx_mm_calc_rsq_pd(dx33
,dy33
,dz33
);
1905 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
1906 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
1907 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
1908 rinv13
= gmx_mm_invsqrt_pd(rsq13
);
1909 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
1910 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
1911 rinv23
= gmx_mm_invsqrt_pd(rsq23
);
1912 rinv31
= gmx_mm_invsqrt_pd(rsq31
);
1913 rinv32
= gmx_mm_invsqrt_pd(rsq32
);
1914 rinv33
= gmx_mm_invsqrt_pd(rsq33
);
1916 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
1917 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
1918 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
1919 rinvsq13
= _mm_mul_pd(rinv13
,rinv13
);
1920 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
1921 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
1922 rinvsq23
= _mm_mul_pd(rinv23
,rinv23
);
1923 rinvsq31
= _mm_mul_pd(rinv31
,rinv31
);
1924 rinvsq32
= _mm_mul_pd(rinv32
,rinv32
);
1925 rinvsq33
= _mm_mul_pd(rinv33
,rinv33
);
1927 fjx0
= _mm_setzero_pd();
1928 fjy0
= _mm_setzero_pd();
1929 fjz0
= _mm_setzero_pd();
1930 fjx1
= _mm_setzero_pd();
1931 fjy1
= _mm_setzero_pd();
1932 fjz1
= _mm_setzero_pd();
1933 fjx2
= _mm_setzero_pd();
1934 fjy2
= _mm_setzero_pd();
1935 fjz2
= _mm_setzero_pd();
1936 fjx3
= _mm_setzero_pd();
1937 fjy3
= _mm_setzero_pd();
1938 fjz3
= _mm_setzero_pd();
1940 /**************************
1941 * CALCULATE INTERACTIONS *
1942 **************************/
1944 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
1947 r00
= _mm_mul_pd(rsq00
,rinv00
);
1949 /* LENNARD-JONES DISPERSION/REPULSION */
1951 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
1952 vvdw6
= _mm_mul_pd(c6_00
,rinvsix
);
1953 vvdw12
= _mm_mul_pd(c12_00
,_mm_mul_pd(rinvsix
,rinvsix
));
1954 vvdw
= _mm_sub_pd( _mm_mul_pd(vvdw12
,one_twelfth
) , _mm_mul_pd(vvdw6
,one_sixth
) );
1955 fvdw
= _mm_mul_pd(_mm_sub_pd(vvdw12
,vvdw6
),rinvsq00
);
1957 d
= _mm_sub_pd(r00
,rswitch
);
1958 d
= _mm_max_pd(d
,_mm_setzero_pd());
1959 d2
= _mm_mul_pd(d
,d
);
1960 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
1962 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
1964 /* Evaluate switch function */
1965 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1966 fvdw
= _mm_sub_pd( _mm_mul_pd(fvdw
,sw
) , _mm_mul_pd(rinv00
,_mm_mul_pd(vvdw
,dsw
)) );
1967 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
1971 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1973 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1975 /* Calculate temporary vectorial force */
1976 tx
= _mm_mul_pd(fscal
,dx00
);
1977 ty
= _mm_mul_pd(fscal
,dy00
);
1978 tz
= _mm_mul_pd(fscal
,dz00
);
1980 /* Update vectorial force */
1981 fix0
= _mm_add_pd(fix0
,tx
);
1982 fiy0
= _mm_add_pd(fiy0
,ty
);
1983 fiz0
= _mm_add_pd(fiz0
,tz
);
1985 fjx0
= _mm_add_pd(fjx0
,tx
);
1986 fjy0
= _mm_add_pd(fjy0
,ty
);
1987 fjz0
= _mm_add_pd(fjz0
,tz
);
1991 /**************************
1992 * CALCULATE INTERACTIONS *
1993 **************************/
1995 if (gmx_mm_any_lt(rsq11
,rcutoff2
))
1998 /* REACTION-FIELD ELECTROSTATICS */
1999 felec
= _mm_mul_pd(qq11
,_mm_sub_pd(_mm_mul_pd(rinv11
,rinvsq11
),krf2
));
2001 cutoff_mask
= _mm_cmplt_pd(rsq11
,rcutoff2
);
2005 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2007 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2009 /* Calculate temporary vectorial force */
2010 tx
= _mm_mul_pd(fscal
,dx11
);
2011 ty
= _mm_mul_pd(fscal
,dy11
);
2012 tz
= _mm_mul_pd(fscal
,dz11
);
2014 /* Update vectorial force */
2015 fix1
= _mm_add_pd(fix1
,tx
);
2016 fiy1
= _mm_add_pd(fiy1
,ty
);
2017 fiz1
= _mm_add_pd(fiz1
,tz
);
2019 fjx1
= _mm_add_pd(fjx1
,tx
);
2020 fjy1
= _mm_add_pd(fjy1
,ty
);
2021 fjz1
= _mm_add_pd(fjz1
,tz
);
2025 /**************************
2026 * CALCULATE INTERACTIONS *
2027 **************************/
2029 if (gmx_mm_any_lt(rsq12
,rcutoff2
))
2032 /* REACTION-FIELD ELECTROSTATICS */
2033 felec
= _mm_mul_pd(qq12
,_mm_sub_pd(_mm_mul_pd(rinv12
,rinvsq12
),krf2
));
2035 cutoff_mask
= _mm_cmplt_pd(rsq12
,rcutoff2
);
2039 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2041 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2043 /* Calculate temporary vectorial force */
2044 tx
= _mm_mul_pd(fscal
,dx12
);
2045 ty
= _mm_mul_pd(fscal
,dy12
);
2046 tz
= _mm_mul_pd(fscal
,dz12
);
2048 /* Update vectorial force */
2049 fix1
= _mm_add_pd(fix1
,tx
);
2050 fiy1
= _mm_add_pd(fiy1
,ty
);
2051 fiz1
= _mm_add_pd(fiz1
,tz
);
2053 fjx2
= _mm_add_pd(fjx2
,tx
);
2054 fjy2
= _mm_add_pd(fjy2
,ty
);
2055 fjz2
= _mm_add_pd(fjz2
,tz
);
2059 /**************************
2060 * CALCULATE INTERACTIONS *
2061 **************************/
2063 if (gmx_mm_any_lt(rsq13
,rcutoff2
))
2066 /* REACTION-FIELD ELECTROSTATICS */
2067 felec
= _mm_mul_pd(qq13
,_mm_sub_pd(_mm_mul_pd(rinv13
,rinvsq13
),krf2
));
2069 cutoff_mask
= _mm_cmplt_pd(rsq13
,rcutoff2
);
2073 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2075 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2077 /* Calculate temporary vectorial force */
2078 tx
= _mm_mul_pd(fscal
,dx13
);
2079 ty
= _mm_mul_pd(fscal
,dy13
);
2080 tz
= _mm_mul_pd(fscal
,dz13
);
2082 /* Update vectorial force */
2083 fix1
= _mm_add_pd(fix1
,tx
);
2084 fiy1
= _mm_add_pd(fiy1
,ty
);
2085 fiz1
= _mm_add_pd(fiz1
,tz
);
2087 fjx3
= _mm_add_pd(fjx3
,tx
);
2088 fjy3
= _mm_add_pd(fjy3
,ty
);
2089 fjz3
= _mm_add_pd(fjz3
,tz
);
2093 /**************************
2094 * CALCULATE INTERACTIONS *
2095 **************************/
2097 if (gmx_mm_any_lt(rsq21
,rcutoff2
))
2100 /* REACTION-FIELD ELECTROSTATICS */
2101 felec
= _mm_mul_pd(qq21
,_mm_sub_pd(_mm_mul_pd(rinv21
,rinvsq21
),krf2
));
2103 cutoff_mask
= _mm_cmplt_pd(rsq21
,rcutoff2
);
2107 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2109 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2111 /* Calculate temporary vectorial force */
2112 tx
= _mm_mul_pd(fscal
,dx21
);
2113 ty
= _mm_mul_pd(fscal
,dy21
);
2114 tz
= _mm_mul_pd(fscal
,dz21
);
2116 /* Update vectorial force */
2117 fix2
= _mm_add_pd(fix2
,tx
);
2118 fiy2
= _mm_add_pd(fiy2
,ty
);
2119 fiz2
= _mm_add_pd(fiz2
,tz
);
2121 fjx1
= _mm_add_pd(fjx1
,tx
);
2122 fjy1
= _mm_add_pd(fjy1
,ty
);
2123 fjz1
= _mm_add_pd(fjz1
,tz
);
2127 /**************************
2128 * CALCULATE INTERACTIONS *
2129 **************************/
2131 if (gmx_mm_any_lt(rsq22
,rcutoff2
))
2134 /* REACTION-FIELD ELECTROSTATICS */
2135 felec
= _mm_mul_pd(qq22
,_mm_sub_pd(_mm_mul_pd(rinv22
,rinvsq22
),krf2
));
2137 cutoff_mask
= _mm_cmplt_pd(rsq22
,rcutoff2
);
2141 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2143 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2145 /* Calculate temporary vectorial force */
2146 tx
= _mm_mul_pd(fscal
,dx22
);
2147 ty
= _mm_mul_pd(fscal
,dy22
);
2148 tz
= _mm_mul_pd(fscal
,dz22
);
2150 /* Update vectorial force */
2151 fix2
= _mm_add_pd(fix2
,tx
);
2152 fiy2
= _mm_add_pd(fiy2
,ty
);
2153 fiz2
= _mm_add_pd(fiz2
,tz
);
2155 fjx2
= _mm_add_pd(fjx2
,tx
);
2156 fjy2
= _mm_add_pd(fjy2
,ty
);
2157 fjz2
= _mm_add_pd(fjz2
,tz
);
2161 /**************************
2162 * CALCULATE INTERACTIONS *
2163 **************************/
2165 if (gmx_mm_any_lt(rsq23
,rcutoff2
))
2168 /* REACTION-FIELD ELECTROSTATICS */
2169 felec
= _mm_mul_pd(qq23
,_mm_sub_pd(_mm_mul_pd(rinv23
,rinvsq23
),krf2
));
2171 cutoff_mask
= _mm_cmplt_pd(rsq23
,rcutoff2
);
2175 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2177 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2179 /* Calculate temporary vectorial force */
2180 tx
= _mm_mul_pd(fscal
,dx23
);
2181 ty
= _mm_mul_pd(fscal
,dy23
);
2182 tz
= _mm_mul_pd(fscal
,dz23
);
2184 /* Update vectorial force */
2185 fix2
= _mm_add_pd(fix2
,tx
);
2186 fiy2
= _mm_add_pd(fiy2
,ty
);
2187 fiz2
= _mm_add_pd(fiz2
,tz
);
2189 fjx3
= _mm_add_pd(fjx3
,tx
);
2190 fjy3
= _mm_add_pd(fjy3
,ty
);
2191 fjz3
= _mm_add_pd(fjz3
,tz
);
2195 /**************************
2196 * CALCULATE INTERACTIONS *
2197 **************************/
2199 if (gmx_mm_any_lt(rsq31
,rcutoff2
))
2202 /* REACTION-FIELD ELECTROSTATICS */
2203 felec
= _mm_mul_pd(qq31
,_mm_sub_pd(_mm_mul_pd(rinv31
,rinvsq31
),krf2
));
2205 cutoff_mask
= _mm_cmplt_pd(rsq31
,rcutoff2
);
2209 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2211 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2213 /* Calculate temporary vectorial force */
2214 tx
= _mm_mul_pd(fscal
,dx31
);
2215 ty
= _mm_mul_pd(fscal
,dy31
);
2216 tz
= _mm_mul_pd(fscal
,dz31
);
2218 /* Update vectorial force */
2219 fix3
= _mm_add_pd(fix3
,tx
);
2220 fiy3
= _mm_add_pd(fiy3
,ty
);
2221 fiz3
= _mm_add_pd(fiz3
,tz
);
2223 fjx1
= _mm_add_pd(fjx1
,tx
);
2224 fjy1
= _mm_add_pd(fjy1
,ty
);
2225 fjz1
= _mm_add_pd(fjz1
,tz
);
2229 /**************************
2230 * CALCULATE INTERACTIONS *
2231 **************************/
2233 if (gmx_mm_any_lt(rsq32
,rcutoff2
))
2236 /* REACTION-FIELD ELECTROSTATICS */
2237 felec
= _mm_mul_pd(qq32
,_mm_sub_pd(_mm_mul_pd(rinv32
,rinvsq32
),krf2
));
2239 cutoff_mask
= _mm_cmplt_pd(rsq32
,rcutoff2
);
2243 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2245 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2247 /* Calculate temporary vectorial force */
2248 tx
= _mm_mul_pd(fscal
,dx32
);
2249 ty
= _mm_mul_pd(fscal
,dy32
);
2250 tz
= _mm_mul_pd(fscal
,dz32
);
2252 /* Update vectorial force */
2253 fix3
= _mm_add_pd(fix3
,tx
);
2254 fiy3
= _mm_add_pd(fiy3
,ty
);
2255 fiz3
= _mm_add_pd(fiz3
,tz
);
2257 fjx2
= _mm_add_pd(fjx2
,tx
);
2258 fjy2
= _mm_add_pd(fjy2
,ty
);
2259 fjz2
= _mm_add_pd(fjz2
,tz
);
2263 /**************************
2264 * CALCULATE INTERACTIONS *
2265 **************************/
2267 if (gmx_mm_any_lt(rsq33
,rcutoff2
))
2270 /* REACTION-FIELD ELECTROSTATICS */
2271 felec
= _mm_mul_pd(qq33
,_mm_sub_pd(_mm_mul_pd(rinv33
,rinvsq33
),krf2
));
2273 cutoff_mask
= _mm_cmplt_pd(rsq33
,rcutoff2
);
2277 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2279 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2281 /* Calculate temporary vectorial force */
2282 tx
= _mm_mul_pd(fscal
,dx33
);
2283 ty
= _mm_mul_pd(fscal
,dy33
);
2284 tz
= _mm_mul_pd(fscal
,dz33
);
2286 /* Update vectorial force */
2287 fix3
= _mm_add_pd(fix3
,tx
);
2288 fiy3
= _mm_add_pd(fiy3
,ty
);
2289 fiz3
= _mm_add_pd(fiz3
,tz
);
2291 fjx3
= _mm_add_pd(fjx3
,tx
);
2292 fjy3
= _mm_add_pd(fjy3
,ty
);
2293 fjz3
= _mm_add_pd(fjz3
,tz
);
2297 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
2299 /* Inner loop uses 329 flops */
2302 /* End of innermost loop */
2304 gmx_mm_update_iforce_4atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
2305 f
+i_coord_offset
,fshift
+i_shift_offset
);
2307 /* Increment number of inner iterations */
2308 inneriter
+= j_index_end
- j_index_start
;
2310 /* Outer loop uses 24 flops */
2313 /* Increment number of outer iterations */
2316 /* Update outer/inner flops */
2318 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W4W4_F
,outeriter
*24 + inneriter
*329);