2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_sse2_double.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_sse2_double
51 * Electrostatics interaction: ReactionField
52 * VdW interaction: LennardJones
53 * Geometry: Water4-Water4
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_sse2_double
58 (t_nblist
* gmx_restrict nlist
,
59 rvec
* gmx_restrict xx
,
60 rvec
* gmx_restrict ff
,
61 struct t_forcerec
* gmx_restrict fr
,
62 t_mdatoms
* gmx_restrict mdatoms
,
63 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
64 t_nrnb
* gmx_restrict nrnb
)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
72 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
74 int j_coord_offsetA
,j_coord_offsetB
;
75 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
77 real
*shiftvec
,*fshift
,*x
,*f
;
78 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
80 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
82 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
84 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
86 __m128d ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
87 int vdwjidx0A
,vdwjidx0B
;
88 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
89 int vdwjidx1A
,vdwjidx1B
;
90 __m128d jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
91 int vdwjidx2A
,vdwjidx2B
;
92 __m128d jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
93 int vdwjidx3A
,vdwjidx3B
;
94 __m128d jx3
,jy3
,jz3
,fjx3
,fjy3
,fjz3
,jq3
,isaj3
;
95 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
96 __m128d dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
97 __m128d dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
98 __m128d dx13
,dy13
,dz13
,rsq13
,rinv13
,rinvsq13
,r13
,qq13
,c6_13
,c12_13
;
99 __m128d dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
100 __m128d dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
101 __m128d dx23
,dy23
,dz23
,rsq23
,rinv23
,rinvsq23
,r23
,qq23
,c6_23
,c12_23
;
102 __m128d dx31
,dy31
,dz31
,rsq31
,rinv31
,rinvsq31
,r31
,qq31
,c6_31
,c12_31
;
103 __m128d dx32
,dy32
,dz32
,rsq32
,rinv32
,rinvsq32
,r32
,qq32
,c6_32
,c12_32
;
104 __m128d dx33
,dy33
,dz33
,rsq33
,rinv33
,rinvsq33
,r33
,qq33
,c6_33
,c12_33
;
105 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
108 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
111 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
112 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
113 __m128d dummy_mask
,cutoff_mask
;
114 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
115 __m128d one
= _mm_set1_pd(1.0);
116 __m128d two
= _mm_set1_pd(2.0);
122 jindex
= nlist
->jindex
;
124 shiftidx
= nlist
->shift
;
126 shiftvec
= fr
->shift_vec
[0];
127 fshift
= fr
->fshift
[0];
128 facel
= _mm_set1_pd(fr
->ic
->epsfac
);
129 charge
= mdatoms
->chargeA
;
130 krf
= _mm_set1_pd(fr
->ic
->k_rf
);
131 krf2
= _mm_set1_pd(fr
->ic
->k_rf
*2.0);
132 crf
= _mm_set1_pd(fr
->ic
->c_rf
);
133 nvdwtype
= fr
->ntype
;
135 vdwtype
= mdatoms
->typeA
;
137 /* Setup water-specific parameters */
138 inr
= nlist
->iinr
[0];
139 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
140 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
141 iq3
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+3]));
142 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
144 jq1
= _mm_set1_pd(charge
[inr
+1]);
145 jq2
= _mm_set1_pd(charge
[inr
+2]);
146 jq3
= _mm_set1_pd(charge
[inr
+3]);
147 vdwjidx0A
= 2*vdwtype
[inr
+0];
148 c6_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
]);
149 c12_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
150 qq11
= _mm_mul_pd(iq1
,jq1
);
151 qq12
= _mm_mul_pd(iq1
,jq2
);
152 qq13
= _mm_mul_pd(iq1
,jq3
);
153 qq21
= _mm_mul_pd(iq2
,jq1
);
154 qq22
= _mm_mul_pd(iq2
,jq2
);
155 qq23
= _mm_mul_pd(iq2
,jq3
);
156 qq31
= _mm_mul_pd(iq3
,jq1
);
157 qq32
= _mm_mul_pd(iq3
,jq2
);
158 qq33
= _mm_mul_pd(iq3
,jq3
);
160 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
161 rcutoff_scalar
= fr
->ic
->rcoulomb
;
162 rcutoff
= _mm_set1_pd(rcutoff_scalar
);
163 rcutoff2
= _mm_mul_pd(rcutoff
,rcutoff
);
165 sh_vdw_invrcut6
= _mm_set1_pd(fr
->ic
->sh_invrc6
);
166 rvdw
= _mm_set1_pd(fr
->ic
->rvdw
);
168 /* Avoid stupid compiler warnings */
176 /* Start outer loop over neighborlists */
177 for(iidx
=0; iidx
<nri
; iidx
++)
179 /* Load shift vector for this list */
180 i_shift_offset
= DIM
*shiftidx
[iidx
];
182 /* Load limits for loop over neighbors */
183 j_index_start
= jindex
[iidx
];
184 j_index_end
= jindex
[iidx
+1];
186 /* Get outer coordinate index */
188 i_coord_offset
= DIM
*inr
;
190 /* Load i particle coords and add shift vector */
191 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
192 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
194 fix0
= _mm_setzero_pd();
195 fiy0
= _mm_setzero_pd();
196 fiz0
= _mm_setzero_pd();
197 fix1
= _mm_setzero_pd();
198 fiy1
= _mm_setzero_pd();
199 fiz1
= _mm_setzero_pd();
200 fix2
= _mm_setzero_pd();
201 fiy2
= _mm_setzero_pd();
202 fiz2
= _mm_setzero_pd();
203 fix3
= _mm_setzero_pd();
204 fiy3
= _mm_setzero_pd();
205 fiz3
= _mm_setzero_pd();
207 /* Reset potential sums */
208 velecsum
= _mm_setzero_pd();
209 vvdwsum
= _mm_setzero_pd();
211 /* Start inner kernel loop */
212 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
215 /* Get j neighbor index, and coordinate index */
218 j_coord_offsetA
= DIM
*jnrA
;
219 j_coord_offsetB
= DIM
*jnrB
;
221 /* load j atom coordinates */
222 gmx_mm_load_4rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
223 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
224 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
226 /* Calculate displacement vector */
227 dx00
= _mm_sub_pd(ix0
,jx0
);
228 dy00
= _mm_sub_pd(iy0
,jy0
);
229 dz00
= _mm_sub_pd(iz0
,jz0
);
230 dx11
= _mm_sub_pd(ix1
,jx1
);
231 dy11
= _mm_sub_pd(iy1
,jy1
);
232 dz11
= _mm_sub_pd(iz1
,jz1
);
233 dx12
= _mm_sub_pd(ix1
,jx2
);
234 dy12
= _mm_sub_pd(iy1
,jy2
);
235 dz12
= _mm_sub_pd(iz1
,jz2
);
236 dx13
= _mm_sub_pd(ix1
,jx3
);
237 dy13
= _mm_sub_pd(iy1
,jy3
);
238 dz13
= _mm_sub_pd(iz1
,jz3
);
239 dx21
= _mm_sub_pd(ix2
,jx1
);
240 dy21
= _mm_sub_pd(iy2
,jy1
);
241 dz21
= _mm_sub_pd(iz2
,jz1
);
242 dx22
= _mm_sub_pd(ix2
,jx2
);
243 dy22
= _mm_sub_pd(iy2
,jy2
);
244 dz22
= _mm_sub_pd(iz2
,jz2
);
245 dx23
= _mm_sub_pd(ix2
,jx3
);
246 dy23
= _mm_sub_pd(iy2
,jy3
);
247 dz23
= _mm_sub_pd(iz2
,jz3
);
248 dx31
= _mm_sub_pd(ix3
,jx1
);
249 dy31
= _mm_sub_pd(iy3
,jy1
);
250 dz31
= _mm_sub_pd(iz3
,jz1
);
251 dx32
= _mm_sub_pd(ix3
,jx2
);
252 dy32
= _mm_sub_pd(iy3
,jy2
);
253 dz32
= _mm_sub_pd(iz3
,jz2
);
254 dx33
= _mm_sub_pd(ix3
,jx3
);
255 dy33
= _mm_sub_pd(iy3
,jy3
);
256 dz33
= _mm_sub_pd(iz3
,jz3
);
258 /* Calculate squared distance and things based on it */
259 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
260 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
261 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
262 rsq13
= gmx_mm_calc_rsq_pd(dx13
,dy13
,dz13
);
263 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
264 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
265 rsq23
= gmx_mm_calc_rsq_pd(dx23
,dy23
,dz23
);
266 rsq31
= gmx_mm_calc_rsq_pd(dx31
,dy31
,dz31
);
267 rsq32
= gmx_mm_calc_rsq_pd(dx32
,dy32
,dz32
);
268 rsq33
= gmx_mm_calc_rsq_pd(dx33
,dy33
,dz33
);
270 rinv11
= sse2_invsqrt_d(rsq11
);
271 rinv12
= sse2_invsqrt_d(rsq12
);
272 rinv13
= sse2_invsqrt_d(rsq13
);
273 rinv21
= sse2_invsqrt_d(rsq21
);
274 rinv22
= sse2_invsqrt_d(rsq22
);
275 rinv23
= sse2_invsqrt_d(rsq23
);
276 rinv31
= sse2_invsqrt_d(rsq31
);
277 rinv32
= sse2_invsqrt_d(rsq32
);
278 rinv33
= sse2_invsqrt_d(rsq33
);
280 rinvsq00
= sse2_inv_d(rsq00
);
281 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
282 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
283 rinvsq13
= _mm_mul_pd(rinv13
,rinv13
);
284 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
285 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
286 rinvsq23
= _mm_mul_pd(rinv23
,rinv23
);
287 rinvsq31
= _mm_mul_pd(rinv31
,rinv31
);
288 rinvsq32
= _mm_mul_pd(rinv32
,rinv32
);
289 rinvsq33
= _mm_mul_pd(rinv33
,rinv33
);
291 fjx0
= _mm_setzero_pd();
292 fjy0
= _mm_setzero_pd();
293 fjz0
= _mm_setzero_pd();
294 fjx1
= _mm_setzero_pd();
295 fjy1
= _mm_setzero_pd();
296 fjz1
= _mm_setzero_pd();
297 fjx2
= _mm_setzero_pd();
298 fjy2
= _mm_setzero_pd();
299 fjz2
= _mm_setzero_pd();
300 fjx3
= _mm_setzero_pd();
301 fjy3
= _mm_setzero_pd();
302 fjz3
= _mm_setzero_pd();
304 /**************************
305 * CALCULATE INTERACTIONS *
306 **************************/
308 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
311 /* LENNARD-JONES DISPERSION/REPULSION */
313 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
314 vvdw6
= _mm_mul_pd(c6_00
,rinvsix
);
315 vvdw12
= _mm_mul_pd(c12_00
,_mm_mul_pd(rinvsix
,rinvsix
));
316 vvdw
= _mm_sub_pd(_mm_mul_pd( _mm_sub_pd(vvdw12
, _mm_mul_pd(c12_00
,_mm_mul_pd(sh_vdw_invrcut6
,sh_vdw_invrcut6
))), one_twelfth
) ,
317 _mm_mul_pd( _mm_sub_pd(vvdw6
,_mm_mul_pd(c6_00
,sh_vdw_invrcut6
)),one_sixth
));
318 fvdw
= _mm_mul_pd(_mm_sub_pd(vvdw12
,vvdw6
),rinvsq00
);
320 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
322 /* Update potential sum for this i atom from the interaction with this j atom. */
323 vvdw
= _mm_and_pd(vvdw
,cutoff_mask
);
324 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
328 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
330 /* Calculate temporary vectorial force */
331 tx
= _mm_mul_pd(fscal
,dx00
);
332 ty
= _mm_mul_pd(fscal
,dy00
);
333 tz
= _mm_mul_pd(fscal
,dz00
);
335 /* Update vectorial force */
336 fix0
= _mm_add_pd(fix0
,tx
);
337 fiy0
= _mm_add_pd(fiy0
,ty
);
338 fiz0
= _mm_add_pd(fiz0
,tz
);
340 fjx0
= _mm_add_pd(fjx0
,tx
);
341 fjy0
= _mm_add_pd(fjy0
,ty
);
342 fjz0
= _mm_add_pd(fjz0
,tz
);
346 /**************************
347 * CALCULATE INTERACTIONS *
348 **************************/
350 if (gmx_mm_any_lt(rsq11
,rcutoff2
))
353 /* REACTION-FIELD ELECTROSTATICS */
354 velec
= _mm_mul_pd(qq11
,_mm_sub_pd(_mm_add_pd(rinv11
,_mm_mul_pd(krf
,rsq11
)),crf
));
355 felec
= _mm_mul_pd(qq11
,_mm_sub_pd(_mm_mul_pd(rinv11
,rinvsq11
),krf2
));
357 cutoff_mask
= _mm_cmplt_pd(rsq11
,rcutoff2
);
359 /* Update potential sum for this i atom from the interaction with this j atom. */
360 velec
= _mm_and_pd(velec
,cutoff_mask
);
361 velecsum
= _mm_add_pd(velecsum
,velec
);
365 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
367 /* Calculate temporary vectorial force */
368 tx
= _mm_mul_pd(fscal
,dx11
);
369 ty
= _mm_mul_pd(fscal
,dy11
);
370 tz
= _mm_mul_pd(fscal
,dz11
);
372 /* Update vectorial force */
373 fix1
= _mm_add_pd(fix1
,tx
);
374 fiy1
= _mm_add_pd(fiy1
,ty
);
375 fiz1
= _mm_add_pd(fiz1
,tz
);
377 fjx1
= _mm_add_pd(fjx1
,tx
);
378 fjy1
= _mm_add_pd(fjy1
,ty
);
379 fjz1
= _mm_add_pd(fjz1
,tz
);
383 /**************************
384 * CALCULATE INTERACTIONS *
385 **************************/
387 if (gmx_mm_any_lt(rsq12
,rcutoff2
))
390 /* REACTION-FIELD ELECTROSTATICS */
391 velec
= _mm_mul_pd(qq12
,_mm_sub_pd(_mm_add_pd(rinv12
,_mm_mul_pd(krf
,rsq12
)),crf
));
392 felec
= _mm_mul_pd(qq12
,_mm_sub_pd(_mm_mul_pd(rinv12
,rinvsq12
),krf2
));
394 cutoff_mask
= _mm_cmplt_pd(rsq12
,rcutoff2
);
396 /* Update potential sum for this i atom from the interaction with this j atom. */
397 velec
= _mm_and_pd(velec
,cutoff_mask
);
398 velecsum
= _mm_add_pd(velecsum
,velec
);
402 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
404 /* Calculate temporary vectorial force */
405 tx
= _mm_mul_pd(fscal
,dx12
);
406 ty
= _mm_mul_pd(fscal
,dy12
);
407 tz
= _mm_mul_pd(fscal
,dz12
);
409 /* Update vectorial force */
410 fix1
= _mm_add_pd(fix1
,tx
);
411 fiy1
= _mm_add_pd(fiy1
,ty
);
412 fiz1
= _mm_add_pd(fiz1
,tz
);
414 fjx2
= _mm_add_pd(fjx2
,tx
);
415 fjy2
= _mm_add_pd(fjy2
,ty
);
416 fjz2
= _mm_add_pd(fjz2
,tz
);
420 /**************************
421 * CALCULATE INTERACTIONS *
422 **************************/
424 if (gmx_mm_any_lt(rsq13
,rcutoff2
))
427 /* REACTION-FIELD ELECTROSTATICS */
428 velec
= _mm_mul_pd(qq13
,_mm_sub_pd(_mm_add_pd(rinv13
,_mm_mul_pd(krf
,rsq13
)),crf
));
429 felec
= _mm_mul_pd(qq13
,_mm_sub_pd(_mm_mul_pd(rinv13
,rinvsq13
),krf2
));
431 cutoff_mask
= _mm_cmplt_pd(rsq13
,rcutoff2
);
433 /* Update potential sum for this i atom from the interaction with this j atom. */
434 velec
= _mm_and_pd(velec
,cutoff_mask
);
435 velecsum
= _mm_add_pd(velecsum
,velec
);
439 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
441 /* Calculate temporary vectorial force */
442 tx
= _mm_mul_pd(fscal
,dx13
);
443 ty
= _mm_mul_pd(fscal
,dy13
);
444 tz
= _mm_mul_pd(fscal
,dz13
);
446 /* Update vectorial force */
447 fix1
= _mm_add_pd(fix1
,tx
);
448 fiy1
= _mm_add_pd(fiy1
,ty
);
449 fiz1
= _mm_add_pd(fiz1
,tz
);
451 fjx3
= _mm_add_pd(fjx3
,tx
);
452 fjy3
= _mm_add_pd(fjy3
,ty
);
453 fjz3
= _mm_add_pd(fjz3
,tz
);
457 /**************************
458 * CALCULATE INTERACTIONS *
459 **************************/
461 if (gmx_mm_any_lt(rsq21
,rcutoff2
))
464 /* REACTION-FIELD ELECTROSTATICS */
465 velec
= _mm_mul_pd(qq21
,_mm_sub_pd(_mm_add_pd(rinv21
,_mm_mul_pd(krf
,rsq21
)),crf
));
466 felec
= _mm_mul_pd(qq21
,_mm_sub_pd(_mm_mul_pd(rinv21
,rinvsq21
),krf2
));
468 cutoff_mask
= _mm_cmplt_pd(rsq21
,rcutoff2
);
470 /* Update potential sum for this i atom from the interaction with this j atom. */
471 velec
= _mm_and_pd(velec
,cutoff_mask
);
472 velecsum
= _mm_add_pd(velecsum
,velec
);
476 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
478 /* Calculate temporary vectorial force */
479 tx
= _mm_mul_pd(fscal
,dx21
);
480 ty
= _mm_mul_pd(fscal
,dy21
);
481 tz
= _mm_mul_pd(fscal
,dz21
);
483 /* Update vectorial force */
484 fix2
= _mm_add_pd(fix2
,tx
);
485 fiy2
= _mm_add_pd(fiy2
,ty
);
486 fiz2
= _mm_add_pd(fiz2
,tz
);
488 fjx1
= _mm_add_pd(fjx1
,tx
);
489 fjy1
= _mm_add_pd(fjy1
,ty
);
490 fjz1
= _mm_add_pd(fjz1
,tz
);
494 /**************************
495 * CALCULATE INTERACTIONS *
496 **************************/
498 if (gmx_mm_any_lt(rsq22
,rcutoff2
))
501 /* REACTION-FIELD ELECTROSTATICS */
502 velec
= _mm_mul_pd(qq22
,_mm_sub_pd(_mm_add_pd(rinv22
,_mm_mul_pd(krf
,rsq22
)),crf
));
503 felec
= _mm_mul_pd(qq22
,_mm_sub_pd(_mm_mul_pd(rinv22
,rinvsq22
),krf2
));
505 cutoff_mask
= _mm_cmplt_pd(rsq22
,rcutoff2
);
507 /* Update potential sum for this i atom from the interaction with this j atom. */
508 velec
= _mm_and_pd(velec
,cutoff_mask
);
509 velecsum
= _mm_add_pd(velecsum
,velec
);
513 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
515 /* Calculate temporary vectorial force */
516 tx
= _mm_mul_pd(fscal
,dx22
);
517 ty
= _mm_mul_pd(fscal
,dy22
);
518 tz
= _mm_mul_pd(fscal
,dz22
);
520 /* Update vectorial force */
521 fix2
= _mm_add_pd(fix2
,tx
);
522 fiy2
= _mm_add_pd(fiy2
,ty
);
523 fiz2
= _mm_add_pd(fiz2
,tz
);
525 fjx2
= _mm_add_pd(fjx2
,tx
);
526 fjy2
= _mm_add_pd(fjy2
,ty
);
527 fjz2
= _mm_add_pd(fjz2
,tz
);
531 /**************************
532 * CALCULATE INTERACTIONS *
533 **************************/
535 if (gmx_mm_any_lt(rsq23
,rcutoff2
))
538 /* REACTION-FIELD ELECTROSTATICS */
539 velec
= _mm_mul_pd(qq23
,_mm_sub_pd(_mm_add_pd(rinv23
,_mm_mul_pd(krf
,rsq23
)),crf
));
540 felec
= _mm_mul_pd(qq23
,_mm_sub_pd(_mm_mul_pd(rinv23
,rinvsq23
),krf2
));
542 cutoff_mask
= _mm_cmplt_pd(rsq23
,rcutoff2
);
544 /* Update potential sum for this i atom from the interaction with this j atom. */
545 velec
= _mm_and_pd(velec
,cutoff_mask
);
546 velecsum
= _mm_add_pd(velecsum
,velec
);
550 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
552 /* Calculate temporary vectorial force */
553 tx
= _mm_mul_pd(fscal
,dx23
);
554 ty
= _mm_mul_pd(fscal
,dy23
);
555 tz
= _mm_mul_pd(fscal
,dz23
);
557 /* Update vectorial force */
558 fix2
= _mm_add_pd(fix2
,tx
);
559 fiy2
= _mm_add_pd(fiy2
,ty
);
560 fiz2
= _mm_add_pd(fiz2
,tz
);
562 fjx3
= _mm_add_pd(fjx3
,tx
);
563 fjy3
= _mm_add_pd(fjy3
,ty
);
564 fjz3
= _mm_add_pd(fjz3
,tz
);
568 /**************************
569 * CALCULATE INTERACTIONS *
570 **************************/
572 if (gmx_mm_any_lt(rsq31
,rcutoff2
))
575 /* REACTION-FIELD ELECTROSTATICS */
576 velec
= _mm_mul_pd(qq31
,_mm_sub_pd(_mm_add_pd(rinv31
,_mm_mul_pd(krf
,rsq31
)),crf
));
577 felec
= _mm_mul_pd(qq31
,_mm_sub_pd(_mm_mul_pd(rinv31
,rinvsq31
),krf2
));
579 cutoff_mask
= _mm_cmplt_pd(rsq31
,rcutoff2
);
581 /* Update potential sum for this i atom from the interaction with this j atom. */
582 velec
= _mm_and_pd(velec
,cutoff_mask
);
583 velecsum
= _mm_add_pd(velecsum
,velec
);
587 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
589 /* Calculate temporary vectorial force */
590 tx
= _mm_mul_pd(fscal
,dx31
);
591 ty
= _mm_mul_pd(fscal
,dy31
);
592 tz
= _mm_mul_pd(fscal
,dz31
);
594 /* Update vectorial force */
595 fix3
= _mm_add_pd(fix3
,tx
);
596 fiy3
= _mm_add_pd(fiy3
,ty
);
597 fiz3
= _mm_add_pd(fiz3
,tz
);
599 fjx1
= _mm_add_pd(fjx1
,tx
);
600 fjy1
= _mm_add_pd(fjy1
,ty
);
601 fjz1
= _mm_add_pd(fjz1
,tz
);
605 /**************************
606 * CALCULATE INTERACTIONS *
607 **************************/
609 if (gmx_mm_any_lt(rsq32
,rcutoff2
))
612 /* REACTION-FIELD ELECTROSTATICS */
613 velec
= _mm_mul_pd(qq32
,_mm_sub_pd(_mm_add_pd(rinv32
,_mm_mul_pd(krf
,rsq32
)),crf
));
614 felec
= _mm_mul_pd(qq32
,_mm_sub_pd(_mm_mul_pd(rinv32
,rinvsq32
),krf2
));
616 cutoff_mask
= _mm_cmplt_pd(rsq32
,rcutoff2
);
618 /* Update potential sum for this i atom from the interaction with this j atom. */
619 velec
= _mm_and_pd(velec
,cutoff_mask
);
620 velecsum
= _mm_add_pd(velecsum
,velec
);
624 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
626 /* Calculate temporary vectorial force */
627 tx
= _mm_mul_pd(fscal
,dx32
);
628 ty
= _mm_mul_pd(fscal
,dy32
);
629 tz
= _mm_mul_pd(fscal
,dz32
);
631 /* Update vectorial force */
632 fix3
= _mm_add_pd(fix3
,tx
);
633 fiy3
= _mm_add_pd(fiy3
,ty
);
634 fiz3
= _mm_add_pd(fiz3
,tz
);
636 fjx2
= _mm_add_pd(fjx2
,tx
);
637 fjy2
= _mm_add_pd(fjy2
,ty
);
638 fjz2
= _mm_add_pd(fjz2
,tz
);
642 /**************************
643 * CALCULATE INTERACTIONS *
644 **************************/
646 if (gmx_mm_any_lt(rsq33
,rcutoff2
))
649 /* REACTION-FIELD ELECTROSTATICS */
650 velec
= _mm_mul_pd(qq33
,_mm_sub_pd(_mm_add_pd(rinv33
,_mm_mul_pd(krf
,rsq33
)),crf
));
651 felec
= _mm_mul_pd(qq33
,_mm_sub_pd(_mm_mul_pd(rinv33
,rinvsq33
),krf2
));
653 cutoff_mask
= _mm_cmplt_pd(rsq33
,rcutoff2
);
655 /* Update potential sum for this i atom from the interaction with this j atom. */
656 velec
= _mm_and_pd(velec
,cutoff_mask
);
657 velecsum
= _mm_add_pd(velecsum
,velec
);
661 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
663 /* Calculate temporary vectorial force */
664 tx
= _mm_mul_pd(fscal
,dx33
);
665 ty
= _mm_mul_pd(fscal
,dy33
);
666 tz
= _mm_mul_pd(fscal
,dz33
);
668 /* Update vectorial force */
669 fix3
= _mm_add_pd(fix3
,tx
);
670 fiy3
= _mm_add_pd(fiy3
,ty
);
671 fiz3
= _mm_add_pd(fiz3
,tz
);
673 fjx3
= _mm_add_pd(fjx3
,tx
);
674 fjy3
= _mm_add_pd(fjy3
,ty
);
675 fjz3
= _mm_add_pd(fjz3
,tz
);
679 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
681 /* Inner loop uses 368 flops */
688 j_coord_offsetA
= DIM
*jnrA
;
690 /* load j atom coordinates */
691 gmx_mm_load_4rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
692 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
693 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
695 /* Calculate displacement vector */
696 dx00
= _mm_sub_pd(ix0
,jx0
);
697 dy00
= _mm_sub_pd(iy0
,jy0
);
698 dz00
= _mm_sub_pd(iz0
,jz0
);
699 dx11
= _mm_sub_pd(ix1
,jx1
);
700 dy11
= _mm_sub_pd(iy1
,jy1
);
701 dz11
= _mm_sub_pd(iz1
,jz1
);
702 dx12
= _mm_sub_pd(ix1
,jx2
);
703 dy12
= _mm_sub_pd(iy1
,jy2
);
704 dz12
= _mm_sub_pd(iz1
,jz2
);
705 dx13
= _mm_sub_pd(ix1
,jx3
);
706 dy13
= _mm_sub_pd(iy1
,jy3
);
707 dz13
= _mm_sub_pd(iz1
,jz3
);
708 dx21
= _mm_sub_pd(ix2
,jx1
);
709 dy21
= _mm_sub_pd(iy2
,jy1
);
710 dz21
= _mm_sub_pd(iz2
,jz1
);
711 dx22
= _mm_sub_pd(ix2
,jx2
);
712 dy22
= _mm_sub_pd(iy2
,jy2
);
713 dz22
= _mm_sub_pd(iz2
,jz2
);
714 dx23
= _mm_sub_pd(ix2
,jx3
);
715 dy23
= _mm_sub_pd(iy2
,jy3
);
716 dz23
= _mm_sub_pd(iz2
,jz3
);
717 dx31
= _mm_sub_pd(ix3
,jx1
);
718 dy31
= _mm_sub_pd(iy3
,jy1
);
719 dz31
= _mm_sub_pd(iz3
,jz1
);
720 dx32
= _mm_sub_pd(ix3
,jx2
);
721 dy32
= _mm_sub_pd(iy3
,jy2
);
722 dz32
= _mm_sub_pd(iz3
,jz2
);
723 dx33
= _mm_sub_pd(ix3
,jx3
);
724 dy33
= _mm_sub_pd(iy3
,jy3
);
725 dz33
= _mm_sub_pd(iz3
,jz3
);
727 /* Calculate squared distance and things based on it */
728 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
729 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
730 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
731 rsq13
= gmx_mm_calc_rsq_pd(dx13
,dy13
,dz13
);
732 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
733 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
734 rsq23
= gmx_mm_calc_rsq_pd(dx23
,dy23
,dz23
);
735 rsq31
= gmx_mm_calc_rsq_pd(dx31
,dy31
,dz31
);
736 rsq32
= gmx_mm_calc_rsq_pd(dx32
,dy32
,dz32
);
737 rsq33
= gmx_mm_calc_rsq_pd(dx33
,dy33
,dz33
);
739 rinv11
= sse2_invsqrt_d(rsq11
);
740 rinv12
= sse2_invsqrt_d(rsq12
);
741 rinv13
= sse2_invsqrt_d(rsq13
);
742 rinv21
= sse2_invsqrt_d(rsq21
);
743 rinv22
= sse2_invsqrt_d(rsq22
);
744 rinv23
= sse2_invsqrt_d(rsq23
);
745 rinv31
= sse2_invsqrt_d(rsq31
);
746 rinv32
= sse2_invsqrt_d(rsq32
);
747 rinv33
= sse2_invsqrt_d(rsq33
);
749 rinvsq00
= sse2_inv_d(rsq00
);
750 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
751 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
752 rinvsq13
= _mm_mul_pd(rinv13
,rinv13
);
753 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
754 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
755 rinvsq23
= _mm_mul_pd(rinv23
,rinv23
);
756 rinvsq31
= _mm_mul_pd(rinv31
,rinv31
);
757 rinvsq32
= _mm_mul_pd(rinv32
,rinv32
);
758 rinvsq33
= _mm_mul_pd(rinv33
,rinv33
);
760 fjx0
= _mm_setzero_pd();
761 fjy0
= _mm_setzero_pd();
762 fjz0
= _mm_setzero_pd();
763 fjx1
= _mm_setzero_pd();
764 fjy1
= _mm_setzero_pd();
765 fjz1
= _mm_setzero_pd();
766 fjx2
= _mm_setzero_pd();
767 fjy2
= _mm_setzero_pd();
768 fjz2
= _mm_setzero_pd();
769 fjx3
= _mm_setzero_pd();
770 fjy3
= _mm_setzero_pd();
771 fjz3
= _mm_setzero_pd();
773 /**************************
774 * CALCULATE INTERACTIONS *
775 **************************/
777 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
780 /* LENNARD-JONES DISPERSION/REPULSION */
782 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
783 vvdw6
= _mm_mul_pd(c6_00
,rinvsix
);
784 vvdw12
= _mm_mul_pd(c12_00
,_mm_mul_pd(rinvsix
,rinvsix
));
785 vvdw
= _mm_sub_pd(_mm_mul_pd( _mm_sub_pd(vvdw12
, _mm_mul_pd(c12_00
,_mm_mul_pd(sh_vdw_invrcut6
,sh_vdw_invrcut6
))), one_twelfth
) ,
786 _mm_mul_pd( _mm_sub_pd(vvdw6
,_mm_mul_pd(c6_00
,sh_vdw_invrcut6
)),one_sixth
));
787 fvdw
= _mm_mul_pd(_mm_sub_pd(vvdw12
,vvdw6
),rinvsq00
);
789 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
791 /* Update potential sum for this i atom from the interaction with this j atom. */
792 vvdw
= _mm_and_pd(vvdw
,cutoff_mask
);
793 vvdw
= _mm_unpacklo_pd(vvdw
,_mm_setzero_pd());
794 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
798 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
800 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
802 /* Calculate temporary vectorial force */
803 tx
= _mm_mul_pd(fscal
,dx00
);
804 ty
= _mm_mul_pd(fscal
,dy00
);
805 tz
= _mm_mul_pd(fscal
,dz00
);
807 /* Update vectorial force */
808 fix0
= _mm_add_pd(fix0
,tx
);
809 fiy0
= _mm_add_pd(fiy0
,ty
);
810 fiz0
= _mm_add_pd(fiz0
,tz
);
812 fjx0
= _mm_add_pd(fjx0
,tx
);
813 fjy0
= _mm_add_pd(fjy0
,ty
);
814 fjz0
= _mm_add_pd(fjz0
,tz
);
818 /**************************
819 * CALCULATE INTERACTIONS *
820 **************************/
822 if (gmx_mm_any_lt(rsq11
,rcutoff2
))
825 /* REACTION-FIELD ELECTROSTATICS */
826 velec
= _mm_mul_pd(qq11
,_mm_sub_pd(_mm_add_pd(rinv11
,_mm_mul_pd(krf
,rsq11
)),crf
));
827 felec
= _mm_mul_pd(qq11
,_mm_sub_pd(_mm_mul_pd(rinv11
,rinvsq11
),krf2
));
829 cutoff_mask
= _mm_cmplt_pd(rsq11
,rcutoff2
);
831 /* Update potential sum for this i atom from the interaction with this j atom. */
832 velec
= _mm_and_pd(velec
,cutoff_mask
);
833 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
834 velecsum
= _mm_add_pd(velecsum
,velec
);
838 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
840 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
842 /* Calculate temporary vectorial force */
843 tx
= _mm_mul_pd(fscal
,dx11
);
844 ty
= _mm_mul_pd(fscal
,dy11
);
845 tz
= _mm_mul_pd(fscal
,dz11
);
847 /* Update vectorial force */
848 fix1
= _mm_add_pd(fix1
,tx
);
849 fiy1
= _mm_add_pd(fiy1
,ty
);
850 fiz1
= _mm_add_pd(fiz1
,tz
);
852 fjx1
= _mm_add_pd(fjx1
,tx
);
853 fjy1
= _mm_add_pd(fjy1
,ty
);
854 fjz1
= _mm_add_pd(fjz1
,tz
);
858 /**************************
859 * CALCULATE INTERACTIONS *
860 **************************/
862 if (gmx_mm_any_lt(rsq12
,rcutoff2
))
865 /* REACTION-FIELD ELECTROSTATICS */
866 velec
= _mm_mul_pd(qq12
,_mm_sub_pd(_mm_add_pd(rinv12
,_mm_mul_pd(krf
,rsq12
)),crf
));
867 felec
= _mm_mul_pd(qq12
,_mm_sub_pd(_mm_mul_pd(rinv12
,rinvsq12
),krf2
));
869 cutoff_mask
= _mm_cmplt_pd(rsq12
,rcutoff2
);
871 /* Update potential sum for this i atom from the interaction with this j atom. */
872 velec
= _mm_and_pd(velec
,cutoff_mask
);
873 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
874 velecsum
= _mm_add_pd(velecsum
,velec
);
878 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
880 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
882 /* Calculate temporary vectorial force */
883 tx
= _mm_mul_pd(fscal
,dx12
);
884 ty
= _mm_mul_pd(fscal
,dy12
);
885 tz
= _mm_mul_pd(fscal
,dz12
);
887 /* Update vectorial force */
888 fix1
= _mm_add_pd(fix1
,tx
);
889 fiy1
= _mm_add_pd(fiy1
,ty
);
890 fiz1
= _mm_add_pd(fiz1
,tz
);
892 fjx2
= _mm_add_pd(fjx2
,tx
);
893 fjy2
= _mm_add_pd(fjy2
,ty
);
894 fjz2
= _mm_add_pd(fjz2
,tz
);
898 /**************************
899 * CALCULATE INTERACTIONS *
900 **************************/
902 if (gmx_mm_any_lt(rsq13
,rcutoff2
))
905 /* REACTION-FIELD ELECTROSTATICS */
906 velec
= _mm_mul_pd(qq13
,_mm_sub_pd(_mm_add_pd(rinv13
,_mm_mul_pd(krf
,rsq13
)),crf
));
907 felec
= _mm_mul_pd(qq13
,_mm_sub_pd(_mm_mul_pd(rinv13
,rinvsq13
),krf2
));
909 cutoff_mask
= _mm_cmplt_pd(rsq13
,rcutoff2
);
911 /* Update potential sum for this i atom from the interaction with this j atom. */
912 velec
= _mm_and_pd(velec
,cutoff_mask
);
913 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
914 velecsum
= _mm_add_pd(velecsum
,velec
);
918 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
920 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
922 /* Calculate temporary vectorial force */
923 tx
= _mm_mul_pd(fscal
,dx13
);
924 ty
= _mm_mul_pd(fscal
,dy13
);
925 tz
= _mm_mul_pd(fscal
,dz13
);
927 /* Update vectorial force */
928 fix1
= _mm_add_pd(fix1
,tx
);
929 fiy1
= _mm_add_pd(fiy1
,ty
);
930 fiz1
= _mm_add_pd(fiz1
,tz
);
932 fjx3
= _mm_add_pd(fjx3
,tx
);
933 fjy3
= _mm_add_pd(fjy3
,ty
);
934 fjz3
= _mm_add_pd(fjz3
,tz
);
938 /**************************
939 * CALCULATE INTERACTIONS *
940 **************************/
942 if (gmx_mm_any_lt(rsq21
,rcutoff2
))
945 /* REACTION-FIELD ELECTROSTATICS */
946 velec
= _mm_mul_pd(qq21
,_mm_sub_pd(_mm_add_pd(rinv21
,_mm_mul_pd(krf
,rsq21
)),crf
));
947 felec
= _mm_mul_pd(qq21
,_mm_sub_pd(_mm_mul_pd(rinv21
,rinvsq21
),krf2
));
949 cutoff_mask
= _mm_cmplt_pd(rsq21
,rcutoff2
);
951 /* Update potential sum for this i atom from the interaction with this j atom. */
952 velec
= _mm_and_pd(velec
,cutoff_mask
);
953 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
954 velecsum
= _mm_add_pd(velecsum
,velec
);
958 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
960 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
962 /* Calculate temporary vectorial force */
963 tx
= _mm_mul_pd(fscal
,dx21
);
964 ty
= _mm_mul_pd(fscal
,dy21
);
965 tz
= _mm_mul_pd(fscal
,dz21
);
967 /* Update vectorial force */
968 fix2
= _mm_add_pd(fix2
,tx
);
969 fiy2
= _mm_add_pd(fiy2
,ty
);
970 fiz2
= _mm_add_pd(fiz2
,tz
);
972 fjx1
= _mm_add_pd(fjx1
,tx
);
973 fjy1
= _mm_add_pd(fjy1
,ty
);
974 fjz1
= _mm_add_pd(fjz1
,tz
);
978 /**************************
979 * CALCULATE INTERACTIONS *
980 **************************/
982 if (gmx_mm_any_lt(rsq22
,rcutoff2
))
985 /* REACTION-FIELD ELECTROSTATICS */
986 velec
= _mm_mul_pd(qq22
,_mm_sub_pd(_mm_add_pd(rinv22
,_mm_mul_pd(krf
,rsq22
)),crf
));
987 felec
= _mm_mul_pd(qq22
,_mm_sub_pd(_mm_mul_pd(rinv22
,rinvsq22
),krf2
));
989 cutoff_mask
= _mm_cmplt_pd(rsq22
,rcutoff2
);
991 /* Update potential sum for this i atom from the interaction with this j atom. */
992 velec
= _mm_and_pd(velec
,cutoff_mask
);
993 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
994 velecsum
= _mm_add_pd(velecsum
,velec
);
998 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1000 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1002 /* Calculate temporary vectorial force */
1003 tx
= _mm_mul_pd(fscal
,dx22
);
1004 ty
= _mm_mul_pd(fscal
,dy22
);
1005 tz
= _mm_mul_pd(fscal
,dz22
);
1007 /* Update vectorial force */
1008 fix2
= _mm_add_pd(fix2
,tx
);
1009 fiy2
= _mm_add_pd(fiy2
,ty
);
1010 fiz2
= _mm_add_pd(fiz2
,tz
);
1012 fjx2
= _mm_add_pd(fjx2
,tx
);
1013 fjy2
= _mm_add_pd(fjy2
,ty
);
1014 fjz2
= _mm_add_pd(fjz2
,tz
);
1018 /**************************
1019 * CALCULATE INTERACTIONS *
1020 **************************/
1022 if (gmx_mm_any_lt(rsq23
,rcutoff2
))
1025 /* REACTION-FIELD ELECTROSTATICS */
1026 velec
= _mm_mul_pd(qq23
,_mm_sub_pd(_mm_add_pd(rinv23
,_mm_mul_pd(krf
,rsq23
)),crf
));
1027 felec
= _mm_mul_pd(qq23
,_mm_sub_pd(_mm_mul_pd(rinv23
,rinvsq23
),krf2
));
1029 cutoff_mask
= _mm_cmplt_pd(rsq23
,rcutoff2
);
1031 /* Update potential sum for this i atom from the interaction with this j atom. */
1032 velec
= _mm_and_pd(velec
,cutoff_mask
);
1033 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1034 velecsum
= _mm_add_pd(velecsum
,velec
);
1038 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1040 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1042 /* Calculate temporary vectorial force */
1043 tx
= _mm_mul_pd(fscal
,dx23
);
1044 ty
= _mm_mul_pd(fscal
,dy23
);
1045 tz
= _mm_mul_pd(fscal
,dz23
);
1047 /* Update vectorial force */
1048 fix2
= _mm_add_pd(fix2
,tx
);
1049 fiy2
= _mm_add_pd(fiy2
,ty
);
1050 fiz2
= _mm_add_pd(fiz2
,tz
);
1052 fjx3
= _mm_add_pd(fjx3
,tx
);
1053 fjy3
= _mm_add_pd(fjy3
,ty
);
1054 fjz3
= _mm_add_pd(fjz3
,tz
);
1058 /**************************
1059 * CALCULATE INTERACTIONS *
1060 **************************/
1062 if (gmx_mm_any_lt(rsq31
,rcutoff2
))
1065 /* REACTION-FIELD ELECTROSTATICS */
1066 velec
= _mm_mul_pd(qq31
,_mm_sub_pd(_mm_add_pd(rinv31
,_mm_mul_pd(krf
,rsq31
)),crf
));
1067 felec
= _mm_mul_pd(qq31
,_mm_sub_pd(_mm_mul_pd(rinv31
,rinvsq31
),krf2
));
1069 cutoff_mask
= _mm_cmplt_pd(rsq31
,rcutoff2
);
1071 /* Update potential sum for this i atom from the interaction with this j atom. */
1072 velec
= _mm_and_pd(velec
,cutoff_mask
);
1073 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1074 velecsum
= _mm_add_pd(velecsum
,velec
);
1078 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1080 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1082 /* Calculate temporary vectorial force */
1083 tx
= _mm_mul_pd(fscal
,dx31
);
1084 ty
= _mm_mul_pd(fscal
,dy31
);
1085 tz
= _mm_mul_pd(fscal
,dz31
);
1087 /* Update vectorial force */
1088 fix3
= _mm_add_pd(fix3
,tx
);
1089 fiy3
= _mm_add_pd(fiy3
,ty
);
1090 fiz3
= _mm_add_pd(fiz3
,tz
);
1092 fjx1
= _mm_add_pd(fjx1
,tx
);
1093 fjy1
= _mm_add_pd(fjy1
,ty
);
1094 fjz1
= _mm_add_pd(fjz1
,tz
);
1098 /**************************
1099 * CALCULATE INTERACTIONS *
1100 **************************/
1102 if (gmx_mm_any_lt(rsq32
,rcutoff2
))
1105 /* REACTION-FIELD ELECTROSTATICS */
1106 velec
= _mm_mul_pd(qq32
,_mm_sub_pd(_mm_add_pd(rinv32
,_mm_mul_pd(krf
,rsq32
)),crf
));
1107 felec
= _mm_mul_pd(qq32
,_mm_sub_pd(_mm_mul_pd(rinv32
,rinvsq32
),krf2
));
1109 cutoff_mask
= _mm_cmplt_pd(rsq32
,rcutoff2
);
1111 /* Update potential sum for this i atom from the interaction with this j atom. */
1112 velec
= _mm_and_pd(velec
,cutoff_mask
);
1113 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1114 velecsum
= _mm_add_pd(velecsum
,velec
);
1118 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1120 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1122 /* Calculate temporary vectorial force */
1123 tx
= _mm_mul_pd(fscal
,dx32
);
1124 ty
= _mm_mul_pd(fscal
,dy32
);
1125 tz
= _mm_mul_pd(fscal
,dz32
);
1127 /* Update vectorial force */
1128 fix3
= _mm_add_pd(fix3
,tx
);
1129 fiy3
= _mm_add_pd(fiy3
,ty
);
1130 fiz3
= _mm_add_pd(fiz3
,tz
);
1132 fjx2
= _mm_add_pd(fjx2
,tx
);
1133 fjy2
= _mm_add_pd(fjy2
,ty
);
1134 fjz2
= _mm_add_pd(fjz2
,tz
);
1138 /**************************
1139 * CALCULATE INTERACTIONS *
1140 **************************/
1142 if (gmx_mm_any_lt(rsq33
,rcutoff2
))
1145 /* REACTION-FIELD ELECTROSTATICS */
1146 velec
= _mm_mul_pd(qq33
,_mm_sub_pd(_mm_add_pd(rinv33
,_mm_mul_pd(krf
,rsq33
)),crf
));
1147 felec
= _mm_mul_pd(qq33
,_mm_sub_pd(_mm_mul_pd(rinv33
,rinvsq33
),krf2
));
1149 cutoff_mask
= _mm_cmplt_pd(rsq33
,rcutoff2
);
1151 /* Update potential sum for this i atom from the interaction with this j atom. */
1152 velec
= _mm_and_pd(velec
,cutoff_mask
);
1153 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1154 velecsum
= _mm_add_pd(velecsum
,velec
);
1158 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1160 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1162 /* Calculate temporary vectorial force */
1163 tx
= _mm_mul_pd(fscal
,dx33
);
1164 ty
= _mm_mul_pd(fscal
,dy33
);
1165 tz
= _mm_mul_pd(fscal
,dz33
);
1167 /* Update vectorial force */
1168 fix3
= _mm_add_pd(fix3
,tx
);
1169 fiy3
= _mm_add_pd(fiy3
,ty
);
1170 fiz3
= _mm_add_pd(fiz3
,tz
);
1172 fjx3
= _mm_add_pd(fjx3
,tx
);
1173 fjy3
= _mm_add_pd(fjy3
,ty
);
1174 fjz3
= _mm_add_pd(fjz3
,tz
);
1178 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
1180 /* Inner loop uses 368 flops */
1183 /* End of innermost loop */
1185 gmx_mm_update_iforce_4atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
1186 f
+i_coord_offset
,fshift
+i_shift_offset
);
1189 /* Update potential energies */
1190 gmx_mm_update_1pot_pd(velecsum
,kernel_data
->energygrp_elec
+ggid
);
1191 gmx_mm_update_1pot_pd(vvdwsum
,kernel_data
->energygrp_vdw
+ggid
);
1193 /* Increment number of inner iterations */
1194 inneriter
+= j_index_end
- j_index_start
;
1196 /* Outer loop uses 26 flops */
1199 /* Increment number of outer iterations */
1202 /* Update outer/inner flops */
1204 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W4W4_VF
,outeriter
*26 + inneriter
*368);
1207 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_sse2_double
1208 * Electrostatics interaction: ReactionField
1209 * VdW interaction: LennardJones
1210 * Geometry: Water4-Water4
1211 * Calculate force/pot: Force
1214 nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_sse2_double
1215 (t_nblist
* gmx_restrict nlist
,
1216 rvec
* gmx_restrict xx
,
1217 rvec
* gmx_restrict ff
,
1218 struct t_forcerec
* gmx_restrict fr
,
1219 t_mdatoms
* gmx_restrict mdatoms
,
1220 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
1221 t_nrnb
* gmx_restrict nrnb
)
1223 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1224 * just 0 for non-waters.
1225 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1226 * jnr indices corresponding to data put in the four positions in the SIMD register.
1228 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
1229 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
1231 int j_coord_offsetA
,j_coord_offsetB
;
1232 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
1233 real rcutoff_scalar
;
1234 real
*shiftvec
,*fshift
,*x
,*f
;
1235 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
1237 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
1239 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
1241 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
1243 __m128d ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
1244 int vdwjidx0A
,vdwjidx0B
;
1245 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
1246 int vdwjidx1A
,vdwjidx1B
;
1247 __m128d jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
1248 int vdwjidx2A
,vdwjidx2B
;
1249 __m128d jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
1250 int vdwjidx3A
,vdwjidx3B
;
1251 __m128d jx3
,jy3
,jz3
,fjx3
,fjy3
,fjz3
,jq3
,isaj3
;
1252 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
1253 __m128d dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
1254 __m128d dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
1255 __m128d dx13
,dy13
,dz13
,rsq13
,rinv13
,rinvsq13
,r13
,qq13
,c6_13
,c12_13
;
1256 __m128d dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
1257 __m128d dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
1258 __m128d dx23
,dy23
,dz23
,rsq23
,rinv23
,rinvsq23
,r23
,qq23
,c6_23
,c12_23
;
1259 __m128d dx31
,dy31
,dz31
,rsq31
,rinv31
,rinvsq31
,r31
,qq31
,c6_31
,c12_31
;
1260 __m128d dx32
,dy32
,dz32
,rsq32
,rinv32
,rinvsq32
,r32
,qq32
,c6_32
,c12_32
;
1261 __m128d dx33
,dy33
,dz33
,rsq33
,rinv33
,rinvsq33
,r33
,qq33
,c6_33
,c12_33
;
1262 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
1265 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
1268 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
1269 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
1270 __m128d dummy_mask
,cutoff_mask
;
1271 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1272 __m128d one
= _mm_set1_pd(1.0);
1273 __m128d two
= _mm_set1_pd(2.0);
1279 jindex
= nlist
->jindex
;
1281 shiftidx
= nlist
->shift
;
1283 shiftvec
= fr
->shift_vec
[0];
1284 fshift
= fr
->fshift
[0];
1285 facel
= _mm_set1_pd(fr
->ic
->epsfac
);
1286 charge
= mdatoms
->chargeA
;
1287 krf
= _mm_set1_pd(fr
->ic
->k_rf
);
1288 krf2
= _mm_set1_pd(fr
->ic
->k_rf
*2.0);
1289 crf
= _mm_set1_pd(fr
->ic
->c_rf
);
1290 nvdwtype
= fr
->ntype
;
1291 vdwparam
= fr
->nbfp
;
1292 vdwtype
= mdatoms
->typeA
;
1294 /* Setup water-specific parameters */
1295 inr
= nlist
->iinr
[0];
1296 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
1297 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
1298 iq3
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+3]));
1299 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
1301 jq1
= _mm_set1_pd(charge
[inr
+1]);
1302 jq2
= _mm_set1_pd(charge
[inr
+2]);
1303 jq3
= _mm_set1_pd(charge
[inr
+3]);
1304 vdwjidx0A
= 2*vdwtype
[inr
+0];
1305 c6_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
]);
1306 c12_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
1307 qq11
= _mm_mul_pd(iq1
,jq1
);
1308 qq12
= _mm_mul_pd(iq1
,jq2
);
1309 qq13
= _mm_mul_pd(iq1
,jq3
);
1310 qq21
= _mm_mul_pd(iq2
,jq1
);
1311 qq22
= _mm_mul_pd(iq2
,jq2
);
1312 qq23
= _mm_mul_pd(iq2
,jq3
);
1313 qq31
= _mm_mul_pd(iq3
,jq1
);
1314 qq32
= _mm_mul_pd(iq3
,jq2
);
1315 qq33
= _mm_mul_pd(iq3
,jq3
);
1317 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1318 rcutoff_scalar
= fr
->ic
->rcoulomb
;
1319 rcutoff
= _mm_set1_pd(rcutoff_scalar
);
1320 rcutoff2
= _mm_mul_pd(rcutoff
,rcutoff
);
1322 sh_vdw_invrcut6
= _mm_set1_pd(fr
->ic
->sh_invrc6
);
1323 rvdw
= _mm_set1_pd(fr
->ic
->rvdw
);
1325 /* Avoid stupid compiler warnings */
1327 j_coord_offsetA
= 0;
1328 j_coord_offsetB
= 0;
1333 /* Start outer loop over neighborlists */
1334 for(iidx
=0; iidx
<nri
; iidx
++)
1336 /* Load shift vector for this list */
1337 i_shift_offset
= DIM
*shiftidx
[iidx
];
1339 /* Load limits for loop over neighbors */
1340 j_index_start
= jindex
[iidx
];
1341 j_index_end
= jindex
[iidx
+1];
1343 /* Get outer coordinate index */
1345 i_coord_offset
= DIM
*inr
;
1347 /* Load i particle coords and add shift vector */
1348 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
1349 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
1351 fix0
= _mm_setzero_pd();
1352 fiy0
= _mm_setzero_pd();
1353 fiz0
= _mm_setzero_pd();
1354 fix1
= _mm_setzero_pd();
1355 fiy1
= _mm_setzero_pd();
1356 fiz1
= _mm_setzero_pd();
1357 fix2
= _mm_setzero_pd();
1358 fiy2
= _mm_setzero_pd();
1359 fiz2
= _mm_setzero_pd();
1360 fix3
= _mm_setzero_pd();
1361 fiy3
= _mm_setzero_pd();
1362 fiz3
= _mm_setzero_pd();
1364 /* Start inner kernel loop */
1365 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
1368 /* Get j neighbor index, and coordinate index */
1370 jnrB
= jjnr
[jidx
+1];
1371 j_coord_offsetA
= DIM
*jnrA
;
1372 j_coord_offsetB
= DIM
*jnrB
;
1374 /* load j atom coordinates */
1375 gmx_mm_load_4rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1376 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
1377 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
1379 /* Calculate displacement vector */
1380 dx00
= _mm_sub_pd(ix0
,jx0
);
1381 dy00
= _mm_sub_pd(iy0
,jy0
);
1382 dz00
= _mm_sub_pd(iz0
,jz0
);
1383 dx11
= _mm_sub_pd(ix1
,jx1
);
1384 dy11
= _mm_sub_pd(iy1
,jy1
);
1385 dz11
= _mm_sub_pd(iz1
,jz1
);
1386 dx12
= _mm_sub_pd(ix1
,jx2
);
1387 dy12
= _mm_sub_pd(iy1
,jy2
);
1388 dz12
= _mm_sub_pd(iz1
,jz2
);
1389 dx13
= _mm_sub_pd(ix1
,jx3
);
1390 dy13
= _mm_sub_pd(iy1
,jy3
);
1391 dz13
= _mm_sub_pd(iz1
,jz3
);
1392 dx21
= _mm_sub_pd(ix2
,jx1
);
1393 dy21
= _mm_sub_pd(iy2
,jy1
);
1394 dz21
= _mm_sub_pd(iz2
,jz1
);
1395 dx22
= _mm_sub_pd(ix2
,jx2
);
1396 dy22
= _mm_sub_pd(iy2
,jy2
);
1397 dz22
= _mm_sub_pd(iz2
,jz2
);
1398 dx23
= _mm_sub_pd(ix2
,jx3
);
1399 dy23
= _mm_sub_pd(iy2
,jy3
);
1400 dz23
= _mm_sub_pd(iz2
,jz3
);
1401 dx31
= _mm_sub_pd(ix3
,jx1
);
1402 dy31
= _mm_sub_pd(iy3
,jy1
);
1403 dz31
= _mm_sub_pd(iz3
,jz1
);
1404 dx32
= _mm_sub_pd(ix3
,jx2
);
1405 dy32
= _mm_sub_pd(iy3
,jy2
);
1406 dz32
= _mm_sub_pd(iz3
,jz2
);
1407 dx33
= _mm_sub_pd(ix3
,jx3
);
1408 dy33
= _mm_sub_pd(iy3
,jy3
);
1409 dz33
= _mm_sub_pd(iz3
,jz3
);
1411 /* Calculate squared distance and things based on it */
1412 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
1413 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
1414 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
1415 rsq13
= gmx_mm_calc_rsq_pd(dx13
,dy13
,dz13
);
1416 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
1417 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
1418 rsq23
= gmx_mm_calc_rsq_pd(dx23
,dy23
,dz23
);
1419 rsq31
= gmx_mm_calc_rsq_pd(dx31
,dy31
,dz31
);
1420 rsq32
= gmx_mm_calc_rsq_pd(dx32
,dy32
,dz32
);
1421 rsq33
= gmx_mm_calc_rsq_pd(dx33
,dy33
,dz33
);
1423 rinv11
= sse2_invsqrt_d(rsq11
);
1424 rinv12
= sse2_invsqrt_d(rsq12
);
1425 rinv13
= sse2_invsqrt_d(rsq13
);
1426 rinv21
= sse2_invsqrt_d(rsq21
);
1427 rinv22
= sse2_invsqrt_d(rsq22
);
1428 rinv23
= sse2_invsqrt_d(rsq23
);
1429 rinv31
= sse2_invsqrt_d(rsq31
);
1430 rinv32
= sse2_invsqrt_d(rsq32
);
1431 rinv33
= sse2_invsqrt_d(rsq33
);
1433 rinvsq00
= sse2_inv_d(rsq00
);
1434 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
1435 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
1436 rinvsq13
= _mm_mul_pd(rinv13
,rinv13
);
1437 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
1438 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
1439 rinvsq23
= _mm_mul_pd(rinv23
,rinv23
);
1440 rinvsq31
= _mm_mul_pd(rinv31
,rinv31
);
1441 rinvsq32
= _mm_mul_pd(rinv32
,rinv32
);
1442 rinvsq33
= _mm_mul_pd(rinv33
,rinv33
);
1444 fjx0
= _mm_setzero_pd();
1445 fjy0
= _mm_setzero_pd();
1446 fjz0
= _mm_setzero_pd();
1447 fjx1
= _mm_setzero_pd();
1448 fjy1
= _mm_setzero_pd();
1449 fjz1
= _mm_setzero_pd();
1450 fjx2
= _mm_setzero_pd();
1451 fjy2
= _mm_setzero_pd();
1452 fjz2
= _mm_setzero_pd();
1453 fjx3
= _mm_setzero_pd();
1454 fjy3
= _mm_setzero_pd();
1455 fjz3
= _mm_setzero_pd();
1457 /**************************
1458 * CALCULATE INTERACTIONS *
1459 **************************/
1461 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
1464 /* LENNARD-JONES DISPERSION/REPULSION */
1466 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
1467 fvdw
= _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00
,rinvsix
),c6_00
),_mm_mul_pd(rinvsix
,rinvsq00
));
1469 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
1473 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1475 /* Calculate temporary vectorial force */
1476 tx
= _mm_mul_pd(fscal
,dx00
);
1477 ty
= _mm_mul_pd(fscal
,dy00
);
1478 tz
= _mm_mul_pd(fscal
,dz00
);
1480 /* Update vectorial force */
1481 fix0
= _mm_add_pd(fix0
,tx
);
1482 fiy0
= _mm_add_pd(fiy0
,ty
);
1483 fiz0
= _mm_add_pd(fiz0
,tz
);
1485 fjx0
= _mm_add_pd(fjx0
,tx
);
1486 fjy0
= _mm_add_pd(fjy0
,ty
);
1487 fjz0
= _mm_add_pd(fjz0
,tz
);
1491 /**************************
1492 * CALCULATE INTERACTIONS *
1493 **************************/
1495 if (gmx_mm_any_lt(rsq11
,rcutoff2
))
1498 /* REACTION-FIELD ELECTROSTATICS */
1499 felec
= _mm_mul_pd(qq11
,_mm_sub_pd(_mm_mul_pd(rinv11
,rinvsq11
),krf2
));
1501 cutoff_mask
= _mm_cmplt_pd(rsq11
,rcutoff2
);
1505 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1507 /* Calculate temporary vectorial force */
1508 tx
= _mm_mul_pd(fscal
,dx11
);
1509 ty
= _mm_mul_pd(fscal
,dy11
);
1510 tz
= _mm_mul_pd(fscal
,dz11
);
1512 /* Update vectorial force */
1513 fix1
= _mm_add_pd(fix1
,tx
);
1514 fiy1
= _mm_add_pd(fiy1
,ty
);
1515 fiz1
= _mm_add_pd(fiz1
,tz
);
1517 fjx1
= _mm_add_pd(fjx1
,tx
);
1518 fjy1
= _mm_add_pd(fjy1
,ty
);
1519 fjz1
= _mm_add_pd(fjz1
,tz
);
1523 /**************************
1524 * CALCULATE INTERACTIONS *
1525 **************************/
1527 if (gmx_mm_any_lt(rsq12
,rcutoff2
))
1530 /* REACTION-FIELD ELECTROSTATICS */
1531 felec
= _mm_mul_pd(qq12
,_mm_sub_pd(_mm_mul_pd(rinv12
,rinvsq12
),krf2
));
1533 cutoff_mask
= _mm_cmplt_pd(rsq12
,rcutoff2
);
1537 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1539 /* Calculate temporary vectorial force */
1540 tx
= _mm_mul_pd(fscal
,dx12
);
1541 ty
= _mm_mul_pd(fscal
,dy12
);
1542 tz
= _mm_mul_pd(fscal
,dz12
);
1544 /* Update vectorial force */
1545 fix1
= _mm_add_pd(fix1
,tx
);
1546 fiy1
= _mm_add_pd(fiy1
,ty
);
1547 fiz1
= _mm_add_pd(fiz1
,tz
);
1549 fjx2
= _mm_add_pd(fjx2
,tx
);
1550 fjy2
= _mm_add_pd(fjy2
,ty
);
1551 fjz2
= _mm_add_pd(fjz2
,tz
);
1555 /**************************
1556 * CALCULATE INTERACTIONS *
1557 **************************/
1559 if (gmx_mm_any_lt(rsq13
,rcutoff2
))
1562 /* REACTION-FIELD ELECTROSTATICS */
1563 felec
= _mm_mul_pd(qq13
,_mm_sub_pd(_mm_mul_pd(rinv13
,rinvsq13
),krf2
));
1565 cutoff_mask
= _mm_cmplt_pd(rsq13
,rcutoff2
);
1569 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1571 /* Calculate temporary vectorial force */
1572 tx
= _mm_mul_pd(fscal
,dx13
);
1573 ty
= _mm_mul_pd(fscal
,dy13
);
1574 tz
= _mm_mul_pd(fscal
,dz13
);
1576 /* Update vectorial force */
1577 fix1
= _mm_add_pd(fix1
,tx
);
1578 fiy1
= _mm_add_pd(fiy1
,ty
);
1579 fiz1
= _mm_add_pd(fiz1
,tz
);
1581 fjx3
= _mm_add_pd(fjx3
,tx
);
1582 fjy3
= _mm_add_pd(fjy3
,ty
);
1583 fjz3
= _mm_add_pd(fjz3
,tz
);
1587 /**************************
1588 * CALCULATE INTERACTIONS *
1589 **************************/
1591 if (gmx_mm_any_lt(rsq21
,rcutoff2
))
1594 /* REACTION-FIELD ELECTROSTATICS */
1595 felec
= _mm_mul_pd(qq21
,_mm_sub_pd(_mm_mul_pd(rinv21
,rinvsq21
),krf2
));
1597 cutoff_mask
= _mm_cmplt_pd(rsq21
,rcutoff2
);
1601 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1603 /* Calculate temporary vectorial force */
1604 tx
= _mm_mul_pd(fscal
,dx21
);
1605 ty
= _mm_mul_pd(fscal
,dy21
);
1606 tz
= _mm_mul_pd(fscal
,dz21
);
1608 /* Update vectorial force */
1609 fix2
= _mm_add_pd(fix2
,tx
);
1610 fiy2
= _mm_add_pd(fiy2
,ty
);
1611 fiz2
= _mm_add_pd(fiz2
,tz
);
1613 fjx1
= _mm_add_pd(fjx1
,tx
);
1614 fjy1
= _mm_add_pd(fjy1
,ty
);
1615 fjz1
= _mm_add_pd(fjz1
,tz
);
1619 /**************************
1620 * CALCULATE INTERACTIONS *
1621 **************************/
1623 if (gmx_mm_any_lt(rsq22
,rcutoff2
))
1626 /* REACTION-FIELD ELECTROSTATICS */
1627 felec
= _mm_mul_pd(qq22
,_mm_sub_pd(_mm_mul_pd(rinv22
,rinvsq22
),krf2
));
1629 cutoff_mask
= _mm_cmplt_pd(rsq22
,rcutoff2
);
1633 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1635 /* Calculate temporary vectorial force */
1636 tx
= _mm_mul_pd(fscal
,dx22
);
1637 ty
= _mm_mul_pd(fscal
,dy22
);
1638 tz
= _mm_mul_pd(fscal
,dz22
);
1640 /* Update vectorial force */
1641 fix2
= _mm_add_pd(fix2
,tx
);
1642 fiy2
= _mm_add_pd(fiy2
,ty
);
1643 fiz2
= _mm_add_pd(fiz2
,tz
);
1645 fjx2
= _mm_add_pd(fjx2
,tx
);
1646 fjy2
= _mm_add_pd(fjy2
,ty
);
1647 fjz2
= _mm_add_pd(fjz2
,tz
);
1651 /**************************
1652 * CALCULATE INTERACTIONS *
1653 **************************/
1655 if (gmx_mm_any_lt(rsq23
,rcutoff2
))
1658 /* REACTION-FIELD ELECTROSTATICS */
1659 felec
= _mm_mul_pd(qq23
,_mm_sub_pd(_mm_mul_pd(rinv23
,rinvsq23
),krf2
));
1661 cutoff_mask
= _mm_cmplt_pd(rsq23
,rcutoff2
);
1665 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1667 /* Calculate temporary vectorial force */
1668 tx
= _mm_mul_pd(fscal
,dx23
);
1669 ty
= _mm_mul_pd(fscal
,dy23
);
1670 tz
= _mm_mul_pd(fscal
,dz23
);
1672 /* Update vectorial force */
1673 fix2
= _mm_add_pd(fix2
,tx
);
1674 fiy2
= _mm_add_pd(fiy2
,ty
);
1675 fiz2
= _mm_add_pd(fiz2
,tz
);
1677 fjx3
= _mm_add_pd(fjx3
,tx
);
1678 fjy3
= _mm_add_pd(fjy3
,ty
);
1679 fjz3
= _mm_add_pd(fjz3
,tz
);
1683 /**************************
1684 * CALCULATE INTERACTIONS *
1685 **************************/
1687 if (gmx_mm_any_lt(rsq31
,rcutoff2
))
1690 /* REACTION-FIELD ELECTROSTATICS */
1691 felec
= _mm_mul_pd(qq31
,_mm_sub_pd(_mm_mul_pd(rinv31
,rinvsq31
),krf2
));
1693 cutoff_mask
= _mm_cmplt_pd(rsq31
,rcutoff2
);
1697 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1699 /* Calculate temporary vectorial force */
1700 tx
= _mm_mul_pd(fscal
,dx31
);
1701 ty
= _mm_mul_pd(fscal
,dy31
);
1702 tz
= _mm_mul_pd(fscal
,dz31
);
1704 /* Update vectorial force */
1705 fix3
= _mm_add_pd(fix3
,tx
);
1706 fiy3
= _mm_add_pd(fiy3
,ty
);
1707 fiz3
= _mm_add_pd(fiz3
,tz
);
1709 fjx1
= _mm_add_pd(fjx1
,tx
);
1710 fjy1
= _mm_add_pd(fjy1
,ty
);
1711 fjz1
= _mm_add_pd(fjz1
,tz
);
1715 /**************************
1716 * CALCULATE INTERACTIONS *
1717 **************************/
1719 if (gmx_mm_any_lt(rsq32
,rcutoff2
))
1722 /* REACTION-FIELD ELECTROSTATICS */
1723 felec
= _mm_mul_pd(qq32
,_mm_sub_pd(_mm_mul_pd(rinv32
,rinvsq32
),krf2
));
1725 cutoff_mask
= _mm_cmplt_pd(rsq32
,rcutoff2
);
1729 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1731 /* Calculate temporary vectorial force */
1732 tx
= _mm_mul_pd(fscal
,dx32
);
1733 ty
= _mm_mul_pd(fscal
,dy32
);
1734 tz
= _mm_mul_pd(fscal
,dz32
);
1736 /* Update vectorial force */
1737 fix3
= _mm_add_pd(fix3
,tx
);
1738 fiy3
= _mm_add_pd(fiy3
,ty
);
1739 fiz3
= _mm_add_pd(fiz3
,tz
);
1741 fjx2
= _mm_add_pd(fjx2
,tx
);
1742 fjy2
= _mm_add_pd(fjy2
,ty
);
1743 fjz2
= _mm_add_pd(fjz2
,tz
);
1747 /**************************
1748 * CALCULATE INTERACTIONS *
1749 **************************/
1751 if (gmx_mm_any_lt(rsq33
,rcutoff2
))
1754 /* REACTION-FIELD ELECTROSTATICS */
1755 felec
= _mm_mul_pd(qq33
,_mm_sub_pd(_mm_mul_pd(rinv33
,rinvsq33
),krf2
));
1757 cutoff_mask
= _mm_cmplt_pd(rsq33
,rcutoff2
);
1761 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1763 /* Calculate temporary vectorial force */
1764 tx
= _mm_mul_pd(fscal
,dx33
);
1765 ty
= _mm_mul_pd(fscal
,dy33
);
1766 tz
= _mm_mul_pd(fscal
,dz33
);
1768 /* Update vectorial force */
1769 fix3
= _mm_add_pd(fix3
,tx
);
1770 fiy3
= _mm_add_pd(fiy3
,ty
);
1771 fiz3
= _mm_add_pd(fiz3
,tz
);
1773 fjx3
= _mm_add_pd(fjx3
,tx
);
1774 fjy3
= _mm_add_pd(fjy3
,ty
);
1775 fjz3
= _mm_add_pd(fjz3
,tz
);
1779 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
1781 /* Inner loop uses 303 flops */
1784 if(jidx
<j_index_end
)
1788 j_coord_offsetA
= DIM
*jnrA
;
1790 /* load j atom coordinates */
1791 gmx_mm_load_4rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
1792 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
1793 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
1795 /* Calculate displacement vector */
1796 dx00
= _mm_sub_pd(ix0
,jx0
);
1797 dy00
= _mm_sub_pd(iy0
,jy0
);
1798 dz00
= _mm_sub_pd(iz0
,jz0
);
1799 dx11
= _mm_sub_pd(ix1
,jx1
);
1800 dy11
= _mm_sub_pd(iy1
,jy1
);
1801 dz11
= _mm_sub_pd(iz1
,jz1
);
1802 dx12
= _mm_sub_pd(ix1
,jx2
);
1803 dy12
= _mm_sub_pd(iy1
,jy2
);
1804 dz12
= _mm_sub_pd(iz1
,jz2
);
1805 dx13
= _mm_sub_pd(ix1
,jx3
);
1806 dy13
= _mm_sub_pd(iy1
,jy3
);
1807 dz13
= _mm_sub_pd(iz1
,jz3
);
1808 dx21
= _mm_sub_pd(ix2
,jx1
);
1809 dy21
= _mm_sub_pd(iy2
,jy1
);
1810 dz21
= _mm_sub_pd(iz2
,jz1
);
1811 dx22
= _mm_sub_pd(ix2
,jx2
);
1812 dy22
= _mm_sub_pd(iy2
,jy2
);
1813 dz22
= _mm_sub_pd(iz2
,jz2
);
1814 dx23
= _mm_sub_pd(ix2
,jx3
);
1815 dy23
= _mm_sub_pd(iy2
,jy3
);
1816 dz23
= _mm_sub_pd(iz2
,jz3
);
1817 dx31
= _mm_sub_pd(ix3
,jx1
);
1818 dy31
= _mm_sub_pd(iy3
,jy1
);
1819 dz31
= _mm_sub_pd(iz3
,jz1
);
1820 dx32
= _mm_sub_pd(ix3
,jx2
);
1821 dy32
= _mm_sub_pd(iy3
,jy2
);
1822 dz32
= _mm_sub_pd(iz3
,jz2
);
1823 dx33
= _mm_sub_pd(ix3
,jx3
);
1824 dy33
= _mm_sub_pd(iy3
,jy3
);
1825 dz33
= _mm_sub_pd(iz3
,jz3
);
1827 /* Calculate squared distance and things based on it */
1828 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
1829 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
1830 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
1831 rsq13
= gmx_mm_calc_rsq_pd(dx13
,dy13
,dz13
);
1832 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
1833 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
1834 rsq23
= gmx_mm_calc_rsq_pd(dx23
,dy23
,dz23
);
1835 rsq31
= gmx_mm_calc_rsq_pd(dx31
,dy31
,dz31
);
1836 rsq32
= gmx_mm_calc_rsq_pd(dx32
,dy32
,dz32
);
1837 rsq33
= gmx_mm_calc_rsq_pd(dx33
,dy33
,dz33
);
1839 rinv11
= sse2_invsqrt_d(rsq11
);
1840 rinv12
= sse2_invsqrt_d(rsq12
);
1841 rinv13
= sse2_invsqrt_d(rsq13
);
1842 rinv21
= sse2_invsqrt_d(rsq21
);
1843 rinv22
= sse2_invsqrt_d(rsq22
);
1844 rinv23
= sse2_invsqrt_d(rsq23
);
1845 rinv31
= sse2_invsqrt_d(rsq31
);
1846 rinv32
= sse2_invsqrt_d(rsq32
);
1847 rinv33
= sse2_invsqrt_d(rsq33
);
1849 rinvsq00
= sse2_inv_d(rsq00
);
1850 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
1851 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
1852 rinvsq13
= _mm_mul_pd(rinv13
,rinv13
);
1853 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
1854 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
1855 rinvsq23
= _mm_mul_pd(rinv23
,rinv23
);
1856 rinvsq31
= _mm_mul_pd(rinv31
,rinv31
);
1857 rinvsq32
= _mm_mul_pd(rinv32
,rinv32
);
1858 rinvsq33
= _mm_mul_pd(rinv33
,rinv33
);
1860 fjx0
= _mm_setzero_pd();
1861 fjy0
= _mm_setzero_pd();
1862 fjz0
= _mm_setzero_pd();
1863 fjx1
= _mm_setzero_pd();
1864 fjy1
= _mm_setzero_pd();
1865 fjz1
= _mm_setzero_pd();
1866 fjx2
= _mm_setzero_pd();
1867 fjy2
= _mm_setzero_pd();
1868 fjz2
= _mm_setzero_pd();
1869 fjx3
= _mm_setzero_pd();
1870 fjy3
= _mm_setzero_pd();
1871 fjz3
= _mm_setzero_pd();
1873 /**************************
1874 * CALCULATE INTERACTIONS *
1875 **************************/
1877 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
1880 /* LENNARD-JONES DISPERSION/REPULSION */
1882 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
1883 fvdw
= _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00
,rinvsix
),c6_00
),_mm_mul_pd(rinvsix
,rinvsq00
));
1885 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
1889 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1891 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1893 /* Calculate temporary vectorial force */
1894 tx
= _mm_mul_pd(fscal
,dx00
);
1895 ty
= _mm_mul_pd(fscal
,dy00
);
1896 tz
= _mm_mul_pd(fscal
,dz00
);
1898 /* Update vectorial force */
1899 fix0
= _mm_add_pd(fix0
,tx
);
1900 fiy0
= _mm_add_pd(fiy0
,ty
);
1901 fiz0
= _mm_add_pd(fiz0
,tz
);
1903 fjx0
= _mm_add_pd(fjx0
,tx
);
1904 fjy0
= _mm_add_pd(fjy0
,ty
);
1905 fjz0
= _mm_add_pd(fjz0
,tz
);
1909 /**************************
1910 * CALCULATE INTERACTIONS *
1911 **************************/
1913 if (gmx_mm_any_lt(rsq11
,rcutoff2
))
1916 /* REACTION-FIELD ELECTROSTATICS */
1917 felec
= _mm_mul_pd(qq11
,_mm_sub_pd(_mm_mul_pd(rinv11
,rinvsq11
),krf2
));
1919 cutoff_mask
= _mm_cmplt_pd(rsq11
,rcutoff2
);
1923 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1925 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1927 /* Calculate temporary vectorial force */
1928 tx
= _mm_mul_pd(fscal
,dx11
);
1929 ty
= _mm_mul_pd(fscal
,dy11
);
1930 tz
= _mm_mul_pd(fscal
,dz11
);
1932 /* Update vectorial force */
1933 fix1
= _mm_add_pd(fix1
,tx
);
1934 fiy1
= _mm_add_pd(fiy1
,ty
);
1935 fiz1
= _mm_add_pd(fiz1
,tz
);
1937 fjx1
= _mm_add_pd(fjx1
,tx
);
1938 fjy1
= _mm_add_pd(fjy1
,ty
);
1939 fjz1
= _mm_add_pd(fjz1
,tz
);
1943 /**************************
1944 * CALCULATE INTERACTIONS *
1945 **************************/
1947 if (gmx_mm_any_lt(rsq12
,rcutoff2
))
1950 /* REACTION-FIELD ELECTROSTATICS */
1951 felec
= _mm_mul_pd(qq12
,_mm_sub_pd(_mm_mul_pd(rinv12
,rinvsq12
),krf2
));
1953 cutoff_mask
= _mm_cmplt_pd(rsq12
,rcutoff2
);
1957 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1959 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1961 /* Calculate temporary vectorial force */
1962 tx
= _mm_mul_pd(fscal
,dx12
);
1963 ty
= _mm_mul_pd(fscal
,dy12
);
1964 tz
= _mm_mul_pd(fscal
,dz12
);
1966 /* Update vectorial force */
1967 fix1
= _mm_add_pd(fix1
,tx
);
1968 fiy1
= _mm_add_pd(fiy1
,ty
);
1969 fiz1
= _mm_add_pd(fiz1
,tz
);
1971 fjx2
= _mm_add_pd(fjx2
,tx
);
1972 fjy2
= _mm_add_pd(fjy2
,ty
);
1973 fjz2
= _mm_add_pd(fjz2
,tz
);
1977 /**************************
1978 * CALCULATE INTERACTIONS *
1979 **************************/
1981 if (gmx_mm_any_lt(rsq13
,rcutoff2
))
1984 /* REACTION-FIELD ELECTROSTATICS */
1985 felec
= _mm_mul_pd(qq13
,_mm_sub_pd(_mm_mul_pd(rinv13
,rinvsq13
),krf2
));
1987 cutoff_mask
= _mm_cmplt_pd(rsq13
,rcutoff2
);
1991 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1993 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1995 /* Calculate temporary vectorial force */
1996 tx
= _mm_mul_pd(fscal
,dx13
);
1997 ty
= _mm_mul_pd(fscal
,dy13
);
1998 tz
= _mm_mul_pd(fscal
,dz13
);
2000 /* Update vectorial force */
2001 fix1
= _mm_add_pd(fix1
,tx
);
2002 fiy1
= _mm_add_pd(fiy1
,ty
);
2003 fiz1
= _mm_add_pd(fiz1
,tz
);
2005 fjx3
= _mm_add_pd(fjx3
,tx
);
2006 fjy3
= _mm_add_pd(fjy3
,ty
);
2007 fjz3
= _mm_add_pd(fjz3
,tz
);
2011 /**************************
2012 * CALCULATE INTERACTIONS *
2013 **************************/
2015 if (gmx_mm_any_lt(rsq21
,rcutoff2
))
2018 /* REACTION-FIELD ELECTROSTATICS */
2019 felec
= _mm_mul_pd(qq21
,_mm_sub_pd(_mm_mul_pd(rinv21
,rinvsq21
),krf2
));
2021 cutoff_mask
= _mm_cmplt_pd(rsq21
,rcutoff2
);
2025 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2027 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2029 /* Calculate temporary vectorial force */
2030 tx
= _mm_mul_pd(fscal
,dx21
);
2031 ty
= _mm_mul_pd(fscal
,dy21
);
2032 tz
= _mm_mul_pd(fscal
,dz21
);
2034 /* Update vectorial force */
2035 fix2
= _mm_add_pd(fix2
,tx
);
2036 fiy2
= _mm_add_pd(fiy2
,ty
);
2037 fiz2
= _mm_add_pd(fiz2
,tz
);
2039 fjx1
= _mm_add_pd(fjx1
,tx
);
2040 fjy1
= _mm_add_pd(fjy1
,ty
);
2041 fjz1
= _mm_add_pd(fjz1
,tz
);
2045 /**************************
2046 * CALCULATE INTERACTIONS *
2047 **************************/
2049 if (gmx_mm_any_lt(rsq22
,rcutoff2
))
2052 /* REACTION-FIELD ELECTROSTATICS */
2053 felec
= _mm_mul_pd(qq22
,_mm_sub_pd(_mm_mul_pd(rinv22
,rinvsq22
),krf2
));
2055 cutoff_mask
= _mm_cmplt_pd(rsq22
,rcutoff2
);
2059 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2061 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2063 /* Calculate temporary vectorial force */
2064 tx
= _mm_mul_pd(fscal
,dx22
);
2065 ty
= _mm_mul_pd(fscal
,dy22
);
2066 tz
= _mm_mul_pd(fscal
,dz22
);
2068 /* Update vectorial force */
2069 fix2
= _mm_add_pd(fix2
,tx
);
2070 fiy2
= _mm_add_pd(fiy2
,ty
);
2071 fiz2
= _mm_add_pd(fiz2
,tz
);
2073 fjx2
= _mm_add_pd(fjx2
,tx
);
2074 fjy2
= _mm_add_pd(fjy2
,ty
);
2075 fjz2
= _mm_add_pd(fjz2
,tz
);
2079 /**************************
2080 * CALCULATE INTERACTIONS *
2081 **************************/
2083 if (gmx_mm_any_lt(rsq23
,rcutoff2
))
2086 /* REACTION-FIELD ELECTROSTATICS */
2087 felec
= _mm_mul_pd(qq23
,_mm_sub_pd(_mm_mul_pd(rinv23
,rinvsq23
),krf2
));
2089 cutoff_mask
= _mm_cmplt_pd(rsq23
,rcutoff2
);
2093 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2095 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2097 /* Calculate temporary vectorial force */
2098 tx
= _mm_mul_pd(fscal
,dx23
);
2099 ty
= _mm_mul_pd(fscal
,dy23
);
2100 tz
= _mm_mul_pd(fscal
,dz23
);
2102 /* Update vectorial force */
2103 fix2
= _mm_add_pd(fix2
,tx
);
2104 fiy2
= _mm_add_pd(fiy2
,ty
);
2105 fiz2
= _mm_add_pd(fiz2
,tz
);
2107 fjx3
= _mm_add_pd(fjx3
,tx
);
2108 fjy3
= _mm_add_pd(fjy3
,ty
);
2109 fjz3
= _mm_add_pd(fjz3
,tz
);
2113 /**************************
2114 * CALCULATE INTERACTIONS *
2115 **************************/
2117 if (gmx_mm_any_lt(rsq31
,rcutoff2
))
2120 /* REACTION-FIELD ELECTROSTATICS */
2121 felec
= _mm_mul_pd(qq31
,_mm_sub_pd(_mm_mul_pd(rinv31
,rinvsq31
),krf2
));
2123 cutoff_mask
= _mm_cmplt_pd(rsq31
,rcutoff2
);
2127 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2129 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2131 /* Calculate temporary vectorial force */
2132 tx
= _mm_mul_pd(fscal
,dx31
);
2133 ty
= _mm_mul_pd(fscal
,dy31
);
2134 tz
= _mm_mul_pd(fscal
,dz31
);
2136 /* Update vectorial force */
2137 fix3
= _mm_add_pd(fix3
,tx
);
2138 fiy3
= _mm_add_pd(fiy3
,ty
);
2139 fiz3
= _mm_add_pd(fiz3
,tz
);
2141 fjx1
= _mm_add_pd(fjx1
,tx
);
2142 fjy1
= _mm_add_pd(fjy1
,ty
);
2143 fjz1
= _mm_add_pd(fjz1
,tz
);
2147 /**************************
2148 * CALCULATE INTERACTIONS *
2149 **************************/
2151 if (gmx_mm_any_lt(rsq32
,rcutoff2
))
2154 /* REACTION-FIELD ELECTROSTATICS */
2155 felec
= _mm_mul_pd(qq32
,_mm_sub_pd(_mm_mul_pd(rinv32
,rinvsq32
),krf2
));
2157 cutoff_mask
= _mm_cmplt_pd(rsq32
,rcutoff2
);
2161 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2163 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2165 /* Calculate temporary vectorial force */
2166 tx
= _mm_mul_pd(fscal
,dx32
);
2167 ty
= _mm_mul_pd(fscal
,dy32
);
2168 tz
= _mm_mul_pd(fscal
,dz32
);
2170 /* Update vectorial force */
2171 fix3
= _mm_add_pd(fix3
,tx
);
2172 fiy3
= _mm_add_pd(fiy3
,ty
);
2173 fiz3
= _mm_add_pd(fiz3
,tz
);
2175 fjx2
= _mm_add_pd(fjx2
,tx
);
2176 fjy2
= _mm_add_pd(fjy2
,ty
);
2177 fjz2
= _mm_add_pd(fjz2
,tz
);
2181 /**************************
2182 * CALCULATE INTERACTIONS *
2183 **************************/
2185 if (gmx_mm_any_lt(rsq33
,rcutoff2
))
2188 /* REACTION-FIELD ELECTROSTATICS */
2189 felec
= _mm_mul_pd(qq33
,_mm_sub_pd(_mm_mul_pd(rinv33
,rinvsq33
),krf2
));
2191 cutoff_mask
= _mm_cmplt_pd(rsq33
,rcutoff2
);
2195 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2197 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2199 /* Calculate temporary vectorial force */
2200 tx
= _mm_mul_pd(fscal
,dx33
);
2201 ty
= _mm_mul_pd(fscal
,dy33
);
2202 tz
= _mm_mul_pd(fscal
,dz33
);
2204 /* Update vectorial force */
2205 fix3
= _mm_add_pd(fix3
,tx
);
2206 fiy3
= _mm_add_pd(fiy3
,ty
);
2207 fiz3
= _mm_add_pd(fiz3
,tz
);
2209 fjx3
= _mm_add_pd(fjx3
,tx
);
2210 fjy3
= _mm_add_pd(fjy3
,ty
);
2211 fjz3
= _mm_add_pd(fjz3
,tz
);
2215 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
2217 /* Inner loop uses 303 flops */
2220 /* End of innermost loop */
2222 gmx_mm_update_iforce_4atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
2223 f
+i_coord_offset
,fshift
+i_shift_offset
);
2225 /* Increment number of inner iterations */
2226 inneriter
+= j_index_end
- j_index_start
;
2228 /* Outer loop uses 24 flops */
2231 /* Increment number of outer iterations */
2234 /* Update outer/inner flops */
2236 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W4W4_F
,outeriter
*24 + inneriter
*303);