2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_single kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_sse2_single.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_sse2_single
51 * Electrostatics interaction: ReactionField
52 * VdW interaction: LennardJones
53 * Geometry: Water4-Water4
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_sse2_single
58 (t_nblist
* gmx_restrict nlist
,
59 rvec
* gmx_restrict xx
,
60 rvec
* gmx_restrict ff
,
61 struct t_forcerec
* gmx_restrict fr
,
62 t_mdatoms
* gmx_restrict mdatoms
,
63 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
64 t_nrnb
* gmx_restrict nrnb
)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
72 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
73 int jnrA
,jnrB
,jnrC
,jnrD
;
74 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
75 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
76 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
78 real
*shiftvec
,*fshift
,*x
,*f
;
79 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
;
81 __m128 tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
83 __m128 ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
85 __m128 ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
87 __m128 ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
89 __m128 ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
90 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
;
91 __m128 jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
92 int vdwjidx1A
,vdwjidx1B
,vdwjidx1C
,vdwjidx1D
;
93 __m128 jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
94 int vdwjidx2A
,vdwjidx2B
,vdwjidx2C
,vdwjidx2D
;
95 __m128 jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
96 int vdwjidx3A
,vdwjidx3B
,vdwjidx3C
,vdwjidx3D
;
97 __m128 jx3
,jy3
,jz3
,fjx3
,fjy3
,fjz3
,jq3
,isaj3
;
98 __m128 dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
99 __m128 dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
100 __m128 dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
101 __m128 dx13
,dy13
,dz13
,rsq13
,rinv13
,rinvsq13
,r13
,qq13
,c6_13
,c12_13
;
102 __m128 dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
103 __m128 dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
104 __m128 dx23
,dy23
,dz23
,rsq23
,rinv23
,rinvsq23
,r23
,qq23
,c6_23
,c12_23
;
105 __m128 dx31
,dy31
,dz31
,rsq31
,rinv31
,rinvsq31
,r31
,qq31
,c6_31
,c12_31
;
106 __m128 dx32
,dy32
,dz32
,rsq32
,rinv32
,rinvsq32
,r32
,qq32
,c6_32
,c12_32
;
107 __m128 dx33
,dy33
,dz33
,rsq33
,rinv33
,rinvsq33
,r33
,qq33
,c6_33
,c12_33
;
108 __m128 velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
111 __m128 rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
114 __m128 one_sixth
= _mm_set1_ps(1.0/6.0);
115 __m128 one_twelfth
= _mm_set1_ps(1.0/12.0);
116 __m128 dummy_mask
,cutoff_mask
;
117 __m128 signbit
= _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
118 __m128 one
= _mm_set1_ps(1.0);
119 __m128 two
= _mm_set1_ps(2.0);
125 jindex
= nlist
->jindex
;
127 shiftidx
= nlist
->shift
;
129 shiftvec
= fr
->shift_vec
[0];
130 fshift
= fr
->fshift
[0];
131 facel
= _mm_set1_ps(fr
->ic
->epsfac
);
132 charge
= mdatoms
->chargeA
;
133 krf
= _mm_set1_ps(fr
->ic
->k_rf
);
134 krf2
= _mm_set1_ps(fr
->ic
->k_rf
*2.0);
135 crf
= _mm_set1_ps(fr
->ic
->c_rf
);
136 nvdwtype
= fr
->ntype
;
138 vdwtype
= mdatoms
->typeA
;
140 /* Setup water-specific parameters */
141 inr
= nlist
->iinr
[0];
142 iq1
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+1]));
143 iq2
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+2]));
144 iq3
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+3]));
145 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
147 jq1
= _mm_set1_ps(charge
[inr
+1]);
148 jq2
= _mm_set1_ps(charge
[inr
+2]);
149 jq3
= _mm_set1_ps(charge
[inr
+3]);
150 vdwjidx0A
= 2*vdwtype
[inr
+0];
151 c6_00
= _mm_set1_ps(vdwparam
[vdwioffset0
+vdwjidx0A
]);
152 c12_00
= _mm_set1_ps(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
153 qq11
= _mm_mul_ps(iq1
,jq1
);
154 qq12
= _mm_mul_ps(iq1
,jq2
);
155 qq13
= _mm_mul_ps(iq1
,jq3
);
156 qq21
= _mm_mul_ps(iq2
,jq1
);
157 qq22
= _mm_mul_ps(iq2
,jq2
);
158 qq23
= _mm_mul_ps(iq2
,jq3
);
159 qq31
= _mm_mul_ps(iq3
,jq1
);
160 qq32
= _mm_mul_ps(iq3
,jq2
);
161 qq33
= _mm_mul_ps(iq3
,jq3
);
163 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
164 rcutoff_scalar
= fr
->ic
->rcoulomb
;
165 rcutoff
= _mm_set1_ps(rcutoff_scalar
);
166 rcutoff2
= _mm_mul_ps(rcutoff
,rcutoff
);
168 sh_vdw_invrcut6
= _mm_set1_ps(fr
->ic
->sh_invrc6
);
169 rvdw
= _mm_set1_ps(fr
->ic
->rvdw
);
171 /* Avoid stupid compiler warnings */
172 jnrA
= jnrB
= jnrC
= jnrD
= 0;
181 for(iidx
=0;iidx
<4*DIM
;iidx
++)
186 /* Start outer loop over neighborlists */
187 for(iidx
=0; iidx
<nri
; iidx
++)
189 /* Load shift vector for this list */
190 i_shift_offset
= DIM
*shiftidx
[iidx
];
192 /* Load limits for loop over neighbors */
193 j_index_start
= jindex
[iidx
];
194 j_index_end
= jindex
[iidx
+1];
196 /* Get outer coordinate index */
198 i_coord_offset
= DIM
*inr
;
200 /* Load i particle coords and add shift vector */
201 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
202 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
204 fix0
= _mm_setzero_ps();
205 fiy0
= _mm_setzero_ps();
206 fiz0
= _mm_setzero_ps();
207 fix1
= _mm_setzero_ps();
208 fiy1
= _mm_setzero_ps();
209 fiz1
= _mm_setzero_ps();
210 fix2
= _mm_setzero_ps();
211 fiy2
= _mm_setzero_ps();
212 fiz2
= _mm_setzero_ps();
213 fix3
= _mm_setzero_ps();
214 fiy3
= _mm_setzero_ps();
215 fiz3
= _mm_setzero_ps();
217 /* Reset potential sums */
218 velecsum
= _mm_setzero_ps();
219 vvdwsum
= _mm_setzero_ps();
221 /* Start inner kernel loop */
222 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+3]>=0; jidx
+=4)
225 /* Get j neighbor index, and coordinate index */
230 j_coord_offsetA
= DIM
*jnrA
;
231 j_coord_offsetB
= DIM
*jnrB
;
232 j_coord_offsetC
= DIM
*jnrC
;
233 j_coord_offsetD
= DIM
*jnrD
;
235 /* load j atom coordinates */
236 gmx_mm_load_4rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
237 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
238 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
239 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
241 /* Calculate displacement vector */
242 dx00
= _mm_sub_ps(ix0
,jx0
);
243 dy00
= _mm_sub_ps(iy0
,jy0
);
244 dz00
= _mm_sub_ps(iz0
,jz0
);
245 dx11
= _mm_sub_ps(ix1
,jx1
);
246 dy11
= _mm_sub_ps(iy1
,jy1
);
247 dz11
= _mm_sub_ps(iz1
,jz1
);
248 dx12
= _mm_sub_ps(ix1
,jx2
);
249 dy12
= _mm_sub_ps(iy1
,jy2
);
250 dz12
= _mm_sub_ps(iz1
,jz2
);
251 dx13
= _mm_sub_ps(ix1
,jx3
);
252 dy13
= _mm_sub_ps(iy1
,jy3
);
253 dz13
= _mm_sub_ps(iz1
,jz3
);
254 dx21
= _mm_sub_ps(ix2
,jx1
);
255 dy21
= _mm_sub_ps(iy2
,jy1
);
256 dz21
= _mm_sub_ps(iz2
,jz1
);
257 dx22
= _mm_sub_ps(ix2
,jx2
);
258 dy22
= _mm_sub_ps(iy2
,jy2
);
259 dz22
= _mm_sub_ps(iz2
,jz2
);
260 dx23
= _mm_sub_ps(ix2
,jx3
);
261 dy23
= _mm_sub_ps(iy2
,jy3
);
262 dz23
= _mm_sub_ps(iz2
,jz3
);
263 dx31
= _mm_sub_ps(ix3
,jx1
);
264 dy31
= _mm_sub_ps(iy3
,jy1
);
265 dz31
= _mm_sub_ps(iz3
,jz1
);
266 dx32
= _mm_sub_ps(ix3
,jx2
);
267 dy32
= _mm_sub_ps(iy3
,jy2
);
268 dz32
= _mm_sub_ps(iz3
,jz2
);
269 dx33
= _mm_sub_ps(ix3
,jx3
);
270 dy33
= _mm_sub_ps(iy3
,jy3
);
271 dz33
= _mm_sub_ps(iz3
,jz3
);
273 /* Calculate squared distance and things based on it */
274 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
275 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
276 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
277 rsq13
= gmx_mm_calc_rsq_ps(dx13
,dy13
,dz13
);
278 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
279 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
280 rsq23
= gmx_mm_calc_rsq_ps(dx23
,dy23
,dz23
);
281 rsq31
= gmx_mm_calc_rsq_ps(dx31
,dy31
,dz31
);
282 rsq32
= gmx_mm_calc_rsq_ps(dx32
,dy32
,dz32
);
283 rsq33
= gmx_mm_calc_rsq_ps(dx33
,dy33
,dz33
);
285 rinv11
= sse2_invsqrt_f(rsq11
);
286 rinv12
= sse2_invsqrt_f(rsq12
);
287 rinv13
= sse2_invsqrt_f(rsq13
);
288 rinv21
= sse2_invsqrt_f(rsq21
);
289 rinv22
= sse2_invsqrt_f(rsq22
);
290 rinv23
= sse2_invsqrt_f(rsq23
);
291 rinv31
= sse2_invsqrt_f(rsq31
);
292 rinv32
= sse2_invsqrt_f(rsq32
);
293 rinv33
= sse2_invsqrt_f(rsq33
);
295 rinvsq00
= sse2_inv_f(rsq00
);
296 rinvsq11
= _mm_mul_ps(rinv11
,rinv11
);
297 rinvsq12
= _mm_mul_ps(rinv12
,rinv12
);
298 rinvsq13
= _mm_mul_ps(rinv13
,rinv13
);
299 rinvsq21
= _mm_mul_ps(rinv21
,rinv21
);
300 rinvsq22
= _mm_mul_ps(rinv22
,rinv22
);
301 rinvsq23
= _mm_mul_ps(rinv23
,rinv23
);
302 rinvsq31
= _mm_mul_ps(rinv31
,rinv31
);
303 rinvsq32
= _mm_mul_ps(rinv32
,rinv32
);
304 rinvsq33
= _mm_mul_ps(rinv33
,rinv33
);
306 fjx0
= _mm_setzero_ps();
307 fjy0
= _mm_setzero_ps();
308 fjz0
= _mm_setzero_ps();
309 fjx1
= _mm_setzero_ps();
310 fjy1
= _mm_setzero_ps();
311 fjz1
= _mm_setzero_ps();
312 fjx2
= _mm_setzero_ps();
313 fjy2
= _mm_setzero_ps();
314 fjz2
= _mm_setzero_ps();
315 fjx3
= _mm_setzero_ps();
316 fjy3
= _mm_setzero_ps();
317 fjz3
= _mm_setzero_ps();
319 /**************************
320 * CALCULATE INTERACTIONS *
321 **************************/
323 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
326 /* LENNARD-JONES DISPERSION/REPULSION */
328 rinvsix
= _mm_mul_ps(_mm_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
329 vvdw6
= _mm_mul_ps(c6_00
,rinvsix
);
330 vvdw12
= _mm_mul_ps(c12_00
,_mm_mul_ps(rinvsix
,rinvsix
));
331 vvdw
= _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12
, _mm_mul_ps(c12_00
,_mm_mul_ps(sh_vdw_invrcut6
,sh_vdw_invrcut6
))), one_twelfth
) ,
332 _mm_mul_ps( _mm_sub_ps(vvdw6
,_mm_mul_ps(c6_00
,sh_vdw_invrcut6
)),one_sixth
));
333 fvdw
= _mm_mul_ps(_mm_sub_ps(vvdw12
,vvdw6
),rinvsq00
);
335 cutoff_mask
= _mm_cmplt_ps(rsq00
,rcutoff2
);
337 /* Update potential sum for this i atom from the interaction with this j atom. */
338 vvdw
= _mm_and_ps(vvdw
,cutoff_mask
);
339 vvdwsum
= _mm_add_ps(vvdwsum
,vvdw
);
343 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
345 /* Calculate temporary vectorial force */
346 tx
= _mm_mul_ps(fscal
,dx00
);
347 ty
= _mm_mul_ps(fscal
,dy00
);
348 tz
= _mm_mul_ps(fscal
,dz00
);
350 /* Update vectorial force */
351 fix0
= _mm_add_ps(fix0
,tx
);
352 fiy0
= _mm_add_ps(fiy0
,ty
);
353 fiz0
= _mm_add_ps(fiz0
,tz
);
355 fjx0
= _mm_add_ps(fjx0
,tx
);
356 fjy0
= _mm_add_ps(fjy0
,ty
);
357 fjz0
= _mm_add_ps(fjz0
,tz
);
361 /**************************
362 * CALCULATE INTERACTIONS *
363 **************************/
365 if (gmx_mm_any_lt(rsq11
,rcutoff2
))
368 /* REACTION-FIELD ELECTROSTATICS */
369 velec
= _mm_mul_ps(qq11
,_mm_sub_ps(_mm_add_ps(rinv11
,_mm_mul_ps(krf
,rsq11
)),crf
));
370 felec
= _mm_mul_ps(qq11
,_mm_sub_ps(_mm_mul_ps(rinv11
,rinvsq11
),krf2
));
372 cutoff_mask
= _mm_cmplt_ps(rsq11
,rcutoff2
);
374 /* Update potential sum for this i atom from the interaction with this j atom. */
375 velec
= _mm_and_ps(velec
,cutoff_mask
);
376 velecsum
= _mm_add_ps(velecsum
,velec
);
380 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
382 /* Calculate temporary vectorial force */
383 tx
= _mm_mul_ps(fscal
,dx11
);
384 ty
= _mm_mul_ps(fscal
,dy11
);
385 tz
= _mm_mul_ps(fscal
,dz11
);
387 /* Update vectorial force */
388 fix1
= _mm_add_ps(fix1
,tx
);
389 fiy1
= _mm_add_ps(fiy1
,ty
);
390 fiz1
= _mm_add_ps(fiz1
,tz
);
392 fjx1
= _mm_add_ps(fjx1
,tx
);
393 fjy1
= _mm_add_ps(fjy1
,ty
);
394 fjz1
= _mm_add_ps(fjz1
,tz
);
398 /**************************
399 * CALCULATE INTERACTIONS *
400 **************************/
402 if (gmx_mm_any_lt(rsq12
,rcutoff2
))
405 /* REACTION-FIELD ELECTROSTATICS */
406 velec
= _mm_mul_ps(qq12
,_mm_sub_ps(_mm_add_ps(rinv12
,_mm_mul_ps(krf
,rsq12
)),crf
));
407 felec
= _mm_mul_ps(qq12
,_mm_sub_ps(_mm_mul_ps(rinv12
,rinvsq12
),krf2
));
409 cutoff_mask
= _mm_cmplt_ps(rsq12
,rcutoff2
);
411 /* Update potential sum for this i atom from the interaction with this j atom. */
412 velec
= _mm_and_ps(velec
,cutoff_mask
);
413 velecsum
= _mm_add_ps(velecsum
,velec
);
417 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
419 /* Calculate temporary vectorial force */
420 tx
= _mm_mul_ps(fscal
,dx12
);
421 ty
= _mm_mul_ps(fscal
,dy12
);
422 tz
= _mm_mul_ps(fscal
,dz12
);
424 /* Update vectorial force */
425 fix1
= _mm_add_ps(fix1
,tx
);
426 fiy1
= _mm_add_ps(fiy1
,ty
);
427 fiz1
= _mm_add_ps(fiz1
,tz
);
429 fjx2
= _mm_add_ps(fjx2
,tx
);
430 fjy2
= _mm_add_ps(fjy2
,ty
);
431 fjz2
= _mm_add_ps(fjz2
,tz
);
435 /**************************
436 * CALCULATE INTERACTIONS *
437 **************************/
439 if (gmx_mm_any_lt(rsq13
,rcutoff2
))
442 /* REACTION-FIELD ELECTROSTATICS */
443 velec
= _mm_mul_ps(qq13
,_mm_sub_ps(_mm_add_ps(rinv13
,_mm_mul_ps(krf
,rsq13
)),crf
));
444 felec
= _mm_mul_ps(qq13
,_mm_sub_ps(_mm_mul_ps(rinv13
,rinvsq13
),krf2
));
446 cutoff_mask
= _mm_cmplt_ps(rsq13
,rcutoff2
);
448 /* Update potential sum for this i atom from the interaction with this j atom. */
449 velec
= _mm_and_ps(velec
,cutoff_mask
);
450 velecsum
= _mm_add_ps(velecsum
,velec
);
454 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
456 /* Calculate temporary vectorial force */
457 tx
= _mm_mul_ps(fscal
,dx13
);
458 ty
= _mm_mul_ps(fscal
,dy13
);
459 tz
= _mm_mul_ps(fscal
,dz13
);
461 /* Update vectorial force */
462 fix1
= _mm_add_ps(fix1
,tx
);
463 fiy1
= _mm_add_ps(fiy1
,ty
);
464 fiz1
= _mm_add_ps(fiz1
,tz
);
466 fjx3
= _mm_add_ps(fjx3
,tx
);
467 fjy3
= _mm_add_ps(fjy3
,ty
);
468 fjz3
= _mm_add_ps(fjz3
,tz
);
472 /**************************
473 * CALCULATE INTERACTIONS *
474 **************************/
476 if (gmx_mm_any_lt(rsq21
,rcutoff2
))
479 /* REACTION-FIELD ELECTROSTATICS */
480 velec
= _mm_mul_ps(qq21
,_mm_sub_ps(_mm_add_ps(rinv21
,_mm_mul_ps(krf
,rsq21
)),crf
));
481 felec
= _mm_mul_ps(qq21
,_mm_sub_ps(_mm_mul_ps(rinv21
,rinvsq21
),krf2
));
483 cutoff_mask
= _mm_cmplt_ps(rsq21
,rcutoff2
);
485 /* Update potential sum for this i atom from the interaction with this j atom. */
486 velec
= _mm_and_ps(velec
,cutoff_mask
);
487 velecsum
= _mm_add_ps(velecsum
,velec
);
491 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
493 /* Calculate temporary vectorial force */
494 tx
= _mm_mul_ps(fscal
,dx21
);
495 ty
= _mm_mul_ps(fscal
,dy21
);
496 tz
= _mm_mul_ps(fscal
,dz21
);
498 /* Update vectorial force */
499 fix2
= _mm_add_ps(fix2
,tx
);
500 fiy2
= _mm_add_ps(fiy2
,ty
);
501 fiz2
= _mm_add_ps(fiz2
,tz
);
503 fjx1
= _mm_add_ps(fjx1
,tx
);
504 fjy1
= _mm_add_ps(fjy1
,ty
);
505 fjz1
= _mm_add_ps(fjz1
,tz
);
509 /**************************
510 * CALCULATE INTERACTIONS *
511 **************************/
513 if (gmx_mm_any_lt(rsq22
,rcutoff2
))
516 /* REACTION-FIELD ELECTROSTATICS */
517 velec
= _mm_mul_ps(qq22
,_mm_sub_ps(_mm_add_ps(rinv22
,_mm_mul_ps(krf
,rsq22
)),crf
));
518 felec
= _mm_mul_ps(qq22
,_mm_sub_ps(_mm_mul_ps(rinv22
,rinvsq22
),krf2
));
520 cutoff_mask
= _mm_cmplt_ps(rsq22
,rcutoff2
);
522 /* Update potential sum for this i atom from the interaction with this j atom. */
523 velec
= _mm_and_ps(velec
,cutoff_mask
);
524 velecsum
= _mm_add_ps(velecsum
,velec
);
528 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
530 /* Calculate temporary vectorial force */
531 tx
= _mm_mul_ps(fscal
,dx22
);
532 ty
= _mm_mul_ps(fscal
,dy22
);
533 tz
= _mm_mul_ps(fscal
,dz22
);
535 /* Update vectorial force */
536 fix2
= _mm_add_ps(fix2
,tx
);
537 fiy2
= _mm_add_ps(fiy2
,ty
);
538 fiz2
= _mm_add_ps(fiz2
,tz
);
540 fjx2
= _mm_add_ps(fjx2
,tx
);
541 fjy2
= _mm_add_ps(fjy2
,ty
);
542 fjz2
= _mm_add_ps(fjz2
,tz
);
546 /**************************
547 * CALCULATE INTERACTIONS *
548 **************************/
550 if (gmx_mm_any_lt(rsq23
,rcutoff2
))
553 /* REACTION-FIELD ELECTROSTATICS */
554 velec
= _mm_mul_ps(qq23
,_mm_sub_ps(_mm_add_ps(rinv23
,_mm_mul_ps(krf
,rsq23
)),crf
));
555 felec
= _mm_mul_ps(qq23
,_mm_sub_ps(_mm_mul_ps(rinv23
,rinvsq23
),krf2
));
557 cutoff_mask
= _mm_cmplt_ps(rsq23
,rcutoff2
);
559 /* Update potential sum for this i atom from the interaction with this j atom. */
560 velec
= _mm_and_ps(velec
,cutoff_mask
);
561 velecsum
= _mm_add_ps(velecsum
,velec
);
565 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
567 /* Calculate temporary vectorial force */
568 tx
= _mm_mul_ps(fscal
,dx23
);
569 ty
= _mm_mul_ps(fscal
,dy23
);
570 tz
= _mm_mul_ps(fscal
,dz23
);
572 /* Update vectorial force */
573 fix2
= _mm_add_ps(fix2
,tx
);
574 fiy2
= _mm_add_ps(fiy2
,ty
);
575 fiz2
= _mm_add_ps(fiz2
,tz
);
577 fjx3
= _mm_add_ps(fjx3
,tx
);
578 fjy3
= _mm_add_ps(fjy3
,ty
);
579 fjz3
= _mm_add_ps(fjz3
,tz
);
583 /**************************
584 * CALCULATE INTERACTIONS *
585 **************************/
587 if (gmx_mm_any_lt(rsq31
,rcutoff2
))
590 /* REACTION-FIELD ELECTROSTATICS */
591 velec
= _mm_mul_ps(qq31
,_mm_sub_ps(_mm_add_ps(rinv31
,_mm_mul_ps(krf
,rsq31
)),crf
));
592 felec
= _mm_mul_ps(qq31
,_mm_sub_ps(_mm_mul_ps(rinv31
,rinvsq31
),krf2
));
594 cutoff_mask
= _mm_cmplt_ps(rsq31
,rcutoff2
);
596 /* Update potential sum for this i atom from the interaction with this j atom. */
597 velec
= _mm_and_ps(velec
,cutoff_mask
);
598 velecsum
= _mm_add_ps(velecsum
,velec
);
602 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
604 /* Calculate temporary vectorial force */
605 tx
= _mm_mul_ps(fscal
,dx31
);
606 ty
= _mm_mul_ps(fscal
,dy31
);
607 tz
= _mm_mul_ps(fscal
,dz31
);
609 /* Update vectorial force */
610 fix3
= _mm_add_ps(fix3
,tx
);
611 fiy3
= _mm_add_ps(fiy3
,ty
);
612 fiz3
= _mm_add_ps(fiz3
,tz
);
614 fjx1
= _mm_add_ps(fjx1
,tx
);
615 fjy1
= _mm_add_ps(fjy1
,ty
);
616 fjz1
= _mm_add_ps(fjz1
,tz
);
620 /**************************
621 * CALCULATE INTERACTIONS *
622 **************************/
624 if (gmx_mm_any_lt(rsq32
,rcutoff2
))
627 /* REACTION-FIELD ELECTROSTATICS */
628 velec
= _mm_mul_ps(qq32
,_mm_sub_ps(_mm_add_ps(rinv32
,_mm_mul_ps(krf
,rsq32
)),crf
));
629 felec
= _mm_mul_ps(qq32
,_mm_sub_ps(_mm_mul_ps(rinv32
,rinvsq32
),krf2
));
631 cutoff_mask
= _mm_cmplt_ps(rsq32
,rcutoff2
);
633 /* Update potential sum for this i atom from the interaction with this j atom. */
634 velec
= _mm_and_ps(velec
,cutoff_mask
);
635 velecsum
= _mm_add_ps(velecsum
,velec
);
639 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
641 /* Calculate temporary vectorial force */
642 tx
= _mm_mul_ps(fscal
,dx32
);
643 ty
= _mm_mul_ps(fscal
,dy32
);
644 tz
= _mm_mul_ps(fscal
,dz32
);
646 /* Update vectorial force */
647 fix3
= _mm_add_ps(fix3
,tx
);
648 fiy3
= _mm_add_ps(fiy3
,ty
);
649 fiz3
= _mm_add_ps(fiz3
,tz
);
651 fjx2
= _mm_add_ps(fjx2
,tx
);
652 fjy2
= _mm_add_ps(fjy2
,ty
);
653 fjz2
= _mm_add_ps(fjz2
,tz
);
657 /**************************
658 * CALCULATE INTERACTIONS *
659 **************************/
661 if (gmx_mm_any_lt(rsq33
,rcutoff2
))
664 /* REACTION-FIELD ELECTROSTATICS */
665 velec
= _mm_mul_ps(qq33
,_mm_sub_ps(_mm_add_ps(rinv33
,_mm_mul_ps(krf
,rsq33
)),crf
));
666 felec
= _mm_mul_ps(qq33
,_mm_sub_ps(_mm_mul_ps(rinv33
,rinvsq33
),krf2
));
668 cutoff_mask
= _mm_cmplt_ps(rsq33
,rcutoff2
);
670 /* Update potential sum for this i atom from the interaction with this j atom. */
671 velec
= _mm_and_ps(velec
,cutoff_mask
);
672 velecsum
= _mm_add_ps(velecsum
,velec
);
676 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
678 /* Calculate temporary vectorial force */
679 tx
= _mm_mul_ps(fscal
,dx33
);
680 ty
= _mm_mul_ps(fscal
,dy33
);
681 tz
= _mm_mul_ps(fscal
,dz33
);
683 /* Update vectorial force */
684 fix3
= _mm_add_ps(fix3
,tx
);
685 fiy3
= _mm_add_ps(fiy3
,ty
);
686 fiz3
= _mm_add_ps(fiz3
,tz
);
688 fjx3
= _mm_add_ps(fjx3
,tx
);
689 fjy3
= _mm_add_ps(fjy3
,ty
);
690 fjz3
= _mm_add_ps(fjz3
,tz
);
694 fjptrA
= f
+j_coord_offsetA
;
695 fjptrB
= f
+j_coord_offsetB
;
696 fjptrC
= f
+j_coord_offsetC
;
697 fjptrD
= f
+j_coord_offsetD
;
699 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
700 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,
701 fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
703 /* Inner loop uses 368 flops */
709 /* Get j neighbor index, and coordinate index */
710 jnrlistA
= jjnr
[jidx
];
711 jnrlistB
= jjnr
[jidx
+1];
712 jnrlistC
= jjnr
[jidx
+2];
713 jnrlistD
= jjnr
[jidx
+3];
714 /* Sign of each element will be negative for non-real atoms.
715 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
716 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
718 dummy_mask
= gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128()));
719 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
720 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
721 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
722 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
723 j_coord_offsetA
= DIM
*jnrA
;
724 j_coord_offsetB
= DIM
*jnrB
;
725 j_coord_offsetC
= DIM
*jnrC
;
726 j_coord_offsetD
= DIM
*jnrD
;
728 /* load j atom coordinates */
729 gmx_mm_load_4rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
730 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
731 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
732 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
734 /* Calculate displacement vector */
735 dx00
= _mm_sub_ps(ix0
,jx0
);
736 dy00
= _mm_sub_ps(iy0
,jy0
);
737 dz00
= _mm_sub_ps(iz0
,jz0
);
738 dx11
= _mm_sub_ps(ix1
,jx1
);
739 dy11
= _mm_sub_ps(iy1
,jy1
);
740 dz11
= _mm_sub_ps(iz1
,jz1
);
741 dx12
= _mm_sub_ps(ix1
,jx2
);
742 dy12
= _mm_sub_ps(iy1
,jy2
);
743 dz12
= _mm_sub_ps(iz1
,jz2
);
744 dx13
= _mm_sub_ps(ix1
,jx3
);
745 dy13
= _mm_sub_ps(iy1
,jy3
);
746 dz13
= _mm_sub_ps(iz1
,jz3
);
747 dx21
= _mm_sub_ps(ix2
,jx1
);
748 dy21
= _mm_sub_ps(iy2
,jy1
);
749 dz21
= _mm_sub_ps(iz2
,jz1
);
750 dx22
= _mm_sub_ps(ix2
,jx2
);
751 dy22
= _mm_sub_ps(iy2
,jy2
);
752 dz22
= _mm_sub_ps(iz2
,jz2
);
753 dx23
= _mm_sub_ps(ix2
,jx3
);
754 dy23
= _mm_sub_ps(iy2
,jy3
);
755 dz23
= _mm_sub_ps(iz2
,jz3
);
756 dx31
= _mm_sub_ps(ix3
,jx1
);
757 dy31
= _mm_sub_ps(iy3
,jy1
);
758 dz31
= _mm_sub_ps(iz3
,jz1
);
759 dx32
= _mm_sub_ps(ix3
,jx2
);
760 dy32
= _mm_sub_ps(iy3
,jy2
);
761 dz32
= _mm_sub_ps(iz3
,jz2
);
762 dx33
= _mm_sub_ps(ix3
,jx3
);
763 dy33
= _mm_sub_ps(iy3
,jy3
);
764 dz33
= _mm_sub_ps(iz3
,jz3
);
766 /* Calculate squared distance and things based on it */
767 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
768 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
769 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
770 rsq13
= gmx_mm_calc_rsq_ps(dx13
,dy13
,dz13
);
771 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
772 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
773 rsq23
= gmx_mm_calc_rsq_ps(dx23
,dy23
,dz23
);
774 rsq31
= gmx_mm_calc_rsq_ps(dx31
,dy31
,dz31
);
775 rsq32
= gmx_mm_calc_rsq_ps(dx32
,dy32
,dz32
);
776 rsq33
= gmx_mm_calc_rsq_ps(dx33
,dy33
,dz33
);
778 rinv11
= sse2_invsqrt_f(rsq11
);
779 rinv12
= sse2_invsqrt_f(rsq12
);
780 rinv13
= sse2_invsqrt_f(rsq13
);
781 rinv21
= sse2_invsqrt_f(rsq21
);
782 rinv22
= sse2_invsqrt_f(rsq22
);
783 rinv23
= sse2_invsqrt_f(rsq23
);
784 rinv31
= sse2_invsqrt_f(rsq31
);
785 rinv32
= sse2_invsqrt_f(rsq32
);
786 rinv33
= sse2_invsqrt_f(rsq33
);
788 rinvsq00
= sse2_inv_f(rsq00
);
789 rinvsq11
= _mm_mul_ps(rinv11
,rinv11
);
790 rinvsq12
= _mm_mul_ps(rinv12
,rinv12
);
791 rinvsq13
= _mm_mul_ps(rinv13
,rinv13
);
792 rinvsq21
= _mm_mul_ps(rinv21
,rinv21
);
793 rinvsq22
= _mm_mul_ps(rinv22
,rinv22
);
794 rinvsq23
= _mm_mul_ps(rinv23
,rinv23
);
795 rinvsq31
= _mm_mul_ps(rinv31
,rinv31
);
796 rinvsq32
= _mm_mul_ps(rinv32
,rinv32
);
797 rinvsq33
= _mm_mul_ps(rinv33
,rinv33
);
799 fjx0
= _mm_setzero_ps();
800 fjy0
= _mm_setzero_ps();
801 fjz0
= _mm_setzero_ps();
802 fjx1
= _mm_setzero_ps();
803 fjy1
= _mm_setzero_ps();
804 fjz1
= _mm_setzero_ps();
805 fjx2
= _mm_setzero_ps();
806 fjy2
= _mm_setzero_ps();
807 fjz2
= _mm_setzero_ps();
808 fjx3
= _mm_setzero_ps();
809 fjy3
= _mm_setzero_ps();
810 fjz3
= _mm_setzero_ps();
812 /**************************
813 * CALCULATE INTERACTIONS *
814 **************************/
816 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
819 /* LENNARD-JONES DISPERSION/REPULSION */
821 rinvsix
= _mm_mul_ps(_mm_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
822 vvdw6
= _mm_mul_ps(c6_00
,rinvsix
);
823 vvdw12
= _mm_mul_ps(c12_00
,_mm_mul_ps(rinvsix
,rinvsix
));
824 vvdw
= _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12
, _mm_mul_ps(c12_00
,_mm_mul_ps(sh_vdw_invrcut6
,sh_vdw_invrcut6
))), one_twelfth
) ,
825 _mm_mul_ps( _mm_sub_ps(vvdw6
,_mm_mul_ps(c6_00
,sh_vdw_invrcut6
)),one_sixth
));
826 fvdw
= _mm_mul_ps(_mm_sub_ps(vvdw12
,vvdw6
),rinvsq00
);
828 cutoff_mask
= _mm_cmplt_ps(rsq00
,rcutoff2
);
830 /* Update potential sum for this i atom from the interaction with this j atom. */
831 vvdw
= _mm_and_ps(vvdw
,cutoff_mask
);
832 vvdw
= _mm_andnot_ps(dummy_mask
,vvdw
);
833 vvdwsum
= _mm_add_ps(vvdwsum
,vvdw
);
837 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
839 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
841 /* Calculate temporary vectorial force */
842 tx
= _mm_mul_ps(fscal
,dx00
);
843 ty
= _mm_mul_ps(fscal
,dy00
);
844 tz
= _mm_mul_ps(fscal
,dz00
);
846 /* Update vectorial force */
847 fix0
= _mm_add_ps(fix0
,tx
);
848 fiy0
= _mm_add_ps(fiy0
,ty
);
849 fiz0
= _mm_add_ps(fiz0
,tz
);
851 fjx0
= _mm_add_ps(fjx0
,tx
);
852 fjy0
= _mm_add_ps(fjy0
,ty
);
853 fjz0
= _mm_add_ps(fjz0
,tz
);
857 /**************************
858 * CALCULATE INTERACTIONS *
859 **************************/
861 if (gmx_mm_any_lt(rsq11
,rcutoff2
))
864 /* REACTION-FIELD ELECTROSTATICS */
865 velec
= _mm_mul_ps(qq11
,_mm_sub_ps(_mm_add_ps(rinv11
,_mm_mul_ps(krf
,rsq11
)),crf
));
866 felec
= _mm_mul_ps(qq11
,_mm_sub_ps(_mm_mul_ps(rinv11
,rinvsq11
),krf2
));
868 cutoff_mask
= _mm_cmplt_ps(rsq11
,rcutoff2
);
870 /* Update potential sum for this i atom from the interaction with this j atom. */
871 velec
= _mm_and_ps(velec
,cutoff_mask
);
872 velec
= _mm_andnot_ps(dummy_mask
,velec
);
873 velecsum
= _mm_add_ps(velecsum
,velec
);
877 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
879 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
881 /* Calculate temporary vectorial force */
882 tx
= _mm_mul_ps(fscal
,dx11
);
883 ty
= _mm_mul_ps(fscal
,dy11
);
884 tz
= _mm_mul_ps(fscal
,dz11
);
886 /* Update vectorial force */
887 fix1
= _mm_add_ps(fix1
,tx
);
888 fiy1
= _mm_add_ps(fiy1
,ty
);
889 fiz1
= _mm_add_ps(fiz1
,tz
);
891 fjx1
= _mm_add_ps(fjx1
,tx
);
892 fjy1
= _mm_add_ps(fjy1
,ty
);
893 fjz1
= _mm_add_ps(fjz1
,tz
);
897 /**************************
898 * CALCULATE INTERACTIONS *
899 **************************/
901 if (gmx_mm_any_lt(rsq12
,rcutoff2
))
904 /* REACTION-FIELD ELECTROSTATICS */
905 velec
= _mm_mul_ps(qq12
,_mm_sub_ps(_mm_add_ps(rinv12
,_mm_mul_ps(krf
,rsq12
)),crf
));
906 felec
= _mm_mul_ps(qq12
,_mm_sub_ps(_mm_mul_ps(rinv12
,rinvsq12
),krf2
));
908 cutoff_mask
= _mm_cmplt_ps(rsq12
,rcutoff2
);
910 /* Update potential sum for this i atom from the interaction with this j atom. */
911 velec
= _mm_and_ps(velec
,cutoff_mask
);
912 velec
= _mm_andnot_ps(dummy_mask
,velec
);
913 velecsum
= _mm_add_ps(velecsum
,velec
);
917 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
919 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
921 /* Calculate temporary vectorial force */
922 tx
= _mm_mul_ps(fscal
,dx12
);
923 ty
= _mm_mul_ps(fscal
,dy12
);
924 tz
= _mm_mul_ps(fscal
,dz12
);
926 /* Update vectorial force */
927 fix1
= _mm_add_ps(fix1
,tx
);
928 fiy1
= _mm_add_ps(fiy1
,ty
);
929 fiz1
= _mm_add_ps(fiz1
,tz
);
931 fjx2
= _mm_add_ps(fjx2
,tx
);
932 fjy2
= _mm_add_ps(fjy2
,ty
);
933 fjz2
= _mm_add_ps(fjz2
,tz
);
937 /**************************
938 * CALCULATE INTERACTIONS *
939 **************************/
941 if (gmx_mm_any_lt(rsq13
,rcutoff2
))
944 /* REACTION-FIELD ELECTROSTATICS */
945 velec
= _mm_mul_ps(qq13
,_mm_sub_ps(_mm_add_ps(rinv13
,_mm_mul_ps(krf
,rsq13
)),crf
));
946 felec
= _mm_mul_ps(qq13
,_mm_sub_ps(_mm_mul_ps(rinv13
,rinvsq13
),krf2
));
948 cutoff_mask
= _mm_cmplt_ps(rsq13
,rcutoff2
);
950 /* Update potential sum for this i atom from the interaction with this j atom. */
951 velec
= _mm_and_ps(velec
,cutoff_mask
);
952 velec
= _mm_andnot_ps(dummy_mask
,velec
);
953 velecsum
= _mm_add_ps(velecsum
,velec
);
957 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
959 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
961 /* Calculate temporary vectorial force */
962 tx
= _mm_mul_ps(fscal
,dx13
);
963 ty
= _mm_mul_ps(fscal
,dy13
);
964 tz
= _mm_mul_ps(fscal
,dz13
);
966 /* Update vectorial force */
967 fix1
= _mm_add_ps(fix1
,tx
);
968 fiy1
= _mm_add_ps(fiy1
,ty
);
969 fiz1
= _mm_add_ps(fiz1
,tz
);
971 fjx3
= _mm_add_ps(fjx3
,tx
);
972 fjy3
= _mm_add_ps(fjy3
,ty
);
973 fjz3
= _mm_add_ps(fjz3
,tz
);
977 /**************************
978 * CALCULATE INTERACTIONS *
979 **************************/
981 if (gmx_mm_any_lt(rsq21
,rcutoff2
))
984 /* REACTION-FIELD ELECTROSTATICS */
985 velec
= _mm_mul_ps(qq21
,_mm_sub_ps(_mm_add_ps(rinv21
,_mm_mul_ps(krf
,rsq21
)),crf
));
986 felec
= _mm_mul_ps(qq21
,_mm_sub_ps(_mm_mul_ps(rinv21
,rinvsq21
),krf2
));
988 cutoff_mask
= _mm_cmplt_ps(rsq21
,rcutoff2
);
990 /* Update potential sum for this i atom from the interaction with this j atom. */
991 velec
= _mm_and_ps(velec
,cutoff_mask
);
992 velec
= _mm_andnot_ps(dummy_mask
,velec
);
993 velecsum
= _mm_add_ps(velecsum
,velec
);
997 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
999 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1001 /* Calculate temporary vectorial force */
1002 tx
= _mm_mul_ps(fscal
,dx21
);
1003 ty
= _mm_mul_ps(fscal
,dy21
);
1004 tz
= _mm_mul_ps(fscal
,dz21
);
1006 /* Update vectorial force */
1007 fix2
= _mm_add_ps(fix2
,tx
);
1008 fiy2
= _mm_add_ps(fiy2
,ty
);
1009 fiz2
= _mm_add_ps(fiz2
,tz
);
1011 fjx1
= _mm_add_ps(fjx1
,tx
);
1012 fjy1
= _mm_add_ps(fjy1
,ty
);
1013 fjz1
= _mm_add_ps(fjz1
,tz
);
1017 /**************************
1018 * CALCULATE INTERACTIONS *
1019 **************************/
1021 if (gmx_mm_any_lt(rsq22
,rcutoff2
))
1024 /* REACTION-FIELD ELECTROSTATICS */
1025 velec
= _mm_mul_ps(qq22
,_mm_sub_ps(_mm_add_ps(rinv22
,_mm_mul_ps(krf
,rsq22
)),crf
));
1026 felec
= _mm_mul_ps(qq22
,_mm_sub_ps(_mm_mul_ps(rinv22
,rinvsq22
),krf2
));
1028 cutoff_mask
= _mm_cmplt_ps(rsq22
,rcutoff2
);
1030 /* Update potential sum for this i atom from the interaction with this j atom. */
1031 velec
= _mm_and_ps(velec
,cutoff_mask
);
1032 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1033 velecsum
= _mm_add_ps(velecsum
,velec
);
1037 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1039 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1041 /* Calculate temporary vectorial force */
1042 tx
= _mm_mul_ps(fscal
,dx22
);
1043 ty
= _mm_mul_ps(fscal
,dy22
);
1044 tz
= _mm_mul_ps(fscal
,dz22
);
1046 /* Update vectorial force */
1047 fix2
= _mm_add_ps(fix2
,tx
);
1048 fiy2
= _mm_add_ps(fiy2
,ty
);
1049 fiz2
= _mm_add_ps(fiz2
,tz
);
1051 fjx2
= _mm_add_ps(fjx2
,tx
);
1052 fjy2
= _mm_add_ps(fjy2
,ty
);
1053 fjz2
= _mm_add_ps(fjz2
,tz
);
1057 /**************************
1058 * CALCULATE INTERACTIONS *
1059 **************************/
1061 if (gmx_mm_any_lt(rsq23
,rcutoff2
))
1064 /* REACTION-FIELD ELECTROSTATICS */
1065 velec
= _mm_mul_ps(qq23
,_mm_sub_ps(_mm_add_ps(rinv23
,_mm_mul_ps(krf
,rsq23
)),crf
));
1066 felec
= _mm_mul_ps(qq23
,_mm_sub_ps(_mm_mul_ps(rinv23
,rinvsq23
),krf2
));
1068 cutoff_mask
= _mm_cmplt_ps(rsq23
,rcutoff2
);
1070 /* Update potential sum for this i atom from the interaction with this j atom. */
1071 velec
= _mm_and_ps(velec
,cutoff_mask
);
1072 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1073 velecsum
= _mm_add_ps(velecsum
,velec
);
1077 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1079 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1081 /* Calculate temporary vectorial force */
1082 tx
= _mm_mul_ps(fscal
,dx23
);
1083 ty
= _mm_mul_ps(fscal
,dy23
);
1084 tz
= _mm_mul_ps(fscal
,dz23
);
1086 /* Update vectorial force */
1087 fix2
= _mm_add_ps(fix2
,tx
);
1088 fiy2
= _mm_add_ps(fiy2
,ty
);
1089 fiz2
= _mm_add_ps(fiz2
,tz
);
1091 fjx3
= _mm_add_ps(fjx3
,tx
);
1092 fjy3
= _mm_add_ps(fjy3
,ty
);
1093 fjz3
= _mm_add_ps(fjz3
,tz
);
1097 /**************************
1098 * CALCULATE INTERACTIONS *
1099 **************************/
1101 if (gmx_mm_any_lt(rsq31
,rcutoff2
))
1104 /* REACTION-FIELD ELECTROSTATICS */
1105 velec
= _mm_mul_ps(qq31
,_mm_sub_ps(_mm_add_ps(rinv31
,_mm_mul_ps(krf
,rsq31
)),crf
));
1106 felec
= _mm_mul_ps(qq31
,_mm_sub_ps(_mm_mul_ps(rinv31
,rinvsq31
),krf2
));
1108 cutoff_mask
= _mm_cmplt_ps(rsq31
,rcutoff2
);
1110 /* Update potential sum for this i atom from the interaction with this j atom. */
1111 velec
= _mm_and_ps(velec
,cutoff_mask
);
1112 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1113 velecsum
= _mm_add_ps(velecsum
,velec
);
1117 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1119 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1121 /* Calculate temporary vectorial force */
1122 tx
= _mm_mul_ps(fscal
,dx31
);
1123 ty
= _mm_mul_ps(fscal
,dy31
);
1124 tz
= _mm_mul_ps(fscal
,dz31
);
1126 /* Update vectorial force */
1127 fix3
= _mm_add_ps(fix3
,tx
);
1128 fiy3
= _mm_add_ps(fiy3
,ty
);
1129 fiz3
= _mm_add_ps(fiz3
,tz
);
1131 fjx1
= _mm_add_ps(fjx1
,tx
);
1132 fjy1
= _mm_add_ps(fjy1
,ty
);
1133 fjz1
= _mm_add_ps(fjz1
,tz
);
1137 /**************************
1138 * CALCULATE INTERACTIONS *
1139 **************************/
1141 if (gmx_mm_any_lt(rsq32
,rcutoff2
))
1144 /* REACTION-FIELD ELECTROSTATICS */
1145 velec
= _mm_mul_ps(qq32
,_mm_sub_ps(_mm_add_ps(rinv32
,_mm_mul_ps(krf
,rsq32
)),crf
));
1146 felec
= _mm_mul_ps(qq32
,_mm_sub_ps(_mm_mul_ps(rinv32
,rinvsq32
),krf2
));
1148 cutoff_mask
= _mm_cmplt_ps(rsq32
,rcutoff2
);
1150 /* Update potential sum for this i atom from the interaction with this j atom. */
1151 velec
= _mm_and_ps(velec
,cutoff_mask
);
1152 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1153 velecsum
= _mm_add_ps(velecsum
,velec
);
1157 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1159 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1161 /* Calculate temporary vectorial force */
1162 tx
= _mm_mul_ps(fscal
,dx32
);
1163 ty
= _mm_mul_ps(fscal
,dy32
);
1164 tz
= _mm_mul_ps(fscal
,dz32
);
1166 /* Update vectorial force */
1167 fix3
= _mm_add_ps(fix3
,tx
);
1168 fiy3
= _mm_add_ps(fiy3
,ty
);
1169 fiz3
= _mm_add_ps(fiz3
,tz
);
1171 fjx2
= _mm_add_ps(fjx2
,tx
);
1172 fjy2
= _mm_add_ps(fjy2
,ty
);
1173 fjz2
= _mm_add_ps(fjz2
,tz
);
1177 /**************************
1178 * CALCULATE INTERACTIONS *
1179 **************************/
1181 if (gmx_mm_any_lt(rsq33
,rcutoff2
))
1184 /* REACTION-FIELD ELECTROSTATICS */
1185 velec
= _mm_mul_ps(qq33
,_mm_sub_ps(_mm_add_ps(rinv33
,_mm_mul_ps(krf
,rsq33
)),crf
));
1186 felec
= _mm_mul_ps(qq33
,_mm_sub_ps(_mm_mul_ps(rinv33
,rinvsq33
),krf2
));
1188 cutoff_mask
= _mm_cmplt_ps(rsq33
,rcutoff2
);
1190 /* Update potential sum for this i atom from the interaction with this j atom. */
1191 velec
= _mm_and_ps(velec
,cutoff_mask
);
1192 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1193 velecsum
= _mm_add_ps(velecsum
,velec
);
1197 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1199 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1201 /* Calculate temporary vectorial force */
1202 tx
= _mm_mul_ps(fscal
,dx33
);
1203 ty
= _mm_mul_ps(fscal
,dy33
);
1204 tz
= _mm_mul_ps(fscal
,dz33
);
1206 /* Update vectorial force */
1207 fix3
= _mm_add_ps(fix3
,tx
);
1208 fiy3
= _mm_add_ps(fiy3
,ty
);
1209 fiz3
= _mm_add_ps(fiz3
,tz
);
1211 fjx3
= _mm_add_ps(fjx3
,tx
);
1212 fjy3
= _mm_add_ps(fjy3
,ty
);
1213 fjz3
= _mm_add_ps(fjz3
,tz
);
1217 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
1218 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
1219 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
1220 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
1222 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
1223 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,
1224 fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
1226 /* Inner loop uses 368 flops */
1229 /* End of innermost loop */
1231 gmx_mm_update_iforce_4atom_swizzle_ps(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
1232 f
+i_coord_offset
,fshift
+i_shift_offset
);
1235 /* Update potential energies */
1236 gmx_mm_update_1pot_ps(velecsum
,kernel_data
->energygrp_elec
+ggid
);
1237 gmx_mm_update_1pot_ps(vvdwsum
,kernel_data
->energygrp_vdw
+ggid
);
1239 /* Increment number of inner iterations */
1240 inneriter
+= j_index_end
- j_index_start
;
1242 /* Outer loop uses 26 flops */
1245 /* Increment number of outer iterations */
1248 /* Update outer/inner flops */
1250 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W4W4_VF
,outeriter
*26 + inneriter
*368);
1253 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_sse2_single
1254 * Electrostatics interaction: ReactionField
1255 * VdW interaction: LennardJones
1256 * Geometry: Water4-Water4
1257 * Calculate force/pot: Force
1260 nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_sse2_single
1261 (t_nblist
* gmx_restrict nlist
,
1262 rvec
* gmx_restrict xx
,
1263 rvec
* gmx_restrict ff
,
1264 struct t_forcerec
* gmx_restrict fr
,
1265 t_mdatoms
* gmx_restrict mdatoms
,
1266 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
1267 t_nrnb
* gmx_restrict nrnb
)
1269 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1270 * just 0 for non-waters.
1271 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1272 * jnr indices corresponding to data put in the four positions in the SIMD register.
1274 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
1275 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
1276 int jnrA
,jnrB
,jnrC
,jnrD
;
1277 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
1278 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
1279 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
1280 real rcutoff_scalar
;
1281 real
*shiftvec
,*fshift
,*x
,*f
;
1282 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
;
1283 real scratch
[4*DIM
];
1284 __m128 tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
1286 __m128 ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
1288 __m128 ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
1290 __m128 ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
1292 __m128 ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
1293 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
;
1294 __m128 jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
1295 int vdwjidx1A
,vdwjidx1B
,vdwjidx1C
,vdwjidx1D
;
1296 __m128 jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
1297 int vdwjidx2A
,vdwjidx2B
,vdwjidx2C
,vdwjidx2D
;
1298 __m128 jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
1299 int vdwjidx3A
,vdwjidx3B
,vdwjidx3C
,vdwjidx3D
;
1300 __m128 jx3
,jy3
,jz3
,fjx3
,fjy3
,fjz3
,jq3
,isaj3
;
1301 __m128 dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
1302 __m128 dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
1303 __m128 dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
1304 __m128 dx13
,dy13
,dz13
,rsq13
,rinv13
,rinvsq13
,r13
,qq13
,c6_13
,c12_13
;
1305 __m128 dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
1306 __m128 dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
1307 __m128 dx23
,dy23
,dz23
,rsq23
,rinv23
,rinvsq23
,r23
,qq23
,c6_23
,c12_23
;
1308 __m128 dx31
,dy31
,dz31
,rsq31
,rinv31
,rinvsq31
,r31
,qq31
,c6_31
,c12_31
;
1309 __m128 dx32
,dy32
,dz32
,rsq32
,rinv32
,rinvsq32
,r32
,qq32
,c6_32
,c12_32
;
1310 __m128 dx33
,dy33
,dz33
,rsq33
,rinv33
,rinvsq33
,r33
,qq33
,c6_33
,c12_33
;
1311 __m128 velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
1314 __m128 rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
1317 __m128 one_sixth
= _mm_set1_ps(1.0/6.0);
1318 __m128 one_twelfth
= _mm_set1_ps(1.0/12.0);
1319 __m128 dummy_mask
,cutoff_mask
;
1320 __m128 signbit
= _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1321 __m128 one
= _mm_set1_ps(1.0);
1322 __m128 two
= _mm_set1_ps(2.0);
1328 jindex
= nlist
->jindex
;
1330 shiftidx
= nlist
->shift
;
1332 shiftvec
= fr
->shift_vec
[0];
1333 fshift
= fr
->fshift
[0];
1334 facel
= _mm_set1_ps(fr
->ic
->epsfac
);
1335 charge
= mdatoms
->chargeA
;
1336 krf
= _mm_set1_ps(fr
->ic
->k_rf
);
1337 krf2
= _mm_set1_ps(fr
->ic
->k_rf
*2.0);
1338 crf
= _mm_set1_ps(fr
->ic
->c_rf
);
1339 nvdwtype
= fr
->ntype
;
1340 vdwparam
= fr
->nbfp
;
1341 vdwtype
= mdatoms
->typeA
;
1343 /* Setup water-specific parameters */
1344 inr
= nlist
->iinr
[0];
1345 iq1
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+1]));
1346 iq2
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+2]));
1347 iq3
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+3]));
1348 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
1350 jq1
= _mm_set1_ps(charge
[inr
+1]);
1351 jq2
= _mm_set1_ps(charge
[inr
+2]);
1352 jq3
= _mm_set1_ps(charge
[inr
+3]);
1353 vdwjidx0A
= 2*vdwtype
[inr
+0];
1354 c6_00
= _mm_set1_ps(vdwparam
[vdwioffset0
+vdwjidx0A
]);
1355 c12_00
= _mm_set1_ps(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
1356 qq11
= _mm_mul_ps(iq1
,jq1
);
1357 qq12
= _mm_mul_ps(iq1
,jq2
);
1358 qq13
= _mm_mul_ps(iq1
,jq3
);
1359 qq21
= _mm_mul_ps(iq2
,jq1
);
1360 qq22
= _mm_mul_ps(iq2
,jq2
);
1361 qq23
= _mm_mul_ps(iq2
,jq3
);
1362 qq31
= _mm_mul_ps(iq3
,jq1
);
1363 qq32
= _mm_mul_ps(iq3
,jq2
);
1364 qq33
= _mm_mul_ps(iq3
,jq3
);
1366 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1367 rcutoff_scalar
= fr
->ic
->rcoulomb
;
1368 rcutoff
= _mm_set1_ps(rcutoff_scalar
);
1369 rcutoff2
= _mm_mul_ps(rcutoff
,rcutoff
);
1371 sh_vdw_invrcut6
= _mm_set1_ps(fr
->ic
->sh_invrc6
);
1372 rvdw
= _mm_set1_ps(fr
->ic
->rvdw
);
1374 /* Avoid stupid compiler warnings */
1375 jnrA
= jnrB
= jnrC
= jnrD
= 0;
1376 j_coord_offsetA
= 0;
1377 j_coord_offsetB
= 0;
1378 j_coord_offsetC
= 0;
1379 j_coord_offsetD
= 0;
1384 for(iidx
=0;iidx
<4*DIM
;iidx
++)
1386 scratch
[iidx
] = 0.0;
1389 /* Start outer loop over neighborlists */
1390 for(iidx
=0; iidx
<nri
; iidx
++)
1392 /* Load shift vector for this list */
1393 i_shift_offset
= DIM
*shiftidx
[iidx
];
1395 /* Load limits for loop over neighbors */
1396 j_index_start
= jindex
[iidx
];
1397 j_index_end
= jindex
[iidx
+1];
1399 /* Get outer coordinate index */
1401 i_coord_offset
= DIM
*inr
;
1403 /* Load i particle coords and add shift vector */
1404 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
1405 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
1407 fix0
= _mm_setzero_ps();
1408 fiy0
= _mm_setzero_ps();
1409 fiz0
= _mm_setzero_ps();
1410 fix1
= _mm_setzero_ps();
1411 fiy1
= _mm_setzero_ps();
1412 fiz1
= _mm_setzero_ps();
1413 fix2
= _mm_setzero_ps();
1414 fiy2
= _mm_setzero_ps();
1415 fiz2
= _mm_setzero_ps();
1416 fix3
= _mm_setzero_ps();
1417 fiy3
= _mm_setzero_ps();
1418 fiz3
= _mm_setzero_ps();
1420 /* Start inner kernel loop */
1421 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+3]>=0; jidx
+=4)
1424 /* Get j neighbor index, and coordinate index */
1426 jnrB
= jjnr
[jidx
+1];
1427 jnrC
= jjnr
[jidx
+2];
1428 jnrD
= jjnr
[jidx
+3];
1429 j_coord_offsetA
= DIM
*jnrA
;
1430 j_coord_offsetB
= DIM
*jnrB
;
1431 j_coord_offsetC
= DIM
*jnrC
;
1432 j_coord_offsetD
= DIM
*jnrD
;
1434 /* load j atom coordinates */
1435 gmx_mm_load_4rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1436 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
1437 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
1438 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
1440 /* Calculate displacement vector */
1441 dx00
= _mm_sub_ps(ix0
,jx0
);
1442 dy00
= _mm_sub_ps(iy0
,jy0
);
1443 dz00
= _mm_sub_ps(iz0
,jz0
);
1444 dx11
= _mm_sub_ps(ix1
,jx1
);
1445 dy11
= _mm_sub_ps(iy1
,jy1
);
1446 dz11
= _mm_sub_ps(iz1
,jz1
);
1447 dx12
= _mm_sub_ps(ix1
,jx2
);
1448 dy12
= _mm_sub_ps(iy1
,jy2
);
1449 dz12
= _mm_sub_ps(iz1
,jz2
);
1450 dx13
= _mm_sub_ps(ix1
,jx3
);
1451 dy13
= _mm_sub_ps(iy1
,jy3
);
1452 dz13
= _mm_sub_ps(iz1
,jz3
);
1453 dx21
= _mm_sub_ps(ix2
,jx1
);
1454 dy21
= _mm_sub_ps(iy2
,jy1
);
1455 dz21
= _mm_sub_ps(iz2
,jz1
);
1456 dx22
= _mm_sub_ps(ix2
,jx2
);
1457 dy22
= _mm_sub_ps(iy2
,jy2
);
1458 dz22
= _mm_sub_ps(iz2
,jz2
);
1459 dx23
= _mm_sub_ps(ix2
,jx3
);
1460 dy23
= _mm_sub_ps(iy2
,jy3
);
1461 dz23
= _mm_sub_ps(iz2
,jz3
);
1462 dx31
= _mm_sub_ps(ix3
,jx1
);
1463 dy31
= _mm_sub_ps(iy3
,jy1
);
1464 dz31
= _mm_sub_ps(iz3
,jz1
);
1465 dx32
= _mm_sub_ps(ix3
,jx2
);
1466 dy32
= _mm_sub_ps(iy3
,jy2
);
1467 dz32
= _mm_sub_ps(iz3
,jz2
);
1468 dx33
= _mm_sub_ps(ix3
,jx3
);
1469 dy33
= _mm_sub_ps(iy3
,jy3
);
1470 dz33
= _mm_sub_ps(iz3
,jz3
);
1472 /* Calculate squared distance and things based on it */
1473 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
1474 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
1475 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
1476 rsq13
= gmx_mm_calc_rsq_ps(dx13
,dy13
,dz13
);
1477 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
1478 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
1479 rsq23
= gmx_mm_calc_rsq_ps(dx23
,dy23
,dz23
);
1480 rsq31
= gmx_mm_calc_rsq_ps(dx31
,dy31
,dz31
);
1481 rsq32
= gmx_mm_calc_rsq_ps(dx32
,dy32
,dz32
);
1482 rsq33
= gmx_mm_calc_rsq_ps(dx33
,dy33
,dz33
);
1484 rinv11
= sse2_invsqrt_f(rsq11
);
1485 rinv12
= sse2_invsqrt_f(rsq12
);
1486 rinv13
= sse2_invsqrt_f(rsq13
);
1487 rinv21
= sse2_invsqrt_f(rsq21
);
1488 rinv22
= sse2_invsqrt_f(rsq22
);
1489 rinv23
= sse2_invsqrt_f(rsq23
);
1490 rinv31
= sse2_invsqrt_f(rsq31
);
1491 rinv32
= sse2_invsqrt_f(rsq32
);
1492 rinv33
= sse2_invsqrt_f(rsq33
);
1494 rinvsq00
= sse2_inv_f(rsq00
);
1495 rinvsq11
= _mm_mul_ps(rinv11
,rinv11
);
1496 rinvsq12
= _mm_mul_ps(rinv12
,rinv12
);
1497 rinvsq13
= _mm_mul_ps(rinv13
,rinv13
);
1498 rinvsq21
= _mm_mul_ps(rinv21
,rinv21
);
1499 rinvsq22
= _mm_mul_ps(rinv22
,rinv22
);
1500 rinvsq23
= _mm_mul_ps(rinv23
,rinv23
);
1501 rinvsq31
= _mm_mul_ps(rinv31
,rinv31
);
1502 rinvsq32
= _mm_mul_ps(rinv32
,rinv32
);
1503 rinvsq33
= _mm_mul_ps(rinv33
,rinv33
);
1505 fjx0
= _mm_setzero_ps();
1506 fjy0
= _mm_setzero_ps();
1507 fjz0
= _mm_setzero_ps();
1508 fjx1
= _mm_setzero_ps();
1509 fjy1
= _mm_setzero_ps();
1510 fjz1
= _mm_setzero_ps();
1511 fjx2
= _mm_setzero_ps();
1512 fjy2
= _mm_setzero_ps();
1513 fjz2
= _mm_setzero_ps();
1514 fjx3
= _mm_setzero_ps();
1515 fjy3
= _mm_setzero_ps();
1516 fjz3
= _mm_setzero_ps();
1518 /**************************
1519 * CALCULATE INTERACTIONS *
1520 **************************/
1522 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
1525 /* LENNARD-JONES DISPERSION/REPULSION */
1527 rinvsix
= _mm_mul_ps(_mm_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
1528 fvdw
= _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00
,rinvsix
),c6_00
),_mm_mul_ps(rinvsix
,rinvsq00
));
1530 cutoff_mask
= _mm_cmplt_ps(rsq00
,rcutoff2
);
1534 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1536 /* Calculate temporary vectorial force */
1537 tx
= _mm_mul_ps(fscal
,dx00
);
1538 ty
= _mm_mul_ps(fscal
,dy00
);
1539 tz
= _mm_mul_ps(fscal
,dz00
);
1541 /* Update vectorial force */
1542 fix0
= _mm_add_ps(fix0
,tx
);
1543 fiy0
= _mm_add_ps(fiy0
,ty
);
1544 fiz0
= _mm_add_ps(fiz0
,tz
);
1546 fjx0
= _mm_add_ps(fjx0
,tx
);
1547 fjy0
= _mm_add_ps(fjy0
,ty
);
1548 fjz0
= _mm_add_ps(fjz0
,tz
);
1552 /**************************
1553 * CALCULATE INTERACTIONS *
1554 **************************/
1556 if (gmx_mm_any_lt(rsq11
,rcutoff2
))
1559 /* REACTION-FIELD ELECTROSTATICS */
1560 felec
= _mm_mul_ps(qq11
,_mm_sub_ps(_mm_mul_ps(rinv11
,rinvsq11
),krf2
));
1562 cutoff_mask
= _mm_cmplt_ps(rsq11
,rcutoff2
);
1566 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1568 /* Calculate temporary vectorial force */
1569 tx
= _mm_mul_ps(fscal
,dx11
);
1570 ty
= _mm_mul_ps(fscal
,dy11
);
1571 tz
= _mm_mul_ps(fscal
,dz11
);
1573 /* Update vectorial force */
1574 fix1
= _mm_add_ps(fix1
,tx
);
1575 fiy1
= _mm_add_ps(fiy1
,ty
);
1576 fiz1
= _mm_add_ps(fiz1
,tz
);
1578 fjx1
= _mm_add_ps(fjx1
,tx
);
1579 fjy1
= _mm_add_ps(fjy1
,ty
);
1580 fjz1
= _mm_add_ps(fjz1
,tz
);
1584 /**************************
1585 * CALCULATE INTERACTIONS *
1586 **************************/
1588 if (gmx_mm_any_lt(rsq12
,rcutoff2
))
1591 /* REACTION-FIELD ELECTROSTATICS */
1592 felec
= _mm_mul_ps(qq12
,_mm_sub_ps(_mm_mul_ps(rinv12
,rinvsq12
),krf2
));
1594 cutoff_mask
= _mm_cmplt_ps(rsq12
,rcutoff2
);
1598 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1600 /* Calculate temporary vectorial force */
1601 tx
= _mm_mul_ps(fscal
,dx12
);
1602 ty
= _mm_mul_ps(fscal
,dy12
);
1603 tz
= _mm_mul_ps(fscal
,dz12
);
1605 /* Update vectorial force */
1606 fix1
= _mm_add_ps(fix1
,tx
);
1607 fiy1
= _mm_add_ps(fiy1
,ty
);
1608 fiz1
= _mm_add_ps(fiz1
,tz
);
1610 fjx2
= _mm_add_ps(fjx2
,tx
);
1611 fjy2
= _mm_add_ps(fjy2
,ty
);
1612 fjz2
= _mm_add_ps(fjz2
,tz
);
1616 /**************************
1617 * CALCULATE INTERACTIONS *
1618 **************************/
1620 if (gmx_mm_any_lt(rsq13
,rcutoff2
))
1623 /* REACTION-FIELD ELECTROSTATICS */
1624 felec
= _mm_mul_ps(qq13
,_mm_sub_ps(_mm_mul_ps(rinv13
,rinvsq13
),krf2
));
1626 cutoff_mask
= _mm_cmplt_ps(rsq13
,rcutoff2
);
1630 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1632 /* Calculate temporary vectorial force */
1633 tx
= _mm_mul_ps(fscal
,dx13
);
1634 ty
= _mm_mul_ps(fscal
,dy13
);
1635 tz
= _mm_mul_ps(fscal
,dz13
);
1637 /* Update vectorial force */
1638 fix1
= _mm_add_ps(fix1
,tx
);
1639 fiy1
= _mm_add_ps(fiy1
,ty
);
1640 fiz1
= _mm_add_ps(fiz1
,tz
);
1642 fjx3
= _mm_add_ps(fjx3
,tx
);
1643 fjy3
= _mm_add_ps(fjy3
,ty
);
1644 fjz3
= _mm_add_ps(fjz3
,tz
);
1648 /**************************
1649 * CALCULATE INTERACTIONS *
1650 **************************/
1652 if (gmx_mm_any_lt(rsq21
,rcutoff2
))
1655 /* REACTION-FIELD ELECTROSTATICS */
1656 felec
= _mm_mul_ps(qq21
,_mm_sub_ps(_mm_mul_ps(rinv21
,rinvsq21
),krf2
));
1658 cutoff_mask
= _mm_cmplt_ps(rsq21
,rcutoff2
);
1662 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1664 /* Calculate temporary vectorial force */
1665 tx
= _mm_mul_ps(fscal
,dx21
);
1666 ty
= _mm_mul_ps(fscal
,dy21
);
1667 tz
= _mm_mul_ps(fscal
,dz21
);
1669 /* Update vectorial force */
1670 fix2
= _mm_add_ps(fix2
,tx
);
1671 fiy2
= _mm_add_ps(fiy2
,ty
);
1672 fiz2
= _mm_add_ps(fiz2
,tz
);
1674 fjx1
= _mm_add_ps(fjx1
,tx
);
1675 fjy1
= _mm_add_ps(fjy1
,ty
);
1676 fjz1
= _mm_add_ps(fjz1
,tz
);
1680 /**************************
1681 * CALCULATE INTERACTIONS *
1682 **************************/
1684 if (gmx_mm_any_lt(rsq22
,rcutoff2
))
1687 /* REACTION-FIELD ELECTROSTATICS */
1688 felec
= _mm_mul_ps(qq22
,_mm_sub_ps(_mm_mul_ps(rinv22
,rinvsq22
),krf2
));
1690 cutoff_mask
= _mm_cmplt_ps(rsq22
,rcutoff2
);
1694 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1696 /* Calculate temporary vectorial force */
1697 tx
= _mm_mul_ps(fscal
,dx22
);
1698 ty
= _mm_mul_ps(fscal
,dy22
);
1699 tz
= _mm_mul_ps(fscal
,dz22
);
1701 /* Update vectorial force */
1702 fix2
= _mm_add_ps(fix2
,tx
);
1703 fiy2
= _mm_add_ps(fiy2
,ty
);
1704 fiz2
= _mm_add_ps(fiz2
,tz
);
1706 fjx2
= _mm_add_ps(fjx2
,tx
);
1707 fjy2
= _mm_add_ps(fjy2
,ty
);
1708 fjz2
= _mm_add_ps(fjz2
,tz
);
1712 /**************************
1713 * CALCULATE INTERACTIONS *
1714 **************************/
1716 if (gmx_mm_any_lt(rsq23
,rcutoff2
))
1719 /* REACTION-FIELD ELECTROSTATICS */
1720 felec
= _mm_mul_ps(qq23
,_mm_sub_ps(_mm_mul_ps(rinv23
,rinvsq23
),krf2
));
1722 cutoff_mask
= _mm_cmplt_ps(rsq23
,rcutoff2
);
1726 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1728 /* Calculate temporary vectorial force */
1729 tx
= _mm_mul_ps(fscal
,dx23
);
1730 ty
= _mm_mul_ps(fscal
,dy23
);
1731 tz
= _mm_mul_ps(fscal
,dz23
);
1733 /* Update vectorial force */
1734 fix2
= _mm_add_ps(fix2
,tx
);
1735 fiy2
= _mm_add_ps(fiy2
,ty
);
1736 fiz2
= _mm_add_ps(fiz2
,tz
);
1738 fjx3
= _mm_add_ps(fjx3
,tx
);
1739 fjy3
= _mm_add_ps(fjy3
,ty
);
1740 fjz3
= _mm_add_ps(fjz3
,tz
);
1744 /**************************
1745 * CALCULATE INTERACTIONS *
1746 **************************/
1748 if (gmx_mm_any_lt(rsq31
,rcutoff2
))
1751 /* REACTION-FIELD ELECTROSTATICS */
1752 felec
= _mm_mul_ps(qq31
,_mm_sub_ps(_mm_mul_ps(rinv31
,rinvsq31
),krf2
));
1754 cutoff_mask
= _mm_cmplt_ps(rsq31
,rcutoff2
);
1758 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1760 /* Calculate temporary vectorial force */
1761 tx
= _mm_mul_ps(fscal
,dx31
);
1762 ty
= _mm_mul_ps(fscal
,dy31
);
1763 tz
= _mm_mul_ps(fscal
,dz31
);
1765 /* Update vectorial force */
1766 fix3
= _mm_add_ps(fix3
,tx
);
1767 fiy3
= _mm_add_ps(fiy3
,ty
);
1768 fiz3
= _mm_add_ps(fiz3
,tz
);
1770 fjx1
= _mm_add_ps(fjx1
,tx
);
1771 fjy1
= _mm_add_ps(fjy1
,ty
);
1772 fjz1
= _mm_add_ps(fjz1
,tz
);
1776 /**************************
1777 * CALCULATE INTERACTIONS *
1778 **************************/
1780 if (gmx_mm_any_lt(rsq32
,rcutoff2
))
1783 /* REACTION-FIELD ELECTROSTATICS */
1784 felec
= _mm_mul_ps(qq32
,_mm_sub_ps(_mm_mul_ps(rinv32
,rinvsq32
),krf2
));
1786 cutoff_mask
= _mm_cmplt_ps(rsq32
,rcutoff2
);
1790 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1792 /* Calculate temporary vectorial force */
1793 tx
= _mm_mul_ps(fscal
,dx32
);
1794 ty
= _mm_mul_ps(fscal
,dy32
);
1795 tz
= _mm_mul_ps(fscal
,dz32
);
1797 /* Update vectorial force */
1798 fix3
= _mm_add_ps(fix3
,tx
);
1799 fiy3
= _mm_add_ps(fiy3
,ty
);
1800 fiz3
= _mm_add_ps(fiz3
,tz
);
1802 fjx2
= _mm_add_ps(fjx2
,tx
);
1803 fjy2
= _mm_add_ps(fjy2
,ty
);
1804 fjz2
= _mm_add_ps(fjz2
,tz
);
1808 /**************************
1809 * CALCULATE INTERACTIONS *
1810 **************************/
1812 if (gmx_mm_any_lt(rsq33
,rcutoff2
))
1815 /* REACTION-FIELD ELECTROSTATICS */
1816 felec
= _mm_mul_ps(qq33
,_mm_sub_ps(_mm_mul_ps(rinv33
,rinvsq33
),krf2
));
1818 cutoff_mask
= _mm_cmplt_ps(rsq33
,rcutoff2
);
1822 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1824 /* Calculate temporary vectorial force */
1825 tx
= _mm_mul_ps(fscal
,dx33
);
1826 ty
= _mm_mul_ps(fscal
,dy33
);
1827 tz
= _mm_mul_ps(fscal
,dz33
);
1829 /* Update vectorial force */
1830 fix3
= _mm_add_ps(fix3
,tx
);
1831 fiy3
= _mm_add_ps(fiy3
,ty
);
1832 fiz3
= _mm_add_ps(fiz3
,tz
);
1834 fjx3
= _mm_add_ps(fjx3
,tx
);
1835 fjy3
= _mm_add_ps(fjy3
,ty
);
1836 fjz3
= _mm_add_ps(fjz3
,tz
);
1840 fjptrA
= f
+j_coord_offsetA
;
1841 fjptrB
= f
+j_coord_offsetB
;
1842 fjptrC
= f
+j_coord_offsetC
;
1843 fjptrD
= f
+j_coord_offsetD
;
1845 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
1846 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,
1847 fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
1849 /* Inner loop uses 303 flops */
1852 if(jidx
<j_index_end
)
1855 /* Get j neighbor index, and coordinate index */
1856 jnrlistA
= jjnr
[jidx
];
1857 jnrlistB
= jjnr
[jidx
+1];
1858 jnrlistC
= jjnr
[jidx
+2];
1859 jnrlistD
= jjnr
[jidx
+3];
1860 /* Sign of each element will be negative for non-real atoms.
1861 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1862 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1864 dummy_mask
= gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128()));
1865 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
1866 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
1867 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
1868 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
1869 j_coord_offsetA
= DIM
*jnrA
;
1870 j_coord_offsetB
= DIM
*jnrB
;
1871 j_coord_offsetC
= DIM
*jnrC
;
1872 j_coord_offsetD
= DIM
*jnrD
;
1874 /* load j atom coordinates */
1875 gmx_mm_load_4rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1876 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
1877 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
1878 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
1880 /* Calculate displacement vector */
1881 dx00
= _mm_sub_ps(ix0
,jx0
);
1882 dy00
= _mm_sub_ps(iy0
,jy0
);
1883 dz00
= _mm_sub_ps(iz0
,jz0
);
1884 dx11
= _mm_sub_ps(ix1
,jx1
);
1885 dy11
= _mm_sub_ps(iy1
,jy1
);
1886 dz11
= _mm_sub_ps(iz1
,jz1
);
1887 dx12
= _mm_sub_ps(ix1
,jx2
);
1888 dy12
= _mm_sub_ps(iy1
,jy2
);
1889 dz12
= _mm_sub_ps(iz1
,jz2
);
1890 dx13
= _mm_sub_ps(ix1
,jx3
);
1891 dy13
= _mm_sub_ps(iy1
,jy3
);
1892 dz13
= _mm_sub_ps(iz1
,jz3
);
1893 dx21
= _mm_sub_ps(ix2
,jx1
);
1894 dy21
= _mm_sub_ps(iy2
,jy1
);
1895 dz21
= _mm_sub_ps(iz2
,jz1
);
1896 dx22
= _mm_sub_ps(ix2
,jx2
);
1897 dy22
= _mm_sub_ps(iy2
,jy2
);
1898 dz22
= _mm_sub_ps(iz2
,jz2
);
1899 dx23
= _mm_sub_ps(ix2
,jx3
);
1900 dy23
= _mm_sub_ps(iy2
,jy3
);
1901 dz23
= _mm_sub_ps(iz2
,jz3
);
1902 dx31
= _mm_sub_ps(ix3
,jx1
);
1903 dy31
= _mm_sub_ps(iy3
,jy1
);
1904 dz31
= _mm_sub_ps(iz3
,jz1
);
1905 dx32
= _mm_sub_ps(ix3
,jx2
);
1906 dy32
= _mm_sub_ps(iy3
,jy2
);
1907 dz32
= _mm_sub_ps(iz3
,jz2
);
1908 dx33
= _mm_sub_ps(ix3
,jx3
);
1909 dy33
= _mm_sub_ps(iy3
,jy3
);
1910 dz33
= _mm_sub_ps(iz3
,jz3
);
1912 /* Calculate squared distance and things based on it */
1913 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
1914 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
1915 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
1916 rsq13
= gmx_mm_calc_rsq_ps(dx13
,dy13
,dz13
);
1917 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
1918 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
1919 rsq23
= gmx_mm_calc_rsq_ps(dx23
,dy23
,dz23
);
1920 rsq31
= gmx_mm_calc_rsq_ps(dx31
,dy31
,dz31
);
1921 rsq32
= gmx_mm_calc_rsq_ps(dx32
,dy32
,dz32
);
1922 rsq33
= gmx_mm_calc_rsq_ps(dx33
,dy33
,dz33
);
1924 rinv11
= sse2_invsqrt_f(rsq11
);
1925 rinv12
= sse2_invsqrt_f(rsq12
);
1926 rinv13
= sse2_invsqrt_f(rsq13
);
1927 rinv21
= sse2_invsqrt_f(rsq21
);
1928 rinv22
= sse2_invsqrt_f(rsq22
);
1929 rinv23
= sse2_invsqrt_f(rsq23
);
1930 rinv31
= sse2_invsqrt_f(rsq31
);
1931 rinv32
= sse2_invsqrt_f(rsq32
);
1932 rinv33
= sse2_invsqrt_f(rsq33
);
1934 rinvsq00
= sse2_inv_f(rsq00
);
1935 rinvsq11
= _mm_mul_ps(rinv11
,rinv11
);
1936 rinvsq12
= _mm_mul_ps(rinv12
,rinv12
);
1937 rinvsq13
= _mm_mul_ps(rinv13
,rinv13
);
1938 rinvsq21
= _mm_mul_ps(rinv21
,rinv21
);
1939 rinvsq22
= _mm_mul_ps(rinv22
,rinv22
);
1940 rinvsq23
= _mm_mul_ps(rinv23
,rinv23
);
1941 rinvsq31
= _mm_mul_ps(rinv31
,rinv31
);
1942 rinvsq32
= _mm_mul_ps(rinv32
,rinv32
);
1943 rinvsq33
= _mm_mul_ps(rinv33
,rinv33
);
1945 fjx0
= _mm_setzero_ps();
1946 fjy0
= _mm_setzero_ps();
1947 fjz0
= _mm_setzero_ps();
1948 fjx1
= _mm_setzero_ps();
1949 fjy1
= _mm_setzero_ps();
1950 fjz1
= _mm_setzero_ps();
1951 fjx2
= _mm_setzero_ps();
1952 fjy2
= _mm_setzero_ps();
1953 fjz2
= _mm_setzero_ps();
1954 fjx3
= _mm_setzero_ps();
1955 fjy3
= _mm_setzero_ps();
1956 fjz3
= _mm_setzero_ps();
1958 /**************************
1959 * CALCULATE INTERACTIONS *
1960 **************************/
1962 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
1965 /* LENNARD-JONES DISPERSION/REPULSION */
1967 rinvsix
= _mm_mul_ps(_mm_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
1968 fvdw
= _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00
,rinvsix
),c6_00
),_mm_mul_ps(rinvsix
,rinvsq00
));
1970 cutoff_mask
= _mm_cmplt_ps(rsq00
,rcutoff2
);
1974 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1976 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1978 /* Calculate temporary vectorial force */
1979 tx
= _mm_mul_ps(fscal
,dx00
);
1980 ty
= _mm_mul_ps(fscal
,dy00
);
1981 tz
= _mm_mul_ps(fscal
,dz00
);
1983 /* Update vectorial force */
1984 fix0
= _mm_add_ps(fix0
,tx
);
1985 fiy0
= _mm_add_ps(fiy0
,ty
);
1986 fiz0
= _mm_add_ps(fiz0
,tz
);
1988 fjx0
= _mm_add_ps(fjx0
,tx
);
1989 fjy0
= _mm_add_ps(fjy0
,ty
);
1990 fjz0
= _mm_add_ps(fjz0
,tz
);
1994 /**************************
1995 * CALCULATE INTERACTIONS *
1996 **************************/
1998 if (gmx_mm_any_lt(rsq11
,rcutoff2
))
2001 /* REACTION-FIELD ELECTROSTATICS */
2002 felec
= _mm_mul_ps(qq11
,_mm_sub_ps(_mm_mul_ps(rinv11
,rinvsq11
),krf2
));
2004 cutoff_mask
= _mm_cmplt_ps(rsq11
,rcutoff2
);
2008 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
2010 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2012 /* Calculate temporary vectorial force */
2013 tx
= _mm_mul_ps(fscal
,dx11
);
2014 ty
= _mm_mul_ps(fscal
,dy11
);
2015 tz
= _mm_mul_ps(fscal
,dz11
);
2017 /* Update vectorial force */
2018 fix1
= _mm_add_ps(fix1
,tx
);
2019 fiy1
= _mm_add_ps(fiy1
,ty
);
2020 fiz1
= _mm_add_ps(fiz1
,tz
);
2022 fjx1
= _mm_add_ps(fjx1
,tx
);
2023 fjy1
= _mm_add_ps(fjy1
,ty
);
2024 fjz1
= _mm_add_ps(fjz1
,tz
);
2028 /**************************
2029 * CALCULATE INTERACTIONS *
2030 **************************/
2032 if (gmx_mm_any_lt(rsq12
,rcutoff2
))
2035 /* REACTION-FIELD ELECTROSTATICS */
2036 felec
= _mm_mul_ps(qq12
,_mm_sub_ps(_mm_mul_ps(rinv12
,rinvsq12
),krf2
));
2038 cutoff_mask
= _mm_cmplt_ps(rsq12
,rcutoff2
);
2042 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
2044 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2046 /* Calculate temporary vectorial force */
2047 tx
= _mm_mul_ps(fscal
,dx12
);
2048 ty
= _mm_mul_ps(fscal
,dy12
);
2049 tz
= _mm_mul_ps(fscal
,dz12
);
2051 /* Update vectorial force */
2052 fix1
= _mm_add_ps(fix1
,tx
);
2053 fiy1
= _mm_add_ps(fiy1
,ty
);
2054 fiz1
= _mm_add_ps(fiz1
,tz
);
2056 fjx2
= _mm_add_ps(fjx2
,tx
);
2057 fjy2
= _mm_add_ps(fjy2
,ty
);
2058 fjz2
= _mm_add_ps(fjz2
,tz
);
2062 /**************************
2063 * CALCULATE INTERACTIONS *
2064 **************************/
2066 if (gmx_mm_any_lt(rsq13
,rcutoff2
))
2069 /* REACTION-FIELD ELECTROSTATICS */
2070 felec
= _mm_mul_ps(qq13
,_mm_sub_ps(_mm_mul_ps(rinv13
,rinvsq13
),krf2
));
2072 cutoff_mask
= _mm_cmplt_ps(rsq13
,rcutoff2
);
2076 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
2078 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2080 /* Calculate temporary vectorial force */
2081 tx
= _mm_mul_ps(fscal
,dx13
);
2082 ty
= _mm_mul_ps(fscal
,dy13
);
2083 tz
= _mm_mul_ps(fscal
,dz13
);
2085 /* Update vectorial force */
2086 fix1
= _mm_add_ps(fix1
,tx
);
2087 fiy1
= _mm_add_ps(fiy1
,ty
);
2088 fiz1
= _mm_add_ps(fiz1
,tz
);
2090 fjx3
= _mm_add_ps(fjx3
,tx
);
2091 fjy3
= _mm_add_ps(fjy3
,ty
);
2092 fjz3
= _mm_add_ps(fjz3
,tz
);
2096 /**************************
2097 * CALCULATE INTERACTIONS *
2098 **************************/
2100 if (gmx_mm_any_lt(rsq21
,rcutoff2
))
2103 /* REACTION-FIELD ELECTROSTATICS */
2104 felec
= _mm_mul_ps(qq21
,_mm_sub_ps(_mm_mul_ps(rinv21
,rinvsq21
),krf2
));
2106 cutoff_mask
= _mm_cmplt_ps(rsq21
,rcutoff2
);
2110 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
2112 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2114 /* Calculate temporary vectorial force */
2115 tx
= _mm_mul_ps(fscal
,dx21
);
2116 ty
= _mm_mul_ps(fscal
,dy21
);
2117 tz
= _mm_mul_ps(fscal
,dz21
);
2119 /* Update vectorial force */
2120 fix2
= _mm_add_ps(fix2
,tx
);
2121 fiy2
= _mm_add_ps(fiy2
,ty
);
2122 fiz2
= _mm_add_ps(fiz2
,tz
);
2124 fjx1
= _mm_add_ps(fjx1
,tx
);
2125 fjy1
= _mm_add_ps(fjy1
,ty
);
2126 fjz1
= _mm_add_ps(fjz1
,tz
);
2130 /**************************
2131 * CALCULATE INTERACTIONS *
2132 **************************/
2134 if (gmx_mm_any_lt(rsq22
,rcutoff2
))
2137 /* REACTION-FIELD ELECTROSTATICS */
2138 felec
= _mm_mul_ps(qq22
,_mm_sub_ps(_mm_mul_ps(rinv22
,rinvsq22
),krf2
));
2140 cutoff_mask
= _mm_cmplt_ps(rsq22
,rcutoff2
);
2144 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
2146 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2148 /* Calculate temporary vectorial force */
2149 tx
= _mm_mul_ps(fscal
,dx22
);
2150 ty
= _mm_mul_ps(fscal
,dy22
);
2151 tz
= _mm_mul_ps(fscal
,dz22
);
2153 /* Update vectorial force */
2154 fix2
= _mm_add_ps(fix2
,tx
);
2155 fiy2
= _mm_add_ps(fiy2
,ty
);
2156 fiz2
= _mm_add_ps(fiz2
,tz
);
2158 fjx2
= _mm_add_ps(fjx2
,tx
);
2159 fjy2
= _mm_add_ps(fjy2
,ty
);
2160 fjz2
= _mm_add_ps(fjz2
,tz
);
2164 /**************************
2165 * CALCULATE INTERACTIONS *
2166 **************************/
2168 if (gmx_mm_any_lt(rsq23
,rcutoff2
))
2171 /* REACTION-FIELD ELECTROSTATICS */
2172 felec
= _mm_mul_ps(qq23
,_mm_sub_ps(_mm_mul_ps(rinv23
,rinvsq23
),krf2
));
2174 cutoff_mask
= _mm_cmplt_ps(rsq23
,rcutoff2
);
2178 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
2180 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2182 /* Calculate temporary vectorial force */
2183 tx
= _mm_mul_ps(fscal
,dx23
);
2184 ty
= _mm_mul_ps(fscal
,dy23
);
2185 tz
= _mm_mul_ps(fscal
,dz23
);
2187 /* Update vectorial force */
2188 fix2
= _mm_add_ps(fix2
,tx
);
2189 fiy2
= _mm_add_ps(fiy2
,ty
);
2190 fiz2
= _mm_add_ps(fiz2
,tz
);
2192 fjx3
= _mm_add_ps(fjx3
,tx
);
2193 fjy3
= _mm_add_ps(fjy3
,ty
);
2194 fjz3
= _mm_add_ps(fjz3
,tz
);
2198 /**************************
2199 * CALCULATE INTERACTIONS *
2200 **************************/
2202 if (gmx_mm_any_lt(rsq31
,rcutoff2
))
2205 /* REACTION-FIELD ELECTROSTATICS */
2206 felec
= _mm_mul_ps(qq31
,_mm_sub_ps(_mm_mul_ps(rinv31
,rinvsq31
),krf2
));
2208 cutoff_mask
= _mm_cmplt_ps(rsq31
,rcutoff2
);
2212 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
2214 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2216 /* Calculate temporary vectorial force */
2217 tx
= _mm_mul_ps(fscal
,dx31
);
2218 ty
= _mm_mul_ps(fscal
,dy31
);
2219 tz
= _mm_mul_ps(fscal
,dz31
);
2221 /* Update vectorial force */
2222 fix3
= _mm_add_ps(fix3
,tx
);
2223 fiy3
= _mm_add_ps(fiy3
,ty
);
2224 fiz3
= _mm_add_ps(fiz3
,tz
);
2226 fjx1
= _mm_add_ps(fjx1
,tx
);
2227 fjy1
= _mm_add_ps(fjy1
,ty
);
2228 fjz1
= _mm_add_ps(fjz1
,tz
);
2232 /**************************
2233 * CALCULATE INTERACTIONS *
2234 **************************/
2236 if (gmx_mm_any_lt(rsq32
,rcutoff2
))
2239 /* REACTION-FIELD ELECTROSTATICS */
2240 felec
= _mm_mul_ps(qq32
,_mm_sub_ps(_mm_mul_ps(rinv32
,rinvsq32
),krf2
));
2242 cutoff_mask
= _mm_cmplt_ps(rsq32
,rcutoff2
);
2246 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
2248 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2250 /* Calculate temporary vectorial force */
2251 tx
= _mm_mul_ps(fscal
,dx32
);
2252 ty
= _mm_mul_ps(fscal
,dy32
);
2253 tz
= _mm_mul_ps(fscal
,dz32
);
2255 /* Update vectorial force */
2256 fix3
= _mm_add_ps(fix3
,tx
);
2257 fiy3
= _mm_add_ps(fiy3
,ty
);
2258 fiz3
= _mm_add_ps(fiz3
,tz
);
2260 fjx2
= _mm_add_ps(fjx2
,tx
);
2261 fjy2
= _mm_add_ps(fjy2
,ty
);
2262 fjz2
= _mm_add_ps(fjz2
,tz
);
2266 /**************************
2267 * CALCULATE INTERACTIONS *
2268 **************************/
2270 if (gmx_mm_any_lt(rsq33
,rcutoff2
))
2273 /* REACTION-FIELD ELECTROSTATICS */
2274 felec
= _mm_mul_ps(qq33
,_mm_sub_ps(_mm_mul_ps(rinv33
,rinvsq33
),krf2
));
2276 cutoff_mask
= _mm_cmplt_ps(rsq33
,rcutoff2
);
2280 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
2282 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2284 /* Calculate temporary vectorial force */
2285 tx
= _mm_mul_ps(fscal
,dx33
);
2286 ty
= _mm_mul_ps(fscal
,dy33
);
2287 tz
= _mm_mul_ps(fscal
,dz33
);
2289 /* Update vectorial force */
2290 fix3
= _mm_add_ps(fix3
,tx
);
2291 fiy3
= _mm_add_ps(fiy3
,ty
);
2292 fiz3
= _mm_add_ps(fiz3
,tz
);
2294 fjx3
= _mm_add_ps(fjx3
,tx
);
2295 fjy3
= _mm_add_ps(fjy3
,ty
);
2296 fjz3
= _mm_add_ps(fjz3
,tz
);
2300 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
2301 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
2302 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
2303 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
2305 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
2306 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,
2307 fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
2309 /* Inner loop uses 303 flops */
2312 /* End of innermost loop */
2314 gmx_mm_update_iforce_4atom_swizzle_ps(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
2315 f
+i_coord_offset
,fshift
+i_shift_offset
);
2317 /* Increment number of inner iterations */
2318 inneriter
+= j_index_end
- j_index_start
;
2320 /* Outer loop uses 24 flops */
2323 /* Increment number of outer iterations */
2326 /* Update outer/inner flops */
2328 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W4W4_F
,outeriter
*24 + inneriter
*303);