2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_single kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/math/vec.h"
46 #include "gromacs/legacyheaders/nrnb.h"
48 #include "gromacs/simd/math_x86_sse2_single.h"
49 #include "kernelutil_x86_sse2_single.h"
52 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_sse2_single
53 * Electrostatics interaction: ReactionField
54 * VdW interaction: LennardJones
55 * Geometry: Water4-Water4
56 * Calculate force/pot: PotentialAndForce
59 nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_sse2_single
60 (t_nblist
* gmx_restrict nlist
,
61 rvec
* gmx_restrict xx
,
62 rvec
* gmx_restrict ff
,
63 t_forcerec
* gmx_restrict fr
,
64 t_mdatoms
* gmx_restrict mdatoms
,
65 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
66 t_nrnb
* gmx_restrict nrnb
)
68 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
69 * just 0 for non-waters.
70 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
71 * jnr indices corresponding to data put in the four positions in the SIMD register.
73 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
74 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
75 int jnrA
,jnrB
,jnrC
,jnrD
;
76 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
77 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
78 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
80 real
*shiftvec
,*fshift
,*x
,*f
;
81 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
;
83 __m128 tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
85 __m128 ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
87 __m128 ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
89 __m128 ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
91 __m128 ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
92 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
;
93 __m128 jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
94 int vdwjidx1A
,vdwjidx1B
,vdwjidx1C
,vdwjidx1D
;
95 __m128 jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
96 int vdwjidx2A
,vdwjidx2B
,vdwjidx2C
,vdwjidx2D
;
97 __m128 jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
98 int vdwjidx3A
,vdwjidx3B
,vdwjidx3C
,vdwjidx3D
;
99 __m128 jx3
,jy3
,jz3
,fjx3
,fjy3
,fjz3
,jq3
,isaj3
;
100 __m128 dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
101 __m128 dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
102 __m128 dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
103 __m128 dx13
,dy13
,dz13
,rsq13
,rinv13
,rinvsq13
,r13
,qq13
,c6_13
,c12_13
;
104 __m128 dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
105 __m128 dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
106 __m128 dx23
,dy23
,dz23
,rsq23
,rinv23
,rinvsq23
,r23
,qq23
,c6_23
,c12_23
;
107 __m128 dx31
,dy31
,dz31
,rsq31
,rinv31
,rinvsq31
,r31
,qq31
,c6_31
,c12_31
;
108 __m128 dx32
,dy32
,dz32
,rsq32
,rinv32
,rinvsq32
,r32
,qq32
,c6_32
,c12_32
;
109 __m128 dx33
,dy33
,dz33
,rsq33
,rinv33
,rinvsq33
,r33
,qq33
,c6_33
,c12_33
;
110 __m128 velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
113 __m128 rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
116 __m128 one_sixth
= _mm_set1_ps(1.0/6.0);
117 __m128 one_twelfth
= _mm_set1_ps(1.0/12.0);
118 __m128 dummy_mask
,cutoff_mask
;
119 __m128 signbit
= _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
120 __m128 one
= _mm_set1_ps(1.0);
121 __m128 two
= _mm_set1_ps(2.0);
127 jindex
= nlist
->jindex
;
129 shiftidx
= nlist
->shift
;
131 shiftvec
= fr
->shift_vec
[0];
132 fshift
= fr
->fshift
[0];
133 facel
= _mm_set1_ps(fr
->epsfac
);
134 charge
= mdatoms
->chargeA
;
135 krf
= _mm_set1_ps(fr
->ic
->k_rf
);
136 krf2
= _mm_set1_ps(fr
->ic
->k_rf
*2.0);
137 crf
= _mm_set1_ps(fr
->ic
->c_rf
);
138 nvdwtype
= fr
->ntype
;
140 vdwtype
= mdatoms
->typeA
;
142 /* Setup water-specific parameters */
143 inr
= nlist
->iinr
[0];
144 iq1
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+1]));
145 iq2
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+2]));
146 iq3
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+3]));
147 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
149 jq1
= _mm_set1_ps(charge
[inr
+1]);
150 jq2
= _mm_set1_ps(charge
[inr
+2]);
151 jq3
= _mm_set1_ps(charge
[inr
+3]);
152 vdwjidx0A
= 2*vdwtype
[inr
+0];
153 c6_00
= _mm_set1_ps(vdwparam
[vdwioffset0
+vdwjidx0A
]);
154 c12_00
= _mm_set1_ps(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
155 qq11
= _mm_mul_ps(iq1
,jq1
);
156 qq12
= _mm_mul_ps(iq1
,jq2
);
157 qq13
= _mm_mul_ps(iq1
,jq3
);
158 qq21
= _mm_mul_ps(iq2
,jq1
);
159 qq22
= _mm_mul_ps(iq2
,jq2
);
160 qq23
= _mm_mul_ps(iq2
,jq3
);
161 qq31
= _mm_mul_ps(iq3
,jq1
);
162 qq32
= _mm_mul_ps(iq3
,jq2
);
163 qq33
= _mm_mul_ps(iq3
,jq3
);
165 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
166 rcutoff_scalar
= fr
->rcoulomb
;
167 rcutoff
= _mm_set1_ps(rcutoff_scalar
);
168 rcutoff2
= _mm_mul_ps(rcutoff
,rcutoff
);
170 sh_vdw_invrcut6
= _mm_set1_ps(fr
->ic
->sh_invrc6
);
171 rvdw
= _mm_set1_ps(fr
->rvdw
);
173 /* Avoid stupid compiler warnings */
174 jnrA
= jnrB
= jnrC
= jnrD
= 0;
183 for(iidx
=0;iidx
<4*DIM
;iidx
++)
188 /* Start outer loop over neighborlists */
189 for(iidx
=0; iidx
<nri
; iidx
++)
191 /* Load shift vector for this list */
192 i_shift_offset
= DIM
*shiftidx
[iidx
];
194 /* Load limits for loop over neighbors */
195 j_index_start
= jindex
[iidx
];
196 j_index_end
= jindex
[iidx
+1];
198 /* Get outer coordinate index */
200 i_coord_offset
= DIM
*inr
;
202 /* Load i particle coords and add shift vector */
203 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
204 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
206 fix0
= _mm_setzero_ps();
207 fiy0
= _mm_setzero_ps();
208 fiz0
= _mm_setzero_ps();
209 fix1
= _mm_setzero_ps();
210 fiy1
= _mm_setzero_ps();
211 fiz1
= _mm_setzero_ps();
212 fix2
= _mm_setzero_ps();
213 fiy2
= _mm_setzero_ps();
214 fiz2
= _mm_setzero_ps();
215 fix3
= _mm_setzero_ps();
216 fiy3
= _mm_setzero_ps();
217 fiz3
= _mm_setzero_ps();
219 /* Reset potential sums */
220 velecsum
= _mm_setzero_ps();
221 vvdwsum
= _mm_setzero_ps();
223 /* Start inner kernel loop */
224 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+3]>=0; jidx
+=4)
227 /* Get j neighbor index, and coordinate index */
232 j_coord_offsetA
= DIM
*jnrA
;
233 j_coord_offsetB
= DIM
*jnrB
;
234 j_coord_offsetC
= DIM
*jnrC
;
235 j_coord_offsetD
= DIM
*jnrD
;
237 /* load j atom coordinates */
238 gmx_mm_load_4rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
239 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
240 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
241 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
243 /* Calculate displacement vector */
244 dx00
= _mm_sub_ps(ix0
,jx0
);
245 dy00
= _mm_sub_ps(iy0
,jy0
);
246 dz00
= _mm_sub_ps(iz0
,jz0
);
247 dx11
= _mm_sub_ps(ix1
,jx1
);
248 dy11
= _mm_sub_ps(iy1
,jy1
);
249 dz11
= _mm_sub_ps(iz1
,jz1
);
250 dx12
= _mm_sub_ps(ix1
,jx2
);
251 dy12
= _mm_sub_ps(iy1
,jy2
);
252 dz12
= _mm_sub_ps(iz1
,jz2
);
253 dx13
= _mm_sub_ps(ix1
,jx3
);
254 dy13
= _mm_sub_ps(iy1
,jy3
);
255 dz13
= _mm_sub_ps(iz1
,jz3
);
256 dx21
= _mm_sub_ps(ix2
,jx1
);
257 dy21
= _mm_sub_ps(iy2
,jy1
);
258 dz21
= _mm_sub_ps(iz2
,jz1
);
259 dx22
= _mm_sub_ps(ix2
,jx2
);
260 dy22
= _mm_sub_ps(iy2
,jy2
);
261 dz22
= _mm_sub_ps(iz2
,jz2
);
262 dx23
= _mm_sub_ps(ix2
,jx3
);
263 dy23
= _mm_sub_ps(iy2
,jy3
);
264 dz23
= _mm_sub_ps(iz2
,jz3
);
265 dx31
= _mm_sub_ps(ix3
,jx1
);
266 dy31
= _mm_sub_ps(iy3
,jy1
);
267 dz31
= _mm_sub_ps(iz3
,jz1
);
268 dx32
= _mm_sub_ps(ix3
,jx2
);
269 dy32
= _mm_sub_ps(iy3
,jy2
);
270 dz32
= _mm_sub_ps(iz3
,jz2
);
271 dx33
= _mm_sub_ps(ix3
,jx3
);
272 dy33
= _mm_sub_ps(iy3
,jy3
);
273 dz33
= _mm_sub_ps(iz3
,jz3
);
275 /* Calculate squared distance and things based on it */
276 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
277 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
278 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
279 rsq13
= gmx_mm_calc_rsq_ps(dx13
,dy13
,dz13
);
280 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
281 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
282 rsq23
= gmx_mm_calc_rsq_ps(dx23
,dy23
,dz23
);
283 rsq31
= gmx_mm_calc_rsq_ps(dx31
,dy31
,dz31
);
284 rsq32
= gmx_mm_calc_rsq_ps(dx32
,dy32
,dz32
);
285 rsq33
= gmx_mm_calc_rsq_ps(dx33
,dy33
,dz33
);
287 rinv11
= gmx_mm_invsqrt_ps(rsq11
);
288 rinv12
= gmx_mm_invsqrt_ps(rsq12
);
289 rinv13
= gmx_mm_invsqrt_ps(rsq13
);
290 rinv21
= gmx_mm_invsqrt_ps(rsq21
);
291 rinv22
= gmx_mm_invsqrt_ps(rsq22
);
292 rinv23
= gmx_mm_invsqrt_ps(rsq23
);
293 rinv31
= gmx_mm_invsqrt_ps(rsq31
);
294 rinv32
= gmx_mm_invsqrt_ps(rsq32
);
295 rinv33
= gmx_mm_invsqrt_ps(rsq33
);
297 rinvsq00
= gmx_mm_inv_ps(rsq00
);
298 rinvsq11
= _mm_mul_ps(rinv11
,rinv11
);
299 rinvsq12
= _mm_mul_ps(rinv12
,rinv12
);
300 rinvsq13
= _mm_mul_ps(rinv13
,rinv13
);
301 rinvsq21
= _mm_mul_ps(rinv21
,rinv21
);
302 rinvsq22
= _mm_mul_ps(rinv22
,rinv22
);
303 rinvsq23
= _mm_mul_ps(rinv23
,rinv23
);
304 rinvsq31
= _mm_mul_ps(rinv31
,rinv31
);
305 rinvsq32
= _mm_mul_ps(rinv32
,rinv32
);
306 rinvsq33
= _mm_mul_ps(rinv33
,rinv33
);
308 fjx0
= _mm_setzero_ps();
309 fjy0
= _mm_setzero_ps();
310 fjz0
= _mm_setzero_ps();
311 fjx1
= _mm_setzero_ps();
312 fjy1
= _mm_setzero_ps();
313 fjz1
= _mm_setzero_ps();
314 fjx2
= _mm_setzero_ps();
315 fjy2
= _mm_setzero_ps();
316 fjz2
= _mm_setzero_ps();
317 fjx3
= _mm_setzero_ps();
318 fjy3
= _mm_setzero_ps();
319 fjz3
= _mm_setzero_ps();
321 /**************************
322 * CALCULATE INTERACTIONS *
323 **************************/
325 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
328 /* LENNARD-JONES DISPERSION/REPULSION */
330 rinvsix
= _mm_mul_ps(_mm_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
331 vvdw6
= _mm_mul_ps(c6_00
,rinvsix
);
332 vvdw12
= _mm_mul_ps(c12_00
,_mm_mul_ps(rinvsix
,rinvsix
));
333 vvdw
= _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12
, _mm_mul_ps(c12_00
,_mm_mul_ps(sh_vdw_invrcut6
,sh_vdw_invrcut6
))), one_twelfth
) ,
334 _mm_mul_ps( _mm_sub_ps(vvdw6
,_mm_mul_ps(c6_00
,sh_vdw_invrcut6
)),one_sixth
));
335 fvdw
= _mm_mul_ps(_mm_sub_ps(vvdw12
,vvdw6
),rinvsq00
);
337 cutoff_mask
= _mm_cmplt_ps(rsq00
,rcutoff2
);
339 /* Update potential sum for this i atom from the interaction with this j atom. */
340 vvdw
= _mm_and_ps(vvdw
,cutoff_mask
);
341 vvdwsum
= _mm_add_ps(vvdwsum
,vvdw
);
345 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
347 /* Calculate temporary vectorial force */
348 tx
= _mm_mul_ps(fscal
,dx00
);
349 ty
= _mm_mul_ps(fscal
,dy00
);
350 tz
= _mm_mul_ps(fscal
,dz00
);
352 /* Update vectorial force */
353 fix0
= _mm_add_ps(fix0
,tx
);
354 fiy0
= _mm_add_ps(fiy0
,ty
);
355 fiz0
= _mm_add_ps(fiz0
,tz
);
357 fjx0
= _mm_add_ps(fjx0
,tx
);
358 fjy0
= _mm_add_ps(fjy0
,ty
);
359 fjz0
= _mm_add_ps(fjz0
,tz
);
363 /**************************
364 * CALCULATE INTERACTIONS *
365 **************************/
367 if (gmx_mm_any_lt(rsq11
,rcutoff2
))
370 /* REACTION-FIELD ELECTROSTATICS */
371 velec
= _mm_mul_ps(qq11
,_mm_sub_ps(_mm_add_ps(rinv11
,_mm_mul_ps(krf
,rsq11
)),crf
));
372 felec
= _mm_mul_ps(qq11
,_mm_sub_ps(_mm_mul_ps(rinv11
,rinvsq11
),krf2
));
374 cutoff_mask
= _mm_cmplt_ps(rsq11
,rcutoff2
);
376 /* Update potential sum for this i atom from the interaction with this j atom. */
377 velec
= _mm_and_ps(velec
,cutoff_mask
);
378 velecsum
= _mm_add_ps(velecsum
,velec
);
382 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
384 /* Calculate temporary vectorial force */
385 tx
= _mm_mul_ps(fscal
,dx11
);
386 ty
= _mm_mul_ps(fscal
,dy11
);
387 tz
= _mm_mul_ps(fscal
,dz11
);
389 /* Update vectorial force */
390 fix1
= _mm_add_ps(fix1
,tx
);
391 fiy1
= _mm_add_ps(fiy1
,ty
);
392 fiz1
= _mm_add_ps(fiz1
,tz
);
394 fjx1
= _mm_add_ps(fjx1
,tx
);
395 fjy1
= _mm_add_ps(fjy1
,ty
);
396 fjz1
= _mm_add_ps(fjz1
,tz
);
400 /**************************
401 * CALCULATE INTERACTIONS *
402 **************************/
404 if (gmx_mm_any_lt(rsq12
,rcutoff2
))
407 /* REACTION-FIELD ELECTROSTATICS */
408 velec
= _mm_mul_ps(qq12
,_mm_sub_ps(_mm_add_ps(rinv12
,_mm_mul_ps(krf
,rsq12
)),crf
));
409 felec
= _mm_mul_ps(qq12
,_mm_sub_ps(_mm_mul_ps(rinv12
,rinvsq12
),krf2
));
411 cutoff_mask
= _mm_cmplt_ps(rsq12
,rcutoff2
);
413 /* Update potential sum for this i atom from the interaction with this j atom. */
414 velec
= _mm_and_ps(velec
,cutoff_mask
);
415 velecsum
= _mm_add_ps(velecsum
,velec
);
419 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
421 /* Calculate temporary vectorial force */
422 tx
= _mm_mul_ps(fscal
,dx12
);
423 ty
= _mm_mul_ps(fscal
,dy12
);
424 tz
= _mm_mul_ps(fscal
,dz12
);
426 /* Update vectorial force */
427 fix1
= _mm_add_ps(fix1
,tx
);
428 fiy1
= _mm_add_ps(fiy1
,ty
);
429 fiz1
= _mm_add_ps(fiz1
,tz
);
431 fjx2
= _mm_add_ps(fjx2
,tx
);
432 fjy2
= _mm_add_ps(fjy2
,ty
);
433 fjz2
= _mm_add_ps(fjz2
,tz
);
437 /**************************
438 * CALCULATE INTERACTIONS *
439 **************************/
441 if (gmx_mm_any_lt(rsq13
,rcutoff2
))
444 /* REACTION-FIELD ELECTROSTATICS */
445 velec
= _mm_mul_ps(qq13
,_mm_sub_ps(_mm_add_ps(rinv13
,_mm_mul_ps(krf
,rsq13
)),crf
));
446 felec
= _mm_mul_ps(qq13
,_mm_sub_ps(_mm_mul_ps(rinv13
,rinvsq13
),krf2
));
448 cutoff_mask
= _mm_cmplt_ps(rsq13
,rcutoff2
);
450 /* Update potential sum for this i atom from the interaction with this j atom. */
451 velec
= _mm_and_ps(velec
,cutoff_mask
);
452 velecsum
= _mm_add_ps(velecsum
,velec
);
456 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
458 /* Calculate temporary vectorial force */
459 tx
= _mm_mul_ps(fscal
,dx13
);
460 ty
= _mm_mul_ps(fscal
,dy13
);
461 tz
= _mm_mul_ps(fscal
,dz13
);
463 /* Update vectorial force */
464 fix1
= _mm_add_ps(fix1
,tx
);
465 fiy1
= _mm_add_ps(fiy1
,ty
);
466 fiz1
= _mm_add_ps(fiz1
,tz
);
468 fjx3
= _mm_add_ps(fjx3
,tx
);
469 fjy3
= _mm_add_ps(fjy3
,ty
);
470 fjz3
= _mm_add_ps(fjz3
,tz
);
474 /**************************
475 * CALCULATE INTERACTIONS *
476 **************************/
478 if (gmx_mm_any_lt(rsq21
,rcutoff2
))
481 /* REACTION-FIELD ELECTROSTATICS */
482 velec
= _mm_mul_ps(qq21
,_mm_sub_ps(_mm_add_ps(rinv21
,_mm_mul_ps(krf
,rsq21
)),crf
));
483 felec
= _mm_mul_ps(qq21
,_mm_sub_ps(_mm_mul_ps(rinv21
,rinvsq21
),krf2
));
485 cutoff_mask
= _mm_cmplt_ps(rsq21
,rcutoff2
);
487 /* Update potential sum for this i atom from the interaction with this j atom. */
488 velec
= _mm_and_ps(velec
,cutoff_mask
);
489 velecsum
= _mm_add_ps(velecsum
,velec
);
493 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
495 /* Calculate temporary vectorial force */
496 tx
= _mm_mul_ps(fscal
,dx21
);
497 ty
= _mm_mul_ps(fscal
,dy21
);
498 tz
= _mm_mul_ps(fscal
,dz21
);
500 /* Update vectorial force */
501 fix2
= _mm_add_ps(fix2
,tx
);
502 fiy2
= _mm_add_ps(fiy2
,ty
);
503 fiz2
= _mm_add_ps(fiz2
,tz
);
505 fjx1
= _mm_add_ps(fjx1
,tx
);
506 fjy1
= _mm_add_ps(fjy1
,ty
);
507 fjz1
= _mm_add_ps(fjz1
,tz
);
511 /**************************
512 * CALCULATE INTERACTIONS *
513 **************************/
515 if (gmx_mm_any_lt(rsq22
,rcutoff2
))
518 /* REACTION-FIELD ELECTROSTATICS */
519 velec
= _mm_mul_ps(qq22
,_mm_sub_ps(_mm_add_ps(rinv22
,_mm_mul_ps(krf
,rsq22
)),crf
));
520 felec
= _mm_mul_ps(qq22
,_mm_sub_ps(_mm_mul_ps(rinv22
,rinvsq22
),krf2
));
522 cutoff_mask
= _mm_cmplt_ps(rsq22
,rcutoff2
);
524 /* Update potential sum for this i atom from the interaction with this j atom. */
525 velec
= _mm_and_ps(velec
,cutoff_mask
);
526 velecsum
= _mm_add_ps(velecsum
,velec
);
530 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
532 /* Calculate temporary vectorial force */
533 tx
= _mm_mul_ps(fscal
,dx22
);
534 ty
= _mm_mul_ps(fscal
,dy22
);
535 tz
= _mm_mul_ps(fscal
,dz22
);
537 /* Update vectorial force */
538 fix2
= _mm_add_ps(fix2
,tx
);
539 fiy2
= _mm_add_ps(fiy2
,ty
);
540 fiz2
= _mm_add_ps(fiz2
,tz
);
542 fjx2
= _mm_add_ps(fjx2
,tx
);
543 fjy2
= _mm_add_ps(fjy2
,ty
);
544 fjz2
= _mm_add_ps(fjz2
,tz
);
548 /**************************
549 * CALCULATE INTERACTIONS *
550 **************************/
552 if (gmx_mm_any_lt(rsq23
,rcutoff2
))
555 /* REACTION-FIELD ELECTROSTATICS */
556 velec
= _mm_mul_ps(qq23
,_mm_sub_ps(_mm_add_ps(rinv23
,_mm_mul_ps(krf
,rsq23
)),crf
));
557 felec
= _mm_mul_ps(qq23
,_mm_sub_ps(_mm_mul_ps(rinv23
,rinvsq23
),krf2
));
559 cutoff_mask
= _mm_cmplt_ps(rsq23
,rcutoff2
);
561 /* Update potential sum for this i atom from the interaction with this j atom. */
562 velec
= _mm_and_ps(velec
,cutoff_mask
);
563 velecsum
= _mm_add_ps(velecsum
,velec
);
567 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
569 /* Calculate temporary vectorial force */
570 tx
= _mm_mul_ps(fscal
,dx23
);
571 ty
= _mm_mul_ps(fscal
,dy23
);
572 tz
= _mm_mul_ps(fscal
,dz23
);
574 /* Update vectorial force */
575 fix2
= _mm_add_ps(fix2
,tx
);
576 fiy2
= _mm_add_ps(fiy2
,ty
);
577 fiz2
= _mm_add_ps(fiz2
,tz
);
579 fjx3
= _mm_add_ps(fjx3
,tx
);
580 fjy3
= _mm_add_ps(fjy3
,ty
);
581 fjz3
= _mm_add_ps(fjz3
,tz
);
585 /**************************
586 * CALCULATE INTERACTIONS *
587 **************************/
589 if (gmx_mm_any_lt(rsq31
,rcutoff2
))
592 /* REACTION-FIELD ELECTROSTATICS */
593 velec
= _mm_mul_ps(qq31
,_mm_sub_ps(_mm_add_ps(rinv31
,_mm_mul_ps(krf
,rsq31
)),crf
));
594 felec
= _mm_mul_ps(qq31
,_mm_sub_ps(_mm_mul_ps(rinv31
,rinvsq31
),krf2
));
596 cutoff_mask
= _mm_cmplt_ps(rsq31
,rcutoff2
);
598 /* Update potential sum for this i atom from the interaction with this j atom. */
599 velec
= _mm_and_ps(velec
,cutoff_mask
);
600 velecsum
= _mm_add_ps(velecsum
,velec
);
604 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
606 /* Calculate temporary vectorial force */
607 tx
= _mm_mul_ps(fscal
,dx31
);
608 ty
= _mm_mul_ps(fscal
,dy31
);
609 tz
= _mm_mul_ps(fscal
,dz31
);
611 /* Update vectorial force */
612 fix3
= _mm_add_ps(fix3
,tx
);
613 fiy3
= _mm_add_ps(fiy3
,ty
);
614 fiz3
= _mm_add_ps(fiz3
,tz
);
616 fjx1
= _mm_add_ps(fjx1
,tx
);
617 fjy1
= _mm_add_ps(fjy1
,ty
);
618 fjz1
= _mm_add_ps(fjz1
,tz
);
622 /**************************
623 * CALCULATE INTERACTIONS *
624 **************************/
626 if (gmx_mm_any_lt(rsq32
,rcutoff2
))
629 /* REACTION-FIELD ELECTROSTATICS */
630 velec
= _mm_mul_ps(qq32
,_mm_sub_ps(_mm_add_ps(rinv32
,_mm_mul_ps(krf
,rsq32
)),crf
));
631 felec
= _mm_mul_ps(qq32
,_mm_sub_ps(_mm_mul_ps(rinv32
,rinvsq32
),krf2
));
633 cutoff_mask
= _mm_cmplt_ps(rsq32
,rcutoff2
);
635 /* Update potential sum for this i atom from the interaction with this j atom. */
636 velec
= _mm_and_ps(velec
,cutoff_mask
);
637 velecsum
= _mm_add_ps(velecsum
,velec
);
641 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
643 /* Calculate temporary vectorial force */
644 tx
= _mm_mul_ps(fscal
,dx32
);
645 ty
= _mm_mul_ps(fscal
,dy32
);
646 tz
= _mm_mul_ps(fscal
,dz32
);
648 /* Update vectorial force */
649 fix3
= _mm_add_ps(fix3
,tx
);
650 fiy3
= _mm_add_ps(fiy3
,ty
);
651 fiz3
= _mm_add_ps(fiz3
,tz
);
653 fjx2
= _mm_add_ps(fjx2
,tx
);
654 fjy2
= _mm_add_ps(fjy2
,ty
);
655 fjz2
= _mm_add_ps(fjz2
,tz
);
659 /**************************
660 * CALCULATE INTERACTIONS *
661 **************************/
663 if (gmx_mm_any_lt(rsq33
,rcutoff2
))
666 /* REACTION-FIELD ELECTROSTATICS */
667 velec
= _mm_mul_ps(qq33
,_mm_sub_ps(_mm_add_ps(rinv33
,_mm_mul_ps(krf
,rsq33
)),crf
));
668 felec
= _mm_mul_ps(qq33
,_mm_sub_ps(_mm_mul_ps(rinv33
,rinvsq33
),krf2
));
670 cutoff_mask
= _mm_cmplt_ps(rsq33
,rcutoff2
);
672 /* Update potential sum for this i atom from the interaction with this j atom. */
673 velec
= _mm_and_ps(velec
,cutoff_mask
);
674 velecsum
= _mm_add_ps(velecsum
,velec
);
678 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
680 /* Calculate temporary vectorial force */
681 tx
= _mm_mul_ps(fscal
,dx33
);
682 ty
= _mm_mul_ps(fscal
,dy33
);
683 tz
= _mm_mul_ps(fscal
,dz33
);
685 /* Update vectorial force */
686 fix3
= _mm_add_ps(fix3
,tx
);
687 fiy3
= _mm_add_ps(fiy3
,ty
);
688 fiz3
= _mm_add_ps(fiz3
,tz
);
690 fjx3
= _mm_add_ps(fjx3
,tx
);
691 fjy3
= _mm_add_ps(fjy3
,ty
);
692 fjz3
= _mm_add_ps(fjz3
,tz
);
696 fjptrA
= f
+j_coord_offsetA
;
697 fjptrB
= f
+j_coord_offsetB
;
698 fjptrC
= f
+j_coord_offsetC
;
699 fjptrD
= f
+j_coord_offsetD
;
701 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
702 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,
703 fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
705 /* Inner loop uses 368 flops */
711 /* Get j neighbor index, and coordinate index */
712 jnrlistA
= jjnr
[jidx
];
713 jnrlistB
= jjnr
[jidx
+1];
714 jnrlistC
= jjnr
[jidx
+2];
715 jnrlistD
= jjnr
[jidx
+3];
716 /* Sign of each element will be negative for non-real atoms.
717 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
718 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
720 dummy_mask
= gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128()));
721 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
722 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
723 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
724 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
725 j_coord_offsetA
= DIM
*jnrA
;
726 j_coord_offsetB
= DIM
*jnrB
;
727 j_coord_offsetC
= DIM
*jnrC
;
728 j_coord_offsetD
= DIM
*jnrD
;
730 /* load j atom coordinates */
731 gmx_mm_load_4rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
732 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
733 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
734 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
736 /* Calculate displacement vector */
737 dx00
= _mm_sub_ps(ix0
,jx0
);
738 dy00
= _mm_sub_ps(iy0
,jy0
);
739 dz00
= _mm_sub_ps(iz0
,jz0
);
740 dx11
= _mm_sub_ps(ix1
,jx1
);
741 dy11
= _mm_sub_ps(iy1
,jy1
);
742 dz11
= _mm_sub_ps(iz1
,jz1
);
743 dx12
= _mm_sub_ps(ix1
,jx2
);
744 dy12
= _mm_sub_ps(iy1
,jy2
);
745 dz12
= _mm_sub_ps(iz1
,jz2
);
746 dx13
= _mm_sub_ps(ix1
,jx3
);
747 dy13
= _mm_sub_ps(iy1
,jy3
);
748 dz13
= _mm_sub_ps(iz1
,jz3
);
749 dx21
= _mm_sub_ps(ix2
,jx1
);
750 dy21
= _mm_sub_ps(iy2
,jy1
);
751 dz21
= _mm_sub_ps(iz2
,jz1
);
752 dx22
= _mm_sub_ps(ix2
,jx2
);
753 dy22
= _mm_sub_ps(iy2
,jy2
);
754 dz22
= _mm_sub_ps(iz2
,jz2
);
755 dx23
= _mm_sub_ps(ix2
,jx3
);
756 dy23
= _mm_sub_ps(iy2
,jy3
);
757 dz23
= _mm_sub_ps(iz2
,jz3
);
758 dx31
= _mm_sub_ps(ix3
,jx1
);
759 dy31
= _mm_sub_ps(iy3
,jy1
);
760 dz31
= _mm_sub_ps(iz3
,jz1
);
761 dx32
= _mm_sub_ps(ix3
,jx2
);
762 dy32
= _mm_sub_ps(iy3
,jy2
);
763 dz32
= _mm_sub_ps(iz3
,jz2
);
764 dx33
= _mm_sub_ps(ix3
,jx3
);
765 dy33
= _mm_sub_ps(iy3
,jy3
);
766 dz33
= _mm_sub_ps(iz3
,jz3
);
768 /* Calculate squared distance and things based on it */
769 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
770 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
771 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
772 rsq13
= gmx_mm_calc_rsq_ps(dx13
,dy13
,dz13
);
773 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
774 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
775 rsq23
= gmx_mm_calc_rsq_ps(dx23
,dy23
,dz23
);
776 rsq31
= gmx_mm_calc_rsq_ps(dx31
,dy31
,dz31
);
777 rsq32
= gmx_mm_calc_rsq_ps(dx32
,dy32
,dz32
);
778 rsq33
= gmx_mm_calc_rsq_ps(dx33
,dy33
,dz33
);
780 rinv11
= gmx_mm_invsqrt_ps(rsq11
);
781 rinv12
= gmx_mm_invsqrt_ps(rsq12
);
782 rinv13
= gmx_mm_invsqrt_ps(rsq13
);
783 rinv21
= gmx_mm_invsqrt_ps(rsq21
);
784 rinv22
= gmx_mm_invsqrt_ps(rsq22
);
785 rinv23
= gmx_mm_invsqrt_ps(rsq23
);
786 rinv31
= gmx_mm_invsqrt_ps(rsq31
);
787 rinv32
= gmx_mm_invsqrt_ps(rsq32
);
788 rinv33
= gmx_mm_invsqrt_ps(rsq33
);
790 rinvsq00
= gmx_mm_inv_ps(rsq00
);
791 rinvsq11
= _mm_mul_ps(rinv11
,rinv11
);
792 rinvsq12
= _mm_mul_ps(rinv12
,rinv12
);
793 rinvsq13
= _mm_mul_ps(rinv13
,rinv13
);
794 rinvsq21
= _mm_mul_ps(rinv21
,rinv21
);
795 rinvsq22
= _mm_mul_ps(rinv22
,rinv22
);
796 rinvsq23
= _mm_mul_ps(rinv23
,rinv23
);
797 rinvsq31
= _mm_mul_ps(rinv31
,rinv31
);
798 rinvsq32
= _mm_mul_ps(rinv32
,rinv32
);
799 rinvsq33
= _mm_mul_ps(rinv33
,rinv33
);
801 fjx0
= _mm_setzero_ps();
802 fjy0
= _mm_setzero_ps();
803 fjz0
= _mm_setzero_ps();
804 fjx1
= _mm_setzero_ps();
805 fjy1
= _mm_setzero_ps();
806 fjz1
= _mm_setzero_ps();
807 fjx2
= _mm_setzero_ps();
808 fjy2
= _mm_setzero_ps();
809 fjz2
= _mm_setzero_ps();
810 fjx3
= _mm_setzero_ps();
811 fjy3
= _mm_setzero_ps();
812 fjz3
= _mm_setzero_ps();
814 /**************************
815 * CALCULATE INTERACTIONS *
816 **************************/
818 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
821 /* LENNARD-JONES DISPERSION/REPULSION */
823 rinvsix
= _mm_mul_ps(_mm_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
824 vvdw6
= _mm_mul_ps(c6_00
,rinvsix
);
825 vvdw12
= _mm_mul_ps(c12_00
,_mm_mul_ps(rinvsix
,rinvsix
));
826 vvdw
= _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12
, _mm_mul_ps(c12_00
,_mm_mul_ps(sh_vdw_invrcut6
,sh_vdw_invrcut6
))), one_twelfth
) ,
827 _mm_mul_ps( _mm_sub_ps(vvdw6
,_mm_mul_ps(c6_00
,sh_vdw_invrcut6
)),one_sixth
));
828 fvdw
= _mm_mul_ps(_mm_sub_ps(vvdw12
,vvdw6
),rinvsq00
);
830 cutoff_mask
= _mm_cmplt_ps(rsq00
,rcutoff2
);
832 /* Update potential sum for this i atom from the interaction with this j atom. */
833 vvdw
= _mm_and_ps(vvdw
,cutoff_mask
);
834 vvdw
= _mm_andnot_ps(dummy_mask
,vvdw
);
835 vvdwsum
= _mm_add_ps(vvdwsum
,vvdw
);
839 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
841 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
843 /* Calculate temporary vectorial force */
844 tx
= _mm_mul_ps(fscal
,dx00
);
845 ty
= _mm_mul_ps(fscal
,dy00
);
846 tz
= _mm_mul_ps(fscal
,dz00
);
848 /* Update vectorial force */
849 fix0
= _mm_add_ps(fix0
,tx
);
850 fiy0
= _mm_add_ps(fiy0
,ty
);
851 fiz0
= _mm_add_ps(fiz0
,tz
);
853 fjx0
= _mm_add_ps(fjx0
,tx
);
854 fjy0
= _mm_add_ps(fjy0
,ty
);
855 fjz0
= _mm_add_ps(fjz0
,tz
);
859 /**************************
860 * CALCULATE INTERACTIONS *
861 **************************/
863 if (gmx_mm_any_lt(rsq11
,rcutoff2
))
866 /* REACTION-FIELD ELECTROSTATICS */
867 velec
= _mm_mul_ps(qq11
,_mm_sub_ps(_mm_add_ps(rinv11
,_mm_mul_ps(krf
,rsq11
)),crf
));
868 felec
= _mm_mul_ps(qq11
,_mm_sub_ps(_mm_mul_ps(rinv11
,rinvsq11
),krf2
));
870 cutoff_mask
= _mm_cmplt_ps(rsq11
,rcutoff2
);
872 /* Update potential sum for this i atom from the interaction with this j atom. */
873 velec
= _mm_and_ps(velec
,cutoff_mask
);
874 velec
= _mm_andnot_ps(dummy_mask
,velec
);
875 velecsum
= _mm_add_ps(velecsum
,velec
);
879 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
881 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
883 /* Calculate temporary vectorial force */
884 tx
= _mm_mul_ps(fscal
,dx11
);
885 ty
= _mm_mul_ps(fscal
,dy11
);
886 tz
= _mm_mul_ps(fscal
,dz11
);
888 /* Update vectorial force */
889 fix1
= _mm_add_ps(fix1
,tx
);
890 fiy1
= _mm_add_ps(fiy1
,ty
);
891 fiz1
= _mm_add_ps(fiz1
,tz
);
893 fjx1
= _mm_add_ps(fjx1
,tx
);
894 fjy1
= _mm_add_ps(fjy1
,ty
);
895 fjz1
= _mm_add_ps(fjz1
,tz
);
899 /**************************
900 * CALCULATE INTERACTIONS *
901 **************************/
903 if (gmx_mm_any_lt(rsq12
,rcutoff2
))
906 /* REACTION-FIELD ELECTROSTATICS */
907 velec
= _mm_mul_ps(qq12
,_mm_sub_ps(_mm_add_ps(rinv12
,_mm_mul_ps(krf
,rsq12
)),crf
));
908 felec
= _mm_mul_ps(qq12
,_mm_sub_ps(_mm_mul_ps(rinv12
,rinvsq12
),krf2
));
910 cutoff_mask
= _mm_cmplt_ps(rsq12
,rcutoff2
);
912 /* Update potential sum for this i atom from the interaction with this j atom. */
913 velec
= _mm_and_ps(velec
,cutoff_mask
);
914 velec
= _mm_andnot_ps(dummy_mask
,velec
);
915 velecsum
= _mm_add_ps(velecsum
,velec
);
919 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
921 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
923 /* Calculate temporary vectorial force */
924 tx
= _mm_mul_ps(fscal
,dx12
);
925 ty
= _mm_mul_ps(fscal
,dy12
);
926 tz
= _mm_mul_ps(fscal
,dz12
);
928 /* Update vectorial force */
929 fix1
= _mm_add_ps(fix1
,tx
);
930 fiy1
= _mm_add_ps(fiy1
,ty
);
931 fiz1
= _mm_add_ps(fiz1
,tz
);
933 fjx2
= _mm_add_ps(fjx2
,tx
);
934 fjy2
= _mm_add_ps(fjy2
,ty
);
935 fjz2
= _mm_add_ps(fjz2
,tz
);
939 /**************************
940 * CALCULATE INTERACTIONS *
941 **************************/
943 if (gmx_mm_any_lt(rsq13
,rcutoff2
))
946 /* REACTION-FIELD ELECTROSTATICS */
947 velec
= _mm_mul_ps(qq13
,_mm_sub_ps(_mm_add_ps(rinv13
,_mm_mul_ps(krf
,rsq13
)),crf
));
948 felec
= _mm_mul_ps(qq13
,_mm_sub_ps(_mm_mul_ps(rinv13
,rinvsq13
),krf2
));
950 cutoff_mask
= _mm_cmplt_ps(rsq13
,rcutoff2
);
952 /* Update potential sum for this i atom from the interaction with this j atom. */
953 velec
= _mm_and_ps(velec
,cutoff_mask
);
954 velec
= _mm_andnot_ps(dummy_mask
,velec
);
955 velecsum
= _mm_add_ps(velecsum
,velec
);
959 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
961 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
963 /* Calculate temporary vectorial force */
964 tx
= _mm_mul_ps(fscal
,dx13
);
965 ty
= _mm_mul_ps(fscal
,dy13
);
966 tz
= _mm_mul_ps(fscal
,dz13
);
968 /* Update vectorial force */
969 fix1
= _mm_add_ps(fix1
,tx
);
970 fiy1
= _mm_add_ps(fiy1
,ty
);
971 fiz1
= _mm_add_ps(fiz1
,tz
);
973 fjx3
= _mm_add_ps(fjx3
,tx
);
974 fjy3
= _mm_add_ps(fjy3
,ty
);
975 fjz3
= _mm_add_ps(fjz3
,tz
);
979 /**************************
980 * CALCULATE INTERACTIONS *
981 **************************/
983 if (gmx_mm_any_lt(rsq21
,rcutoff2
))
986 /* REACTION-FIELD ELECTROSTATICS */
987 velec
= _mm_mul_ps(qq21
,_mm_sub_ps(_mm_add_ps(rinv21
,_mm_mul_ps(krf
,rsq21
)),crf
));
988 felec
= _mm_mul_ps(qq21
,_mm_sub_ps(_mm_mul_ps(rinv21
,rinvsq21
),krf2
));
990 cutoff_mask
= _mm_cmplt_ps(rsq21
,rcutoff2
);
992 /* Update potential sum for this i atom from the interaction with this j atom. */
993 velec
= _mm_and_ps(velec
,cutoff_mask
);
994 velec
= _mm_andnot_ps(dummy_mask
,velec
);
995 velecsum
= _mm_add_ps(velecsum
,velec
);
999 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1001 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1003 /* Calculate temporary vectorial force */
1004 tx
= _mm_mul_ps(fscal
,dx21
);
1005 ty
= _mm_mul_ps(fscal
,dy21
);
1006 tz
= _mm_mul_ps(fscal
,dz21
);
1008 /* Update vectorial force */
1009 fix2
= _mm_add_ps(fix2
,tx
);
1010 fiy2
= _mm_add_ps(fiy2
,ty
);
1011 fiz2
= _mm_add_ps(fiz2
,tz
);
1013 fjx1
= _mm_add_ps(fjx1
,tx
);
1014 fjy1
= _mm_add_ps(fjy1
,ty
);
1015 fjz1
= _mm_add_ps(fjz1
,tz
);
1019 /**************************
1020 * CALCULATE INTERACTIONS *
1021 **************************/
1023 if (gmx_mm_any_lt(rsq22
,rcutoff2
))
1026 /* REACTION-FIELD ELECTROSTATICS */
1027 velec
= _mm_mul_ps(qq22
,_mm_sub_ps(_mm_add_ps(rinv22
,_mm_mul_ps(krf
,rsq22
)),crf
));
1028 felec
= _mm_mul_ps(qq22
,_mm_sub_ps(_mm_mul_ps(rinv22
,rinvsq22
),krf2
));
1030 cutoff_mask
= _mm_cmplt_ps(rsq22
,rcutoff2
);
1032 /* Update potential sum for this i atom from the interaction with this j atom. */
1033 velec
= _mm_and_ps(velec
,cutoff_mask
);
1034 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1035 velecsum
= _mm_add_ps(velecsum
,velec
);
1039 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1041 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1043 /* Calculate temporary vectorial force */
1044 tx
= _mm_mul_ps(fscal
,dx22
);
1045 ty
= _mm_mul_ps(fscal
,dy22
);
1046 tz
= _mm_mul_ps(fscal
,dz22
);
1048 /* Update vectorial force */
1049 fix2
= _mm_add_ps(fix2
,tx
);
1050 fiy2
= _mm_add_ps(fiy2
,ty
);
1051 fiz2
= _mm_add_ps(fiz2
,tz
);
1053 fjx2
= _mm_add_ps(fjx2
,tx
);
1054 fjy2
= _mm_add_ps(fjy2
,ty
);
1055 fjz2
= _mm_add_ps(fjz2
,tz
);
1059 /**************************
1060 * CALCULATE INTERACTIONS *
1061 **************************/
1063 if (gmx_mm_any_lt(rsq23
,rcutoff2
))
1066 /* REACTION-FIELD ELECTROSTATICS */
1067 velec
= _mm_mul_ps(qq23
,_mm_sub_ps(_mm_add_ps(rinv23
,_mm_mul_ps(krf
,rsq23
)),crf
));
1068 felec
= _mm_mul_ps(qq23
,_mm_sub_ps(_mm_mul_ps(rinv23
,rinvsq23
),krf2
));
1070 cutoff_mask
= _mm_cmplt_ps(rsq23
,rcutoff2
);
1072 /* Update potential sum for this i atom from the interaction with this j atom. */
1073 velec
= _mm_and_ps(velec
,cutoff_mask
);
1074 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1075 velecsum
= _mm_add_ps(velecsum
,velec
);
1079 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1081 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1083 /* Calculate temporary vectorial force */
1084 tx
= _mm_mul_ps(fscal
,dx23
);
1085 ty
= _mm_mul_ps(fscal
,dy23
);
1086 tz
= _mm_mul_ps(fscal
,dz23
);
1088 /* Update vectorial force */
1089 fix2
= _mm_add_ps(fix2
,tx
);
1090 fiy2
= _mm_add_ps(fiy2
,ty
);
1091 fiz2
= _mm_add_ps(fiz2
,tz
);
1093 fjx3
= _mm_add_ps(fjx3
,tx
);
1094 fjy3
= _mm_add_ps(fjy3
,ty
);
1095 fjz3
= _mm_add_ps(fjz3
,tz
);
1099 /**************************
1100 * CALCULATE INTERACTIONS *
1101 **************************/
1103 if (gmx_mm_any_lt(rsq31
,rcutoff2
))
1106 /* REACTION-FIELD ELECTROSTATICS */
1107 velec
= _mm_mul_ps(qq31
,_mm_sub_ps(_mm_add_ps(rinv31
,_mm_mul_ps(krf
,rsq31
)),crf
));
1108 felec
= _mm_mul_ps(qq31
,_mm_sub_ps(_mm_mul_ps(rinv31
,rinvsq31
),krf2
));
1110 cutoff_mask
= _mm_cmplt_ps(rsq31
,rcutoff2
);
1112 /* Update potential sum for this i atom from the interaction with this j atom. */
1113 velec
= _mm_and_ps(velec
,cutoff_mask
);
1114 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1115 velecsum
= _mm_add_ps(velecsum
,velec
);
1119 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1121 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1123 /* Calculate temporary vectorial force */
1124 tx
= _mm_mul_ps(fscal
,dx31
);
1125 ty
= _mm_mul_ps(fscal
,dy31
);
1126 tz
= _mm_mul_ps(fscal
,dz31
);
1128 /* Update vectorial force */
1129 fix3
= _mm_add_ps(fix3
,tx
);
1130 fiy3
= _mm_add_ps(fiy3
,ty
);
1131 fiz3
= _mm_add_ps(fiz3
,tz
);
1133 fjx1
= _mm_add_ps(fjx1
,tx
);
1134 fjy1
= _mm_add_ps(fjy1
,ty
);
1135 fjz1
= _mm_add_ps(fjz1
,tz
);
1139 /**************************
1140 * CALCULATE INTERACTIONS *
1141 **************************/
1143 if (gmx_mm_any_lt(rsq32
,rcutoff2
))
1146 /* REACTION-FIELD ELECTROSTATICS */
1147 velec
= _mm_mul_ps(qq32
,_mm_sub_ps(_mm_add_ps(rinv32
,_mm_mul_ps(krf
,rsq32
)),crf
));
1148 felec
= _mm_mul_ps(qq32
,_mm_sub_ps(_mm_mul_ps(rinv32
,rinvsq32
),krf2
));
1150 cutoff_mask
= _mm_cmplt_ps(rsq32
,rcutoff2
);
1152 /* Update potential sum for this i atom from the interaction with this j atom. */
1153 velec
= _mm_and_ps(velec
,cutoff_mask
);
1154 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1155 velecsum
= _mm_add_ps(velecsum
,velec
);
1159 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1161 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1163 /* Calculate temporary vectorial force */
1164 tx
= _mm_mul_ps(fscal
,dx32
);
1165 ty
= _mm_mul_ps(fscal
,dy32
);
1166 tz
= _mm_mul_ps(fscal
,dz32
);
1168 /* Update vectorial force */
1169 fix3
= _mm_add_ps(fix3
,tx
);
1170 fiy3
= _mm_add_ps(fiy3
,ty
);
1171 fiz3
= _mm_add_ps(fiz3
,tz
);
1173 fjx2
= _mm_add_ps(fjx2
,tx
);
1174 fjy2
= _mm_add_ps(fjy2
,ty
);
1175 fjz2
= _mm_add_ps(fjz2
,tz
);
1179 /**************************
1180 * CALCULATE INTERACTIONS *
1181 **************************/
1183 if (gmx_mm_any_lt(rsq33
,rcutoff2
))
1186 /* REACTION-FIELD ELECTROSTATICS */
1187 velec
= _mm_mul_ps(qq33
,_mm_sub_ps(_mm_add_ps(rinv33
,_mm_mul_ps(krf
,rsq33
)),crf
));
1188 felec
= _mm_mul_ps(qq33
,_mm_sub_ps(_mm_mul_ps(rinv33
,rinvsq33
),krf2
));
1190 cutoff_mask
= _mm_cmplt_ps(rsq33
,rcutoff2
);
1192 /* Update potential sum for this i atom from the interaction with this j atom. */
1193 velec
= _mm_and_ps(velec
,cutoff_mask
);
1194 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1195 velecsum
= _mm_add_ps(velecsum
,velec
);
1199 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1201 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1203 /* Calculate temporary vectorial force */
1204 tx
= _mm_mul_ps(fscal
,dx33
);
1205 ty
= _mm_mul_ps(fscal
,dy33
);
1206 tz
= _mm_mul_ps(fscal
,dz33
);
1208 /* Update vectorial force */
1209 fix3
= _mm_add_ps(fix3
,tx
);
1210 fiy3
= _mm_add_ps(fiy3
,ty
);
1211 fiz3
= _mm_add_ps(fiz3
,tz
);
1213 fjx3
= _mm_add_ps(fjx3
,tx
);
1214 fjy3
= _mm_add_ps(fjy3
,ty
);
1215 fjz3
= _mm_add_ps(fjz3
,tz
);
1219 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
1220 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
1221 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
1222 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
1224 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
1225 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,
1226 fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
1228 /* Inner loop uses 368 flops */
1231 /* End of innermost loop */
1233 gmx_mm_update_iforce_4atom_swizzle_ps(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
1234 f
+i_coord_offset
,fshift
+i_shift_offset
);
1237 /* Update potential energies */
1238 gmx_mm_update_1pot_ps(velecsum
,kernel_data
->energygrp_elec
+ggid
);
1239 gmx_mm_update_1pot_ps(vvdwsum
,kernel_data
->energygrp_vdw
+ggid
);
1241 /* Increment number of inner iterations */
1242 inneriter
+= j_index_end
- j_index_start
;
1244 /* Outer loop uses 26 flops */
1247 /* Increment number of outer iterations */
1250 /* Update outer/inner flops */
1252 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W4W4_VF
,outeriter
*26 + inneriter
*368);
1255 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_sse2_single
1256 * Electrostatics interaction: ReactionField
1257 * VdW interaction: LennardJones
1258 * Geometry: Water4-Water4
1259 * Calculate force/pot: Force
1262 nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_sse2_single
1263 (t_nblist
* gmx_restrict nlist
,
1264 rvec
* gmx_restrict xx
,
1265 rvec
* gmx_restrict ff
,
1266 t_forcerec
* gmx_restrict fr
,
1267 t_mdatoms
* gmx_restrict mdatoms
,
1268 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
1269 t_nrnb
* gmx_restrict nrnb
)
1271 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1272 * just 0 for non-waters.
1273 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1274 * jnr indices corresponding to data put in the four positions in the SIMD register.
1276 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
1277 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
1278 int jnrA
,jnrB
,jnrC
,jnrD
;
1279 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
1280 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
1281 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
1282 real rcutoff_scalar
;
1283 real
*shiftvec
,*fshift
,*x
,*f
;
1284 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
;
1285 real scratch
[4*DIM
];
1286 __m128 tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
1288 __m128 ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
1290 __m128 ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
1292 __m128 ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
1294 __m128 ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
1295 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
;
1296 __m128 jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
1297 int vdwjidx1A
,vdwjidx1B
,vdwjidx1C
,vdwjidx1D
;
1298 __m128 jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
1299 int vdwjidx2A
,vdwjidx2B
,vdwjidx2C
,vdwjidx2D
;
1300 __m128 jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
1301 int vdwjidx3A
,vdwjidx3B
,vdwjidx3C
,vdwjidx3D
;
1302 __m128 jx3
,jy3
,jz3
,fjx3
,fjy3
,fjz3
,jq3
,isaj3
;
1303 __m128 dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
1304 __m128 dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
1305 __m128 dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
1306 __m128 dx13
,dy13
,dz13
,rsq13
,rinv13
,rinvsq13
,r13
,qq13
,c6_13
,c12_13
;
1307 __m128 dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
1308 __m128 dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
1309 __m128 dx23
,dy23
,dz23
,rsq23
,rinv23
,rinvsq23
,r23
,qq23
,c6_23
,c12_23
;
1310 __m128 dx31
,dy31
,dz31
,rsq31
,rinv31
,rinvsq31
,r31
,qq31
,c6_31
,c12_31
;
1311 __m128 dx32
,dy32
,dz32
,rsq32
,rinv32
,rinvsq32
,r32
,qq32
,c6_32
,c12_32
;
1312 __m128 dx33
,dy33
,dz33
,rsq33
,rinv33
,rinvsq33
,r33
,qq33
,c6_33
,c12_33
;
1313 __m128 velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
1316 __m128 rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
1319 __m128 one_sixth
= _mm_set1_ps(1.0/6.0);
1320 __m128 one_twelfth
= _mm_set1_ps(1.0/12.0);
1321 __m128 dummy_mask
,cutoff_mask
;
1322 __m128 signbit
= _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1323 __m128 one
= _mm_set1_ps(1.0);
1324 __m128 two
= _mm_set1_ps(2.0);
1330 jindex
= nlist
->jindex
;
1332 shiftidx
= nlist
->shift
;
1334 shiftvec
= fr
->shift_vec
[0];
1335 fshift
= fr
->fshift
[0];
1336 facel
= _mm_set1_ps(fr
->epsfac
);
1337 charge
= mdatoms
->chargeA
;
1338 krf
= _mm_set1_ps(fr
->ic
->k_rf
);
1339 krf2
= _mm_set1_ps(fr
->ic
->k_rf
*2.0);
1340 crf
= _mm_set1_ps(fr
->ic
->c_rf
);
1341 nvdwtype
= fr
->ntype
;
1342 vdwparam
= fr
->nbfp
;
1343 vdwtype
= mdatoms
->typeA
;
1345 /* Setup water-specific parameters */
1346 inr
= nlist
->iinr
[0];
1347 iq1
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+1]));
1348 iq2
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+2]));
1349 iq3
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+3]));
1350 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
1352 jq1
= _mm_set1_ps(charge
[inr
+1]);
1353 jq2
= _mm_set1_ps(charge
[inr
+2]);
1354 jq3
= _mm_set1_ps(charge
[inr
+3]);
1355 vdwjidx0A
= 2*vdwtype
[inr
+0];
1356 c6_00
= _mm_set1_ps(vdwparam
[vdwioffset0
+vdwjidx0A
]);
1357 c12_00
= _mm_set1_ps(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
1358 qq11
= _mm_mul_ps(iq1
,jq1
);
1359 qq12
= _mm_mul_ps(iq1
,jq2
);
1360 qq13
= _mm_mul_ps(iq1
,jq3
);
1361 qq21
= _mm_mul_ps(iq2
,jq1
);
1362 qq22
= _mm_mul_ps(iq2
,jq2
);
1363 qq23
= _mm_mul_ps(iq2
,jq3
);
1364 qq31
= _mm_mul_ps(iq3
,jq1
);
1365 qq32
= _mm_mul_ps(iq3
,jq2
);
1366 qq33
= _mm_mul_ps(iq3
,jq3
);
1368 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1369 rcutoff_scalar
= fr
->rcoulomb
;
1370 rcutoff
= _mm_set1_ps(rcutoff_scalar
);
1371 rcutoff2
= _mm_mul_ps(rcutoff
,rcutoff
);
1373 sh_vdw_invrcut6
= _mm_set1_ps(fr
->ic
->sh_invrc6
);
1374 rvdw
= _mm_set1_ps(fr
->rvdw
);
1376 /* Avoid stupid compiler warnings */
1377 jnrA
= jnrB
= jnrC
= jnrD
= 0;
1378 j_coord_offsetA
= 0;
1379 j_coord_offsetB
= 0;
1380 j_coord_offsetC
= 0;
1381 j_coord_offsetD
= 0;
1386 for(iidx
=0;iidx
<4*DIM
;iidx
++)
1388 scratch
[iidx
] = 0.0;
1391 /* Start outer loop over neighborlists */
1392 for(iidx
=0; iidx
<nri
; iidx
++)
1394 /* Load shift vector for this list */
1395 i_shift_offset
= DIM
*shiftidx
[iidx
];
1397 /* Load limits for loop over neighbors */
1398 j_index_start
= jindex
[iidx
];
1399 j_index_end
= jindex
[iidx
+1];
1401 /* Get outer coordinate index */
1403 i_coord_offset
= DIM
*inr
;
1405 /* Load i particle coords and add shift vector */
1406 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
1407 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
1409 fix0
= _mm_setzero_ps();
1410 fiy0
= _mm_setzero_ps();
1411 fiz0
= _mm_setzero_ps();
1412 fix1
= _mm_setzero_ps();
1413 fiy1
= _mm_setzero_ps();
1414 fiz1
= _mm_setzero_ps();
1415 fix2
= _mm_setzero_ps();
1416 fiy2
= _mm_setzero_ps();
1417 fiz2
= _mm_setzero_ps();
1418 fix3
= _mm_setzero_ps();
1419 fiy3
= _mm_setzero_ps();
1420 fiz3
= _mm_setzero_ps();
1422 /* Start inner kernel loop */
1423 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+3]>=0; jidx
+=4)
1426 /* Get j neighbor index, and coordinate index */
1428 jnrB
= jjnr
[jidx
+1];
1429 jnrC
= jjnr
[jidx
+2];
1430 jnrD
= jjnr
[jidx
+3];
1431 j_coord_offsetA
= DIM
*jnrA
;
1432 j_coord_offsetB
= DIM
*jnrB
;
1433 j_coord_offsetC
= DIM
*jnrC
;
1434 j_coord_offsetD
= DIM
*jnrD
;
1436 /* load j atom coordinates */
1437 gmx_mm_load_4rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1438 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
1439 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
1440 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
1442 /* Calculate displacement vector */
1443 dx00
= _mm_sub_ps(ix0
,jx0
);
1444 dy00
= _mm_sub_ps(iy0
,jy0
);
1445 dz00
= _mm_sub_ps(iz0
,jz0
);
1446 dx11
= _mm_sub_ps(ix1
,jx1
);
1447 dy11
= _mm_sub_ps(iy1
,jy1
);
1448 dz11
= _mm_sub_ps(iz1
,jz1
);
1449 dx12
= _mm_sub_ps(ix1
,jx2
);
1450 dy12
= _mm_sub_ps(iy1
,jy2
);
1451 dz12
= _mm_sub_ps(iz1
,jz2
);
1452 dx13
= _mm_sub_ps(ix1
,jx3
);
1453 dy13
= _mm_sub_ps(iy1
,jy3
);
1454 dz13
= _mm_sub_ps(iz1
,jz3
);
1455 dx21
= _mm_sub_ps(ix2
,jx1
);
1456 dy21
= _mm_sub_ps(iy2
,jy1
);
1457 dz21
= _mm_sub_ps(iz2
,jz1
);
1458 dx22
= _mm_sub_ps(ix2
,jx2
);
1459 dy22
= _mm_sub_ps(iy2
,jy2
);
1460 dz22
= _mm_sub_ps(iz2
,jz2
);
1461 dx23
= _mm_sub_ps(ix2
,jx3
);
1462 dy23
= _mm_sub_ps(iy2
,jy3
);
1463 dz23
= _mm_sub_ps(iz2
,jz3
);
1464 dx31
= _mm_sub_ps(ix3
,jx1
);
1465 dy31
= _mm_sub_ps(iy3
,jy1
);
1466 dz31
= _mm_sub_ps(iz3
,jz1
);
1467 dx32
= _mm_sub_ps(ix3
,jx2
);
1468 dy32
= _mm_sub_ps(iy3
,jy2
);
1469 dz32
= _mm_sub_ps(iz3
,jz2
);
1470 dx33
= _mm_sub_ps(ix3
,jx3
);
1471 dy33
= _mm_sub_ps(iy3
,jy3
);
1472 dz33
= _mm_sub_ps(iz3
,jz3
);
1474 /* Calculate squared distance and things based on it */
1475 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
1476 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
1477 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
1478 rsq13
= gmx_mm_calc_rsq_ps(dx13
,dy13
,dz13
);
1479 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
1480 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
1481 rsq23
= gmx_mm_calc_rsq_ps(dx23
,dy23
,dz23
);
1482 rsq31
= gmx_mm_calc_rsq_ps(dx31
,dy31
,dz31
);
1483 rsq32
= gmx_mm_calc_rsq_ps(dx32
,dy32
,dz32
);
1484 rsq33
= gmx_mm_calc_rsq_ps(dx33
,dy33
,dz33
);
1486 rinv11
= gmx_mm_invsqrt_ps(rsq11
);
1487 rinv12
= gmx_mm_invsqrt_ps(rsq12
);
1488 rinv13
= gmx_mm_invsqrt_ps(rsq13
);
1489 rinv21
= gmx_mm_invsqrt_ps(rsq21
);
1490 rinv22
= gmx_mm_invsqrt_ps(rsq22
);
1491 rinv23
= gmx_mm_invsqrt_ps(rsq23
);
1492 rinv31
= gmx_mm_invsqrt_ps(rsq31
);
1493 rinv32
= gmx_mm_invsqrt_ps(rsq32
);
1494 rinv33
= gmx_mm_invsqrt_ps(rsq33
);
1496 rinvsq00
= gmx_mm_inv_ps(rsq00
);
1497 rinvsq11
= _mm_mul_ps(rinv11
,rinv11
);
1498 rinvsq12
= _mm_mul_ps(rinv12
,rinv12
);
1499 rinvsq13
= _mm_mul_ps(rinv13
,rinv13
);
1500 rinvsq21
= _mm_mul_ps(rinv21
,rinv21
);
1501 rinvsq22
= _mm_mul_ps(rinv22
,rinv22
);
1502 rinvsq23
= _mm_mul_ps(rinv23
,rinv23
);
1503 rinvsq31
= _mm_mul_ps(rinv31
,rinv31
);
1504 rinvsq32
= _mm_mul_ps(rinv32
,rinv32
);
1505 rinvsq33
= _mm_mul_ps(rinv33
,rinv33
);
1507 fjx0
= _mm_setzero_ps();
1508 fjy0
= _mm_setzero_ps();
1509 fjz0
= _mm_setzero_ps();
1510 fjx1
= _mm_setzero_ps();
1511 fjy1
= _mm_setzero_ps();
1512 fjz1
= _mm_setzero_ps();
1513 fjx2
= _mm_setzero_ps();
1514 fjy2
= _mm_setzero_ps();
1515 fjz2
= _mm_setzero_ps();
1516 fjx3
= _mm_setzero_ps();
1517 fjy3
= _mm_setzero_ps();
1518 fjz3
= _mm_setzero_ps();
1520 /**************************
1521 * CALCULATE INTERACTIONS *
1522 **************************/
1524 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
1527 /* LENNARD-JONES DISPERSION/REPULSION */
1529 rinvsix
= _mm_mul_ps(_mm_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
1530 fvdw
= _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00
,rinvsix
),c6_00
),_mm_mul_ps(rinvsix
,rinvsq00
));
1532 cutoff_mask
= _mm_cmplt_ps(rsq00
,rcutoff2
);
1536 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1538 /* Calculate temporary vectorial force */
1539 tx
= _mm_mul_ps(fscal
,dx00
);
1540 ty
= _mm_mul_ps(fscal
,dy00
);
1541 tz
= _mm_mul_ps(fscal
,dz00
);
1543 /* Update vectorial force */
1544 fix0
= _mm_add_ps(fix0
,tx
);
1545 fiy0
= _mm_add_ps(fiy0
,ty
);
1546 fiz0
= _mm_add_ps(fiz0
,tz
);
1548 fjx0
= _mm_add_ps(fjx0
,tx
);
1549 fjy0
= _mm_add_ps(fjy0
,ty
);
1550 fjz0
= _mm_add_ps(fjz0
,tz
);
1554 /**************************
1555 * CALCULATE INTERACTIONS *
1556 **************************/
1558 if (gmx_mm_any_lt(rsq11
,rcutoff2
))
1561 /* REACTION-FIELD ELECTROSTATICS */
1562 felec
= _mm_mul_ps(qq11
,_mm_sub_ps(_mm_mul_ps(rinv11
,rinvsq11
),krf2
));
1564 cutoff_mask
= _mm_cmplt_ps(rsq11
,rcutoff2
);
1568 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1570 /* Calculate temporary vectorial force */
1571 tx
= _mm_mul_ps(fscal
,dx11
);
1572 ty
= _mm_mul_ps(fscal
,dy11
);
1573 tz
= _mm_mul_ps(fscal
,dz11
);
1575 /* Update vectorial force */
1576 fix1
= _mm_add_ps(fix1
,tx
);
1577 fiy1
= _mm_add_ps(fiy1
,ty
);
1578 fiz1
= _mm_add_ps(fiz1
,tz
);
1580 fjx1
= _mm_add_ps(fjx1
,tx
);
1581 fjy1
= _mm_add_ps(fjy1
,ty
);
1582 fjz1
= _mm_add_ps(fjz1
,tz
);
1586 /**************************
1587 * CALCULATE INTERACTIONS *
1588 **************************/
1590 if (gmx_mm_any_lt(rsq12
,rcutoff2
))
1593 /* REACTION-FIELD ELECTROSTATICS */
1594 felec
= _mm_mul_ps(qq12
,_mm_sub_ps(_mm_mul_ps(rinv12
,rinvsq12
),krf2
));
1596 cutoff_mask
= _mm_cmplt_ps(rsq12
,rcutoff2
);
1600 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1602 /* Calculate temporary vectorial force */
1603 tx
= _mm_mul_ps(fscal
,dx12
);
1604 ty
= _mm_mul_ps(fscal
,dy12
);
1605 tz
= _mm_mul_ps(fscal
,dz12
);
1607 /* Update vectorial force */
1608 fix1
= _mm_add_ps(fix1
,tx
);
1609 fiy1
= _mm_add_ps(fiy1
,ty
);
1610 fiz1
= _mm_add_ps(fiz1
,tz
);
1612 fjx2
= _mm_add_ps(fjx2
,tx
);
1613 fjy2
= _mm_add_ps(fjy2
,ty
);
1614 fjz2
= _mm_add_ps(fjz2
,tz
);
1618 /**************************
1619 * CALCULATE INTERACTIONS *
1620 **************************/
1622 if (gmx_mm_any_lt(rsq13
,rcutoff2
))
1625 /* REACTION-FIELD ELECTROSTATICS */
1626 felec
= _mm_mul_ps(qq13
,_mm_sub_ps(_mm_mul_ps(rinv13
,rinvsq13
),krf2
));
1628 cutoff_mask
= _mm_cmplt_ps(rsq13
,rcutoff2
);
1632 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1634 /* Calculate temporary vectorial force */
1635 tx
= _mm_mul_ps(fscal
,dx13
);
1636 ty
= _mm_mul_ps(fscal
,dy13
);
1637 tz
= _mm_mul_ps(fscal
,dz13
);
1639 /* Update vectorial force */
1640 fix1
= _mm_add_ps(fix1
,tx
);
1641 fiy1
= _mm_add_ps(fiy1
,ty
);
1642 fiz1
= _mm_add_ps(fiz1
,tz
);
1644 fjx3
= _mm_add_ps(fjx3
,tx
);
1645 fjy3
= _mm_add_ps(fjy3
,ty
);
1646 fjz3
= _mm_add_ps(fjz3
,tz
);
1650 /**************************
1651 * CALCULATE INTERACTIONS *
1652 **************************/
1654 if (gmx_mm_any_lt(rsq21
,rcutoff2
))
1657 /* REACTION-FIELD ELECTROSTATICS */
1658 felec
= _mm_mul_ps(qq21
,_mm_sub_ps(_mm_mul_ps(rinv21
,rinvsq21
),krf2
));
1660 cutoff_mask
= _mm_cmplt_ps(rsq21
,rcutoff2
);
1664 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1666 /* Calculate temporary vectorial force */
1667 tx
= _mm_mul_ps(fscal
,dx21
);
1668 ty
= _mm_mul_ps(fscal
,dy21
);
1669 tz
= _mm_mul_ps(fscal
,dz21
);
1671 /* Update vectorial force */
1672 fix2
= _mm_add_ps(fix2
,tx
);
1673 fiy2
= _mm_add_ps(fiy2
,ty
);
1674 fiz2
= _mm_add_ps(fiz2
,tz
);
1676 fjx1
= _mm_add_ps(fjx1
,tx
);
1677 fjy1
= _mm_add_ps(fjy1
,ty
);
1678 fjz1
= _mm_add_ps(fjz1
,tz
);
1682 /**************************
1683 * CALCULATE INTERACTIONS *
1684 **************************/
1686 if (gmx_mm_any_lt(rsq22
,rcutoff2
))
1689 /* REACTION-FIELD ELECTROSTATICS */
1690 felec
= _mm_mul_ps(qq22
,_mm_sub_ps(_mm_mul_ps(rinv22
,rinvsq22
),krf2
));
1692 cutoff_mask
= _mm_cmplt_ps(rsq22
,rcutoff2
);
1696 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1698 /* Calculate temporary vectorial force */
1699 tx
= _mm_mul_ps(fscal
,dx22
);
1700 ty
= _mm_mul_ps(fscal
,dy22
);
1701 tz
= _mm_mul_ps(fscal
,dz22
);
1703 /* Update vectorial force */
1704 fix2
= _mm_add_ps(fix2
,tx
);
1705 fiy2
= _mm_add_ps(fiy2
,ty
);
1706 fiz2
= _mm_add_ps(fiz2
,tz
);
1708 fjx2
= _mm_add_ps(fjx2
,tx
);
1709 fjy2
= _mm_add_ps(fjy2
,ty
);
1710 fjz2
= _mm_add_ps(fjz2
,tz
);
1714 /**************************
1715 * CALCULATE INTERACTIONS *
1716 **************************/
1718 if (gmx_mm_any_lt(rsq23
,rcutoff2
))
1721 /* REACTION-FIELD ELECTROSTATICS */
1722 felec
= _mm_mul_ps(qq23
,_mm_sub_ps(_mm_mul_ps(rinv23
,rinvsq23
),krf2
));
1724 cutoff_mask
= _mm_cmplt_ps(rsq23
,rcutoff2
);
1728 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1730 /* Calculate temporary vectorial force */
1731 tx
= _mm_mul_ps(fscal
,dx23
);
1732 ty
= _mm_mul_ps(fscal
,dy23
);
1733 tz
= _mm_mul_ps(fscal
,dz23
);
1735 /* Update vectorial force */
1736 fix2
= _mm_add_ps(fix2
,tx
);
1737 fiy2
= _mm_add_ps(fiy2
,ty
);
1738 fiz2
= _mm_add_ps(fiz2
,tz
);
1740 fjx3
= _mm_add_ps(fjx3
,tx
);
1741 fjy3
= _mm_add_ps(fjy3
,ty
);
1742 fjz3
= _mm_add_ps(fjz3
,tz
);
1746 /**************************
1747 * CALCULATE INTERACTIONS *
1748 **************************/
1750 if (gmx_mm_any_lt(rsq31
,rcutoff2
))
1753 /* REACTION-FIELD ELECTROSTATICS */
1754 felec
= _mm_mul_ps(qq31
,_mm_sub_ps(_mm_mul_ps(rinv31
,rinvsq31
),krf2
));
1756 cutoff_mask
= _mm_cmplt_ps(rsq31
,rcutoff2
);
1760 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1762 /* Calculate temporary vectorial force */
1763 tx
= _mm_mul_ps(fscal
,dx31
);
1764 ty
= _mm_mul_ps(fscal
,dy31
);
1765 tz
= _mm_mul_ps(fscal
,dz31
);
1767 /* Update vectorial force */
1768 fix3
= _mm_add_ps(fix3
,tx
);
1769 fiy3
= _mm_add_ps(fiy3
,ty
);
1770 fiz3
= _mm_add_ps(fiz3
,tz
);
1772 fjx1
= _mm_add_ps(fjx1
,tx
);
1773 fjy1
= _mm_add_ps(fjy1
,ty
);
1774 fjz1
= _mm_add_ps(fjz1
,tz
);
1778 /**************************
1779 * CALCULATE INTERACTIONS *
1780 **************************/
1782 if (gmx_mm_any_lt(rsq32
,rcutoff2
))
1785 /* REACTION-FIELD ELECTROSTATICS */
1786 felec
= _mm_mul_ps(qq32
,_mm_sub_ps(_mm_mul_ps(rinv32
,rinvsq32
),krf2
));
1788 cutoff_mask
= _mm_cmplt_ps(rsq32
,rcutoff2
);
1792 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1794 /* Calculate temporary vectorial force */
1795 tx
= _mm_mul_ps(fscal
,dx32
);
1796 ty
= _mm_mul_ps(fscal
,dy32
);
1797 tz
= _mm_mul_ps(fscal
,dz32
);
1799 /* Update vectorial force */
1800 fix3
= _mm_add_ps(fix3
,tx
);
1801 fiy3
= _mm_add_ps(fiy3
,ty
);
1802 fiz3
= _mm_add_ps(fiz3
,tz
);
1804 fjx2
= _mm_add_ps(fjx2
,tx
);
1805 fjy2
= _mm_add_ps(fjy2
,ty
);
1806 fjz2
= _mm_add_ps(fjz2
,tz
);
1810 /**************************
1811 * CALCULATE INTERACTIONS *
1812 **************************/
1814 if (gmx_mm_any_lt(rsq33
,rcutoff2
))
1817 /* REACTION-FIELD ELECTROSTATICS */
1818 felec
= _mm_mul_ps(qq33
,_mm_sub_ps(_mm_mul_ps(rinv33
,rinvsq33
),krf2
));
1820 cutoff_mask
= _mm_cmplt_ps(rsq33
,rcutoff2
);
1824 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1826 /* Calculate temporary vectorial force */
1827 tx
= _mm_mul_ps(fscal
,dx33
);
1828 ty
= _mm_mul_ps(fscal
,dy33
);
1829 tz
= _mm_mul_ps(fscal
,dz33
);
1831 /* Update vectorial force */
1832 fix3
= _mm_add_ps(fix3
,tx
);
1833 fiy3
= _mm_add_ps(fiy3
,ty
);
1834 fiz3
= _mm_add_ps(fiz3
,tz
);
1836 fjx3
= _mm_add_ps(fjx3
,tx
);
1837 fjy3
= _mm_add_ps(fjy3
,ty
);
1838 fjz3
= _mm_add_ps(fjz3
,tz
);
1842 fjptrA
= f
+j_coord_offsetA
;
1843 fjptrB
= f
+j_coord_offsetB
;
1844 fjptrC
= f
+j_coord_offsetC
;
1845 fjptrD
= f
+j_coord_offsetD
;
1847 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
1848 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,
1849 fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
1851 /* Inner loop uses 303 flops */
1854 if(jidx
<j_index_end
)
1857 /* Get j neighbor index, and coordinate index */
1858 jnrlistA
= jjnr
[jidx
];
1859 jnrlistB
= jjnr
[jidx
+1];
1860 jnrlistC
= jjnr
[jidx
+2];
1861 jnrlistD
= jjnr
[jidx
+3];
1862 /* Sign of each element will be negative for non-real atoms.
1863 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1864 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1866 dummy_mask
= gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128()));
1867 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
1868 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
1869 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
1870 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
1871 j_coord_offsetA
= DIM
*jnrA
;
1872 j_coord_offsetB
= DIM
*jnrB
;
1873 j_coord_offsetC
= DIM
*jnrC
;
1874 j_coord_offsetD
= DIM
*jnrD
;
1876 /* load j atom coordinates */
1877 gmx_mm_load_4rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1878 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
1879 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
1880 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
1882 /* Calculate displacement vector */
1883 dx00
= _mm_sub_ps(ix0
,jx0
);
1884 dy00
= _mm_sub_ps(iy0
,jy0
);
1885 dz00
= _mm_sub_ps(iz0
,jz0
);
1886 dx11
= _mm_sub_ps(ix1
,jx1
);
1887 dy11
= _mm_sub_ps(iy1
,jy1
);
1888 dz11
= _mm_sub_ps(iz1
,jz1
);
1889 dx12
= _mm_sub_ps(ix1
,jx2
);
1890 dy12
= _mm_sub_ps(iy1
,jy2
);
1891 dz12
= _mm_sub_ps(iz1
,jz2
);
1892 dx13
= _mm_sub_ps(ix1
,jx3
);
1893 dy13
= _mm_sub_ps(iy1
,jy3
);
1894 dz13
= _mm_sub_ps(iz1
,jz3
);
1895 dx21
= _mm_sub_ps(ix2
,jx1
);
1896 dy21
= _mm_sub_ps(iy2
,jy1
);
1897 dz21
= _mm_sub_ps(iz2
,jz1
);
1898 dx22
= _mm_sub_ps(ix2
,jx2
);
1899 dy22
= _mm_sub_ps(iy2
,jy2
);
1900 dz22
= _mm_sub_ps(iz2
,jz2
);
1901 dx23
= _mm_sub_ps(ix2
,jx3
);
1902 dy23
= _mm_sub_ps(iy2
,jy3
);
1903 dz23
= _mm_sub_ps(iz2
,jz3
);
1904 dx31
= _mm_sub_ps(ix3
,jx1
);
1905 dy31
= _mm_sub_ps(iy3
,jy1
);
1906 dz31
= _mm_sub_ps(iz3
,jz1
);
1907 dx32
= _mm_sub_ps(ix3
,jx2
);
1908 dy32
= _mm_sub_ps(iy3
,jy2
);
1909 dz32
= _mm_sub_ps(iz3
,jz2
);
1910 dx33
= _mm_sub_ps(ix3
,jx3
);
1911 dy33
= _mm_sub_ps(iy3
,jy3
);
1912 dz33
= _mm_sub_ps(iz3
,jz3
);
1914 /* Calculate squared distance and things based on it */
1915 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
1916 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
1917 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
1918 rsq13
= gmx_mm_calc_rsq_ps(dx13
,dy13
,dz13
);
1919 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
1920 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
1921 rsq23
= gmx_mm_calc_rsq_ps(dx23
,dy23
,dz23
);
1922 rsq31
= gmx_mm_calc_rsq_ps(dx31
,dy31
,dz31
);
1923 rsq32
= gmx_mm_calc_rsq_ps(dx32
,dy32
,dz32
);
1924 rsq33
= gmx_mm_calc_rsq_ps(dx33
,dy33
,dz33
);
1926 rinv11
= gmx_mm_invsqrt_ps(rsq11
);
1927 rinv12
= gmx_mm_invsqrt_ps(rsq12
);
1928 rinv13
= gmx_mm_invsqrt_ps(rsq13
);
1929 rinv21
= gmx_mm_invsqrt_ps(rsq21
);
1930 rinv22
= gmx_mm_invsqrt_ps(rsq22
);
1931 rinv23
= gmx_mm_invsqrt_ps(rsq23
);
1932 rinv31
= gmx_mm_invsqrt_ps(rsq31
);
1933 rinv32
= gmx_mm_invsqrt_ps(rsq32
);
1934 rinv33
= gmx_mm_invsqrt_ps(rsq33
);
1936 rinvsq00
= gmx_mm_inv_ps(rsq00
);
1937 rinvsq11
= _mm_mul_ps(rinv11
,rinv11
);
1938 rinvsq12
= _mm_mul_ps(rinv12
,rinv12
);
1939 rinvsq13
= _mm_mul_ps(rinv13
,rinv13
);
1940 rinvsq21
= _mm_mul_ps(rinv21
,rinv21
);
1941 rinvsq22
= _mm_mul_ps(rinv22
,rinv22
);
1942 rinvsq23
= _mm_mul_ps(rinv23
,rinv23
);
1943 rinvsq31
= _mm_mul_ps(rinv31
,rinv31
);
1944 rinvsq32
= _mm_mul_ps(rinv32
,rinv32
);
1945 rinvsq33
= _mm_mul_ps(rinv33
,rinv33
);
1947 fjx0
= _mm_setzero_ps();
1948 fjy0
= _mm_setzero_ps();
1949 fjz0
= _mm_setzero_ps();
1950 fjx1
= _mm_setzero_ps();
1951 fjy1
= _mm_setzero_ps();
1952 fjz1
= _mm_setzero_ps();
1953 fjx2
= _mm_setzero_ps();
1954 fjy2
= _mm_setzero_ps();
1955 fjz2
= _mm_setzero_ps();
1956 fjx3
= _mm_setzero_ps();
1957 fjy3
= _mm_setzero_ps();
1958 fjz3
= _mm_setzero_ps();
1960 /**************************
1961 * CALCULATE INTERACTIONS *
1962 **************************/
1964 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
1967 /* LENNARD-JONES DISPERSION/REPULSION */
1969 rinvsix
= _mm_mul_ps(_mm_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
1970 fvdw
= _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00
,rinvsix
),c6_00
),_mm_mul_ps(rinvsix
,rinvsq00
));
1972 cutoff_mask
= _mm_cmplt_ps(rsq00
,rcutoff2
);
1976 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1978 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1980 /* Calculate temporary vectorial force */
1981 tx
= _mm_mul_ps(fscal
,dx00
);
1982 ty
= _mm_mul_ps(fscal
,dy00
);
1983 tz
= _mm_mul_ps(fscal
,dz00
);
1985 /* Update vectorial force */
1986 fix0
= _mm_add_ps(fix0
,tx
);
1987 fiy0
= _mm_add_ps(fiy0
,ty
);
1988 fiz0
= _mm_add_ps(fiz0
,tz
);
1990 fjx0
= _mm_add_ps(fjx0
,tx
);
1991 fjy0
= _mm_add_ps(fjy0
,ty
);
1992 fjz0
= _mm_add_ps(fjz0
,tz
);
1996 /**************************
1997 * CALCULATE INTERACTIONS *
1998 **************************/
2000 if (gmx_mm_any_lt(rsq11
,rcutoff2
))
2003 /* REACTION-FIELD ELECTROSTATICS */
2004 felec
= _mm_mul_ps(qq11
,_mm_sub_ps(_mm_mul_ps(rinv11
,rinvsq11
),krf2
));
2006 cutoff_mask
= _mm_cmplt_ps(rsq11
,rcutoff2
);
2010 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
2012 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2014 /* Calculate temporary vectorial force */
2015 tx
= _mm_mul_ps(fscal
,dx11
);
2016 ty
= _mm_mul_ps(fscal
,dy11
);
2017 tz
= _mm_mul_ps(fscal
,dz11
);
2019 /* Update vectorial force */
2020 fix1
= _mm_add_ps(fix1
,tx
);
2021 fiy1
= _mm_add_ps(fiy1
,ty
);
2022 fiz1
= _mm_add_ps(fiz1
,tz
);
2024 fjx1
= _mm_add_ps(fjx1
,tx
);
2025 fjy1
= _mm_add_ps(fjy1
,ty
);
2026 fjz1
= _mm_add_ps(fjz1
,tz
);
2030 /**************************
2031 * CALCULATE INTERACTIONS *
2032 **************************/
2034 if (gmx_mm_any_lt(rsq12
,rcutoff2
))
2037 /* REACTION-FIELD ELECTROSTATICS */
2038 felec
= _mm_mul_ps(qq12
,_mm_sub_ps(_mm_mul_ps(rinv12
,rinvsq12
),krf2
));
2040 cutoff_mask
= _mm_cmplt_ps(rsq12
,rcutoff2
);
2044 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
2046 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2048 /* Calculate temporary vectorial force */
2049 tx
= _mm_mul_ps(fscal
,dx12
);
2050 ty
= _mm_mul_ps(fscal
,dy12
);
2051 tz
= _mm_mul_ps(fscal
,dz12
);
2053 /* Update vectorial force */
2054 fix1
= _mm_add_ps(fix1
,tx
);
2055 fiy1
= _mm_add_ps(fiy1
,ty
);
2056 fiz1
= _mm_add_ps(fiz1
,tz
);
2058 fjx2
= _mm_add_ps(fjx2
,tx
);
2059 fjy2
= _mm_add_ps(fjy2
,ty
);
2060 fjz2
= _mm_add_ps(fjz2
,tz
);
2064 /**************************
2065 * CALCULATE INTERACTIONS *
2066 **************************/
2068 if (gmx_mm_any_lt(rsq13
,rcutoff2
))
2071 /* REACTION-FIELD ELECTROSTATICS */
2072 felec
= _mm_mul_ps(qq13
,_mm_sub_ps(_mm_mul_ps(rinv13
,rinvsq13
),krf2
));
2074 cutoff_mask
= _mm_cmplt_ps(rsq13
,rcutoff2
);
2078 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
2080 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2082 /* Calculate temporary vectorial force */
2083 tx
= _mm_mul_ps(fscal
,dx13
);
2084 ty
= _mm_mul_ps(fscal
,dy13
);
2085 tz
= _mm_mul_ps(fscal
,dz13
);
2087 /* Update vectorial force */
2088 fix1
= _mm_add_ps(fix1
,tx
);
2089 fiy1
= _mm_add_ps(fiy1
,ty
);
2090 fiz1
= _mm_add_ps(fiz1
,tz
);
2092 fjx3
= _mm_add_ps(fjx3
,tx
);
2093 fjy3
= _mm_add_ps(fjy3
,ty
);
2094 fjz3
= _mm_add_ps(fjz3
,tz
);
2098 /**************************
2099 * CALCULATE INTERACTIONS *
2100 **************************/
2102 if (gmx_mm_any_lt(rsq21
,rcutoff2
))
2105 /* REACTION-FIELD ELECTROSTATICS */
2106 felec
= _mm_mul_ps(qq21
,_mm_sub_ps(_mm_mul_ps(rinv21
,rinvsq21
),krf2
));
2108 cutoff_mask
= _mm_cmplt_ps(rsq21
,rcutoff2
);
2112 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
2114 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2116 /* Calculate temporary vectorial force */
2117 tx
= _mm_mul_ps(fscal
,dx21
);
2118 ty
= _mm_mul_ps(fscal
,dy21
);
2119 tz
= _mm_mul_ps(fscal
,dz21
);
2121 /* Update vectorial force */
2122 fix2
= _mm_add_ps(fix2
,tx
);
2123 fiy2
= _mm_add_ps(fiy2
,ty
);
2124 fiz2
= _mm_add_ps(fiz2
,tz
);
2126 fjx1
= _mm_add_ps(fjx1
,tx
);
2127 fjy1
= _mm_add_ps(fjy1
,ty
);
2128 fjz1
= _mm_add_ps(fjz1
,tz
);
2132 /**************************
2133 * CALCULATE INTERACTIONS *
2134 **************************/
2136 if (gmx_mm_any_lt(rsq22
,rcutoff2
))
2139 /* REACTION-FIELD ELECTROSTATICS */
2140 felec
= _mm_mul_ps(qq22
,_mm_sub_ps(_mm_mul_ps(rinv22
,rinvsq22
),krf2
));
2142 cutoff_mask
= _mm_cmplt_ps(rsq22
,rcutoff2
);
2146 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
2148 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2150 /* Calculate temporary vectorial force */
2151 tx
= _mm_mul_ps(fscal
,dx22
);
2152 ty
= _mm_mul_ps(fscal
,dy22
);
2153 tz
= _mm_mul_ps(fscal
,dz22
);
2155 /* Update vectorial force */
2156 fix2
= _mm_add_ps(fix2
,tx
);
2157 fiy2
= _mm_add_ps(fiy2
,ty
);
2158 fiz2
= _mm_add_ps(fiz2
,tz
);
2160 fjx2
= _mm_add_ps(fjx2
,tx
);
2161 fjy2
= _mm_add_ps(fjy2
,ty
);
2162 fjz2
= _mm_add_ps(fjz2
,tz
);
2166 /**************************
2167 * CALCULATE INTERACTIONS *
2168 **************************/
2170 if (gmx_mm_any_lt(rsq23
,rcutoff2
))
2173 /* REACTION-FIELD ELECTROSTATICS */
2174 felec
= _mm_mul_ps(qq23
,_mm_sub_ps(_mm_mul_ps(rinv23
,rinvsq23
),krf2
));
2176 cutoff_mask
= _mm_cmplt_ps(rsq23
,rcutoff2
);
2180 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
2182 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2184 /* Calculate temporary vectorial force */
2185 tx
= _mm_mul_ps(fscal
,dx23
);
2186 ty
= _mm_mul_ps(fscal
,dy23
);
2187 tz
= _mm_mul_ps(fscal
,dz23
);
2189 /* Update vectorial force */
2190 fix2
= _mm_add_ps(fix2
,tx
);
2191 fiy2
= _mm_add_ps(fiy2
,ty
);
2192 fiz2
= _mm_add_ps(fiz2
,tz
);
2194 fjx3
= _mm_add_ps(fjx3
,tx
);
2195 fjy3
= _mm_add_ps(fjy3
,ty
);
2196 fjz3
= _mm_add_ps(fjz3
,tz
);
2200 /**************************
2201 * CALCULATE INTERACTIONS *
2202 **************************/
2204 if (gmx_mm_any_lt(rsq31
,rcutoff2
))
2207 /* REACTION-FIELD ELECTROSTATICS */
2208 felec
= _mm_mul_ps(qq31
,_mm_sub_ps(_mm_mul_ps(rinv31
,rinvsq31
),krf2
));
2210 cutoff_mask
= _mm_cmplt_ps(rsq31
,rcutoff2
);
2214 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
2216 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2218 /* Calculate temporary vectorial force */
2219 tx
= _mm_mul_ps(fscal
,dx31
);
2220 ty
= _mm_mul_ps(fscal
,dy31
);
2221 tz
= _mm_mul_ps(fscal
,dz31
);
2223 /* Update vectorial force */
2224 fix3
= _mm_add_ps(fix3
,tx
);
2225 fiy3
= _mm_add_ps(fiy3
,ty
);
2226 fiz3
= _mm_add_ps(fiz3
,tz
);
2228 fjx1
= _mm_add_ps(fjx1
,tx
);
2229 fjy1
= _mm_add_ps(fjy1
,ty
);
2230 fjz1
= _mm_add_ps(fjz1
,tz
);
2234 /**************************
2235 * CALCULATE INTERACTIONS *
2236 **************************/
2238 if (gmx_mm_any_lt(rsq32
,rcutoff2
))
2241 /* REACTION-FIELD ELECTROSTATICS */
2242 felec
= _mm_mul_ps(qq32
,_mm_sub_ps(_mm_mul_ps(rinv32
,rinvsq32
),krf2
));
2244 cutoff_mask
= _mm_cmplt_ps(rsq32
,rcutoff2
);
2248 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
2250 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2252 /* Calculate temporary vectorial force */
2253 tx
= _mm_mul_ps(fscal
,dx32
);
2254 ty
= _mm_mul_ps(fscal
,dy32
);
2255 tz
= _mm_mul_ps(fscal
,dz32
);
2257 /* Update vectorial force */
2258 fix3
= _mm_add_ps(fix3
,tx
);
2259 fiy3
= _mm_add_ps(fiy3
,ty
);
2260 fiz3
= _mm_add_ps(fiz3
,tz
);
2262 fjx2
= _mm_add_ps(fjx2
,tx
);
2263 fjy2
= _mm_add_ps(fjy2
,ty
);
2264 fjz2
= _mm_add_ps(fjz2
,tz
);
2268 /**************************
2269 * CALCULATE INTERACTIONS *
2270 **************************/
2272 if (gmx_mm_any_lt(rsq33
,rcutoff2
))
2275 /* REACTION-FIELD ELECTROSTATICS */
2276 felec
= _mm_mul_ps(qq33
,_mm_sub_ps(_mm_mul_ps(rinv33
,rinvsq33
),krf2
));
2278 cutoff_mask
= _mm_cmplt_ps(rsq33
,rcutoff2
);
2282 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
2284 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2286 /* Calculate temporary vectorial force */
2287 tx
= _mm_mul_ps(fscal
,dx33
);
2288 ty
= _mm_mul_ps(fscal
,dy33
);
2289 tz
= _mm_mul_ps(fscal
,dz33
);
2291 /* Update vectorial force */
2292 fix3
= _mm_add_ps(fix3
,tx
);
2293 fiy3
= _mm_add_ps(fiy3
,ty
);
2294 fiz3
= _mm_add_ps(fiz3
,tz
);
2296 fjx3
= _mm_add_ps(fjx3
,tx
);
2297 fjy3
= _mm_add_ps(fjy3
,ty
);
2298 fjz3
= _mm_add_ps(fjz3
,tz
);
2302 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
2303 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
2304 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
2305 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
2307 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
2308 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,
2309 fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
2311 /* Inner loop uses 303 flops */
2314 /* End of innermost loop */
2316 gmx_mm_update_iforce_4atom_swizzle_ps(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
2317 f
+i_coord_offset
,fshift
+i_shift_offset
);
2319 /* Increment number of inner iterations */
2320 inneriter
+= j_index_end
- j_index_start
;
2322 /* Outer loop uses 24 flops */
2325 /* Increment number of outer iterations */
2328 /* Update outer/inner flops */
2330 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W4W4_F
,outeriter
*24 + inneriter
*303);