2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_single kernel generator.
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
47 #include "gromacs/simd/math_x86_sse2_single.h"
48 #include "kernelutil_x86_sse2_single.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sse2_single
52 * Electrostatics interaction: Ewald
53 * VdW interaction: LennardJones
54 * Geometry: Water3-Water3
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sse2_single
59 (t_nblist
* gmx_restrict nlist
,
60 rvec
* gmx_restrict xx
,
61 rvec
* gmx_restrict ff
,
62 t_forcerec
* gmx_restrict fr
,
63 t_mdatoms
* gmx_restrict mdatoms
,
64 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
65 t_nrnb
* gmx_restrict nrnb
)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
73 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
74 int jnrA
,jnrB
,jnrC
,jnrD
;
75 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
76 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
77 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
79 real
*shiftvec
,*fshift
,*x
,*f
;
80 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
;
82 __m128 tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
84 __m128 ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
86 __m128 ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
88 __m128 ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
89 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
;
90 __m128 jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
91 int vdwjidx1A
,vdwjidx1B
,vdwjidx1C
,vdwjidx1D
;
92 __m128 jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
93 int vdwjidx2A
,vdwjidx2B
,vdwjidx2C
,vdwjidx2D
;
94 __m128 jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
95 __m128 dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
96 __m128 dx01
,dy01
,dz01
,rsq01
,rinv01
,rinvsq01
,r01
,qq01
,c6_01
,c12_01
;
97 __m128 dx02
,dy02
,dz02
,rsq02
,rinv02
,rinvsq02
,r02
,qq02
,c6_02
,c12_02
;
98 __m128 dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
99 __m128 dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
100 __m128 dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
101 __m128 dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
102 __m128 dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
103 __m128 dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
104 __m128 velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
107 __m128 rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
110 __m128 one_sixth
= _mm_set1_ps(1.0/6.0);
111 __m128 one_twelfth
= _mm_set1_ps(1.0/12.0);
113 __m128 ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
115 __m128 rswitch
,swV3
,swV4
,swV5
,swF2
,swF3
,swF4
,d
,d2
,sw
,dsw
;
116 real rswitch_scalar
,d_scalar
;
117 __m128 dummy_mask
,cutoff_mask
;
118 __m128 signbit
= _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
119 __m128 one
= _mm_set1_ps(1.0);
120 __m128 two
= _mm_set1_ps(2.0);
126 jindex
= nlist
->jindex
;
128 shiftidx
= nlist
->shift
;
130 shiftvec
= fr
->shift_vec
[0];
131 fshift
= fr
->fshift
[0];
132 facel
= _mm_set1_ps(fr
->epsfac
);
133 charge
= mdatoms
->chargeA
;
134 nvdwtype
= fr
->ntype
;
136 vdwtype
= mdatoms
->typeA
;
138 sh_ewald
= _mm_set1_ps(fr
->ic
->sh_ewald
);
139 ewtab
= fr
->ic
->tabq_coul_FDV0
;
140 ewtabscale
= _mm_set1_ps(fr
->ic
->tabq_scale
);
141 ewtabhalfspace
= _mm_set1_ps(0.5/fr
->ic
->tabq_scale
);
143 /* Setup water-specific parameters */
144 inr
= nlist
->iinr
[0];
145 iq0
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+0]));
146 iq1
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+1]));
147 iq2
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+2]));
148 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
150 jq0
= _mm_set1_ps(charge
[inr
+0]);
151 jq1
= _mm_set1_ps(charge
[inr
+1]);
152 jq2
= _mm_set1_ps(charge
[inr
+2]);
153 vdwjidx0A
= 2*vdwtype
[inr
+0];
154 qq00
= _mm_mul_ps(iq0
,jq0
);
155 c6_00
= _mm_set1_ps(vdwparam
[vdwioffset0
+vdwjidx0A
]);
156 c12_00
= _mm_set1_ps(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
157 qq01
= _mm_mul_ps(iq0
,jq1
);
158 qq02
= _mm_mul_ps(iq0
,jq2
);
159 qq10
= _mm_mul_ps(iq1
,jq0
);
160 qq11
= _mm_mul_ps(iq1
,jq1
);
161 qq12
= _mm_mul_ps(iq1
,jq2
);
162 qq20
= _mm_mul_ps(iq2
,jq0
);
163 qq21
= _mm_mul_ps(iq2
,jq1
);
164 qq22
= _mm_mul_ps(iq2
,jq2
);
166 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
167 rcutoff_scalar
= fr
->rcoulomb
;
168 rcutoff
= _mm_set1_ps(rcutoff_scalar
);
169 rcutoff2
= _mm_mul_ps(rcutoff
,rcutoff
);
171 rswitch_scalar
= fr
->rcoulomb_switch
;
172 rswitch
= _mm_set1_ps(rswitch_scalar
);
173 /* Setup switch parameters */
174 d_scalar
= rcutoff_scalar
-rswitch_scalar
;
175 d
= _mm_set1_ps(d_scalar
);
176 swV3
= _mm_set1_ps(-10.0/(d_scalar
*d_scalar
*d_scalar
));
177 swV4
= _mm_set1_ps( 15.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
));
178 swV5
= _mm_set1_ps( -6.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
*d_scalar
));
179 swF2
= _mm_set1_ps(-30.0/(d_scalar
*d_scalar
*d_scalar
));
180 swF3
= _mm_set1_ps( 60.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
));
181 swF4
= _mm_set1_ps(-30.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
*d_scalar
));
183 /* Avoid stupid compiler warnings */
184 jnrA
= jnrB
= jnrC
= jnrD
= 0;
193 for(iidx
=0;iidx
<4*DIM
;iidx
++)
198 /* Start outer loop over neighborlists */
199 for(iidx
=0; iidx
<nri
; iidx
++)
201 /* Load shift vector for this list */
202 i_shift_offset
= DIM
*shiftidx
[iidx
];
204 /* Load limits for loop over neighbors */
205 j_index_start
= jindex
[iidx
];
206 j_index_end
= jindex
[iidx
+1];
208 /* Get outer coordinate index */
210 i_coord_offset
= DIM
*inr
;
212 /* Load i particle coords and add shift vector */
213 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
214 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
216 fix0
= _mm_setzero_ps();
217 fiy0
= _mm_setzero_ps();
218 fiz0
= _mm_setzero_ps();
219 fix1
= _mm_setzero_ps();
220 fiy1
= _mm_setzero_ps();
221 fiz1
= _mm_setzero_ps();
222 fix2
= _mm_setzero_ps();
223 fiy2
= _mm_setzero_ps();
224 fiz2
= _mm_setzero_ps();
226 /* Reset potential sums */
227 velecsum
= _mm_setzero_ps();
228 vvdwsum
= _mm_setzero_ps();
230 /* Start inner kernel loop */
231 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+3]>=0; jidx
+=4)
234 /* Get j neighbor index, and coordinate index */
239 j_coord_offsetA
= DIM
*jnrA
;
240 j_coord_offsetB
= DIM
*jnrB
;
241 j_coord_offsetC
= DIM
*jnrC
;
242 j_coord_offsetD
= DIM
*jnrD
;
244 /* load j atom coordinates */
245 gmx_mm_load_3rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
246 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
247 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
249 /* Calculate displacement vector */
250 dx00
= _mm_sub_ps(ix0
,jx0
);
251 dy00
= _mm_sub_ps(iy0
,jy0
);
252 dz00
= _mm_sub_ps(iz0
,jz0
);
253 dx01
= _mm_sub_ps(ix0
,jx1
);
254 dy01
= _mm_sub_ps(iy0
,jy1
);
255 dz01
= _mm_sub_ps(iz0
,jz1
);
256 dx02
= _mm_sub_ps(ix0
,jx2
);
257 dy02
= _mm_sub_ps(iy0
,jy2
);
258 dz02
= _mm_sub_ps(iz0
,jz2
);
259 dx10
= _mm_sub_ps(ix1
,jx0
);
260 dy10
= _mm_sub_ps(iy1
,jy0
);
261 dz10
= _mm_sub_ps(iz1
,jz0
);
262 dx11
= _mm_sub_ps(ix1
,jx1
);
263 dy11
= _mm_sub_ps(iy1
,jy1
);
264 dz11
= _mm_sub_ps(iz1
,jz1
);
265 dx12
= _mm_sub_ps(ix1
,jx2
);
266 dy12
= _mm_sub_ps(iy1
,jy2
);
267 dz12
= _mm_sub_ps(iz1
,jz2
);
268 dx20
= _mm_sub_ps(ix2
,jx0
);
269 dy20
= _mm_sub_ps(iy2
,jy0
);
270 dz20
= _mm_sub_ps(iz2
,jz0
);
271 dx21
= _mm_sub_ps(ix2
,jx1
);
272 dy21
= _mm_sub_ps(iy2
,jy1
);
273 dz21
= _mm_sub_ps(iz2
,jz1
);
274 dx22
= _mm_sub_ps(ix2
,jx2
);
275 dy22
= _mm_sub_ps(iy2
,jy2
);
276 dz22
= _mm_sub_ps(iz2
,jz2
);
278 /* Calculate squared distance and things based on it */
279 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
280 rsq01
= gmx_mm_calc_rsq_ps(dx01
,dy01
,dz01
);
281 rsq02
= gmx_mm_calc_rsq_ps(dx02
,dy02
,dz02
);
282 rsq10
= gmx_mm_calc_rsq_ps(dx10
,dy10
,dz10
);
283 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
284 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
285 rsq20
= gmx_mm_calc_rsq_ps(dx20
,dy20
,dz20
);
286 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
287 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
289 rinv00
= gmx_mm_invsqrt_ps(rsq00
);
290 rinv01
= gmx_mm_invsqrt_ps(rsq01
);
291 rinv02
= gmx_mm_invsqrt_ps(rsq02
);
292 rinv10
= gmx_mm_invsqrt_ps(rsq10
);
293 rinv11
= gmx_mm_invsqrt_ps(rsq11
);
294 rinv12
= gmx_mm_invsqrt_ps(rsq12
);
295 rinv20
= gmx_mm_invsqrt_ps(rsq20
);
296 rinv21
= gmx_mm_invsqrt_ps(rsq21
);
297 rinv22
= gmx_mm_invsqrt_ps(rsq22
);
299 rinvsq00
= _mm_mul_ps(rinv00
,rinv00
);
300 rinvsq01
= _mm_mul_ps(rinv01
,rinv01
);
301 rinvsq02
= _mm_mul_ps(rinv02
,rinv02
);
302 rinvsq10
= _mm_mul_ps(rinv10
,rinv10
);
303 rinvsq11
= _mm_mul_ps(rinv11
,rinv11
);
304 rinvsq12
= _mm_mul_ps(rinv12
,rinv12
);
305 rinvsq20
= _mm_mul_ps(rinv20
,rinv20
);
306 rinvsq21
= _mm_mul_ps(rinv21
,rinv21
);
307 rinvsq22
= _mm_mul_ps(rinv22
,rinv22
);
309 fjx0
= _mm_setzero_ps();
310 fjy0
= _mm_setzero_ps();
311 fjz0
= _mm_setzero_ps();
312 fjx1
= _mm_setzero_ps();
313 fjy1
= _mm_setzero_ps();
314 fjz1
= _mm_setzero_ps();
315 fjx2
= _mm_setzero_ps();
316 fjy2
= _mm_setzero_ps();
317 fjz2
= _mm_setzero_ps();
319 /**************************
320 * CALCULATE INTERACTIONS *
321 **************************/
323 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
326 r00
= _mm_mul_ps(rsq00
,rinv00
);
328 /* EWALD ELECTROSTATICS */
330 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
331 ewrt
= _mm_mul_ps(r00
,ewtabscale
);
332 ewitab
= _mm_cvttps_epi32(ewrt
);
333 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
334 ewitab
= _mm_slli_epi32(ewitab
,2);
335 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
336 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
337 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
338 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
339 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
340 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
341 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
342 velec
= _mm_mul_ps(qq00
,_mm_sub_ps(rinv00
,velec
));
343 felec
= _mm_mul_ps(_mm_mul_ps(qq00
,rinv00
),_mm_sub_ps(rinvsq00
,felec
));
345 /* LENNARD-JONES DISPERSION/REPULSION */
347 rinvsix
= _mm_mul_ps(_mm_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
348 vvdw6
= _mm_mul_ps(c6_00
,rinvsix
);
349 vvdw12
= _mm_mul_ps(c12_00
,_mm_mul_ps(rinvsix
,rinvsix
));
350 vvdw
= _mm_sub_ps( _mm_mul_ps(vvdw12
,one_twelfth
) , _mm_mul_ps(vvdw6
,one_sixth
) );
351 fvdw
= _mm_mul_ps(_mm_sub_ps(vvdw12
,vvdw6
),rinvsq00
);
353 d
= _mm_sub_ps(r00
,rswitch
);
354 d
= _mm_max_ps(d
,_mm_setzero_ps());
355 d2
= _mm_mul_ps(d
,d
);
356 sw
= _mm_add_ps(one
,_mm_mul_ps(d2
,_mm_mul_ps(d
,_mm_add_ps(swV3
,_mm_mul_ps(d
,_mm_add_ps(swV4
,_mm_mul_ps(d
,swV5
)))))));
358 dsw
= _mm_mul_ps(d2
,_mm_add_ps(swF2
,_mm_mul_ps(d
,_mm_add_ps(swF3
,_mm_mul_ps(d
,swF4
)))));
360 /* Evaluate switch function */
361 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
362 felec
= _mm_sub_ps( _mm_mul_ps(felec
,sw
) , _mm_mul_ps(rinv00
,_mm_mul_ps(velec
,dsw
)) );
363 fvdw
= _mm_sub_ps( _mm_mul_ps(fvdw
,sw
) , _mm_mul_ps(rinv00
,_mm_mul_ps(vvdw
,dsw
)) );
364 velec
= _mm_mul_ps(velec
,sw
);
365 vvdw
= _mm_mul_ps(vvdw
,sw
);
366 cutoff_mask
= _mm_cmplt_ps(rsq00
,rcutoff2
);
368 /* Update potential sum for this i atom from the interaction with this j atom. */
369 velec
= _mm_and_ps(velec
,cutoff_mask
);
370 velecsum
= _mm_add_ps(velecsum
,velec
);
371 vvdw
= _mm_and_ps(vvdw
,cutoff_mask
);
372 vvdwsum
= _mm_add_ps(vvdwsum
,vvdw
);
374 fscal
= _mm_add_ps(felec
,fvdw
);
376 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
378 /* Calculate temporary vectorial force */
379 tx
= _mm_mul_ps(fscal
,dx00
);
380 ty
= _mm_mul_ps(fscal
,dy00
);
381 tz
= _mm_mul_ps(fscal
,dz00
);
383 /* Update vectorial force */
384 fix0
= _mm_add_ps(fix0
,tx
);
385 fiy0
= _mm_add_ps(fiy0
,ty
);
386 fiz0
= _mm_add_ps(fiz0
,tz
);
388 fjx0
= _mm_add_ps(fjx0
,tx
);
389 fjy0
= _mm_add_ps(fjy0
,ty
);
390 fjz0
= _mm_add_ps(fjz0
,tz
);
394 /**************************
395 * CALCULATE INTERACTIONS *
396 **************************/
398 if (gmx_mm_any_lt(rsq01
,rcutoff2
))
401 r01
= _mm_mul_ps(rsq01
,rinv01
);
403 /* EWALD ELECTROSTATICS */
405 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
406 ewrt
= _mm_mul_ps(r01
,ewtabscale
);
407 ewitab
= _mm_cvttps_epi32(ewrt
);
408 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
409 ewitab
= _mm_slli_epi32(ewitab
,2);
410 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
411 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
412 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
413 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
414 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
415 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
416 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
417 velec
= _mm_mul_ps(qq01
,_mm_sub_ps(rinv01
,velec
));
418 felec
= _mm_mul_ps(_mm_mul_ps(qq01
,rinv01
),_mm_sub_ps(rinvsq01
,felec
));
420 d
= _mm_sub_ps(r01
,rswitch
);
421 d
= _mm_max_ps(d
,_mm_setzero_ps());
422 d2
= _mm_mul_ps(d
,d
);
423 sw
= _mm_add_ps(one
,_mm_mul_ps(d2
,_mm_mul_ps(d
,_mm_add_ps(swV3
,_mm_mul_ps(d
,_mm_add_ps(swV4
,_mm_mul_ps(d
,swV5
)))))));
425 dsw
= _mm_mul_ps(d2
,_mm_add_ps(swF2
,_mm_mul_ps(d
,_mm_add_ps(swF3
,_mm_mul_ps(d
,swF4
)))));
427 /* Evaluate switch function */
428 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
429 felec
= _mm_sub_ps( _mm_mul_ps(felec
,sw
) , _mm_mul_ps(rinv01
,_mm_mul_ps(velec
,dsw
)) );
430 velec
= _mm_mul_ps(velec
,sw
);
431 cutoff_mask
= _mm_cmplt_ps(rsq01
,rcutoff2
);
433 /* Update potential sum for this i atom from the interaction with this j atom. */
434 velec
= _mm_and_ps(velec
,cutoff_mask
);
435 velecsum
= _mm_add_ps(velecsum
,velec
);
439 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
441 /* Calculate temporary vectorial force */
442 tx
= _mm_mul_ps(fscal
,dx01
);
443 ty
= _mm_mul_ps(fscal
,dy01
);
444 tz
= _mm_mul_ps(fscal
,dz01
);
446 /* Update vectorial force */
447 fix0
= _mm_add_ps(fix0
,tx
);
448 fiy0
= _mm_add_ps(fiy0
,ty
);
449 fiz0
= _mm_add_ps(fiz0
,tz
);
451 fjx1
= _mm_add_ps(fjx1
,tx
);
452 fjy1
= _mm_add_ps(fjy1
,ty
);
453 fjz1
= _mm_add_ps(fjz1
,tz
);
457 /**************************
458 * CALCULATE INTERACTIONS *
459 **************************/
461 if (gmx_mm_any_lt(rsq02
,rcutoff2
))
464 r02
= _mm_mul_ps(rsq02
,rinv02
);
466 /* EWALD ELECTROSTATICS */
468 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
469 ewrt
= _mm_mul_ps(r02
,ewtabscale
);
470 ewitab
= _mm_cvttps_epi32(ewrt
);
471 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
472 ewitab
= _mm_slli_epi32(ewitab
,2);
473 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
474 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
475 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
476 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
477 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
478 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
479 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
480 velec
= _mm_mul_ps(qq02
,_mm_sub_ps(rinv02
,velec
));
481 felec
= _mm_mul_ps(_mm_mul_ps(qq02
,rinv02
),_mm_sub_ps(rinvsq02
,felec
));
483 d
= _mm_sub_ps(r02
,rswitch
);
484 d
= _mm_max_ps(d
,_mm_setzero_ps());
485 d2
= _mm_mul_ps(d
,d
);
486 sw
= _mm_add_ps(one
,_mm_mul_ps(d2
,_mm_mul_ps(d
,_mm_add_ps(swV3
,_mm_mul_ps(d
,_mm_add_ps(swV4
,_mm_mul_ps(d
,swV5
)))))));
488 dsw
= _mm_mul_ps(d2
,_mm_add_ps(swF2
,_mm_mul_ps(d
,_mm_add_ps(swF3
,_mm_mul_ps(d
,swF4
)))));
490 /* Evaluate switch function */
491 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
492 felec
= _mm_sub_ps( _mm_mul_ps(felec
,sw
) , _mm_mul_ps(rinv02
,_mm_mul_ps(velec
,dsw
)) );
493 velec
= _mm_mul_ps(velec
,sw
);
494 cutoff_mask
= _mm_cmplt_ps(rsq02
,rcutoff2
);
496 /* Update potential sum for this i atom from the interaction with this j atom. */
497 velec
= _mm_and_ps(velec
,cutoff_mask
);
498 velecsum
= _mm_add_ps(velecsum
,velec
);
502 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
504 /* Calculate temporary vectorial force */
505 tx
= _mm_mul_ps(fscal
,dx02
);
506 ty
= _mm_mul_ps(fscal
,dy02
);
507 tz
= _mm_mul_ps(fscal
,dz02
);
509 /* Update vectorial force */
510 fix0
= _mm_add_ps(fix0
,tx
);
511 fiy0
= _mm_add_ps(fiy0
,ty
);
512 fiz0
= _mm_add_ps(fiz0
,tz
);
514 fjx2
= _mm_add_ps(fjx2
,tx
);
515 fjy2
= _mm_add_ps(fjy2
,ty
);
516 fjz2
= _mm_add_ps(fjz2
,tz
);
520 /**************************
521 * CALCULATE INTERACTIONS *
522 **************************/
524 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
527 r10
= _mm_mul_ps(rsq10
,rinv10
);
529 /* EWALD ELECTROSTATICS */
531 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
532 ewrt
= _mm_mul_ps(r10
,ewtabscale
);
533 ewitab
= _mm_cvttps_epi32(ewrt
);
534 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
535 ewitab
= _mm_slli_epi32(ewitab
,2);
536 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
537 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
538 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
539 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
540 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
541 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
542 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
543 velec
= _mm_mul_ps(qq10
,_mm_sub_ps(rinv10
,velec
));
544 felec
= _mm_mul_ps(_mm_mul_ps(qq10
,rinv10
),_mm_sub_ps(rinvsq10
,felec
));
546 d
= _mm_sub_ps(r10
,rswitch
);
547 d
= _mm_max_ps(d
,_mm_setzero_ps());
548 d2
= _mm_mul_ps(d
,d
);
549 sw
= _mm_add_ps(one
,_mm_mul_ps(d2
,_mm_mul_ps(d
,_mm_add_ps(swV3
,_mm_mul_ps(d
,_mm_add_ps(swV4
,_mm_mul_ps(d
,swV5
)))))));
551 dsw
= _mm_mul_ps(d2
,_mm_add_ps(swF2
,_mm_mul_ps(d
,_mm_add_ps(swF3
,_mm_mul_ps(d
,swF4
)))));
553 /* Evaluate switch function */
554 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
555 felec
= _mm_sub_ps( _mm_mul_ps(felec
,sw
) , _mm_mul_ps(rinv10
,_mm_mul_ps(velec
,dsw
)) );
556 velec
= _mm_mul_ps(velec
,sw
);
557 cutoff_mask
= _mm_cmplt_ps(rsq10
,rcutoff2
);
559 /* Update potential sum for this i atom from the interaction with this j atom. */
560 velec
= _mm_and_ps(velec
,cutoff_mask
);
561 velecsum
= _mm_add_ps(velecsum
,velec
);
565 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
567 /* Calculate temporary vectorial force */
568 tx
= _mm_mul_ps(fscal
,dx10
);
569 ty
= _mm_mul_ps(fscal
,dy10
);
570 tz
= _mm_mul_ps(fscal
,dz10
);
572 /* Update vectorial force */
573 fix1
= _mm_add_ps(fix1
,tx
);
574 fiy1
= _mm_add_ps(fiy1
,ty
);
575 fiz1
= _mm_add_ps(fiz1
,tz
);
577 fjx0
= _mm_add_ps(fjx0
,tx
);
578 fjy0
= _mm_add_ps(fjy0
,ty
);
579 fjz0
= _mm_add_ps(fjz0
,tz
);
583 /**************************
584 * CALCULATE INTERACTIONS *
585 **************************/
587 if (gmx_mm_any_lt(rsq11
,rcutoff2
))
590 r11
= _mm_mul_ps(rsq11
,rinv11
);
592 /* EWALD ELECTROSTATICS */
594 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
595 ewrt
= _mm_mul_ps(r11
,ewtabscale
);
596 ewitab
= _mm_cvttps_epi32(ewrt
);
597 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
598 ewitab
= _mm_slli_epi32(ewitab
,2);
599 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
600 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
601 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
602 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
603 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
604 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
605 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
606 velec
= _mm_mul_ps(qq11
,_mm_sub_ps(rinv11
,velec
));
607 felec
= _mm_mul_ps(_mm_mul_ps(qq11
,rinv11
),_mm_sub_ps(rinvsq11
,felec
));
609 d
= _mm_sub_ps(r11
,rswitch
);
610 d
= _mm_max_ps(d
,_mm_setzero_ps());
611 d2
= _mm_mul_ps(d
,d
);
612 sw
= _mm_add_ps(one
,_mm_mul_ps(d2
,_mm_mul_ps(d
,_mm_add_ps(swV3
,_mm_mul_ps(d
,_mm_add_ps(swV4
,_mm_mul_ps(d
,swV5
)))))));
614 dsw
= _mm_mul_ps(d2
,_mm_add_ps(swF2
,_mm_mul_ps(d
,_mm_add_ps(swF3
,_mm_mul_ps(d
,swF4
)))));
616 /* Evaluate switch function */
617 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
618 felec
= _mm_sub_ps( _mm_mul_ps(felec
,sw
) , _mm_mul_ps(rinv11
,_mm_mul_ps(velec
,dsw
)) );
619 velec
= _mm_mul_ps(velec
,sw
);
620 cutoff_mask
= _mm_cmplt_ps(rsq11
,rcutoff2
);
622 /* Update potential sum for this i atom from the interaction with this j atom. */
623 velec
= _mm_and_ps(velec
,cutoff_mask
);
624 velecsum
= _mm_add_ps(velecsum
,velec
);
628 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
630 /* Calculate temporary vectorial force */
631 tx
= _mm_mul_ps(fscal
,dx11
);
632 ty
= _mm_mul_ps(fscal
,dy11
);
633 tz
= _mm_mul_ps(fscal
,dz11
);
635 /* Update vectorial force */
636 fix1
= _mm_add_ps(fix1
,tx
);
637 fiy1
= _mm_add_ps(fiy1
,ty
);
638 fiz1
= _mm_add_ps(fiz1
,tz
);
640 fjx1
= _mm_add_ps(fjx1
,tx
);
641 fjy1
= _mm_add_ps(fjy1
,ty
);
642 fjz1
= _mm_add_ps(fjz1
,tz
);
646 /**************************
647 * CALCULATE INTERACTIONS *
648 **************************/
650 if (gmx_mm_any_lt(rsq12
,rcutoff2
))
653 r12
= _mm_mul_ps(rsq12
,rinv12
);
655 /* EWALD ELECTROSTATICS */
657 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
658 ewrt
= _mm_mul_ps(r12
,ewtabscale
);
659 ewitab
= _mm_cvttps_epi32(ewrt
);
660 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
661 ewitab
= _mm_slli_epi32(ewitab
,2);
662 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
663 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
664 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
665 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
666 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
667 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
668 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
669 velec
= _mm_mul_ps(qq12
,_mm_sub_ps(rinv12
,velec
));
670 felec
= _mm_mul_ps(_mm_mul_ps(qq12
,rinv12
),_mm_sub_ps(rinvsq12
,felec
));
672 d
= _mm_sub_ps(r12
,rswitch
);
673 d
= _mm_max_ps(d
,_mm_setzero_ps());
674 d2
= _mm_mul_ps(d
,d
);
675 sw
= _mm_add_ps(one
,_mm_mul_ps(d2
,_mm_mul_ps(d
,_mm_add_ps(swV3
,_mm_mul_ps(d
,_mm_add_ps(swV4
,_mm_mul_ps(d
,swV5
)))))));
677 dsw
= _mm_mul_ps(d2
,_mm_add_ps(swF2
,_mm_mul_ps(d
,_mm_add_ps(swF3
,_mm_mul_ps(d
,swF4
)))));
679 /* Evaluate switch function */
680 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
681 felec
= _mm_sub_ps( _mm_mul_ps(felec
,sw
) , _mm_mul_ps(rinv12
,_mm_mul_ps(velec
,dsw
)) );
682 velec
= _mm_mul_ps(velec
,sw
);
683 cutoff_mask
= _mm_cmplt_ps(rsq12
,rcutoff2
);
685 /* Update potential sum for this i atom from the interaction with this j atom. */
686 velec
= _mm_and_ps(velec
,cutoff_mask
);
687 velecsum
= _mm_add_ps(velecsum
,velec
);
691 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
693 /* Calculate temporary vectorial force */
694 tx
= _mm_mul_ps(fscal
,dx12
);
695 ty
= _mm_mul_ps(fscal
,dy12
);
696 tz
= _mm_mul_ps(fscal
,dz12
);
698 /* Update vectorial force */
699 fix1
= _mm_add_ps(fix1
,tx
);
700 fiy1
= _mm_add_ps(fiy1
,ty
);
701 fiz1
= _mm_add_ps(fiz1
,tz
);
703 fjx2
= _mm_add_ps(fjx2
,tx
);
704 fjy2
= _mm_add_ps(fjy2
,ty
);
705 fjz2
= _mm_add_ps(fjz2
,tz
);
709 /**************************
710 * CALCULATE INTERACTIONS *
711 **************************/
713 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
716 r20
= _mm_mul_ps(rsq20
,rinv20
);
718 /* EWALD ELECTROSTATICS */
720 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
721 ewrt
= _mm_mul_ps(r20
,ewtabscale
);
722 ewitab
= _mm_cvttps_epi32(ewrt
);
723 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
724 ewitab
= _mm_slli_epi32(ewitab
,2);
725 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
726 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
727 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
728 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
729 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
730 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
731 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
732 velec
= _mm_mul_ps(qq20
,_mm_sub_ps(rinv20
,velec
));
733 felec
= _mm_mul_ps(_mm_mul_ps(qq20
,rinv20
),_mm_sub_ps(rinvsq20
,felec
));
735 d
= _mm_sub_ps(r20
,rswitch
);
736 d
= _mm_max_ps(d
,_mm_setzero_ps());
737 d2
= _mm_mul_ps(d
,d
);
738 sw
= _mm_add_ps(one
,_mm_mul_ps(d2
,_mm_mul_ps(d
,_mm_add_ps(swV3
,_mm_mul_ps(d
,_mm_add_ps(swV4
,_mm_mul_ps(d
,swV5
)))))));
740 dsw
= _mm_mul_ps(d2
,_mm_add_ps(swF2
,_mm_mul_ps(d
,_mm_add_ps(swF3
,_mm_mul_ps(d
,swF4
)))));
742 /* Evaluate switch function */
743 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
744 felec
= _mm_sub_ps( _mm_mul_ps(felec
,sw
) , _mm_mul_ps(rinv20
,_mm_mul_ps(velec
,dsw
)) );
745 velec
= _mm_mul_ps(velec
,sw
);
746 cutoff_mask
= _mm_cmplt_ps(rsq20
,rcutoff2
);
748 /* Update potential sum for this i atom from the interaction with this j atom. */
749 velec
= _mm_and_ps(velec
,cutoff_mask
);
750 velecsum
= _mm_add_ps(velecsum
,velec
);
754 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
756 /* Calculate temporary vectorial force */
757 tx
= _mm_mul_ps(fscal
,dx20
);
758 ty
= _mm_mul_ps(fscal
,dy20
);
759 tz
= _mm_mul_ps(fscal
,dz20
);
761 /* Update vectorial force */
762 fix2
= _mm_add_ps(fix2
,tx
);
763 fiy2
= _mm_add_ps(fiy2
,ty
);
764 fiz2
= _mm_add_ps(fiz2
,tz
);
766 fjx0
= _mm_add_ps(fjx0
,tx
);
767 fjy0
= _mm_add_ps(fjy0
,ty
);
768 fjz0
= _mm_add_ps(fjz0
,tz
);
772 /**************************
773 * CALCULATE INTERACTIONS *
774 **************************/
776 if (gmx_mm_any_lt(rsq21
,rcutoff2
))
779 r21
= _mm_mul_ps(rsq21
,rinv21
);
781 /* EWALD ELECTROSTATICS */
783 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
784 ewrt
= _mm_mul_ps(r21
,ewtabscale
);
785 ewitab
= _mm_cvttps_epi32(ewrt
);
786 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
787 ewitab
= _mm_slli_epi32(ewitab
,2);
788 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
789 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
790 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
791 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
792 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
793 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
794 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
795 velec
= _mm_mul_ps(qq21
,_mm_sub_ps(rinv21
,velec
));
796 felec
= _mm_mul_ps(_mm_mul_ps(qq21
,rinv21
),_mm_sub_ps(rinvsq21
,felec
));
798 d
= _mm_sub_ps(r21
,rswitch
);
799 d
= _mm_max_ps(d
,_mm_setzero_ps());
800 d2
= _mm_mul_ps(d
,d
);
801 sw
= _mm_add_ps(one
,_mm_mul_ps(d2
,_mm_mul_ps(d
,_mm_add_ps(swV3
,_mm_mul_ps(d
,_mm_add_ps(swV4
,_mm_mul_ps(d
,swV5
)))))));
803 dsw
= _mm_mul_ps(d2
,_mm_add_ps(swF2
,_mm_mul_ps(d
,_mm_add_ps(swF3
,_mm_mul_ps(d
,swF4
)))));
805 /* Evaluate switch function */
806 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
807 felec
= _mm_sub_ps( _mm_mul_ps(felec
,sw
) , _mm_mul_ps(rinv21
,_mm_mul_ps(velec
,dsw
)) );
808 velec
= _mm_mul_ps(velec
,sw
);
809 cutoff_mask
= _mm_cmplt_ps(rsq21
,rcutoff2
);
811 /* Update potential sum for this i atom from the interaction with this j atom. */
812 velec
= _mm_and_ps(velec
,cutoff_mask
);
813 velecsum
= _mm_add_ps(velecsum
,velec
);
817 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
819 /* Calculate temporary vectorial force */
820 tx
= _mm_mul_ps(fscal
,dx21
);
821 ty
= _mm_mul_ps(fscal
,dy21
);
822 tz
= _mm_mul_ps(fscal
,dz21
);
824 /* Update vectorial force */
825 fix2
= _mm_add_ps(fix2
,tx
);
826 fiy2
= _mm_add_ps(fiy2
,ty
);
827 fiz2
= _mm_add_ps(fiz2
,tz
);
829 fjx1
= _mm_add_ps(fjx1
,tx
);
830 fjy1
= _mm_add_ps(fjy1
,ty
);
831 fjz1
= _mm_add_ps(fjz1
,tz
);
835 /**************************
836 * CALCULATE INTERACTIONS *
837 **************************/
839 if (gmx_mm_any_lt(rsq22
,rcutoff2
))
842 r22
= _mm_mul_ps(rsq22
,rinv22
);
844 /* EWALD ELECTROSTATICS */
846 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
847 ewrt
= _mm_mul_ps(r22
,ewtabscale
);
848 ewitab
= _mm_cvttps_epi32(ewrt
);
849 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
850 ewitab
= _mm_slli_epi32(ewitab
,2);
851 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
852 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
853 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
854 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
855 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
856 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
857 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
858 velec
= _mm_mul_ps(qq22
,_mm_sub_ps(rinv22
,velec
));
859 felec
= _mm_mul_ps(_mm_mul_ps(qq22
,rinv22
),_mm_sub_ps(rinvsq22
,felec
));
861 d
= _mm_sub_ps(r22
,rswitch
);
862 d
= _mm_max_ps(d
,_mm_setzero_ps());
863 d2
= _mm_mul_ps(d
,d
);
864 sw
= _mm_add_ps(one
,_mm_mul_ps(d2
,_mm_mul_ps(d
,_mm_add_ps(swV3
,_mm_mul_ps(d
,_mm_add_ps(swV4
,_mm_mul_ps(d
,swV5
)))))));
866 dsw
= _mm_mul_ps(d2
,_mm_add_ps(swF2
,_mm_mul_ps(d
,_mm_add_ps(swF3
,_mm_mul_ps(d
,swF4
)))));
868 /* Evaluate switch function */
869 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
870 felec
= _mm_sub_ps( _mm_mul_ps(felec
,sw
) , _mm_mul_ps(rinv22
,_mm_mul_ps(velec
,dsw
)) );
871 velec
= _mm_mul_ps(velec
,sw
);
872 cutoff_mask
= _mm_cmplt_ps(rsq22
,rcutoff2
);
874 /* Update potential sum for this i atom from the interaction with this j atom. */
875 velec
= _mm_and_ps(velec
,cutoff_mask
);
876 velecsum
= _mm_add_ps(velecsum
,velec
);
880 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
882 /* Calculate temporary vectorial force */
883 tx
= _mm_mul_ps(fscal
,dx22
);
884 ty
= _mm_mul_ps(fscal
,dy22
);
885 tz
= _mm_mul_ps(fscal
,dz22
);
887 /* Update vectorial force */
888 fix2
= _mm_add_ps(fix2
,tx
);
889 fiy2
= _mm_add_ps(fiy2
,ty
);
890 fiz2
= _mm_add_ps(fiz2
,tz
);
892 fjx2
= _mm_add_ps(fjx2
,tx
);
893 fjy2
= _mm_add_ps(fjy2
,ty
);
894 fjz2
= _mm_add_ps(fjz2
,tz
);
898 fjptrA
= f
+j_coord_offsetA
;
899 fjptrB
= f
+j_coord_offsetB
;
900 fjptrC
= f
+j_coord_offsetC
;
901 fjptrD
= f
+j_coord_offsetD
;
903 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
904 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
906 /* Inner loop uses 603 flops */
912 /* Get j neighbor index, and coordinate index */
913 jnrlistA
= jjnr
[jidx
];
914 jnrlistB
= jjnr
[jidx
+1];
915 jnrlistC
= jjnr
[jidx
+2];
916 jnrlistD
= jjnr
[jidx
+3];
917 /* Sign of each element will be negative for non-real atoms.
918 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
919 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
921 dummy_mask
= gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128()));
922 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
923 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
924 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
925 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
926 j_coord_offsetA
= DIM
*jnrA
;
927 j_coord_offsetB
= DIM
*jnrB
;
928 j_coord_offsetC
= DIM
*jnrC
;
929 j_coord_offsetD
= DIM
*jnrD
;
931 /* load j atom coordinates */
932 gmx_mm_load_3rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
933 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
934 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
936 /* Calculate displacement vector */
937 dx00
= _mm_sub_ps(ix0
,jx0
);
938 dy00
= _mm_sub_ps(iy0
,jy0
);
939 dz00
= _mm_sub_ps(iz0
,jz0
);
940 dx01
= _mm_sub_ps(ix0
,jx1
);
941 dy01
= _mm_sub_ps(iy0
,jy1
);
942 dz01
= _mm_sub_ps(iz0
,jz1
);
943 dx02
= _mm_sub_ps(ix0
,jx2
);
944 dy02
= _mm_sub_ps(iy0
,jy2
);
945 dz02
= _mm_sub_ps(iz0
,jz2
);
946 dx10
= _mm_sub_ps(ix1
,jx0
);
947 dy10
= _mm_sub_ps(iy1
,jy0
);
948 dz10
= _mm_sub_ps(iz1
,jz0
);
949 dx11
= _mm_sub_ps(ix1
,jx1
);
950 dy11
= _mm_sub_ps(iy1
,jy1
);
951 dz11
= _mm_sub_ps(iz1
,jz1
);
952 dx12
= _mm_sub_ps(ix1
,jx2
);
953 dy12
= _mm_sub_ps(iy1
,jy2
);
954 dz12
= _mm_sub_ps(iz1
,jz2
);
955 dx20
= _mm_sub_ps(ix2
,jx0
);
956 dy20
= _mm_sub_ps(iy2
,jy0
);
957 dz20
= _mm_sub_ps(iz2
,jz0
);
958 dx21
= _mm_sub_ps(ix2
,jx1
);
959 dy21
= _mm_sub_ps(iy2
,jy1
);
960 dz21
= _mm_sub_ps(iz2
,jz1
);
961 dx22
= _mm_sub_ps(ix2
,jx2
);
962 dy22
= _mm_sub_ps(iy2
,jy2
);
963 dz22
= _mm_sub_ps(iz2
,jz2
);
965 /* Calculate squared distance and things based on it */
966 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
967 rsq01
= gmx_mm_calc_rsq_ps(dx01
,dy01
,dz01
);
968 rsq02
= gmx_mm_calc_rsq_ps(dx02
,dy02
,dz02
);
969 rsq10
= gmx_mm_calc_rsq_ps(dx10
,dy10
,dz10
);
970 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
971 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
972 rsq20
= gmx_mm_calc_rsq_ps(dx20
,dy20
,dz20
);
973 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
974 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
976 rinv00
= gmx_mm_invsqrt_ps(rsq00
);
977 rinv01
= gmx_mm_invsqrt_ps(rsq01
);
978 rinv02
= gmx_mm_invsqrt_ps(rsq02
);
979 rinv10
= gmx_mm_invsqrt_ps(rsq10
);
980 rinv11
= gmx_mm_invsqrt_ps(rsq11
);
981 rinv12
= gmx_mm_invsqrt_ps(rsq12
);
982 rinv20
= gmx_mm_invsqrt_ps(rsq20
);
983 rinv21
= gmx_mm_invsqrt_ps(rsq21
);
984 rinv22
= gmx_mm_invsqrt_ps(rsq22
);
986 rinvsq00
= _mm_mul_ps(rinv00
,rinv00
);
987 rinvsq01
= _mm_mul_ps(rinv01
,rinv01
);
988 rinvsq02
= _mm_mul_ps(rinv02
,rinv02
);
989 rinvsq10
= _mm_mul_ps(rinv10
,rinv10
);
990 rinvsq11
= _mm_mul_ps(rinv11
,rinv11
);
991 rinvsq12
= _mm_mul_ps(rinv12
,rinv12
);
992 rinvsq20
= _mm_mul_ps(rinv20
,rinv20
);
993 rinvsq21
= _mm_mul_ps(rinv21
,rinv21
);
994 rinvsq22
= _mm_mul_ps(rinv22
,rinv22
);
996 fjx0
= _mm_setzero_ps();
997 fjy0
= _mm_setzero_ps();
998 fjz0
= _mm_setzero_ps();
999 fjx1
= _mm_setzero_ps();
1000 fjy1
= _mm_setzero_ps();
1001 fjz1
= _mm_setzero_ps();
1002 fjx2
= _mm_setzero_ps();
1003 fjy2
= _mm_setzero_ps();
1004 fjz2
= _mm_setzero_ps();
1006 /**************************
1007 * CALCULATE INTERACTIONS *
1008 **************************/
1010 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
1013 r00
= _mm_mul_ps(rsq00
,rinv00
);
1014 r00
= _mm_andnot_ps(dummy_mask
,r00
);
1016 /* EWALD ELECTROSTATICS */
1018 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1019 ewrt
= _mm_mul_ps(r00
,ewtabscale
);
1020 ewitab
= _mm_cvttps_epi32(ewrt
);
1021 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
1022 ewitab
= _mm_slli_epi32(ewitab
,2);
1023 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1024 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
1025 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
1026 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
1027 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
1028 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
1029 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
1030 velec
= _mm_mul_ps(qq00
,_mm_sub_ps(rinv00
,velec
));
1031 felec
= _mm_mul_ps(_mm_mul_ps(qq00
,rinv00
),_mm_sub_ps(rinvsq00
,felec
));
1033 /* LENNARD-JONES DISPERSION/REPULSION */
1035 rinvsix
= _mm_mul_ps(_mm_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
1036 vvdw6
= _mm_mul_ps(c6_00
,rinvsix
);
1037 vvdw12
= _mm_mul_ps(c12_00
,_mm_mul_ps(rinvsix
,rinvsix
));
1038 vvdw
= _mm_sub_ps( _mm_mul_ps(vvdw12
,one_twelfth
) , _mm_mul_ps(vvdw6
,one_sixth
) );
1039 fvdw
= _mm_mul_ps(_mm_sub_ps(vvdw12
,vvdw6
),rinvsq00
);
1041 d
= _mm_sub_ps(r00
,rswitch
);
1042 d
= _mm_max_ps(d
,_mm_setzero_ps());
1043 d2
= _mm_mul_ps(d
,d
);
1044 sw
= _mm_add_ps(one
,_mm_mul_ps(d2
,_mm_mul_ps(d
,_mm_add_ps(swV3
,_mm_mul_ps(d
,_mm_add_ps(swV4
,_mm_mul_ps(d
,swV5
)))))));
1046 dsw
= _mm_mul_ps(d2
,_mm_add_ps(swF2
,_mm_mul_ps(d
,_mm_add_ps(swF3
,_mm_mul_ps(d
,swF4
)))));
1048 /* Evaluate switch function */
1049 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1050 felec
= _mm_sub_ps( _mm_mul_ps(felec
,sw
) , _mm_mul_ps(rinv00
,_mm_mul_ps(velec
,dsw
)) );
1051 fvdw
= _mm_sub_ps( _mm_mul_ps(fvdw
,sw
) , _mm_mul_ps(rinv00
,_mm_mul_ps(vvdw
,dsw
)) );
1052 velec
= _mm_mul_ps(velec
,sw
);
1053 vvdw
= _mm_mul_ps(vvdw
,sw
);
1054 cutoff_mask
= _mm_cmplt_ps(rsq00
,rcutoff2
);
1056 /* Update potential sum for this i atom from the interaction with this j atom. */
1057 velec
= _mm_and_ps(velec
,cutoff_mask
);
1058 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1059 velecsum
= _mm_add_ps(velecsum
,velec
);
1060 vvdw
= _mm_and_ps(vvdw
,cutoff_mask
);
1061 vvdw
= _mm_andnot_ps(dummy_mask
,vvdw
);
1062 vvdwsum
= _mm_add_ps(vvdwsum
,vvdw
);
1064 fscal
= _mm_add_ps(felec
,fvdw
);
1066 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1068 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1070 /* Calculate temporary vectorial force */
1071 tx
= _mm_mul_ps(fscal
,dx00
);
1072 ty
= _mm_mul_ps(fscal
,dy00
);
1073 tz
= _mm_mul_ps(fscal
,dz00
);
1075 /* Update vectorial force */
1076 fix0
= _mm_add_ps(fix0
,tx
);
1077 fiy0
= _mm_add_ps(fiy0
,ty
);
1078 fiz0
= _mm_add_ps(fiz0
,tz
);
1080 fjx0
= _mm_add_ps(fjx0
,tx
);
1081 fjy0
= _mm_add_ps(fjy0
,ty
);
1082 fjz0
= _mm_add_ps(fjz0
,tz
);
1086 /**************************
1087 * CALCULATE INTERACTIONS *
1088 **************************/
1090 if (gmx_mm_any_lt(rsq01
,rcutoff2
))
1093 r01
= _mm_mul_ps(rsq01
,rinv01
);
1094 r01
= _mm_andnot_ps(dummy_mask
,r01
);
1096 /* EWALD ELECTROSTATICS */
1098 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1099 ewrt
= _mm_mul_ps(r01
,ewtabscale
);
1100 ewitab
= _mm_cvttps_epi32(ewrt
);
1101 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
1102 ewitab
= _mm_slli_epi32(ewitab
,2);
1103 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1104 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
1105 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
1106 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
1107 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
1108 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
1109 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
1110 velec
= _mm_mul_ps(qq01
,_mm_sub_ps(rinv01
,velec
));
1111 felec
= _mm_mul_ps(_mm_mul_ps(qq01
,rinv01
),_mm_sub_ps(rinvsq01
,felec
));
1113 d
= _mm_sub_ps(r01
,rswitch
);
1114 d
= _mm_max_ps(d
,_mm_setzero_ps());
1115 d2
= _mm_mul_ps(d
,d
);
1116 sw
= _mm_add_ps(one
,_mm_mul_ps(d2
,_mm_mul_ps(d
,_mm_add_ps(swV3
,_mm_mul_ps(d
,_mm_add_ps(swV4
,_mm_mul_ps(d
,swV5
)))))));
1118 dsw
= _mm_mul_ps(d2
,_mm_add_ps(swF2
,_mm_mul_ps(d
,_mm_add_ps(swF3
,_mm_mul_ps(d
,swF4
)))));
1120 /* Evaluate switch function */
1121 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1122 felec
= _mm_sub_ps( _mm_mul_ps(felec
,sw
) , _mm_mul_ps(rinv01
,_mm_mul_ps(velec
,dsw
)) );
1123 velec
= _mm_mul_ps(velec
,sw
);
1124 cutoff_mask
= _mm_cmplt_ps(rsq01
,rcutoff2
);
1126 /* Update potential sum for this i atom from the interaction with this j atom. */
1127 velec
= _mm_and_ps(velec
,cutoff_mask
);
1128 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1129 velecsum
= _mm_add_ps(velecsum
,velec
);
1133 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1135 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1137 /* Calculate temporary vectorial force */
1138 tx
= _mm_mul_ps(fscal
,dx01
);
1139 ty
= _mm_mul_ps(fscal
,dy01
);
1140 tz
= _mm_mul_ps(fscal
,dz01
);
1142 /* Update vectorial force */
1143 fix0
= _mm_add_ps(fix0
,tx
);
1144 fiy0
= _mm_add_ps(fiy0
,ty
);
1145 fiz0
= _mm_add_ps(fiz0
,tz
);
1147 fjx1
= _mm_add_ps(fjx1
,tx
);
1148 fjy1
= _mm_add_ps(fjy1
,ty
);
1149 fjz1
= _mm_add_ps(fjz1
,tz
);
1153 /**************************
1154 * CALCULATE INTERACTIONS *
1155 **************************/
1157 if (gmx_mm_any_lt(rsq02
,rcutoff2
))
1160 r02
= _mm_mul_ps(rsq02
,rinv02
);
1161 r02
= _mm_andnot_ps(dummy_mask
,r02
);
1163 /* EWALD ELECTROSTATICS */
1165 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1166 ewrt
= _mm_mul_ps(r02
,ewtabscale
);
1167 ewitab
= _mm_cvttps_epi32(ewrt
);
1168 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
1169 ewitab
= _mm_slli_epi32(ewitab
,2);
1170 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1171 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
1172 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
1173 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
1174 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
1175 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
1176 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
1177 velec
= _mm_mul_ps(qq02
,_mm_sub_ps(rinv02
,velec
));
1178 felec
= _mm_mul_ps(_mm_mul_ps(qq02
,rinv02
),_mm_sub_ps(rinvsq02
,felec
));
1180 d
= _mm_sub_ps(r02
,rswitch
);
1181 d
= _mm_max_ps(d
,_mm_setzero_ps());
1182 d2
= _mm_mul_ps(d
,d
);
1183 sw
= _mm_add_ps(one
,_mm_mul_ps(d2
,_mm_mul_ps(d
,_mm_add_ps(swV3
,_mm_mul_ps(d
,_mm_add_ps(swV4
,_mm_mul_ps(d
,swV5
)))))));
1185 dsw
= _mm_mul_ps(d2
,_mm_add_ps(swF2
,_mm_mul_ps(d
,_mm_add_ps(swF3
,_mm_mul_ps(d
,swF4
)))));
1187 /* Evaluate switch function */
1188 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1189 felec
= _mm_sub_ps( _mm_mul_ps(felec
,sw
) , _mm_mul_ps(rinv02
,_mm_mul_ps(velec
,dsw
)) );
1190 velec
= _mm_mul_ps(velec
,sw
);
1191 cutoff_mask
= _mm_cmplt_ps(rsq02
,rcutoff2
);
1193 /* Update potential sum for this i atom from the interaction with this j atom. */
1194 velec
= _mm_and_ps(velec
,cutoff_mask
);
1195 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1196 velecsum
= _mm_add_ps(velecsum
,velec
);
1200 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1202 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1204 /* Calculate temporary vectorial force */
1205 tx
= _mm_mul_ps(fscal
,dx02
);
1206 ty
= _mm_mul_ps(fscal
,dy02
);
1207 tz
= _mm_mul_ps(fscal
,dz02
);
1209 /* Update vectorial force */
1210 fix0
= _mm_add_ps(fix0
,tx
);
1211 fiy0
= _mm_add_ps(fiy0
,ty
);
1212 fiz0
= _mm_add_ps(fiz0
,tz
);
1214 fjx2
= _mm_add_ps(fjx2
,tx
);
1215 fjy2
= _mm_add_ps(fjy2
,ty
);
1216 fjz2
= _mm_add_ps(fjz2
,tz
);
1220 /**************************
1221 * CALCULATE INTERACTIONS *
1222 **************************/
1224 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
1227 r10
= _mm_mul_ps(rsq10
,rinv10
);
1228 r10
= _mm_andnot_ps(dummy_mask
,r10
);
1230 /* EWALD ELECTROSTATICS */
1232 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1233 ewrt
= _mm_mul_ps(r10
,ewtabscale
);
1234 ewitab
= _mm_cvttps_epi32(ewrt
);
1235 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
1236 ewitab
= _mm_slli_epi32(ewitab
,2);
1237 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1238 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
1239 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
1240 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
1241 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
1242 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
1243 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
1244 velec
= _mm_mul_ps(qq10
,_mm_sub_ps(rinv10
,velec
));
1245 felec
= _mm_mul_ps(_mm_mul_ps(qq10
,rinv10
),_mm_sub_ps(rinvsq10
,felec
));
1247 d
= _mm_sub_ps(r10
,rswitch
);
1248 d
= _mm_max_ps(d
,_mm_setzero_ps());
1249 d2
= _mm_mul_ps(d
,d
);
1250 sw
= _mm_add_ps(one
,_mm_mul_ps(d2
,_mm_mul_ps(d
,_mm_add_ps(swV3
,_mm_mul_ps(d
,_mm_add_ps(swV4
,_mm_mul_ps(d
,swV5
)))))));
1252 dsw
= _mm_mul_ps(d2
,_mm_add_ps(swF2
,_mm_mul_ps(d
,_mm_add_ps(swF3
,_mm_mul_ps(d
,swF4
)))));
1254 /* Evaluate switch function */
1255 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1256 felec
= _mm_sub_ps( _mm_mul_ps(felec
,sw
) , _mm_mul_ps(rinv10
,_mm_mul_ps(velec
,dsw
)) );
1257 velec
= _mm_mul_ps(velec
,sw
);
1258 cutoff_mask
= _mm_cmplt_ps(rsq10
,rcutoff2
);
1260 /* Update potential sum for this i atom from the interaction with this j atom. */
1261 velec
= _mm_and_ps(velec
,cutoff_mask
);
1262 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1263 velecsum
= _mm_add_ps(velecsum
,velec
);
1267 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1269 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1271 /* Calculate temporary vectorial force */
1272 tx
= _mm_mul_ps(fscal
,dx10
);
1273 ty
= _mm_mul_ps(fscal
,dy10
);
1274 tz
= _mm_mul_ps(fscal
,dz10
);
1276 /* Update vectorial force */
1277 fix1
= _mm_add_ps(fix1
,tx
);
1278 fiy1
= _mm_add_ps(fiy1
,ty
);
1279 fiz1
= _mm_add_ps(fiz1
,tz
);
1281 fjx0
= _mm_add_ps(fjx0
,tx
);
1282 fjy0
= _mm_add_ps(fjy0
,ty
);
1283 fjz0
= _mm_add_ps(fjz0
,tz
);
1287 /**************************
1288 * CALCULATE INTERACTIONS *
1289 **************************/
1291 if (gmx_mm_any_lt(rsq11
,rcutoff2
))
1294 r11
= _mm_mul_ps(rsq11
,rinv11
);
1295 r11
= _mm_andnot_ps(dummy_mask
,r11
);
1297 /* EWALD ELECTROSTATICS */
1299 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1300 ewrt
= _mm_mul_ps(r11
,ewtabscale
);
1301 ewitab
= _mm_cvttps_epi32(ewrt
);
1302 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
1303 ewitab
= _mm_slli_epi32(ewitab
,2);
1304 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1305 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
1306 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
1307 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
1308 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
1309 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
1310 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
1311 velec
= _mm_mul_ps(qq11
,_mm_sub_ps(rinv11
,velec
));
1312 felec
= _mm_mul_ps(_mm_mul_ps(qq11
,rinv11
),_mm_sub_ps(rinvsq11
,felec
));
1314 d
= _mm_sub_ps(r11
,rswitch
);
1315 d
= _mm_max_ps(d
,_mm_setzero_ps());
1316 d2
= _mm_mul_ps(d
,d
);
1317 sw
= _mm_add_ps(one
,_mm_mul_ps(d2
,_mm_mul_ps(d
,_mm_add_ps(swV3
,_mm_mul_ps(d
,_mm_add_ps(swV4
,_mm_mul_ps(d
,swV5
)))))));
1319 dsw
= _mm_mul_ps(d2
,_mm_add_ps(swF2
,_mm_mul_ps(d
,_mm_add_ps(swF3
,_mm_mul_ps(d
,swF4
)))));
1321 /* Evaluate switch function */
1322 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1323 felec
= _mm_sub_ps( _mm_mul_ps(felec
,sw
) , _mm_mul_ps(rinv11
,_mm_mul_ps(velec
,dsw
)) );
1324 velec
= _mm_mul_ps(velec
,sw
);
1325 cutoff_mask
= _mm_cmplt_ps(rsq11
,rcutoff2
);
1327 /* Update potential sum for this i atom from the interaction with this j atom. */
1328 velec
= _mm_and_ps(velec
,cutoff_mask
);
1329 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1330 velecsum
= _mm_add_ps(velecsum
,velec
);
1334 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1336 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1338 /* Calculate temporary vectorial force */
1339 tx
= _mm_mul_ps(fscal
,dx11
);
1340 ty
= _mm_mul_ps(fscal
,dy11
);
1341 tz
= _mm_mul_ps(fscal
,dz11
);
1343 /* Update vectorial force */
1344 fix1
= _mm_add_ps(fix1
,tx
);
1345 fiy1
= _mm_add_ps(fiy1
,ty
);
1346 fiz1
= _mm_add_ps(fiz1
,tz
);
1348 fjx1
= _mm_add_ps(fjx1
,tx
);
1349 fjy1
= _mm_add_ps(fjy1
,ty
);
1350 fjz1
= _mm_add_ps(fjz1
,tz
);
1354 /**************************
1355 * CALCULATE INTERACTIONS *
1356 **************************/
1358 if (gmx_mm_any_lt(rsq12
,rcutoff2
))
1361 r12
= _mm_mul_ps(rsq12
,rinv12
);
1362 r12
= _mm_andnot_ps(dummy_mask
,r12
);
1364 /* EWALD ELECTROSTATICS */
1366 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1367 ewrt
= _mm_mul_ps(r12
,ewtabscale
);
1368 ewitab
= _mm_cvttps_epi32(ewrt
);
1369 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
1370 ewitab
= _mm_slli_epi32(ewitab
,2);
1371 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1372 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
1373 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
1374 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
1375 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
1376 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
1377 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
1378 velec
= _mm_mul_ps(qq12
,_mm_sub_ps(rinv12
,velec
));
1379 felec
= _mm_mul_ps(_mm_mul_ps(qq12
,rinv12
),_mm_sub_ps(rinvsq12
,felec
));
1381 d
= _mm_sub_ps(r12
,rswitch
);
1382 d
= _mm_max_ps(d
,_mm_setzero_ps());
1383 d2
= _mm_mul_ps(d
,d
);
1384 sw
= _mm_add_ps(one
,_mm_mul_ps(d2
,_mm_mul_ps(d
,_mm_add_ps(swV3
,_mm_mul_ps(d
,_mm_add_ps(swV4
,_mm_mul_ps(d
,swV5
)))))));
1386 dsw
= _mm_mul_ps(d2
,_mm_add_ps(swF2
,_mm_mul_ps(d
,_mm_add_ps(swF3
,_mm_mul_ps(d
,swF4
)))));
1388 /* Evaluate switch function */
1389 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1390 felec
= _mm_sub_ps( _mm_mul_ps(felec
,sw
) , _mm_mul_ps(rinv12
,_mm_mul_ps(velec
,dsw
)) );
1391 velec
= _mm_mul_ps(velec
,sw
);
1392 cutoff_mask
= _mm_cmplt_ps(rsq12
,rcutoff2
);
1394 /* Update potential sum for this i atom from the interaction with this j atom. */
1395 velec
= _mm_and_ps(velec
,cutoff_mask
);
1396 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1397 velecsum
= _mm_add_ps(velecsum
,velec
);
1401 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1403 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1405 /* Calculate temporary vectorial force */
1406 tx
= _mm_mul_ps(fscal
,dx12
);
1407 ty
= _mm_mul_ps(fscal
,dy12
);
1408 tz
= _mm_mul_ps(fscal
,dz12
);
1410 /* Update vectorial force */
1411 fix1
= _mm_add_ps(fix1
,tx
);
1412 fiy1
= _mm_add_ps(fiy1
,ty
);
1413 fiz1
= _mm_add_ps(fiz1
,tz
);
1415 fjx2
= _mm_add_ps(fjx2
,tx
);
1416 fjy2
= _mm_add_ps(fjy2
,ty
);
1417 fjz2
= _mm_add_ps(fjz2
,tz
);
1421 /**************************
1422 * CALCULATE INTERACTIONS *
1423 **************************/
1425 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
1428 r20
= _mm_mul_ps(rsq20
,rinv20
);
1429 r20
= _mm_andnot_ps(dummy_mask
,r20
);
1431 /* EWALD ELECTROSTATICS */
1433 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1434 ewrt
= _mm_mul_ps(r20
,ewtabscale
);
1435 ewitab
= _mm_cvttps_epi32(ewrt
);
1436 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
1437 ewitab
= _mm_slli_epi32(ewitab
,2);
1438 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1439 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
1440 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
1441 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
1442 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
1443 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
1444 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
1445 velec
= _mm_mul_ps(qq20
,_mm_sub_ps(rinv20
,velec
));
1446 felec
= _mm_mul_ps(_mm_mul_ps(qq20
,rinv20
),_mm_sub_ps(rinvsq20
,felec
));
1448 d
= _mm_sub_ps(r20
,rswitch
);
1449 d
= _mm_max_ps(d
,_mm_setzero_ps());
1450 d2
= _mm_mul_ps(d
,d
);
1451 sw
= _mm_add_ps(one
,_mm_mul_ps(d2
,_mm_mul_ps(d
,_mm_add_ps(swV3
,_mm_mul_ps(d
,_mm_add_ps(swV4
,_mm_mul_ps(d
,swV5
)))))));
1453 dsw
= _mm_mul_ps(d2
,_mm_add_ps(swF2
,_mm_mul_ps(d
,_mm_add_ps(swF3
,_mm_mul_ps(d
,swF4
)))));
1455 /* Evaluate switch function */
1456 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1457 felec
= _mm_sub_ps( _mm_mul_ps(felec
,sw
) , _mm_mul_ps(rinv20
,_mm_mul_ps(velec
,dsw
)) );
1458 velec
= _mm_mul_ps(velec
,sw
);
1459 cutoff_mask
= _mm_cmplt_ps(rsq20
,rcutoff2
);
1461 /* Update potential sum for this i atom from the interaction with this j atom. */
1462 velec
= _mm_and_ps(velec
,cutoff_mask
);
1463 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1464 velecsum
= _mm_add_ps(velecsum
,velec
);
1468 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1470 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1472 /* Calculate temporary vectorial force */
1473 tx
= _mm_mul_ps(fscal
,dx20
);
1474 ty
= _mm_mul_ps(fscal
,dy20
);
1475 tz
= _mm_mul_ps(fscal
,dz20
);
1477 /* Update vectorial force */
1478 fix2
= _mm_add_ps(fix2
,tx
);
1479 fiy2
= _mm_add_ps(fiy2
,ty
);
1480 fiz2
= _mm_add_ps(fiz2
,tz
);
1482 fjx0
= _mm_add_ps(fjx0
,tx
);
1483 fjy0
= _mm_add_ps(fjy0
,ty
);
1484 fjz0
= _mm_add_ps(fjz0
,tz
);
1488 /**************************
1489 * CALCULATE INTERACTIONS *
1490 **************************/
1492 if (gmx_mm_any_lt(rsq21
,rcutoff2
))
1495 r21
= _mm_mul_ps(rsq21
,rinv21
);
1496 r21
= _mm_andnot_ps(dummy_mask
,r21
);
1498 /* EWALD ELECTROSTATICS */
1500 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1501 ewrt
= _mm_mul_ps(r21
,ewtabscale
);
1502 ewitab
= _mm_cvttps_epi32(ewrt
);
1503 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
1504 ewitab
= _mm_slli_epi32(ewitab
,2);
1505 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1506 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
1507 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
1508 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
1509 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
1510 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
1511 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
1512 velec
= _mm_mul_ps(qq21
,_mm_sub_ps(rinv21
,velec
));
1513 felec
= _mm_mul_ps(_mm_mul_ps(qq21
,rinv21
),_mm_sub_ps(rinvsq21
,felec
));
1515 d
= _mm_sub_ps(r21
,rswitch
);
1516 d
= _mm_max_ps(d
,_mm_setzero_ps());
1517 d2
= _mm_mul_ps(d
,d
);
1518 sw
= _mm_add_ps(one
,_mm_mul_ps(d2
,_mm_mul_ps(d
,_mm_add_ps(swV3
,_mm_mul_ps(d
,_mm_add_ps(swV4
,_mm_mul_ps(d
,swV5
)))))));
1520 dsw
= _mm_mul_ps(d2
,_mm_add_ps(swF2
,_mm_mul_ps(d
,_mm_add_ps(swF3
,_mm_mul_ps(d
,swF4
)))));
1522 /* Evaluate switch function */
1523 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1524 felec
= _mm_sub_ps( _mm_mul_ps(felec
,sw
) , _mm_mul_ps(rinv21
,_mm_mul_ps(velec
,dsw
)) );
1525 velec
= _mm_mul_ps(velec
,sw
);
1526 cutoff_mask
= _mm_cmplt_ps(rsq21
,rcutoff2
);
1528 /* Update potential sum for this i atom from the interaction with this j atom. */
1529 velec
= _mm_and_ps(velec
,cutoff_mask
);
1530 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1531 velecsum
= _mm_add_ps(velecsum
,velec
);
1535 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1537 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1539 /* Calculate temporary vectorial force */
1540 tx
= _mm_mul_ps(fscal
,dx21
);
1541 ty
= _mm_mul_ps(fscal
,dy21
);
1542 tz
= _mm_mul_ps(fscal
,dz21
);
1544 /* Update vectorial force */
1545 fix2
= _mm_add_ps(fix2
,tx
);
1546 fiy2
= _mm_add_ps(fiy2
,ty
);
1547 fiz2
= _mm_add_ps(fiz2
,tz
);
1549 fjx1
= _mm_add_ps(fjx1
,tx
);
1550 fjy1
= _mm_add_ps(fjy1
,ty
);
1551 fjz1
= _mm_add_ps(fjz1
,tz
);
1555 /**************************
1556 * CALCULATE INTERACTIONS *
1557 **************************/
1559 if (gmx_mm_any_lt(rsq22
,rcutoff2
))
1562 r22
= _mm_mul_ps(rsq22
,rinv22
);
1563 r22
= _mm_andnot_ps(dummy_mask
,r22
);
1565 /* EWALD ELECTROSTATICS */
1567 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1568 ewrt
= _mm_mul_ps(r22
,ewtabscale
);
1569 ewitab
= _mm_cvttps_epi32(ewrt
);
1570 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
1571 ewitab
= _mm_slli_epi32(ewitab
,2);
1572 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1573 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
1574 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
1575 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
1576 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
1577 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
1578 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
1579 velec
= _mm_mul_ps(qq22
,_mm_sub_ps(rinv22
,velec
));
1580 felec
= _mm_mul_ps(_mm_mul_ps(qq22
,rinv22
),_mm_sub_ps(rinvsq22
,felec
));
1582 d
= _mm_sub_ps(r22
,rswitch
);
1583 d
= _mm_max_ps(d
,_mm_setzero_ps());
1584 d2
= _mm_mul_ps(d
,d
);
1585 sw
= _mm_add_ps(one
,_mm_mul_ps(d2
,_mm_mul_ps(d
,_mm_add_ps(swV3
,_mm_mul_ps(d
,_mm_add_ps(swV4
,_mm_mul_ps(d
,swV5
)))))));
1587 dsw
= _mm_mul_ps(d2
,_mm_add_ps(swF2
,_mm_mul_ps(d
,_mm_add_ps(swF3
,_mm_mul_ps(d
,swF4
)))));
1589 /* Evaluate switch function */
1590 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1591 felec
= _mm_sub_ps( _mm_mul_ps(felec
,sw
) , _mm_mul_ps(rinv22
,_mm_mul_ps(velec
,dsw
)) );
1592 velec
= _mm_mul_ps(velec
,sw
);
1593 cutoff_mask
= _mm_cmplt_ps(rsq22
,rcutoff2
);
1595 /* Update potential sum for this i atom from the interaction with this j atom. */
1596 velec
= _mm_and_ps(velec
,cutoff_mask
);
1597 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1598 velecsum
= _mm_add_ps(velecsum
,velec
);
1602 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1604 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1606 /* Calculate temporary vectorial force */
1607 tx
= _mm_mul_ps(fscal
,dx22
);
1608 ty
= _mm_mul_ps(fscal
,dy22
);
1609 tz
= _mm_mul_ps(fscal
,dz22
);
1611 /* Update vectorial force */
1612 fix2
= _mm_add_ps(fix2
,tx
);
1613 fiy2
= _mm_add_ps(fiy2
,ty
);
1614 fiz2
= _mm_add_ps(fiz2
,tz
);
1616 fjx2
= _mm_add_ps(fjx2
,tx
);
1617 fjy2
= _mm_add_ps(fjy2
,ty
);
1618 fjz2
= _mm_add_ps(fjz2
,tz
);
1622 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
1623 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
1624 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
1625 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
1627 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
1628 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
1630 /* Inner loop uses 612 flops */
1633 /* End of innermost loop */
1635 gmx_mm_update_iforce_3atom_swizzle_ps(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
1636 f
+i_coord_offset
,fshift
+i_shift_offset
);
1639 /* Update potential energies */
1640 gmx_mm_update_1pot_ps(velecsum
,kernel_data
->energygrp_elec
+ggid
);
1641 gmx_mm_update_1pot_ps(vvdwsum
,kernel_data
->energygrp_vdw
+ggid
);
1643 /* Increment number of inner iterations */
1644 inneriter
+= j_index_end
- j_index_start
;
1646 /* Outer loop uses 20 flops */
1649 /* Increment number of outer iterations */
1652 /* Update outer/inner flops */
1654 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W3W3_VF
,outeriter
*20 + inneriter
*612);
1657 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sse2_single
1658 * Electrostatics interaction: Ewald
1659 * VdW interaction: LennardJones
1660 * Geometry: Water3-Water3
1661 * Calculate force/pot: Force
1664 nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sse2_single
1665 (t_nblist
* gmx_restrict nlist
,
1666 rvec
* gmx_restrict xx
,
1667 rvec
* gmx_restrict ff
,
1668 t_forcerec
* gmx_restrict fr
,
1669 t_mdatoms
* gmx_restrict mdatoms
,
1670 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
1671 t_nrnb
* gmx_restrict nrnb
)
1673 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1674 * just 0 for non-waters.
1675 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1676 * jnr indices corresponding to data put in the four positions in the SIMD register.
1678 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
1679 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
1680 int jnrA
,jnrB
,jnrC
,jnrD
;
1681 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
1682 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
1683 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
1684 real rcutoff_scalar
;
1685 real
*shiftvec
,*fshift
,*x
,*f
;
1686 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
;
1687 real scratch
[4*DIM
];
1688 __m128 tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
1690 __m128 ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
1692 __m128 ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
1694 __m128 ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
1695 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
;
1696 __m128 jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
1697 int vdwjidx1A
,vdwjidx1B
,vdwjidx1C
,vdwjidx1D
;
1698 __m128 jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
1699 int vdwjidx2A
,vdwjidx2B
,vdwjidx2C
,vdwjidx2D
;
1700 __m128 jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
1701 __m128 dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
1702 __m128 dx01
,dy01
,dz01
,rsq01
,rinv01
,rinvsq01
,r01
,qq01
,c6_01
,c12_01
;
1703 __m128 dx02
,dy02
,dz02
,rsq02
,rinv02
,rinvsq02
,r02
,qq02
,c6_02
,c12_02
;
1704 __m128 dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
1705 __m128 dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
1706 __m128 dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
1707 __m128 dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
1708 __m128 dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
1709 __m128 dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
1710 __m128 velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
1713 __m128 rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
1716 __m128 one_sixth
= _mm_set1_ps(1.0/6.0);
1717 __m128 one_twelfth
= _mm_set1_ps(1.0/12.0);
1719 __m128 ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
1721 __m128 rswitch
,swV3
,swV4
,swV5
,swF2
,swF3
,swF4
,d
,d2
,sw
,dsw
;
1722 real rswitch_scalar
,d_scalar
;
1723 __m128 dummy_mask
,cutoff_mask
;
1724 __m128 signbit
= _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1725 __m128 one
= _mm_set1_ps(1.0);
1726 __m128 two
= _mm_set1_ps(2.0);
1732 jindex
= nlist
->jindex
;
1734 shiftidx
= nlist
->shift
;
1736 shiftvec
= fr
->shift_vec
[0];
1737 fshift
= fr
->fshift
[0];
1738 facel
= _mm_set1_ps(fr
->epsfac
);
1739 charge
= mdatoms
->chargeA
;
1740 nvdwtype
= fr
->ntype
;
1741 vdwparam
= fr
->nbfp
;
1742 vdwtype
= mdatoms
->typeA
;
1744 sh_ewald
= _mm_set1_ps(fr
->ic
->sh_ewald
);
1745 ewtab
= fr
->ic
->tabq_coul_FDV0
;
1746 ewtabscale
= _mm_set1_ps(fr
->ic
->tabq_scale
);
1747 ewtabhalfspace
= _mm_set1_ps(0.5/fr
->ic
->tabq_scale
);
1749 /* Setup water-specific parameters */
1750 inr
= nlist
->iinr
[0];
1751 iq0
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+0]));
1752 iq1
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+1]));
1753 iq2
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+2]));
1754 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
1756 jq0
= _mm_set1_ps(charge
[inr
+0]);
1757 jq1
= _mm_set1_ps(charge
[inr
+1]);
1758 jq2
= _mm_set1_ps(charge
[inr
+2]);
1759 vdwjidx0A
= 2*vdwtype
[inr
+0];
1760 qq00
= _mm_mul_ps(iq0
,jq0
);
1761 c6_00
= _mm_set1_ps(vdwparam
[vdwioffset0
+vdwjidx0A
]);
1762 c12_00
= _mm_set1_ps(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
1763 qq01
= _mm_mul_ps(iq0
,jq1
);
1764 qq02
= _mm_mul_ps(iq0
,jq2
);
1765 qq10
= _mm_mul_ps(iq1
,jq0
);
1766 qq11
= _mm_mul_ps(iq1
,jq1
);
1767 qq12
= _mm_mul_ps(iq1
,jq2
);
1768 qq20
= _mm_mul_ps(iq2
,jq0
);
1769 qq21
= _mm_mul_ps(iq2
,jq1
);
1770 qq22
= _mm_mul_ps(iq2
,jq2
);
1772 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1773 rcutoff_scalar
= fr
->rcoulomb
;
1774 rcutoff
= _mm_set1_ps(rcutoff_scalar
);
1775 rcutoff2
= _mm_mul_ps(rcutoff
,rcutoff
);
1777 rswitch_scalar
= fr
->rcoulomb_switch
;
1778 rswitch
= _mm_set1_ps(rswitch_scalar
);
1779 /* Setup switch parameters */
1780 d_scalar
= rcutoff_scalar
-rswitch_scalar
;
1781 d
= _mm_set1_ps(d_scalar
);
1782 swV3
= _mm_set1_ps(-10.0/(d_scalar
*d_scalar
*d_scalar
));
1783 swV4
= _mm_set1_ps( 15.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
));
1784 swV5
= _mm_set1_ps( -6.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
*d_scalar
));
1785 swF2
= _mm_set1_ps(-30.0/(d_scalar
*d_scalar
*d_scalar
));
1786 swF3
= _mm_set1_ps( 60.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
));
1787 swF4
= _mm_set1_ps(-30.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
*d_scalar
));
1789 /* Avoid stupid compiler warnings */
1790 jnrA
= jnrB
= jnrC
= jnrD
= 0;
1791 j_coord_offsetA
= 0;
1792 j_coord_offsetB
= 0;
1793 j_coord_offsetC
= 0;
1794 j_coord_offsetD
= 0;
1799 for(iidx
=0;iidx
<4*DIM
;iidx
++)
1801 scratch
[iidx
] = 0.0;
1804 /* Start outer loop over neighborlists */
1805 for(iidx
=0; iidx
<nri
; iidx
++)
1807 /* Load shift vector for this list */
1808 i_shift_offset
= DIM
*shiftidx
[iidx
];
1810 /* Load limits for loop over neighbors */
1811 j_index_start
= jindex
[iidx
];
1812 j_index_end
= jindex
[iidx
+1];
1814 /* Get outer coordinate index */
1816 i_coord_offset
= DIM
*inr
;
1818 /* Load i particle coords and add shift vector */
1819 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
1820 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
1822 fix0
= _mm_setzero_ps();
1823 fiy0
= _mm_setzero_ps();
1824 fiz0
= _mm_setzero_ps();
1825 fix1
= _mm_setzero_ps();
1826 fiy1
= _mm_setzero_ps();
1827 fiz1
= _mm_setzero_ps();
1828 fix2
= _mm_setzero_ps();
1829 fiy2
= _mm_setzero_ps();
1830 fiz2
= _mm_setzero_ps();
1832 /* Start inner kernel loop */
1833 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+3]>=0; jidx
+=4)
1836 /* Get j neighbor index, and coordinate index */
1838 jnrB
= jjnr
[jidx
+1];
1839 jnrC
= jjnr
[jidx
+2];
1840 jnrD
= jjnr
[jidx
+3];
1841 j_coord_offsetA
= DIM
*jnrA
;
1842 j_coord_offsetB
= DIM
*jnrB
;
1843 j_coord_offsetC
= DIM
*jnrC
;
1844 j_coord_offsetD
= DIM
*jnrD
;
1846 /* load j atom coordinates */
1847 gmx_mm_load_3rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1848 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
1849 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
1851 /* Calculate displacement vector */
1852 dx00
= _mm_sub_ps(ix0
,jx0
);
1853 dy00
= _mm_sub_ps(iy0
,jy0
);
1854 dz00
= _mm_sub_ps(iz0
,jz0
);
1855 dx01
= _mm_sub_ps(ix0
,jx1
);
1856 dy01
= _mm_sub_ps(iy0
,jy1
);
1857 dz01
= _mm_sub_ps(iz0
,jz1
);
1858 dx02
= _mm_sub_ps(ix0
,jx2
);
1859 dy02
= _mm_sub_ps(iy0
,jy2
);
1860 dz02
= _mm_sub_ps(iz0
,jz2
);
1861 dx10
= _mm_sub_ps(ix1
,jx0
);
1862 dy10
= _mm_sub_ps(iy1
,jy0
);
1863 dz10
= _mm_sub_ps(iz1
,jz0
);
1864 dx11
= _mm_sub_ps(ix1
,jx1
);
1865 dy11
= _mm_sub_ps(iy1
,jy1
);
1866 dz11
= _mm_sub_ps(iz1
,jz1
);
1867 dx12
= _mm_sub_ps(ix1
,jx2
);
1868 dy12
= _mm_sub_ps(iy1
,jy2
);
1869 dz12
= _mm_sub_ps(iz1
,jz2
);
1870 dx20
= _mm_sub_ps(ix2
,jx0
);
1871 dy20
= _mm_sub_ps(iy2
,jy0
);
1872 dz20
= _mm_sub_ps(iz2
,jz0
);
1873 dx21
= _mm_sub_ps(ix2
,jx1
);
1874 dy21
= _mm_sub_ps(iy2
,jy1
);
1875 dz21
= _mm_sub_ps(iz2
,jz1
);
1876 dx22
= _mm_sub_ps(ix2
,jx2
);
1877 dy22
= _mm_sub_ps(iy2
,jy2
);
1878 dz22
= _mm_sub_ps(iz2
,jz2
);
1880 /* Calculate squared distance and things based on it */
1881 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
1882 rsq01
= gmx_mm_calc_rsq_ps(dx01
,dy01
,dz01
);
1883 rsq02
= gmx_mm_calc_rsq_ps(dx02
,dy02
,dz02
);
1884 rsq10
= gmx_mm_calc_rsq_ps(dx10
,dy10
,dz10
);
1885 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
1886 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
1887 rsq20
= gmx_mm_calc_rsq_ps(dx20
,dy20
,dz20
);
1888 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
1889 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
1891 rinv00
= gmx_mm_invsqrt_ps(rsq00
);
1892 rinv01
= gmx_mm_invsqrt_ps(rsq01
);
1893 rinv02
= gmx_mm_invsqrt_ps(rsq02
);
1894 rinv10
= gmx_mm_invsqrt_ps(rsq10
);
1895 rinv11
= gmx_mm_invsqrt_ps(rsq11
);
1896 rinv12
= gmx_mm_invsqrt_ps(rsq12
);
1897 rinv20
= gmx_mm_invsqrt_ps(rsq20
);
1898 rinv21
= gmx_mm_invsqrt_ps(rsq21
);
1899 rinv22
= gmx_mm_invsqrt_ps(rsq22
);
1901 rinvsq00
= _mm_mul_ps(rinv00
,rinv00
);
1902 rinvsq01
= _mm_mul_ps(rinv01
,rinv01
);
1903 rinvsq02
= _mm_mul_ps(rinv02
,rinv02
);
1904 rinvsq10
= _mm_mul_ps(rinv10
,rinv10
);
1905 rinvsq11
= _mm_mul_ps(rinv11
,rinv11
);
1906 rinvsq12
= _mm_mul_ps(rinv12
,rinv12
);
1907 rinvsq20
= _mm_mul_ps(rinv20
,rinv20
);
1908 rinvsq21
= _mm_mul_ps(rinv21
,rinv21
);
1909 rinvsq22
= _mm_mul_ps(rinv22
,rinv22
);
1911 fjx0
= _mm_setzero_ps();
1912 fjy0
= _mm_setzero_ps();
1913 fjz0
= _mm_setzero_ps();
1914 fjx1
= _mm_setzero_ps();
1915 fjy1
= _mm_setzero_ps();
1916 fjz1
= _mm_setzero_ps();
1917 fjx2
= _mm_setzero_ps();
1918 fjy2
= _mm_setzero_ps();
1919 fjz2
= _mm_setzero_ps();
1921 /**************************
1922 * CALCULATE INTERACTIONS *
1923 **************************/
1925 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
1928 r00
= _mm_mul_ps(rsq00
,rinv00
);
1930 /* EWALD ELECTROSTATICS */
1932 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1933 ewrt
= _mm_mul_ps(r00
,ewtabscale
);
1934 ewitab
= _mm_cvttps_epi32(ewrt
);
1935 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
1936 ewitab
= _mm_slli_epi32(ewitab
,2);
1937 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1938 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
1939 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
1940 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
1941 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
1942 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
1943 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
1944 velec
= _mm_mul_ps(qq00
,_mm_sub_ps(rinv00
,velec
));
1945 felec
= _mm_mul_ps(_mm_mul_ps(qq00
,rinv00
),_mm_sub_ps(rinvsq00
,felec
));
1947 /* LENNARD-JONES DISPERSION/REPULSION */
1949 rinvsix
= _mm_mul_ps(_mm_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
1950 vvdw6
= _mm_mul_ps(c6_00
,rinvsix
);
1951 vvdw12
= _mm_mul_ps(c12_00
,_mm_mul_ps(rinvsix
,rinvsix
));
1952 vvdw
= _mm_sub_ps( _mm_mul_ps(vvdw12
,one_twelfth
) , _mm_mul_ps(vvdw6
,one_sixth
) );
1953 fvdw
= _mm_mul_ps(_mm_sub_ps(vvdw12
,vvdw6
),rinvsq00
);
1955 d
= _mm_sub_ps(r00
,rswitch
);
1956 d
= _mm_max_ps(d
,_mm_setzero_ps());
1957 d2
= _mm_mul_ps(d
,d
);
1958 sw
= _mm_add_ps(one
,_mm_mul_ps(d2
,_mm_mul_ps(d
,_mm_add_ps(swV3
,_mm_mul_ps(d
,_mm_add_ps(swV4
,_mm_mul_ps(d
,swV5
)))))));
1960 dsw
= _mm_mul_ps(d2
,_mm_add_ps(swF2
,_mm_mul_ps(d
,_mm_add_ps(swF3
,_mm_mul_ps(d
,swF4
)))));
1962 /* Evaluate switch function */
1963 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1964 felec
= _mm_sub_ps( _mm_mul_ps(felec
,sw
) , _mm_mul_ps(rinv00
,_mm_mul_ps(velec
,dsw
)) );
1965 fvdw
= _mm_sub_ps( _mm_mul_ps(fvdw
,sw
) , _mm_mul_ps(rinv00
,_mm_mul_ps(vvdw
,dsw
)) );
1966 cutoff_mask
= _mm_cmplt_ps(rsq00
,rcutoff2
);
1968 fscal
= _mm_add_ps(felec
,fvdw
);
1970 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
1972 /* Calculate temporary vectorial force */
1973 tx
= _mm_mul_ps(fscal
,dx00
);
1974 ty
= _mm_mul_ps(fscal
,dy00
);
1975 tz
= _mm_mul_ps(fscal
,dz00
);
1977 /* Update vectorial force */
1978 fix0
= _mm_add_ps(fix0
,tx
);
1979 fiy0
= _mm_add_ps(fiy0
,ty
);
1980 fiz0
= _mm_add_ps(fiz0
,tz
);
1982 fjx0
= _mm_add_ps(fjx0
,tx
);
1983 fjy0
= _mm_add_ps(fjy0
,ty
);
1984 fjz0
= _mm_add_ps(fjz0
,tz
);
1988 /**************************
1989 * CALCULATE INTERACTIONS *
1990 **************************/
1992 if (gmx_mm_any_lt(rsq01
,rcutoff2
))
1995 r01
= _mm_mul_ps(rsq01
,rinv01
);
1997 /* EWALD ELECTROSTATICS */
1999 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2000 ewrt
= _mm_mul_ps(r01
,ewtabscale
);
2001 ewitab
= _mm_cvttps_epi32(ewrt
);
2002 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
2003 ewitab
= _mm_slli_epi32(ewitab
,2);
2004 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
2005 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
2006 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
2007 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
2008 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
2009 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
2010 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
2011 velec
= _mm_mul_ps(qq01
,_mm_sub_ps(rinv01
,velec
));
2012 felec
= _mm_mul_ps(_mm_mul_ps(qq01
,rinv01
),_mm_sub_ps(rinvsq01
,felec
));
2014 d
= _mm_sub_ps(r01
,rswitch
);
2015 d
= _mm_max_ps(d
,_mm_setzero_ps());
2016 d2
= _mm_mul_ps(d
,d
);
2017 sw
= _mm_add_ps(one
,_mm_mul_ps(d2
,_mm_mul_ps(d
,_mm_add_ps(swV3
,_mm_mul_ps(d
,_mm_add_ps(swV4
,_mm_mul_ps(d
,swV5
)))))));
2019 dsw
= _mm_mul_ps(d2
,_mm_add_ps(swF2
,_mm_mul_ps(d
,_mm_add_ps(swF3
,_mm_mul_ps(d
,swF4
)))));
2021 /* Evaluate switch function */
2022 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2023 felec
= _mm_sub_ps( _mm_mul_ps(felec
,sw
) , _mm_mul_ps(rinv01
,_mm_mul_ps(velec
,dsw
)) );
2024 cutoff_mask
= _mm_cmplt_ps(rsq01
,rcutoff2
);
2028 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
2030 /* Calculate temporary vectorial force */
2031 tx
= _mm_mul_ps(fscal
,dx01
);
2032 ty
= _mm_mul_ps(fscal
,dy01
);
2033 tz
= _mm_mul_ps(fscal
,dz01
);
2035 /* Update vectorial force */
2036 fix0
= _mm_add_ps(fix0
,tx
);
2037 fiy0
= _mm_add_ps(fiy0
,ty
);
2038 fiz0
= _mm_add_ps(fiz0
,tz
);
2040 fjx1
= _mm_add_ps(fjx1
,tx
);
2041 fjy1
= _mm_add_ps(fjy1
,ty
);
2042 fjz1
= _mm_add_ps(fjz1
,tz
);
2046 /**************************
2047 * CALCULATE INTERACTIONS *
2048 **************************/
2050 if (gmx_mm_any_lt(rsq02
,rcutoff2
))
2053 r02
= _mm_mul_ps(rsq02
,rinv02
);
2055 /* EWALD ELECTROSTATICS */
2057 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2058 ewrt
= _mm_mul_ps(r02
,ewtabscale
);
2059 ewitab
= _mm_cvttps_epi32(ewrt
);
2060 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
2061 ewitab
= _mm_slli_epi32(ewitab
,2);
2062 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
2063 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
2064 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
2065 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
2066 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
2067 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
2068 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
2069 velec
= _mm_mul_ps(qq02
,_mm_sub_ps(rinv02
,velec
));
2070 felec
= _mm_mul_ps(_mm_mul_ps(qq02
,rinv02
),_mm_sub_ps(rinvsq02
,felec
));
2072 d
= _mm_sub_ps(r02
,rswitch
);
2073 d
= _mm_max_ps(d
,_mm_setzero_ps());
2074 d2
= _mm_mul_ps(d
,d
);
2075 sw
= _mm_add_ps(one
,_mm_mul_ps(d2
,_mm_mul_ps(d
,_mm_add_ps(swV3
,_mm_mul_ps(d
,_mm_add_ps(swV4
,_mm_mul_ps(d
,swV5
)))))));
2077 dsw
= _mm_mul_ps(d2
,_mm_add_ps(swF2
,_mm_mul_ps(d
,_mm_add_ps(swF3
,_mm_mul_ps(d
,swF4
)))));
2079 /* Evaluate switch function */
2080 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2081 felec
= _mm_sub_ps( _mm_mul_ps(felec
,sw
) , _mm_mul_ps(rinv02
,_mm_mul_ps(velec
,dsw
)) );
2082 cutoff_mask
= _mm_cmplt_ps(rsq02
,rcutoff2
);
2086 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
2088 /* Calculate temporary vectorial force */
2089 tx
= _mm_mul_ps(fscal
,dx02
);
2090 ty
= _mm_mul_ps(fscal
,dy02
);
2091 tz
= _mm_mul_ps(fscal
,dz02
);
2093 /* Update vectorial force */
2094 fix0
= _mm_add_ps(fix0
,tx
);
2095 fiy0
= _mm_add_ps(fiy0
,ty
);
2096 fiz0
= _mm_add_ps(fiz0
,tz
);
2098 fjx2
= _mm_add_ps(fjx2
,tx
);
2099 fjy2
= _mm_add_ps(fjy2
,ty
);
2100 fjz2
= _mm_add_ps(fjz2
,tz
);
2104 /**************************
2105 * CALCULATE INTERACTIONS *
2106 **************************/
2108 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
2111 r10
= _mm_mul_ps(rsq10
,rinv10
);
2113 /* EWALD ELECTROSTATICS */
2115 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2116 ewrt
= _mm_mul_ps(r10
,ewtabscale
);
2117 ewitab
= _mm_cvttps_epi32(ewrt
);
2118 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
2119 ewitab
= _mm_slli_epi32(ewitab
,2);
2120 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
2121 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
2122 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
2123 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
2124 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
2125 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
2126 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
2127 velec
= _mm_mul_ps(qq10
,_mm_sub_ps(rinv10
,velec
));
2128 felec
= _mm_mul_ps(_mm_mul_ps(qq10
,rinv10
),_mm_sub_ps(rinvsq10
,felec
));
2130 d
= _mm_sub_ps(r10
,rswitch
);
2131 d
= _mm_max_ps(d
,_mm_setzero_ps());
2132 d2
= _mm_mul_ps(d
,d
);
2133 sw
= _mm_add_ps(one
,_mm_mul_ps(d2
,_mm_mul_ps(d
,_mm_add_ps(swV3
,_mm_mul_ps(d
,_mm_add_ps(swV4
,_mm_mul_ps(d
,swV5
)))))));
2135 dsw
= _mm_mul_ps(d2
,_mm_add_ps(swF2
,_mm_mul_ps(d
,_mm_add_ps(swF3
,_mm_mul_ps(d
,swF4
)))));
2137 /* Evaluate switch function */
2138 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2139 felec
= _mm_sub_ps( _mm_mul_ps(felec
,sw
) , _mm_mul_ps(rinv10
,_mm_mul_ps(velec
,dsw
)) );
2140 cutoff_mask
= _mm_cmplt_ps(rsq10
,rcutoff2
);
2144 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
2146 /* Calculate temporary vectorial force */
2147 tx
= _mm_mul_ps(fscal
,dx10
);
2148 ty
= _mm_mul_ps(fscal
,dy10
);
2149 tz
= _mm_mul_ps(fscal
,dz10
);
2151 /* Update vectorial force */
2152 fix1
= _mm_add_ps(fix1
,tx
);
2153 fiy1
= _mm_add_ps(fiy1
,ty
);
2154 fiz1
= _mm_add_ps(fiz1
,tz
);
2156 fjx0
= _mm_add_ps(fjx0
,tx
);
2157 fjy0
= _mm_add_ps(fjy0
,ty
);
2158 fjz0
= _mm_add_ps(fjz0
,tz
);
2162 /**************************
2163 * CALCULATE INTERACTIONS *
2164 **************************/
2166 if (gmx_mm_any_lt(rsq11
,rcutoff2
))
2169 r11
= _mm_mul_ps(rsq11
,rinv11
);
2171 /* EWALD ELECTROSTATICS */
2173 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2174 ewrt
= _mm_mul_ps(r11
,ewtabscale
);
2175 ewitab
= _mm_cvttps_epi32(ewrt
);
2176 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
2177 ewitab
= _mm_slli_epi32(ewitab
,2);
2178 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
2179 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
2180 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
2181 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
2182 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
2183 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
2184 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
2185 velec
= _mm_mul_ps(qq11
,_mm_sub_ps(rinv11
,velec
));
2186 felec
= _mm_mul_ps(_mm_mul_ps(qq11
,rinv11
),_mm_sub_ps(rinvsq11
,felec
));
2188 d
= _mm_sub_ps(r11
,rswitch
);
2189 d
= _mm_max_ps(d
,_mm_setzero_ps());
2190 d2
= _mm_mul_ps(d
,d
);
2191 sw
= _mm_add_ps(one
,_mm_mul_ps(d2
,_mm_mul_ps(d
,_mm_add_ps(swV3
,_mm_mul_ps(d
,_mm_add_ps(swV4
,_mm_mul_ps(d
,swV5
)))))));
2193 dsw
= _mm_mul_ps(d2
,_mm_add_ps(swF2
,_mm_mul_ps(d
,_mm_add_ps(swF3
,_mm_mul_ps(d
,swF4
)))));
2195 /* Evaluate switch function */
2196 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2197 felec
= _mm_sub_ps( _mm_mul_ps(felec
,sw
) , _mm_mul_ps(rinv11
,_mm_mul_ps(velec
,dsw
)) );
2198 cutoff_mask
= _mm_cmplt_ps(rsq11
,rcutoff2
);
2202 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
2204 /* Calculate temporary vectorial force */
2205 tx
= _mm_mul_ps(fscal
,dx11
);
2206 ty
= _mm_mul_ps(fscal
,dy11
);
2207 tz
= _mm_mul_ps(fscal
,dz11
);
2209 /* Update vectorial force */
2210 fix1
= _mm_add_ps(fix1
,tx
);
2211 fiy1
= _mm_add_ps(fiy1
,ty
);
2212 fiz1
= _mm_add_ps(fiz1
,tz
);
2214 fjx1
= _mm_add_ps(fjx1
,tx
);
2215 fjy1
= _mm_add_ps(fjy1
,ty
);
2216 fjz1
= _mm_add_ps(fjz1
,tz
);
2220 /**************************
2221 * CALCULATE INTERACTIONS *
2222 **************************/
2224 if (gmx_mm_any_lt(rsq12
,rcutoff2
))
2227 r12
= _mm_mul_ps(rsq12
,rinv12
);
2229 /* EWALD ELECTROSTATICS */
2231 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2232 ewrt
= _mm_mul_ps(r12
,ewtabscale
);
2233 ewitab
= _mm_cvttps_epi32(ewrt
);
2234 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
2235 ewitab
= _mm_slli_epi32(ewitab
,2);
2236 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
2237 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
2238 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
2239 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
2240 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
2241 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
2242 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
2243 velec
= _mm_mul_ps(qq12
,_mm_sub_ps(rinv12
,velec
));
2244 felec
= _mm_mul_ps(_mm_mul_ps(qq12
,rinv12
),_mm_sub_ps(rinvsq12
,felec
));
2246 d
= _mm_sub_ps(r12
,rswitch
);
2247 d
= _mm_max_ps(d
,_mm_setzero_ps());
2248 d2
= _mm_mul_ps(d
,d
);
2249 sw
= _mm_add_ps(one
,_mm_mul_ps(d2
,_mm_mul_ps(d
,_mm_add_ps(swV3
,_mm_mul_ps(d
,_mm_add_ps(swV4
,_mm_mul_ps(d
,swV5
)))))));
2251 dsw
= _mm_mul_ps(d2
,_mm_add_ps(swF2
,_mm_mul_ps(d
,_mm_add_ps(swF3
,_mm_mul_ps(d
,swF4
)))));
2253 /* Evaluate switch function */
2254 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2255 felec
= _mm_sub_ps( _mm_mul_ps(felec
,sw
) , _mm_mul_ps(rinv12
,_mm_mul_ps(velec
,dsw
)) );
2256 cutoff_mask
= _mm_cmplt_ps(rsq12
,rcutoff2
);
2260 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
2262 /* Calculate temporary vectorial force */
2263 tx
= _mm_mul_ps(fscal
,dx12
);
2264 ty
= _mm_mul_ps(fscal
,dy12
);
2265 tz
= _mm_mul_ps(fscal
,dz12
);
2267 /* Update vectorial force */
2268 fix1
= _mm_add_ps(fix1
,tx
);
2269 fiy1
= _mm_add_ps(fiy1
,ty
);
2270 fiz1
= _mm_add_ps(fiz1
,tz
);
2272 fjx2
= _mm_add_ps(fjx2
,tx
);
2273 fjy2
= _mm_add_ps(fjy2
,ty
);
2274 fjz2
= _mm_add_ps(fjz2
,tz
);
2278 /**************************
2279 * CALCULATE INTERACTIONS *
2280 **************************/
2282 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
2285 r20
= _mm_mul_ps(rsq20
,rinv20
);
2287 /* EWALD ELECTROSTATICS */
2289 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2290 ewrt
= _mm_mul_ps(r20
,ewtabscale
);
2291 ewitab
= _mm_cvttps_epi32(ewrt
);
2292 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
2293 ewitab
= _mm_slli_epi32(ewitab
,2);
2294 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
2295 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
2296 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
2297 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
2298 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
2299 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
2300 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
2301 velec
= _mm_mul_ps(qq20
,_mm_sub_ps(rinv20
,velec
));
2302 felec
= _mm_mul_ps(_mm_mul_ps(qq20
,rinv20
),_mm_sub_ps(rinvsq20
,felec
));
2304 d
= _mm_sub_ps(r20
,rswitch
);
2305 d
= _mm_max_ps(d
,_mm_setzero_ps());
2306 d2
= _mm_mul_ps(d
,d
);
2307 sw
= _mm_add_ps(one
,_mm_mul_ps(d2
,_mm_mul_ps(d
,_mm_add_ps(swV3
,_mm_mul_ps(d
,_mm_add_ps(swV4
,_mm_mul_ps(d
,swV5
)))))));
2309 dsw
= _mm_mul_ps(d2
,_mm_add_ps(swF2
,_mm_mul_ps(d
,_mm_add_ps(swF3
,_mm_mul_ps(d
,swF4
)))));
2311 /* Evaluate switch function */
2312 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2313 felec
= _mm_sub_ps( _mm_mul_ps(felec
,sw
) , _mm_mul_ps(rinv20
,_mm_mul_ps(velec
,dsw
)) );
2314 cutoff_mask
= _mm_cmplt_ps(rsq20
,rcutoff2
);
2318 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
2320 /* Calculate temporary vectorial force */
2321 tx
= _mm_mul_ps(fscal
,dx20
);
2322 ty
= _mm_mul_ps(fscal
,dy20
);
2323 tz
= _mm_mul_ps(fscal
,dz20
);
2325 /* Update vectorial force */
2326 fix2
= _mm_add_ps(fix2
,tx
);
2327 fiy2
= _mm_add_ps(fiy2
,ty
);
2328 fiz2
= _mm_add_ps(fiz2
,tz
);
2330 fjx0
= _mm_add_ps(fjx0
,tx
);
2331 fjy0
= _mm_add_ps(fjy0
,ty
);
2332 fjz0
= _mm_add_ps(fjz0
,tz
);
2336 /**************************
2337 * CALCULATE INTERACTIONS *
2338 **************************/
2340 if (gmx_mm_any_lt(rsq21
,rcutoff2
))
2343 r21
= _mm_mul_ps(rsq21
,rinv21
);
2345 /* EWALD ELECTROSTATICS */
2347 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2348 ewrt
= _mm_mul_ps(r21
,ewtabscale
);
2349 ewitab
= _mm_cvttps_epi32(ewrt
);
2350 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
2351 ewitab
= _mm_slli_epi32(ewitab
,2);
2352 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
2353 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
2354 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
2355 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
2356 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
2357 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
2358 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
2359 velec
= _mm_mul_ps(qq21
,_mm_sub_ps(rinv21
,velec
));
2360 felec
= _mm_mul_ps(_mm_mul_ps(qq21
,rinv21
),_mm_sub_ps(rinvsq21
,felec
));
2362 d
= _mm_sub_ps(r21
,rswitch
);
2363 d
= _mm_max_ps(d
,_mm_setzero_ps());
2364 d2
= _mm_mul_ps(d
,d
);
2365 sw
= _mm_add_ps(one
,_mm_mul_ps(d2
,_mm_mul_ps(d
,_mm_add_ps(swV3
,_mm_mul_ps(d
,_mm_add_ps(swV4
,_mm_mul_ps(d
,swV5
)))))));
2367 dsw
= _mm_mul_ps(d2
,_mm_add_ps(swF2
,_mm_mul_ps(d
,_mm_add_ps(swF3
,_mm_mul_ps(d
,swF4
)))));
2369 /* Evaluate switch function */
2370 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2371 felec
= _mm_sub_ps( _mm_mul_ps(felec
,sw
) , _mm_mul_ps(rinv21
,_mm_mul_ps(velec
,dsw
)) );
2372 cutoff_mask
= _mm_cmplt_ps(rsq21
,rcutoff2
);
2376 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
2378 /* Calculate temporary vectorial force */
2379 tx
= _mm_mul_ps(fscal
,dx21
);
2380 ty
= _mm_mul_ps(fscal
,dy21
);
2381 tz
= _mm_mul_ps(fscal
,dz21
);
2383 /* Update vectorial force */
2384 fix2
= _mm_add_ps(fix2
,tx
);
2385 fiy2
= _mm_add_ps(fiy2
,ty
);
2386 fiz2
= _mm_add_ps(fiz2
,tz
);
2388 fjx1
= _mm_add_ps(fjx1
,tx
);
2389 fjy1
= _mm_add_ps(fjy1
,ty
);
2390 fjz1
= _mm_add_ps(fjz1
,tz
);
2394 /**************************
2395 * CALCULATE INTERACTIONS *
2396 **************************/
2398 if (gmx_mm_any_lt(rsq22
,rcutoff2
))
2401 r22
= _mm_mul_ps(rsq22
,rinv22
);
2403 /* EWALD ELECTROSTATICS */
2405 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2406 ewrt
= _mm_mul_ps(r22
,ewtabscale
);
2407 ewitab
= _mm_cvttps_epi32(ewrt
);
2408 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
2409 ewitab
= _mm_slli_epi32(ewitab
,2);
2410 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
2411 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
2412 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
2413 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
2414 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
2415 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
2416 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
2417 velec
= _mm_mul_ps(qq22
,_mm_sub_ps(rinv22
,velec
));
2418 felec
= _mm_mul_ps(_mm_mul_ps(qq22
,rinv22
),_mm_sub_ps(rinvsq22
,felec
));
2420 d
= _mm_sub_ps(r22
,rswitch
);
2421 d
= _mm_max_ps(d
,_mm_setzero_ps());
2422 d2
= _mm_mul_ps(d
,d
);
2423 sw
= _mm_add_ps(one
,_mm_mul_ps(d2
,_mm_mul_ps(d
,_mm_add_ps(swV3
,_mm_mul_ps(d
,_mm_add_ps(swV4
,_mm_mul_ps(d
,swV5
)))))));
2425 dsw
= _mm_mul_ps(d2
,_mm_add_ps(swF2
,_mm_mul_ps(d
,_mm_add_ps(swF3
,_mm_mul_ps(d
,swF4
)))));
2427 /* Evaluate switch function */
2428 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2429 felec
= _mm_sub_ps( _mm_mul_ps(felec
,sw
) , _mm_mul_ps(rinv22
,_mm_mul_ps(velec
,dsw
)) );
2430 cutoff_mask
= _mm_cmplt_ps(rsq22
,rcutoff2
);
2434 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
2436 /* Calculate temporary vectorial force */
2437 tx
= _mm_mul_ps(fscal
,dx22
);
2438 ty
= _mm_mul_ps(fscal
,dy22
);
2439 tz
= _mm_mul_ps(fscal
,dz22
);
2441 /* Update vectorial force */
2442 fix2
= _mm_add_ps(fix2
,tx
);
2443 fiy2
= _mm_add_ps(fiy2
,ty
);
2444 fiz2
= _mm_add_ps(fiz2
,tz
);
2446 fjx2
= _mm_add_ps(fjx2
,tx
);
2447 fjy2
= _mm_add_ps(fjy2
,ty
);
2448 fjz2
= _mm_add_ps(fjz2
,tz
);
2452 fjptrA
= f
+j_coord_offsetA
;
2453 fjptrB
= f
+j_coord_offsetB
;
2454 fjptrC
= f
+j_coord_offsetC
;
2455 fjptrD
= f
+j_coord_offsetD
;
2457 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
2458 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
2460 /* Inner loop uses 573 flops */
2463 if(jidx
<j_index_end
)
2466 /* Get j neighbor index, and coordinate index */
2467 jnrlistA
= jjnr
[jidx
];
2468 jnrlistB
= jjnr
[jidx
+1];
2469 jnrlistC
= jjnr
[jidx
+2];
2470 jnrlistD
= jjnr
[jidx
+3];
2471 /* Sign of each element will be negative for non-real atoms.
2472 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2473 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2475 dummy_mask
= gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128()));
2476 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
2477 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
2478 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
2479 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
2480 j_coord_offsetA
= DIM
*jnrA
;
2481 j_coord_offsetB
= DIM
*jnrB
;
2482 j_coord_offsetC
= DIM
*jnrC
;
2483 j_coord_offsetD
= DIM
*jnrD
;
2485 /* load j atom coordinates */
2486 gmx_mm_load_3rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
2487 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
2488 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
2490 /* Calculate displacement vector */
2491 dx00
= _mm_sub_ps(ix0
,jx0
);
2492 dy00
= _mm_sub_ps(iy0
,jy0
);
2493 dz00
= _mm_sub_ps(iz0
,jz0
);
2494 dx01
= _mm_sub_ps(ix0
,jx1
);
2495 dy01
= _mm_sub_ps(iy0
,jy1
);
2496 dz01
= _mm_sub_ps(iz0
,jz1
);
2497 dx02
= _mm_sub_ps(ix0
,jx2
);
2498 dy02
= _mm_sub_ps(iy0
,jy2
);
2499 dz02
= _mm_sub_ps(iz0
,jz2
);
2500 dx10
= _mm_sub_ps(ix1
,jx0
);
2501 dy10
= _mm_sub_ps(iy1
,jy0
);
2502 dz10
= _mm_sub_ps(iz1
,jz0
);
2503 dx11
= _mm_sub_ps(ix1
,jx1
);
2504 dy11
= _mm_sub_ps(iy1
,jy1
);
2505 dz11
= _mm_sub_ps(iz1
,jz1
);
2506 dx12
= _mm_sub_ps(ix1
,jx2
);
2507 dy12
= _mm_sub_ps(iy1
,jy2
);
2508 dz12
= _mm_sub_ps(iz1
,jz2
);
2509 dx20
= _mm_sub_ps(ix2
,jx0
);
2510 dy20
= _mm_sub_ps(iy2
,jy0
);
2511 dz20
= _mm_sub_ps(iz2
,jz0
);
2512 dx21
= _mm_sub_ps(ix2
,jx1
);
2513 dy21
= _mm_sub_ps(iy2
,jy1
);
2514 dz21
= _mm_sub_ps(iz2
,jz1
);
2515 dx22
= _mm_sub_ps(ix2
,jx2
);
2516 dy22
= _mm_sub_ps(iy2
,jy2
);
2517 dz22
= _mm_sub_ps(iz2
,jz2
);
2519 /* Calculate squared distance and things based on it */
2520 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
2521 rsq01
= gmx_mm_calc_rsq_ps(dx01
,dy01
,dz01
);
2522 rsq02
= gmx_mm_calc_rsq_ps(dx02
,dy02
,dz02
);
2523 rsq10
= gmx_mm_calc_rsq_ps(dx10
,dy10
,dz10
);
2524 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
2525 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
2526 rsq20
= gmx_mm_calc_rsq_ps(dx20
,dy20
,dz20
);
2527 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
2528 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
2530 rinv00
= gmx_mm_invsqrt_ps(rsq00
);
2531 rinv01
= gmx_mm_invsqrt_ps(rsq01
);
2532 rinv02
= gmx_mm_invsqrt_ps(rsq02
);
2533 rinv10
= gmx_mm_invsqrt_ps(rsq10
);
2534 rinv11
= gmx_mm_invsqrt_ps(rsq11
);
2535 rinv12
= gmx_mm_invsqrt_ps(rsq12
);
2536 rinv20
= gmx_mm_invsqrt_ps(rsq20
);
2537 rinv21
= gmx_mm_invsqrt_ps(rsq21
);
2538 rinv22
= gmx_mm_invsqrt_ps(rsq22
);
2540 rinvsq00
= _mm_mul_ps(rinv00
,rinv00
);
2541 rinvsq01
= _mm_mul_ps(rinv01
,rinv01
);
2542 rinvsq02
= _mm_mul_ps(rinv02
,rinv02
);
2543 rinvsq10
= _mm_mul_ps(rinv10
,rinv10
);
2544 rinvsq11
= _mm_mul_ps(rinv11
,rinv11
);
2545 rinvsq12
= _mm_mul_ps(rinv12
,rinv12
);
2546 rinvsq20
= _mm_mul_ps(rinv20
,rinv20
);
2547 rinvsq21
= _mm_mul_ps(rinv21
,rinv21
);
2548 rinvsq22
= _mm_mul_ps(rinv22
,rinv22
);
2550 fjx0
= _mm_setzero_ps();
2551 fjy0
= _mm_setzero_ps();
2552 fjz0
= _mm_setzero_ps();
2553 fjx1
= _mm_setzero_ps();
2554 fjy1
= _mm_setzero_ps();
2555 fjz1
= _mm_setzero_ps();
2556 fjx2
= _mm_setzero_ps();
2557 fjy2
= _mm_setzero_ps();
2558 fjz2
= _mm_setzero_ps();
2560 /**************************
2561 * CALCULATE INTERACTIONS *
2562 **************************/
2564 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
2567 r00
= _mm_mul_ps(rsq00
,rinv00
);
2568 r00
= _mm_andnot_ps(dummy_mask
,r00
);
2570 /* EWALD ELECTROSTATICS */
2572 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2573 ewrt
= _mm_mul_ps(r00
,ewtabscale
);
2574 ewitab
= _mm_cvttps_epi32(ewrt
);
2575 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
2576 ewitab
= _mm_slli_epi32(ewitab
,2);
2577 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
2578 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
2579 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
2580 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
2581 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
2582 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
2583 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
2584 velec
= _mm_mul_ps(qq00
,_mm_sub_ps(rinv00
,velec
));
2585 felec
= _mm_mul_ps(_mm_mul_ps(qq00
,rinv00
),_mm_sub_ps(rinvsq00
,felec
));
2587 /* LENNARD-JONES DISPERSION/REPULSION */
2589 rinvsix
= _mm_mul_ps(_mm_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
2590 vvdw6
= _mm_mul_ps(c6_00
,rinvsix
);
2591 vvdw12
= _mm_mul_ps(c12_00
,_mm_mul_ps(rinvsix
,rinvsix
));
2592 vvdw
= _mm_sub_ps( _mm_mul_ps(vvdw12
,one_twelfth
) , _mm_mul_ps(vvdw6
,one_sixth
) );
2593 fvdw
= _mm_mul_ps(_mm_sub_ps(vvdw12
,vvdw6
),rinvsq00
);
2595 d
= _mm_sub_ps(r00
,rswitch
);
2596 d
= _mm_max_ps(d
,_mm_setzero_ps());
2597 d2
= _mm_mul_ps(d
,d
);
2598 sw
= _mm_add_ps(one
,_mm_mul_ps(d2
,_mm_mul_ps(d
,_mm_add_ps(swV3
,_mm_mul_ps(d
,_mm_add_ps(swV4
,_mm_mul_ps(d
,swV5
)))))));
2600 dsw
= _mm_mul_ps(d2
,_mm_add_ps(swF2
,_mm_mul_ps(d
,_mm_add_ps(swF3
,_mm_mul_ps(d
,swF4
)))));
2602 /* Evaluate switch function */
2603 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2604 felec
= _mm_sub_ps( _mm_mul_ps(felec
,sw
) , _mm_mul_ps(rinv00
,_mm_mul_ps(velec
,dsw
)) );
2605 fvdw
= _mm_sub_ps( _mm_mul_ps(fvdw
,sw
) , _mm_mul_ps(rinv00
,_mm_mul_ps(vvdw
,dsw
)) );
2606 cutoff_mask
= _mm_cmplt_ps(rsq00
,rcutoff2
);
2608 fscal
= _mm_add_ps(felec
,fvdw
);
2610 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
2612 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2614 /* Calculate temporary vectorial force */
2615 tx
= _mm_mul_ps(fscal
,dx00
);
2616 ty
= _mm_mul_ps(fscal
,dy00
);
2617 tz
= _mm_mul_ps(fscal
,dz00
);
2619 /* Update vectorial force */
2620 fix0
= _mm_add_ps(fix0
,tx
);
2621 fiy0
= _mm_add_ps(fiy0
,ty
);
2622 fiz0
= _mm_add_ps(fiz0
,tz
);
2624 fjx0
= _mm_add_ps(fjx0
,tx
);
2625 fjy0
= _mm_add_ps(fjy0
,ty
);
2626 fjz0
= _mm_add_ps(fjz0
,tz
);
2630 /**************************
2631 * CALCULATE INTERACTIONS *
2632 **************************/
2634 if (gmx_mm_any_lt(rsq01
,rcutoff2
))
2637 r01
= _mm_mul_ps(rsq01
,rinv01
);
2638 r01
= _mm_andnot_ps(dummy_mask
,r01
);
2640 /* EWALD ELECTROSTATICS */
2642 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2643 ewrt
= _mm_mul_ps(r01
,ewtabscale
);
2644 ewitab
= _mm_cvttps_epi32(ewrt
);
2645 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
2646 ewitab
= _mm_slli_epi32(ewitab
,2);
2647 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
2648 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
2649 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
2650 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
2651 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
2652 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
2653 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
2654 velec
= _mm_mul_ps(qq01
,_mm_sub_ps(rinv01
,velec
));
2655 felec
= _mm_mul_ps(_mm_mul_ps(qq01
,rinv01
),_mm_sub_ps(rinvsq01
,felec
));
2657 d
= _mm_sub_ps(r01
,rswitch
);
2658 d
= _mm_max_ps(d
,_mm_setzero_ps());
2659 d2
= _mm_mul_ps(d
,d
);
2660 sw
= _mm_add_ps(one
,_mm_mul_ps(d2
,_mm_mul_ps(d
,_mm_add_ps(swV3
,_mm_mul_ps(d
,_mm_add_ps(swV4
,_mm_mul_ps(d
,swV5
)))))));
2662 dsw
= _mm_mul_ps(d2
,_mm_add_ps(swF2
,_mm_mul_ps(d
,_mm_add_ps(swF3
,_mm_mul_ps(d
,swF4
)))));
2664 /* Evaluate switch function */
2665 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2666 felec
= _mm_sub_ps( _mm_mul_ps(felec
,sw
) , _mm_mul_ps(rinv01
,_mm_mul_ps(velec
,dsw
)) );
2667 cutoff_mask
= _mm_cmplt_ps(rsq01
,rcutoff2
);
2671 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
2673 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2675 /* Calculate temporary vectorial force */
2676 tx
= _mm_mul_ps(fscal
,dx01
);
2677 ty
= _mm_mul_ps(fscal
,dy01
);
2678 tz
= _mm_mul_ps(fscal
,dz01
);
2680 /* Update vectorial force */
2681 fix0
= _mm_add_ps(fix0
,tx
);
2682 fiy0
= _mm_add_ps(fiy0
,ty
);
2683 fiz0
= _mm_add_ps(fiz0
,tz
);
2685 fjx1
= _mm_add_ps(fjx1
,tx
);
2686 fjy1
= _mm_add_ps(fjy1
,ty
);
2687 fjz1
= _mm_add_ps(fjz1
,tz
);
2691 /**************************
2692 * CALCULATE INTERACTIONS *
2693 **************************/
2695 if (gmx_mm_any_lt(rsq02
,rcutoff2
))
2698 r02
= _mm_mul_ps(rsq02
,rinv02
);
2699 r02
= _mm_andnot_ps(dummy_mask
,r02
);
2701 /* EWALD ELECTROSTATICS */
2703 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2704 ewrt
= _mm_mul_ps(r02
,ewtabscale
);
2705 ewitab
= _mm_cvttps_epi32(ewrt
);
2706 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
2707 ewitab
= _mm_slli_epi32(ewitab
,2);
2708 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
2709 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
2710 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
2711 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
2712 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
2713 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
2714 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
2715 velec
= _mm_mul_ps(qq02
,_mm_sub_ps(rinv02
,velec
));
2716 felec
= _mm_mul_ps(_mm_mul_ps(qq02
,rinv02
),_mm_sub_ps(rinvsq02
,felec
));
2718 d
= _mm_sub_ps(r02
,rswitch
);
2719 d
= _mm_max_ps(d
,_mm_setzero_ps());
2720 d2
= _mm_mul_ps(d
,d
);
2721 sw
= _mm_add_ps(one
,_mm_mul_ps(d2
,_mm_mul_ps(d
,_mm_add_ps(swV3
,_mm_mul_ps(d
,_mm_add_ps(swV4
,_mm_mul_ps(d
,swV5
)))))));
2723 dsw
= _mm_mul_ps(d2
,_mm_add_ps(swF2
,_mm_mul_ps(d
,_mm_add_ps(swF3
,_mm_mul_ps(d
,swF4
)))));
2725 /* Evaluate switch function */
2726 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2727 felec
= _mm_sub_ps( _mm_mul_ps(felec
,sw
) , _mm_mul_ps(rinv02
,_mm_mul_ps(velec
,dsw
)) );
2728 cutoff_mask
= _mm_cmplt_ps(rsq02
,rcutoff2
);
2732 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
2734 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2736 /* Calculate temporary vectorial force */
2737 tx
= _mm_mul_ps(fscal
,dx02
);
2738 ty
= _mm_mul_ps(fscal
,dy02
);
2739 tz
= _mm_mul_ps(fscal
,dz02
);
2741 /* Update vectorial force */
2742 fix0
= _mm_add_ps(fix0
,tx
);
2743 fiy0
= _mm_add_ps(fiy0
,ty
);
2744 fiz0
= _mm_add_ps(fiz0
,tz
);
2746 fjx2
= _mm_add_ps(fjx2
,tx
);
2747 fjy2
= _mm_add_ps(fjy2
,ty
);
2748 fjz2
= _mm_add_ps(fjz2
,tz
);
2752 /**************************
2753 * CALCULATE INTERACTIONS *
2754 **************************/
2756 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
2759 r10
= _mm_mul_ps(rsq10
,rinv10
);
2760 r10
= _mm_andnot_ps(dummy_mask
,r10
);
2762 /* EWALD ELECTROSTATICS */
2764 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2765 ewrt
= _mm_mul_ps(r10
,ewtabscale
);
2766 ewitab
= _mm_cvttps_epi32(ewrt
);
2767 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
2768 ewitab
= _mm_slli_epi32(ewitab
,2);
2769 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
2770 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
2771 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
2772 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
2773 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
2774 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
2775 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
2776 velec
= _mm_mul_ps(qq10
,_mm_sub_ps(rinv10
,velec
));
2777 felec
= _mm_mul_ps(_mm_mul_ps(qq10
,rinv10
),_mm_sub_ps(rinvsq10
,felec
));
2779 d
= _mm_sub_ps(r10
,rswitch
);
2780 d
= _mm_max_ps(d
,_mm_setzero_ps());
2781 d2
= _mm_mul_ps(d
,d
);
2782 sw
= _mm_add_ps(one
,_mm_mul_ps(d2
,_mm_mul_ps(d
,_mm_add_ps(swV3
,_mm_mul_ps(d
,_mm_add_ps(swV4
,_mm_mul_ps(d
,swV5
)))))));
2784 dsw
= _mm_mul_ps(d2
,_mm_add_ps(swF2
,_mm_mul_ps(d
,_mm_add_ps(swF3
,_mm_mul_ps(d
,swF4
)))));
2786 /* Evaluate switch function */
2787 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2788 felec
= _mm_sub_ps( _mm_mul_ps(felec
,sw
) , _mm_mul_ps(rinv10
,_mm_mul_ps(velec
,dsw
)) );
2789 cutoff_mask
= _mm_cmplt_ps(rsq10
,rcutoff2
);
2793 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
2795 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2797 /* Calculate temporary vectorial force */
2798 tx
= _mm_mul_ps(fscal
,dx10
);
2799 ty
= _mm_mul_ps(fscal
,dy10
);
2800 tz
= _mm_mul_ps(fscal
,dz10
);
2802 /* Update vectorial force */
2803 fix1
= _mm_add_ps(fix1
,tx
);
2804 fiy1
= _mm_add_ps(fiy1
,ty
);
2805 fiz1
= _mm_add_ps(fiz1
,tz
);
2807 fjx0
= _mm_add_ps(fjx0
,tx
);
2808 fjy0
= _mm_add_ps(fjy0
,ty
);
2809 fjz0
= _mm_add_ps(fjz0
,tz
);
2813 /**************************
2814 * CALCULATE INTERACTIONS *
2815 **************************/
2817 if (gmx_mm_any_lt(rsq11
,rcutoff2
))
2820 r11
= _mm_mul_ps(rsq11
,rinv11
);
2821 r11
= _mm_andnot_ps(dummy_mask
,r11
);
2823 /* EWALD ELECTROSTATICS */
2825 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2826 ewrt
= _mm_mul_ps(r11
,ewtabscale
);
2827 ewitab
= _mm_cvttps_epi32(ewrt
);
2828 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
2829 ewitab
= _mm_slli_epi32(ewitab
,2);
2830 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
2831 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
2832 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
2833 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
2834 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
2835 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
2836 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
2837 velec
= _mm_mul_ps(qq11
,_mm_sub_ps(rinv11
,velec
));
2838 felec
= _mm_mul_ps(_mm_mul_ps(qq11
,rinv11
),_mm_sub_ps(rinvsq11
,felec
));
2840 d
= _mm_sub_ps(r11
,rswitch
);
2841 d
= _mm_max_ps(d
,_mm_setzero_ps());
2842 d2
= _mm_mul_ps(d
,d
);
2843 sw
= _mm_add_ps(one
,_mm_mul_ps(d2
,_mm_mul_ps(d
,_mm_add_ps(swV3
,_mm_mul_ps(d
,_mm_add_ps(swV4
,_mm_mul_ps(d
,swV5
)))))));
2845 dsw
= _mm_mul_ps(d2
,_mm_add_ps(swF2
,_mm_mul_ps(d
,_mm_add_ps(swF3
,_mm_mul_ps(d
,swF4
)))));
2847 /* Evaluate switch function */
2848 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2849 felec
= _mm_sub_ps( _mm_mul_ps(felec
,sw
) , _mm_mul_ps(rinv11
,_mm_mul_ps(velec
,dsw
)) );
2850 cutoff_mask
= _mm_cmplt_ps(rsq11
,rcutoff2
);
2854 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
2856 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2858 /* Calculate temporary vectorial force */
2859 tx
= _mm_mul_ps(fscal
,dx11
);
2860 ty
= _mm_mul_ps(fscal
,dy11
);
2861 tz
= _mm_mul_ps(fscal
,dz11
);
2863 /* Update vectorial force */
2864 fix1
= _mm_add_ps(fix1
,tx
);
2865 fiy1
= _mm_add_ps(fiy1
,ty
);
2866 fiz1
= _mm_add_ps(fiz1
,tz
);
2868 fjx1
= _mm_add_ps(fjx1
,tx
);
2869 fjy1
= _mm_add_ps(fjy1
,ty
);
2870 fjz1
= _mm_add_ps(fjz1
,tz
);
2874 /**************************
2875 * CALCULATE INTERACTIONS *
2876 **************************/
2878 if (gmx_mm_any_lt(rsq12
,rcutoff2
))
2881 r12
= _mm_mul_ps(rsq12
,rinv12
);
2882 r12
= _mm_andnot_ps(dummy_mask
,r12
);
2884 /* EWALD ELECTROSTATICS */
2886 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2887 ewrt
= _mm_mul_ps(r12
,ewtabscale
);
2888 ewitab
= _mm_cvttps_epi32(ewrt
);
2889 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
2890 ewitab
= _mm_slli_epi32(ewitab
,2);
2891 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
2892 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
2893 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
2894 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
2895 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
2896 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
2897 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
2898 velec
= _mm_mul_ps(qq12
,_mm_sub_ps(rinv12
,velec
));
2899 felec
= _mm_mul_ps(_mm_mul_ps(qq12
,rinv12
),_mm_sub_ps(rinvsq12
,felec
));
2901 d
= _mm_sub_ps(r12
,rswitch
);
2902 d
= _mm_max_ps(d
,_mm_setzero_ps());
2903 d2
= _mm_mul_ps(d
,d
);
2904 sw
= _mm_add_ps(one
,_mm_mul_ps(d2
,_mm_mul_ps(d
,_mm_add_ps(swV3
,_mm_mul_ps(d
,_mm_add_ps(swV4
,_mm_mul_ps(d
,swV5
)))))));
2906 dsw
= _mm_mul_ps(d2
,_mm_add_ps(swF2
,_mm_mul_ps(d
,_mm_add_ps(swF3
,_mm_mul_ps(d
,swF4
)))));
2908 /* Evaluate switch function */
2909 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2910 felec
= _mm_sub_ps( _mm_mul_ps(felec
,sw
) , _mm_mul_ps(rinv12
,_mm_mul_ps(velec
,dsw
)) );
2911 cutoff_mask
= _mm_cmplt_ps(rsq12
,rcutoff2
);
2915 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
2917 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2919 /* Calculate temporary vectorial force */
2920 tx
= _mm_mul_ps(fscal
,dx12
);
2921 ty
= _mm_mul_ps(fscal
,dy12
);
2922 tz
= _mm_mul_ps(fscal
,dz12
);
2924 /* Update vectorial force */
2925 fix1
= _mm_add_ps(fix1
,tx
);
2926 fiy1
= _mm_add_ps(fiy1
,ty
);
2927 fiz1
= _mm_add_ps(fiz1
,tz
);
2929 fjx2
= _mm_add_ps(fjx2
,tx
);
2930 fjy2
= _mm_add_ps(fjy2
,ty
);
2931 fjz2
= _mm_add_ps(fjz2
,tz
);
2935 /**************************
2936 * CALCULATE INTERACTIONS *
2937 **************************/
2939 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
2942 r20
= _mm_mul_ps(rsq20
,rinv20
);
2943 r20
= _mm_andnot_ps(dummy_mask
,r20
);
2945 /* EWALD ELECTROSTATICS */
2947 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2948 ewrt
= _mm_mul_ps(r20
,ewtabscale
);
2949 ewitab
= _mm_cvttps_epi32(ewrt
);
2950 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
2951 ewitab
= _mm_slli_epi32(ewitab
,2);
2952 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
2953 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
2954 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
2955 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
2956 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
2957 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
2958 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
2959 velec
= _mm_mul_ps(qq20
,_mm_sub_ps(rinv20
,velec
));
2960 felec
= _mm_mul_ps(_mm_mul_ps(qq20
,rinv20
),_mm_sub_ps(rinvsq20
,felec
));
2962 d
= _mm_sub_ps(r20
,rswitch
);
2963 d
= _mm_max_ps(d
,_mm_setzero_ps());
2964 d2
= _mm_mul_ps(d
,d
);
2965 sw
= _mm_add_ps(one
,_mm_mul_ps(d2
,_mm_mul_ps(d
,_mm_add_ps(swV3
,_mm_mul_ps(d
,_mm_add_ps(swV4
,_mm_mul_ps(d
,swV5
)))))));
2967 dsw
= _mm_mul_ps(d2
,_mm_add_ps(swF2
,_mm_mul_ps(d
,_mm_add_ps(swF3
,_mm_mul_ps(d
,swF4
)))));
2969 /* Evaluate switch function */
2970 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2971 felec
= _mm_sub_ps( _mm_mul_ps(felec
,sw
) , _mm_mul_ps(rinv20
,_mm_mul_ps(velec
,dsw
)) );
2972 cutoff_mask
= _mm_cmplt_ps(rsq20
,rcutoff2
);
2976 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
2978 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2980 /* Calculate temporary vectorial force */
2981 tx
= _mm_mul_ps(fscal
,dx20
);
2982 ty
= _mm_mul_ps(fscal
,dy20
);
2983 tz
= _mm_mul_ps(fscal
,dz20
);
2985 /* Update vectorial force */
2986 fix2
= _mm_add_ps(fix2
,tx
);
2987 fiy2
= _mm_add_ps(fiy2
,ty
);
2988 fiz2
= _mm_add_ps(fiz2
,tz
);
2990 fjx0
= _mm_add_ps(fjx0
,tx
);
2991 fjy0
= _mm_add_ps(fjy0
,ty
);
2992 fjz0
= _mm_add_ps(fjz0
,tz
);
2996 /**************************
2997 * CALCULATE INTERACTIONS *
2998 **************************/
3000 if (gmx_mm_any_lt(rsq21
,rcutoff2
))
3003 r21
= _mm_mul_ps(rsq21
,rinv21
);
3004 r21
= _mm_andnot_ps(dummy_mask
,r21
);
3006 /* EWALD ELECTROSTATICS */
3008 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3009 ewrt
= _mm_mul_ps(r21
,ewtabscale
);
3010 ewitab
= _mm_cvttps_epi32(ewrt
);
3011 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
3012 ewitab
= _mm_slli_epi32(ewitab
,2);
3013 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
3014 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
3015 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
3016 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
3017 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
3018 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
3019 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
3020 velec
= _mm_mul_ps(qq21
,_mm_sub_ps(rinv21
,velec
));
3021 felec
= _mm_mul_ps(_mm_mul_ps(qq21
,rinv21
),_mm_sub_ps(rinvsq21
,felec
));
3023 d
= _mm_sub_ps(r21
,rswitch
);
3024 d
= _mm_max_ps(d
,_mm_setzero_ps());
3025 d2
= _mm_mul_ps(d
,d
);
3026 sw
= _mm_add_ps(one
,_mm_mul_ps(d2
,_mm_mul_ps(d
,_mm_add_ps(swV3
,_mm_mul_ps(d
,_mm_add_ps(swV4
,_mm_mul_ps(d
,swV5
)))))));
3028 dsw
= _mm_mul_ps(d2
,_mm_add_ps(swF2
,_mm_mul_ps(d
,_mm_add_ps(swF3
,_mm_mul_ps(d
,swF4
)))));
3030 /* Evaluate switch function */
3031 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3032 felec
= _mm_sub_ps( _mm_mul_ps(felec
,sw
) , _mm_mul_ps(rinv21
,_mm_mul_ps(velec
,dsw
)) );
3033 cutoff_mask
= _mm_cmplt_ps(rsq21
,rcutoff2
);
3037 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
3039 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
3041 /* Calculate temporary vectorial force */
3042 tx
= _mm_mul_ps(fscal
,dx21
);
3043 ty
= _mm_mul_ps(fscal
,dy21
);
3044 tz
= _mm_mul_ps(fscal
,dz21
);
3046 /* Update vectorial force */
3047 fix2
= _mm_add_ps(fix2
,tx
);
3048 fiy2
= _mm_add_ps(fiy2
,ty
);
3049 fiz2
= _mm_add_ps(fiz2
,tz
);
3051 fjx1
= _mm_add_ps(fjx1
,tx
);
3052 fjy1
= _mm_add_ps(fjy1
,ty
);
3053 fjz1
= _mm_add_ps(fjz1
,tz
);
3057 /**************************
3058 * CALCULATE INTERACTIONS *
3059 **************************/
3061 if (gmx_mm_any_lt(rsq22
,rcutoff2
))
3064 r22
= _mm_mul_ps(rsq22
,rinv22
);
3065 r22
= _mm_andnot_ps(dummy_mask
,r22
);
3067 /* EWALD ELECTROSTATICS */
3069 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3070 ewrt
= _mm_mul_ps(r22
,ewtabscale
);
3071 ewitab
= _mm_cvttps_epi32(ewrt
);
3072 eweps
= _mm_sub_ps(ewrt
,_mm_cvtepi32_ps(ewitab
));
3073 ewitab
= _mm_slli_epi32(ewitab
,2);
3074 ewtabF
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
3075 ewtabD
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
3076 ewtabV
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,2) );
3077 ewtabFn
= _mm_load_ps( ewtab
+ gmx_mm_extract_epi32(ewitab
,3) );
3078 _MM_TRANSPOSE4_PS(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
3079 felec
= _mm_add_ps(ewtabF
,_mm_mul_ps(eweps
,ewtabD
));
3080 velec
= _mm_sub_ps(ewtabV
,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace
,eweps
),_mm_add_ps(ewtabF
,felec
)));
3081 velec
= _mm_mul_ps(qq22
,_mm_sub_ps(rinv22
,velec
));
3082 felec
= _mm_mul_ps(_mm_mul_ps(qq22
,rinv22
),_mm_sub_ps(rinvsq22
,felec
));
3084 d
= _mm_sub_ps(r22
,rswitch
);
3085 d
= _mm_max_ps(d
,_mm_setzero_ps());
3086 d2
= _mm_mul_ps(d
,d
);
3087 sw
= _mm_add_ps(one
,_mm_mul_ps(d2
,_mm_mul_ps(d
,_mm_add_ps(swV3
,_mm_mul_ps(d
,_mm_add_ps(swV4
,_mm_mul_ps(d
,swV5
)))))));
3089 dsw
= _mm_mul_ps(d2
,_mm_add_ps(swF2
,_mm_mul_ps(d
,_mm_add_ps(swF3
,_mm_mul_ps(d
,swF4
)))));
3091 /* Evaluate switch function */
3092 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3093 felec
= _mm_sub_ps( _mm_mul_ps(felec
,sw
) , _mm_mul_ps(rinv22
,_mm_mul_ps(velec
,dsw
)) );
3094 cutoff_mask
= _mm_cmplt_ps(rsq22
,rcutoff2
);
3098 fscal
= _mm_and_ps(fscal
,cutoff_mask
);
3100 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
3102 /* Calculate temporary vectorial force */
3103 tx
= _mm_mul_ps(fscal
,dx22
);
3104 ty
= _mm_mul_ps(fscal
,dy22
);
3105 tz
= _mm_mul_ps(fscal
,dz22
);
3107 /* Update vectorial force */
3108 fix2
= _mm_add_ps(fix2
,tx
);
3109 fiy2
= _mm_add_ps(fiy2
,ty
);
3110 fiz2
= _mm_add_ps(fiz2
,tz
);
3112 fjx2
= _mm_add_ps(fjx2
,tx
);
3113 fjy2
= _mm_add_ps(fjy2
,ty
);
3114 fjz2
= _mm_add_ps(fjz2
,tz
);
3118 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
3119 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
3120 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
3121 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
3123 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
3124 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
3126 /* Inner loop uses 582 flops */
3129 /* End of innermost loop */
3131 gmx_mm_update_iforce_3atom_swizzle_ps(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
3132 f
+i_coord_offset
,fshift
+i_shift_offset
);
3134 /* Increment number of inner iterations */
3135 inneriter
+= j_index_end
- j_index_start
;
3137 /* Outer loop uses 18 flops */
3140 /* Increment number of outer iterations */
3143 /* Update outer/inner flops */
3145 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W3W3_F
,outeriter
*18 + inneriter
*582);