2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/math/vec.h"
46 #include "gromacs/legacyheaders/nrnb.h"
48 #include "gromacs/simd/math_x86_sse2_double.h"
49 #include "kernelutil_x86_sse2_double.h"
52 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sse2_double
53 * Electrostatics interaction: Ewald
54 * VdW interaction: LennardJones
55 * Geometry: Water4-Water4
56 * Calculate force/pot: PotentialAndForce
59 nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sse2_double
60 (t_nblist
* gmx_restrict nlist
,
61 rvec
* gmx_restrict xx
,
62 rvec
* gmx_restrict ff
,
63 t_forcerec
* gmx_restrict fr
,
64 t_mdatoms
* gmx_restrict mdatoms
,
65 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
66 t_nrnb
* gmx_restrict nrnb
)
68 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
69 * just 0 for non-waters.
70 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
71 * jnr indices corresponding to data put in the four positions in the SIMD register.
73 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
74 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
76 int j_coord_offsetA
,j_coord_offsetB
;
77 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
79 real
*shiftvec
,*fshift
,*x
,*f
;
80 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
82 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
84 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
86 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
88 __m128d ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
89 int vdwjidx0A
,vdwjidx0B
;
90 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
91 int vdwjidx1A
,vdwjidx1B
;
92 __m128d jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
93 int vdwjidx2A
,vdwjidx2B
;
94 __m128d jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
95 int vdwjidx3A
,vdwjidx3B
;
96 __m128d jx3
,jy3
,jz3
,fjx3
,fjy3
,fjz3
,jq3
,isaj3
;
97 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
98 __m128d dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
99 __m128d dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
100 __m128d dx13
,dy13
,dz13
,rsq13
,rinv13
,rinvsq13
,r13
,qq13
,c6_13
,c12_13
;
101 __m128d dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
102 __m128d dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
103 __m128d dx23
,dy23
,dz23
,rsq23
,rinv23
,rinvsq23
,r23
,qq23
,c6_23
,c12_23
;
104 __m128d dx31
,dy31
,dz31
,rsq31
,rinv31
,rinvsq31
,r31
,qq31
,c6_31
,c12_31
;
105 __m128d dx32
,dy32
,dz32
,rsq32
,rinv32
,rinvsq32
,r32
,qq32
,c6_32
,c12_32
;
106 __m128d dx33
,dy33
,dz33
,rsq33
,rinv33
,rinvsq33
,r33
,qq33
,c6_33
,c12_33
;
107 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
110 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
113 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
114 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
116 __m128d ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
118 __m128d rswitch
,swV3
,swV4
,swV5
,swF2
,swF3
,swF4
,d
,d2
,sw
,dsw
;
119 real rswitch_scalar
,d_scalar
;
120 __m128d dummy_mask
,cutoff_mask
;
121 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
122 __m128d one
= _mm_set1_pd(1.0);
123 __m128d two
= _mm_set1_pd(2.0);
129 jindex
= nlist
->jindex
;
131 shiftidx
= nlist
->shift
;
133 shiftvec
= fr
->shift_vec
[0];
134 fshift
= fr
->fshift
[0];
135 facel
= _mm_set1_pd(fr
->epsfac
);
136 charge
= mdatoms
->chargeA
;
137 nvdwtype
= fr
->ntype
;
139 vdwtype
= mdatoms
->typeA
;
141 sh_ewald
= _mm_set1_pd(fr
->ic
->sh_ewald
);
142 ewtab
= fr
->ic
->tabq_coul_FDV0
;
143 ewtabscale
= _mm_set1_pd(fr
->ic
->tabq_scale
);
144 ewtabhalfspace
= _mm_set1_pd(0.5/fr
->ic
->tabq_scale
);
146 /* Setup water-specific parameters */
147 inr
= nlist
->iinr
[0];
148 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
149 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
150 iq3
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+3]));
151 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
153 jq1
= _mm_set1_pd(charge
[inr
+1]);
154 jq2
= _mm_set1_pd(charge
[inr
+2]);
155 jq3
= _mm_set1_pd(charge
[inr
+3]);
156 vdwjidx0A
= 2*vdwtype
[inr
+0];
157 c6_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
]);
158 c12_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
159 qq11
= _mm_mul_pd(iq1
,jq1
);
160 qq12
= _mm_mul_pd(iq1
,jq2
);
161 qq13
= _mm_mul_pd(iq1
,jq3
);
162 qq21
= _mm_mul_pd(iq2
,jq1
);
163 qq22
= _mm_mul_pd(iq2
,jq2
);
164 qq23
= _mm_mul_pd(iq2
,jq3
);
165 qq31
= _mm_mul_pd(iq3
,jq1
);
166 qq32
= _mm_mul_pd(iq3
,jq2
);
167 qq33
= _mm_mul_pd(iq3
,jq3
);
169 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
170 rcutoff_scalar
= fr
->rcoulomb
;
171 rcutoff
= _mm_set1_pd(rcutoff_scalar
);
172 rcutoff2
= _mm_mul_pd(rcutoff
,rcutoff
);
174 rswitch_scalar
= fr
->rcoulomb_switch
;
175 rswitch
= _mm_set1_pd(rswitch_scalar
);
176 /* Setup switch parameters */
177 d_scalar
= rcutoff_scalar
-rswitch_scalar
;
178 d
= _mm_set1_pd(d_scalar
);
179 swV3
= _mm_set1_pd(-10.0/(d_scalar
*d_scalar
*d_scalar
));
180 swV4
= _mm_set1_pd( 15.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
));
181 swV5
= _mm_set1_pd( -6.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
*d_scalar
));
182 swF2
= _mm_set1_pd(-30.0/(d_scalar
*d_scalar
*d_scalar
));
183 swF3
= _mm_set1_pd( 60.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
));
184 swF4
= _mm_set1_pd(-30.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
*d_scalar
));
186 /* Avoid stupid compiler warnings */
194 /* Start outer loop over neighborlists */
195 for(iidx
=0; iidx
<nri
; iidx
++)
197 /* Load shift vector for this list */
198 i_shift_offset
= DIM
*shiftidx
[iidx
];
200 /* Load limits for loop over neighbors */
201 j_index_start
= jindex
[iidx
];
202 j_index_end
= jindex
[iidx
+1];
204 /* Get outer coordinate index */
206 i_coord_offset
= DIM
*inr
;
208 /* Load i particle coords and add shift vector */
209 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
210 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
212 fix0
= _mm_setzero_pd();
213 fiy0
= _mm_setzero_pd();
214 fiz0
= _mm_setzero_pd();
215 fix1
= _mm_setzero_pd();
216 fiy1
= _mm_setzero_pd();
217 fiz1
= _mm_setzero_pd();
218 fix2
= _mm_setzero_pd();
219 fiy2
= _mm_setzero_pd();
220 fiz2
= _mm_setzero_pd();
221 fix3
= _mm_setzero_pd();
222 fiy3
= _mm_setzero_pd();
223 fiz3
= _mm_setzero_pd();
225 /* Reset potential sums */
226 velecsum
= _mm_setzero_pd();
227 vvdwsum
= _mm_setzero_pd();
229 /* Start inner kernel loop */
230 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
233 /* Get j neighbor index, and coordinate index */
236 j_coord_offsetA
= DIM
*jnrA
;
237 j_coord_offsetB
= DIM
*jnrB
;
239 /* load j atom coordinates */
240 gmx_mm_load_4rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
241 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
242 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
244 /* Calculate displacement vector */
245 dx00
= _mm_sub_pd(ix0
,jx0
);
246 dy00
= _mm_sub_pd(iy0
,jy0
);
247 dz00
= _mm_sub_pd(iz0
,jz0
);
248 dx11
= _mm_sub_pd(ix1
,jx1
);
249 dy11
= _mm_sub_pd(iy1
,jy1
);
250 dz11
= _mm_sub_pd(iz1
,jz1
);
251 dx12
= _mm_sub_pd(ix1
,jx2
);
252 dy12
= _mm_sub_pd(iy1
,jy2
);
253 dz12
= _mm_sub_pd(iz1
,jz2
);
254 dx13
= _mm_sub_pd(ix1
,jx3
);
255 dy13
= _mm_sub_pd(iy1
,jy3
);
256 dz13
= _mm_sub_pd(iz1
,jz3
);
257 dx21
= _mm_sub_pd(ix2
,jx1
);
258 dy21
= _mm_sub_pd(iy2
,jy1
);
259 dz21
= _mm_sub_pd(iz2
,jz1
);
260 dx22
= _mm_sub_pd(ix2
,jx2
);
261 dy22
= _mm_sub_pd(iy2
,jy2
);
262 dz22
= _mm_sub_pd(iz2
,jz2
);
263 dx23
= _mm_sub_pd(ix2
,jx3
);
264 dy23
= _mm_sub_pd(iy2
,jy3
);
265 dz23
= _mm_sub_pd(iz2
,jz3
);
266 dx31
= _mm_sub_pd(ix3
,jx1
);
267 dy31
= _mm_sub_pd(iy3
,jy1
);
268 dz31
= _mm_sub_pd(iz3
,jz1
);
269 dx32
= _mm_sub_pd(ix3
,jx2
);
270 dy32
= _mm_sub_pd(iy3
,jy2
);
271 dz32
= _mm_sub_pd(iz3
,jz2
);
272 dx33
= _mm_sub_pd(ix3
,jx3
);
273 dy33
= _mm_sub_pd(iy3
,jy3
);
274 dz33
= _mm_sub_pd(iz3
,jz3
);
276 /* Calculate squared distance and things based on it */
277 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
278 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
279 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
280 rsq13
= gmx_mm_calc_rsq_pd(dx13
,dy13
,dz13
);
281 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
282 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
283 rsq23
= gmx_mm_calc_rsq_pd(dx23
,dy23
,dz23
);
284 rsq31
= gmx_mm_calc_rsq_pd(dx31
,dy31
,dz31
);
285 rsq32
= gmx_mm_calc_rsq_pd(dx32
,dy32
,dz32
);
286 rsq33
= gmx_mm_calc_rsq_pd(dx33
,dy33
,dz33
);
288 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
289 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
290 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
291 rinv13
= gmx_mm_invsqrt_pd(rsq13
);
292 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
293 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
294 rinv23
= gmx_mm_invsqrt_pd(rsq23
);
295 rinv31
= gmx_mm_invsqrt_pd(rsq31
);
296 rinv32
= gmx_mm_invsqrt_pd(rsq32
);
297 rinv33
= gmx_mm_invsqrt_pd(rsq33
);
299 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
300 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
301 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
302 rinvsq13
= _mm_mul_pd(rinv13
,rinv13
);
303 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
304 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
305 rinvsq23
= _mm_mul_pd(rinv23
,rinv23
);
306 rinvsq31
= _mm_mul_pd(rinv31
,rinv31
);
307 rinvsq32
= _mm_mul_pd(rinv32
,rinv32
);
308 rinvsq33
= _mm_mul_pd(rinv33
,rinv33
);
310 fjx0
= _mm_setzero_pd();
311 fjy0
= _mm_setzero_pd();
312 fjz0
= _mm_setzero_pd();
313 fjx1
= _mm_setzero_pd();
314 fjy1
= _mm_setzero_pd();
315 fjz1
= _mm_setzero_pd();
316 fjx2
= _mm_setzero_pd();
317 fjy2
= _mm_setzero_pd();
318 fjz2
= _mm_setzero_pd();
319 fjx3
= _mm_setzero_pd();
320 fjy3
= _mm_setzero_pd();
321 fjz3
= _mm_setzero_pd();
323 /**************************
324 * CALCULATE INTERACTIONS *
325 **************************/
327 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
330 r00
= _mm_mul_pd(rsq00
,rinv00
);
332 /* LENNARD-JONES DISPERSION/REPULSION */
334 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
335 vvdw6
= _mm_mul_pd(c6_00
,rinvsix
);
336 vvdw12
= _mm_mul_pd(c12_00
,_mm_mul_pd(rinvsix
,rinvsix
));
337 vvdw
= _mm_sub_pd( _mm_mul_pd(vvdw12
,one_twelfth
) , _mm_mul_pd(vvdw6
,one_sixth
) );
338 fvdw
= _mm_mul_pd(_mm_sub_pd(vvdw12
,vvdw6
),rinvsq00
);
340 d
= _mm_sub_pd(r00
,rswitch
);
341 d
= _mm_max_pd(d
,_mm_setzero_pd());
342 d2
= _mm_mul_pd(d
,d
);
343 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
345 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
347 /* Evaluate switch function */
348 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
349 fvdw
= _mm_sub_pd( _mm_mul_pd(fvdw
,sw
) , _mm_mul_pd(rinv00
,_mm_mul_pd(vvdw
,dsw
)) );
350 vvdw
= _mm_mul_pd(vvdw
,sw
);
351 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
353 /* Update potential sum for this i atom from the interaction with this j atom. */
354 vvdw
= _mm_and_pd(vvdw
,cutoff_mask
);
355 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
359 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
361 /* Calculate temporary vectorial force */
362 tx
= _mm_mul_pd(fscal
,dx00
);
363 ty
= _mm_mul_pd(fscal
,dy00
);
364 tz
= _mm_mul_pd(fscal
,dz00
);
366 /* Update vectorial force */
367 fix0
= _mm_add_pd(fix0
,tx
);
368 fiy0
= _mm_add_pd(fiy0
,ty
);
369 fiz0
= _mm_add_pd(fiz0
,tz
);
371 fjx0
= _mm_add_pd(fjx0
,tx
);
372 fjy0
= _mm_add_pd(fjy0
,ty
);
373 fjz0
= _mm_add_pd(fjz0
,tz
);
377 /**************************
378 * CALCULATE INTERACTIONS *
379 **************************/
381 if (gmx_mm_any_lt(rsq11
,rcutoff2
))
384 r11
= _mm_mul_pd(rsq11
,rinv11
);
386 /* EWALD ELECTROSTATICS */
388 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
389 ewrt
= _mm_mul_pd(r11
,ewtabscale
);
390 ewitab
= _mm_cvttpd_epi32(ewrt
);
391 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
392 ewitab
= _mm_slli_epi32(ewitab
,2);
393 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
394 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
395 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
396 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
397 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
398 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
399 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
400 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
401 velec
= _mm_mul_pd(qq11
,_mm_sub_pd(rinv11
,velec
));
402 felec
= _mm_mul_pd(_mm_mul_pd(qq11
,rinv11
),_mm_sub_pd(rinvsq11
,felec
));
404 d
= _mm_sub_pd(r11
,rswitch
);
405 d
= _mm_max_pd(d
,_mm_setzero_pd());
406 d2
= _mm_mul_pd(d
,d
);
407 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
409 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
411 /* Evaluate switch function */
412 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
413 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv11
,_mm_mul_pd(velec
,dsw
)) );
414 velec
= _mm_mul_pd(velec
,sw
);
415 cutoff_mask
= _mm_cmplt_pd(rsq11
,rcutoff2
);
417 /* Update potential sum for this i atom from the interaction with this j atom. */
418 velec
= _mm_and_pd(velec
,cutoff_mask
);
419 velecsum
= _mm_add_pd(velecsum
,velec
);
423 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
425 /* Calculate temporary vectorial force */
426 tx
= _mm_mul_pd(fscal
,dx11
);
427 ty
= _mm_mul_pd(fscal
,dy11
);
428 tz
= _mm_mul_pd(fscal
,dz11
);
430 /* Update vectorial force */
431 fix1
= _mm_add_pd(fix1
,tx
);
432 fiy1
= _mm_add_pd(fiy1
,ty
);
433 fiz1
= _mm_add_pd(fiz1
,tz
);
435 fjx1
= _mm_add_pd(fjx1
,tx
);
436 fjy1
= _mm_add_pd(fjy1
,ty
);
437 fjz1
= _mm_add_pd(fjz1
,tz
);
441 /**************************
442 * CALCULATE INTERACTIONS *
443 **************************/
445 if (gmx_mm_any_lt(rsq12
,rcutoff2
))
448 r12
= _mm_mul_pd(rsq12
,rinv12
);
450 /* EWALD ELECTROSTATICS */
452 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
453 ewrt
= _mm_mul_pd(r12
,ewtabscale
);
454 ewitab
= _mm_cvttpd_epi32(ewrt
);
455 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
456 ewitab
= _mm_slli_epi32(ewitab
,2);
457 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
458 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
459 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
460 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
461 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
462 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
463 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
464 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
465 velec
= _mm_mul_pd(qq12
,_mm_sub_pd(rinv12
,velec
));
466 felec
= _mm_mul_pd(_mm_mul_pd(qq12
,rinv12
),_mm_sub_pd(rinvsq12
,felec
));
468 d
= _mm_sub_pd(r12
,rswitch
);
469 d
= _mm_max_pd(d
,_mm_setzero_pd());
470 d2
= _mm_mul_pd(d
,d
);
471 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
473 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
475 /* Evaluate switch function */
476 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
477 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv12
,_mm_mul_pd(velec
,dsw
)) );
478 velec
= _mm_mul_pd(velec
,sw
);
479 cutoff_mask
= _mm_cmplt_pd(rsq12
,rcutoff2
);
481 /* Update potential sum for this i atom from the interaction with this j atom. */
482 velec
= _mm_and_pd(velec
,cutoff_mask
);
483 velecsum
= _mm_add_pd(velecsum
,velec
);
487 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
489 /* Calculate temporary vectorial force */
490 tx
= _mm_mul_pd(fscal
,dx12
);
491 ty
= _mm_mul_pd(fscal
,dy12
);
492 tz
= _mm_mul_pd(fscal
,dz12
);
494 /* Update vectorial force */
495 fix1
= _mm_add_pd(fix1
,tx
);
496 fiy1
= _mm_add_pd(fiy1
,ty
);
497 fiz1
= _mm_add_pd(fiz1
,tz
);
499 fjx2
= _mm_add_pd(fjx2
,tx
);
500 fjy2
= _mm_add_pd(fjy2
,ty
);
501 fjz2
= _mm_add_pd(fjz2
,tz
);
505 /**************************
506 * CALCULATE INTERACTIONS *
507 **************************/
509 if (gmx_mm_any_lt(rsq13
,rcutoff2
))
512 r13
= _mm_mul_pd(rsq13
,rinv13
);
514 /* EWALD ELECTROSTATICS */
516 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
517 ewrt
= _mm_mul_pd(r13
,ewtabscale
);
518 ewitab
= _mm_cvttpd_epi32(ewrt
);
519 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
520 ewitab
= _mm_slli_epi32(ewitab
,2);
521 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
522 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
523 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
524 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
525 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
526 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
527 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
528 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
529 velec
= _mm_mul_pd(qq13
,_mm_sub_pd(rinv13
,velec
));
530 felec
= _mm_mul_pd(_mm_mul_pd(qq13
,rinv13
),_mm_sub_pd(rinvsq13
,felec
));
532 d
= _mm_sub_pd(r13
,rswitch
);
533 d
= _mm_max_pd(d
,_mm_setzero_pd());
534 d2
= _mm_mul_pd(d
,d
);
535 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
537 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
539 /* Evaluate switch function */
540 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
541 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv13
,_mm_mul_pd(velec
,dsw
)) );
542 velec
= _mm_mul_pd(velec
,sw
);
543 cutoff_mask
= _mm_cmplt_pd(rsq13
,rcutoff2
);
545 /* Update potential sum for this i atom from the interaction with this j atom. */
546 velec
= _mm_and_pd(velec
,cutoff_mask
);
547 velecsum
= _mm_add_pd(velecsum
,velec
);
551 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
553 /* Calculate temporary vectorial force */
554 tx
= _mm_mul_pd(fscal
,dx13
);
555 ty
= _mm_mul_pd(fscal
,dy13
);
556 tz
= _mm_mul_pd(fscal
,dz13
);
558 /* Update vectorial force */
559 fix1
= _mm_add_pd(fix1
,tx
);
560 fiy1
= _mm_add_pd(fiy1
,ty
);
561 fiz1
= _mm_add_pd(fiz1
,tz
);
563 fjx3
= _mm_add_pd(fjx3
,tx
);
564 fjy3
= _mm_add_pd(fjy3
,ty
);
565 fjz3
= _mm_add_pd(fjz3
,tz
);
569 /**************************
570 * CALCULATE INTERACTIONS *
571 **************************/
573 if (gmx_mm_any_lt(rsq21
,rcutoff2
))
576 r21
= _mm_mul_pd(rsq21
,rinv21
);
578 /* EWALD ELECTROSTATICS */
580 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
581 ewrt
= _mm_mul_pd(r21
,ewtabscale
);
582 ewitab
= _mm_cvttpd_epi32(ewrt
);
583 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
584 ewitab
= _mm_slli_epi32(ewitab
,2);
585 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
586 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
587 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
588 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
589 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
590 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
591 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
592 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
593 velec
= _mm_mul_pd(qq21
,_mm_sub_pd(rinv21
,velec
));
594 felec
= _mm_mul_pd(_mm_mul_pd(qq21
,rinv21
),_mm_sub_pd(rinvsq21
,felec
));
596 d
= _mm_sub_pd(r21
,rswitch
);
597 d
= _mm_max_pd(d
,_mm_setzero_pd());
598 d2
= _mm_mul_pd(d
,d
);
599 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
601 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
603 /* Evaluate switch function */
604 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
605 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv21
,_mm_mul_pd(velec
,dsw
)) );
606 velec
= _mm_mul_pd(velec
,sw
);
607 cutoff_mask
= _mm_cmplt_pd(rsq21
,rcutoff2
);
609 /* Update potential sum for this i atom from the interaction with this j atom. */
610 velec
= _mm_and_pd(velec
,cutoff_mask
);
611 velecsum
= _mm_add_pd(velecsum
,velec
);
615 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
617 /* Calculate temporary vectorial force */
618 tx
= _mm_mul_pd(fscal
,dx21
);
619 ty
= _mm_mul_pd(fscal
,dy21
);
620 tz
= _mm_mul_pd(fscal
,dz21
);
622 /* Update vectorial force */
623 fix2
= _mm_add_pd(fix2
,tx
);
624 fiy2
= _mm_add_pd(fiy2
,ty
);
625 fiz2
= _mm_add_pd(fiz2
,tz
);
627 fjx1
= _mm_add_pd(fjx1
,tx
);
628 fjy1
= _mm_add_pd(fjy1
,ty
);
629 fjz1
= _mm_add_pd(fjz1
,tz
);
633 /**************************
634 * CALCULATE INTERACTIONS *
635 **************************/
637 if (gmx_mm_any_lt(rsq22
,rcutoff2
))
640 r22
= _mm_mul_pd(rsq22
,rinv22
);
642 /* EWALD ELECTROSTATICS */
644 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
645 ewrt
= _mm_mul_pd(r22
,ewtabscale
);
646 ewitab
= _mm_cvttpd_epi32(ewrt
);
647 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
648 ewitab
= _mm_slli_epi32(ewitab
,2);
649 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
650 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
651 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
652 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
653 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
654 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
655 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
656 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
657 velec
= _mm_mul_pd(qq22
,_mm_sub_pd(rinv22
,velec
));
658 felec
= _mm_mul_pd(_mm_mul_pd(qq22
,rinv22
),_mm_sub_pd(rinvsq22
,felec
));
660 d
= _mm_sub_pd(r22
,rswitch
);
661 d
= _mm_max_pd(d
,_mm_setzero_pd());
662 d2
= _mm_mul_pd(d
,d
);
663 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
665 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
667 /* Evaluate switch function */
668 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
669 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv22
,_mm_mul_pd(velec
,dsw
)) );
670 velec
= _mm_mul_pd(velec
,sw
);
671 cutoff_mask
= _mm_cmplt_pd(rsq22
,rcutoff2
);
673 /* Update potential sum for this i atom from the interaction with this j atom. */
674 velec
= _mm_and_pd(velec
,cutoff_mask
);
675 velecsum
= _mm_add_pd(velecsum
,velec
);
679 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
681 /* Calculate temporary vectorial force */
682 tx
= _mm_mul_pd(fscal
,dx22
);
683 ty
= _mm_mul_pd(fscal
,dy22
);
684 tz
= _mm_mul_pd(fscal
,dz22
);
686 /* Update vectorial force */
687 fix2
= _mm_add_pd(fix2
,tx
);
688 fiy2
= _mm_add_pd(fiy2
,ty
);
689 fiz2
= _mm_add_pd(fiz2
,tz
);
691 fjx2
= _mm_add_pd(fjx2
,tx
);
692 fjy2
= _mm_add_pd(fjy2
,ty
);
693 fjz2
= _mm_add_pd(fjz2
,tz
);
697 /**************************
698 * CALCULATE INTERACTIONS *
699 **************************/
701 if (gmx_mm_any_lt(rsq23
,rcutoff2
))
704 r23
= _mm_mul_pd(rsq23
,rinv23
);
706 /* EWALD ELECTROSTATICS */
708 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
709 ewrt
= _mm_mul_pd(r23
,ewtabscale
);
710 ewitab
= _mm_cvttpd_epi32(ewrt
);
711 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
712 ewitab
= _mm_slli_epi32(ewitab
,2);
713 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
714 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
715 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
716 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
717 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
718 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
719 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
720 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
721 velec
= _mm_mul_pd(qq23
,_mm_sub_pd(rinv23
,velec
));
722 felec
= _mm_mul_pd(_mm_mul_pd(qq23
,rinv23
),_mm_sub_pd(rinvsq23
,felec
));
724 d
= _mm_sub_pd(r23
,rswitch
);
725 d
= _mm_max_pd(d
,_mm_setzero_pd());
726 d2
= _mm_mul_pd(d
,d
);
727 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
729 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
731 /* Evaluate switch function */
732 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
733 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv23
,_mm_mul_pd(velec
,dsw
)) );
734 velec
= _mm_mul_pd(velec
,sw
);
735 cutoff_mask
= _mm_cmplt_pd(rsq23
,rcutoff2
);
737 /* Update potential sum for this i atom from the interaction with this j atom. */
738 velec
= _mm_and_pd(velec
,cutoff_mask
);
739 velecsum
= _mm_add_pd(velecsum
,velec
);
743 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
745 /* Calculate temporary vectorial force */
746 tx
= _mm_mul_pd(fscal
,dx23
);
747 ty
= _mm_mul_pd(fscal
,dy23
);
748 tz
= _mm_mul_pd(fscal
,dz23
);
750 /* Update vectorial force */
751 fix2
= _mm_add_pd(fix2
,tx
);
752 fiy2
= _mm_add_pd(fiy2
,ty
);
753 fiz2
= _mm_add_pd(fiz2
,tz
);
755 fjx3
= _mm_add_pd(fjx3
,tx
);
756 fjy3
= _mm_add_pd(fjy3
,ty
);
757 fjz3
= _mm_add_pd(fjz3
,tz
);
761 /**************************
762 * CALCULATE INTERACTIONS *
763 **************************/
765 if (gmx_mm_any_lt(rsq31
,rcutoff2
))
768 r31
= _mm_mul_pd(rsq31
,rinv31
);
770 /* EWALD ELECTROSTATICS */
772 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
773 ewrt
= _mm_mul_pd(r31
,ewtabscale
);
774 ewitab
= _mm_cvttpd_epi32(ewrt
);
775 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
776 ewitab
= _mm_slli_epi32(ewitab
,2);
777 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
778 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
779 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
780 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
781 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
782 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
783 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
784 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
785 velec
= _mm_mul_pd(qq31
,_mm_sub_pd(rinv31
,velec
));
786 felec
= _mm_mul_pd(_mm_mul_pd(qq31
,rinv31
),_mm_sub_pd(rinvsq31
,felec
));
788 d
= _mm_sub_pd(r31
,rswitch
);
789 d
= _mm_max_pd(d
,_mm_setzero_pd());
790 d2
= _mm_mul_pd(d
,d
);
791 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
793 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
795 /* Evaluate switch function */
796 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
797 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv31
,_mm_mul_pd(velec
,dsw
)) );
798 velec
= _mm_mul_pd(velec
,sw
);
799 cutoff_mask
= _mm_cmplt_pd(rsq31
,rcutoff2
);
801 /* Update potential sum for this i atom from the interaction with this j atom. */
802 velec
= _mm_and_pd(velec
,cutoff_mask
);
803 velecsum
= _mm_add_pd(velecsum
,velec
);
807 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
809 /* Calculate temporary vectorial force */
810 tx
= _mm_mul_pd(fscal
,dx31
);
811 ty
= _mm_mul_pd(fscal
,dy31
);
812 tz
= _mm_mul_pd(fscal
,dz31
);
814 /* Update vectorial force */
815 fix3
= _mm_add_pd(fix3
,tx
);
816 fiy3
= _mm_add_pd(fiy3
,ty
);
817 fiz3
= _mm_add_pd(fiz3
,tz
);
819 fjx1
= _mm_add_pd(fjx1
,tx
);
820 fjy1
= _mm_add_pd(fjy1
,ty
);
821 fjz1
= _mm_add_pd(fjz1
,tz
);
825 /**************************
826 * CALCULATE INTERACTIONS *
827 **************************/
829 if (gmx_mm_any_lt(rsq32
,rcutoff2
))
832 r32
= _mm_mul_pd(rsq32
,rinv32
);
834 /* EWALD ELECTROSTATICS */
836 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
837 ewrt
= _mm_mul_pd(r32
,ewtabscale
);
838 ewitab
= _mm_cvttpd_epi32(ewrt
);
839 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
840 ewitab
= _mm_slli_epi32(ewitab
,2);
841 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
842 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
843 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
844 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
845 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
846 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
847 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
848 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
849 velec
= _mm_mul_pd(qq32
,_mm_sub_pd(rinv32
,velec
));
850 felec
= _mm_mul_pd(_mm_mul_pd(qq32
,rinv32
),_mm_sub_pd(rinvsq32
,felec
));
852 d
= _mm_sub_pd(r32
,rswitch
);
853 d
= _mm_max_pd(d
,_mm_setzero_pd());
854 d2
= _mm_mul_pd(d
,d
);
855 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
857 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
859 /* Evaluate switch function */
860 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
861 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv32
,_mm_mul_pd(velec
,dsw
)) );
862 velec
= _mm_mul_pd(velec
,sw
);
863 cutoff_mask
= _mm_cmplt_pd(rsq32
,rcutoff2
);
865 /* Update potential sum for this i atom from the interaction with this j atom. */
866 velec
= _mm_and_pd(velec
,cutoff_mask
);
867 velecsum
= _mm_add_pd(velecsum
,velec
);
871 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
873 /* Calculate temporary vectorial force */
874 tx
= _mm_mul_pd(fscal
,dx32
);
875 ty
= _mm_mul_pd(fscal
,dy32
);
876 tz
= _mm_mul_pd(fscal
,dz32
);
878 /* Update vectorial force */
879 fix3
= _mm_add_pd(fix3
,tx
);
880 fiy3
= _mm_add_pd(fiy3
,ty
);
881 fiz3
= _mm_add_pd(fiz3
,tz
);
883 fjx2
= _mm_add_pd(fjx2
,tx
);
884 fjy2
= _mm_add_pd(fjy2
,ty
);
885 fjz2
= _mm_add_pd(fjz2
,tz
);
889 /**************************
890 * CALCULATE INTERACTIONS *
891 **************************/
893 if (gmx_mm_any_lt(rsq33
,rcutoff2
))
896 r33
= _mm_mul_pd(rsq33
,rinv33
);
898 /* EWALD ELECTROSTATICS */
900 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
901 ewrt
= _mm_mul_pd(r33
,ewtabscale
);
902 ewitab
= _mm_cvttpd_epi32(ewrt
);
903 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
904 ewitab
= _mm_slli_epi32(ewitab
,2);
905 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
906 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
907 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
908 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
909 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
910 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
911 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
912 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
913 velec
= _mm_mul_pd(qq33
,_mm_sub_pd(rinv33
,velec
));
914 felec
= _mm_mul_pd(_mm_mul_pd(qq33
,rinv33
),_mm_sub_pd(rinvsq33
,felec
));
916 d
= _mm_sub_pd(r33
,rswitch
);
917 d
= _mm_max_pd(d
,_mm_setzero_pd());
918 d2
= _mm_mul_pd(d
,d
);
919 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
921 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
923 /* Evaluate switch function */
924 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
925 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv33
,_mm_mul_pd(velec
,dsw
)) );
926 velec
= _mm_mul_pd(velec
,sw
);
927 cutoff_mask
= _mm_cmplt_pd(rsq33
,rcutoff2
);
929 /* Update potential sum for this i atom from the interaction with this j atom. */
930 velec
= _mm_and_pd(velec
,cutoff_mask
);
931 velecsum
= _mm_add_pd(velecsum
,velec
);
935 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
937 /* Calculate temporary vectorial force */
938 tx
= _mm_mul_pd(fscal
,dx33
);
939 ty
= _mm_mul_pd(fscal
,dy33
);
940 tz
= _mm_mul_pd(fscal
,dz33
);
942 /* Update vectorial force */
943 fix3
= _mm_add_pd(fix3
,tx
);
944 fiy3
= _mm_add_pd(fiy3
,ty
);
945 fiz3
= _mm_add_pd(fiz3
,tz
);
947 fjx3
= _mm_add_pd(fjx3
,tx
);
948 fjy3
= _mm_add_pd(fjy3
,ty
);
949 fjz3
= _mm_add_pd(fjz3
,tz
);
953 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
955 /* Inner loop uses 647 flops */
962 j_coord_offsetA
= DIM
*jnrA
;
964 /* load j atom coordinates */
965 gmx_mm_load_4rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
966 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
967 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
969 /* Calculate displacement vector */
970 dx00
= _mm_sub_pd(ix0
,jx0
);
971 dy00
= _mm_sub_pd(iy0
,jy0
);
972 dz00
= _mm_sub_pd(iz0
,jz0
);
973 dx11
= _mm_sub_pd(ix1
,jx1
);
974 dy11
= _mm_sub_pd(iy1
,jy1
);
975 dz11
= _mm_sub_pd(iz1
,jz1
);
976 dx12
= _mm_sub_pd(ix1
,jx2
);
977 dy12
= _mm_sub_pd(iy1
,jy2
);
978 dz12
= _mm_sub_pd(iz1
,jz2
);
979 dx13
= _mm_sub_pd(ix1
,jx3
);
980 dy13
= _mm_sub_pd(iy1
,jy3
);
981 dz13
= _mm_sub_pd(iz1
,jz3
);
982 dx21
= _mm_sub_pd(ix2
,jx1
);
983 dy21
= _mm_sub_pd(iy2
,jy1
);
984 dz21
= _mm_sub_pd(iz2
,jz1
);
985 dx22
= _mm_sub_pd(ix2
,jx2
);
986 dy22
= _mm_sub_pd(iy2
,jy2
);
987 dz22
= _mm_sub_pd(iz2
,jz2
);
988 dx23
= _mm_sub_pd(ix2
,jx3
);
989 dy23
= _mm_sub_pd(iy2
,jy3
);
990 dz23
= _mm_sub_pd(iz2
,jz3
);
991 dx31
= _mm_sub_pd(ix3
,jx1
);
992 dy31
= _mm_sub_pd(iy3
,jy1
);
993 dz31
= _mm_sub_pd(iz3
,jz1
);
994 dx32
= _mm_sub_pd(ix3
,jx2
);
995 dy32
= _mm_sub_pd(iy3
,jy2
);
996 dz32
= _mm_sub_pd(iz3
,jz2
);
997 dx33
= _mm_sub_pd(ix3
,jx3
);
998 dy33
= _mm_sub_pd(iy3
,jy3
);
999 dz33
= _mm_sub_pd(iz3
,jz3
);
1001 /* Calculate squared distance and things based on it */
1002 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
1003 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
1004 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
1005 rsq13
= gmx_mm_calc_rsq_pd(dx13
,dy13
,dz13
);
1006 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
1007 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
1008 rsq23
= gmx_mm_calc_rsq_pd(dx23
,dy23
,dz23
);
1009 rsq31
= gmx_mm_calc_rsq_pd(dx31
,dy31
,dz31
);
1010 rsq32
= gmx_mm_calc_rsq_pd(dx32
,dy32
,dz32
);
1011 rsq33
= gmx_mm_calc_rsq_pd(dx33
,dy33
,dz33
);
1013 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
1014 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
1015 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
1016 rinv13
= gmx_mm_invsqrt_pd(rsq13
);
1017 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
1018 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
1019 rinv23
= gmx_mm_invsqrt_pd(rsq23
);
1020 rinv31
= gmx_mm_invsqrt_pd(rsq31
);
1021 rinv32
= gmx_mm_invsqrt_pd(rsq32
);
1022 rinv33
= gmx_mm_invsqrt_pd(rsq33
);
1024 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
1025 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
1026 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
1027 rinvsq13
= _mm_mul_pd(rinv13
,rinv13
);
1028 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
1029 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
1030 rinvsq23
= _mm_mul_pd(rinv23
,rinv23
);
1031 rinvsq31
= _mm_mul_pd(rinv31
,rinv31
);
1032 rinvsq32
= _mm_mul_pd(rinv32
,rinv32
);
1033 rinvsq33
= _mm_mul_pd(rinv33
,rinv33
);
1035 fjx0
= _mm_setzero_pd();
1036 fjy0
= _mm_setzero_pd();
1037 fjz0
= _mm_setzero_pd();
1038 fjx1
= _mm_setzero_pd();
1039 fjy1
= _mm_setzero_pd();
1040 fjz1
= _mm_setzero_pd();
1041 fjx2
= _mm_setzero_pd();
1042 fjy2
= _mm_setzero_pd();
1043 fjz2
= _mm_setzero_pd();
1044 fjx3
= _mm_setzero_pd();
1045 fjy3
= _mm_setzero_pd();
1046 fjz3
= _mm_setzero_pd();
1048 /**************************
1049 * CALCULATE INTERACTIONS *
1050 **************************/
1052 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
1055 r00
= _mm_mul_pd(rsq00
,rinv00
);
1057 /* LENNARD-JONES DISPERSION/REPULSION */
1059 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
1060 vvdw6
= _mm_mul_pd(c6_00
,rinvsix
);
1061 vvdw12
= _mm_mul_pd(c12_00
,_mm_mul_pd(rinvsix
,rinvsix
));
1062 vvdw
= _mm_sub_pd( _mm_mul_pd(vvdw12
,one_twelfth
) , _mm_mul_pd(vvdw6
,one_sixth
) );
1063 fvdw
= _mm_mul_pd(_mm_sub_pd(vvdw12
,vvdw6
),rinvsq00
);
1065 d
= _mm_sub_pd(r00
,rswitch
);
1066 d
= _mm_max_pd(d
,_mm_setzero_pd());
1067 d2
= _mm_mul_pd(d
,d
);
1068 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
1070 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
1072 /* Evaluate switch function */
1073 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1074 fvdw
= _mm_sub_pd( _mm_mul_pd(fvdw
,sw
) , _mm_mul_pd(rinv00
,_mm_mul_pd(vvdw
,dsw
)) );
1075 vvdw
= _mm_mul_pd(vvdw
,sw
);
1076 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
1078 /* Update potential sum for this i atom from the interaction with this j atom. */
1079 vvdw
= _mm_and_pd(vvdw
,cutoff_mask
);
1080 vvdw
= _mm_unpacklo_pd(vvdw
,_mm_setzero_pd());
1081 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
1085 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1087 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1089 /* Calculate temporary vectorial force */
1090 tx
= _mm_mul_pd(fscal
,dx00
);
1091 ty
= _mm_mul_pd(fscal
,dy00
);
1092 tz
= _mm_mul_pd(fscal
,dz00
);
1094 /* Update vectorial force */
1095 fix0
= _mm_add_pd(fix0
,tx
);
1096 fiy0
= _mm_add_pd(fiy0
,ty
);
1097 fiz0
= _mm_add_pd(fiz0
,tz
);
1099 fjx0
= _mm_add_pd(fjx0
,tx
);
1100 fjy0
= _mm_add_pd(fjy0
,ty
);
1101 fjz0
= _mm_add_pd(fjz0
,tz
);
1105 /**************************
1106 * CALCULATE INTERACTIONS *
1107 **************************/
1109 if (gmx_mm_any_lt(rsq11
,rcutoff2
))
1112 r11
= _mm_mul_pd(rsq11
,rinv11
);
1114 /* EWALD ELECTROSTATICS */
1116 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1117 ewrt
= _mm_mul_pd(r11
,ewtabscale
);
1118 ewitab
= _mm_cvttpd_epi32(ewrt
);
1119 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1120 ewitab
= _mm_slli_epi32(ewitab
,2);
1121 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1122 ewtabD
= _mm_setzero_pd();
1123 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1124 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1125 ewtabFn
= _mm_setzero_pd();
1126 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1127 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1128 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1129 velec
= _mm_mul_pd(qq11
,_mm_sub_pd(rinv11
,velec
));
1130 felec
= _mm_mul_pd(_mm_mul_pd(qq11
,rinv11
),_mm_sub_pd(rinvsq11
,felec
));
1132 d
= _mm_sub_pd(r11
,rswitch
);
1133 d
= _mm_max_pd(d
,_mm_setzero_pd());
1134 d2
= _mm_mul_pd(d
,d
);
1135 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
1137 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
1139 /* Evaluate switch function */
1140 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1141 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv11
,_mm_mul_pd(velec
,dsw
)) );
1142 velec
= _mm_mul_pd(velec
,sw
);
1143 cutoff_mask
= _mm_cmplt_pd(rsq11
,rcutoff2
);
1145 /* Update potential sum for this i atom from the interaction with this j atom. */
1146 velec
= _mm_and_pd(velec
,cutoff_mask
);
1147 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1148 velecsum
= _mm_add_pd(velecsum
,velec
);
1152 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1154 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1156 /* Calculate temporary vectorial force */
1157 tx
= _mm_mul_pd(fscal
,dx11
);
1158 ty
= _mm_mul_pd(fscal
,dy11
);
1159 tz
= _mm_mul_pd(fscal
,dz11
);
1161 /* Update vectorial force */
1162 fix1
= _mm_add_pd(fix1
,tx
);
1163 fiy1
= _mm_add_pd(fiy1
,ty
);
1164 fiz1
= _mm_add_pd(fiz1
,tz
);
1166 fjx1
= _mm_add_pd(fjx1
,tx
);
1167 fjy1
= _mm_add_pd(fjy1
,ty
);
1168 fjz1
= _mm_add_pd(fjz1
,tz
);
1172 /**************************
1173 * CALCULATE INTERACTIONS *
1174 **************************/
1176 if (gmx_mm_any_lt(rsq12
,rcutoff2
))
1179 r12
= _mm_mul_pd(rsq12
,rinv12
);
1181 /* EWALD ELECTROSTATICS */
1183 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1184 ewrt
= _mm_mul_pd(r12
,ewtabscale
);
1185 ewitab
= _mm_cvttpd_epi32(ewrt
);
1186 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1187 ewitab
= _mm_slli_epi32(ewitab
,2);
1188 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1189 ewtabD
= _mm_setzero_pd();
1190 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1191 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1192 ewtabFn
= _mm_setzero_pd();
1193 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1194 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1195 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1196 velec
= _mm_mul_pd(qq12
,_mm_sub_pd(rinv12
,velec
));
1197 felec
= _mm_mul_pd(_mm_mul_pd(qq12
,rinv12
),_mm_sub_pd(rinvsq12
,felec
));
1199 d
= _mm_sub_pd(r12
,rswitch
);
1200 d
= _mm_max_pd(d
,_mm_setzero_pd());
1201 d2
= _mm_mul_pd(d
,d
);
1202 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
1204 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
1206 /* Evaluate switch function */
1207 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1208 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv12
,_mm_mul_pd(velec
,dsw
)) );
1209 velec
= _mm_mul_pd(velec
,sw
);
1210 cutoff_mask
= _mm_cmplt_pd(rsq12
,rcutoff2
);
1212 /* Update potential sum for this i atom from the interaction with this j atom. */
1213 velec
= _mm_and_pd(velec
,cutoff_mask
);
1214 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1215 velecsum
= _mm_add_pd(velecsum
,velec
);
1219 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1221 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1223 /* Calculate temporary vectorial force */
1224 tx
= _mm_mul_pd(fscal
,dx12
);
1225 ty
= _mm_mul_pd(fscal
,dy12
);
1226 tz
= _mm_mul_pd(fscal
,dz12
);
1228 /* Update vectorial force */
1229 fix1
= _mm_add_pd(fix1
,tx
);
1230 fiy1
= _mm_add_pd(fiy1
,ty
);
1231 fiz1
= _mm_add_pd(fiz1
,tz
);
1233 fjx2
= _mm_add_pd(fjx2
,tx
);
1234 fjy2
= _mm_add_pd(fjy2
,ty
);
1235 fjz2
= _mm_add_pd(fjz2
,tz
);
1239 /**************************
1240 * CALCULATE INTERACTIONS *
1241 **************************/
1243 if (gmx_mm_any_lt(rsq13
,rcutoff2
))
1246 r13
= _mm_mul_pd(rsq13
,rinv13
);
1248 /* EWALD ELECTROSTATICS */
1250 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1251 ewrt
= _mm_mul_pd(r13
,ewtabscale
);
1252 ewitab
= _mm_cvttpd_epi32(ewrt
);
1253 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1254 ewitab
= _mm_slli_epi32(ewitab
,2);
1255 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1256 ewtabD
= _mm_setzero_pd();
1257 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1258 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1259 ewtabFn
= _mm_setzero_pd();
1260 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1261 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1262 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1263 velec
= _mm_mul_pd(qq13
,_mm_sub_pd(rinv13
,velec
));
1264 felec
= _mm_mul_pd(_mm_mul_pd(qq13
,rinv13
),_mm_sub_pd(rinvsq13
,felec
));
1266 d
= _mm_sub_pd(r13
,rswitch
);
1267 d
= _mm_max_pd(d
,_mm_setzero_pd());
1268 d2
= _mm_mul_pd(d
,d
);
1269 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
1271 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
1273 /* Evaluate switch function */
1274 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1275 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv13
,_mm_mul_pd(velec
,dsw
)) );
1276 velec
= _mm_mul_pd(velec
,sw
);
1277 cutoff_mask
= _mm_cmplt_pd(rsq13
,rcutoff2
);
1279 /* Update potential sum for this i atom from the interaction with this j atom. */
1280 velec
= _mm_and_pd(velec
,cutoff_mask
);
1281 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1282 velecsum
= _mm_add_pd(velecsum
,velec
);
1286 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1288 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1290 /* Calculate temporary vectorial force */
1291 tx
= _mm_mul_pd(fscal
,dx13
);
1292 ty
= _mm_mul_pd(fscal
,dy13
);
1293 tz
= _mm_mul_pd(fscal
,dz13
);
1295 /* Update vectorial force */
1296 fix1
= _mm_add_pd(fix1
,tx
);
1297 fiy1
= _mm_add_pd(fiy1
,ty
);
1298 fiz1
= _mm_add_pd(fiz1
,tz
);
1300 fjx3
= _mm_add_pd(fjx3
,tx
);
1301 fjy3
= _mm_add_pd(fjy3
,ty
);
1302 fjz3
= _mm_add_pd(fjz3
,tz
);
1306 /**************************
1307 * CALCULATE INTERACTIONS *
1308 **************************/
1310 if (gmx_mm_any_lt(rsq21
,rcutoff2
))
1313 r21
= _mm_mul_pd(rsq21
,rinv21
);
1315 /* EWALD ELECTROSTATICS */
1317 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1318 ewrt
= _mm_mul_pd(r21
,ewtabscale
);
1319 ewitab
= _mm_cvttpd_epi32(ewrt
);
1320 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1321 ewitab
= _mm_slli_epi32(ewitab
,2);
1322 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1323 ewtabD
= _mm_setzero_pd();
1324 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1325 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1326 ewtabFn
= _mm_setzero_pd();
1327 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1328 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1329 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1330 velec
= _mm_mul_pd(qq21
,_mm_sub_pd(rinv21
,velec
));
1331 felec
= _mm_mul_pd(_mm_mul_pd(qq21
,rinv21
),_mm_sub_pd(rinvsq21
,felec
));
1333 d
= _mm_sub_pd(r21
,rswitch
);
1334 d
= _mm_max_pd(d
,_mm_setzero_pd());
1335 d2
= _mm_mul_pd(d
,d
);
1336 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
1338 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
1340 /* Evaluate switch function */
1341 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1342 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv21
,_mm_mul_pd(velec
,dsw
)) );
1343 velec
= _mm_mul_pd(velec
,sw
);
1344 cutoff_mask
= _mm_cmplt_pd(rsq21
,rcutoff2
);
1346 /* Update potential sum for this i atom from the interaction with this j atom. */
1347 velec
= _mm_and_pd(velec
,cutoff_mask
);
1348 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1349 velecsum
= _mm_add_pd(velecsum
,velec
);
1353 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1355 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1357 /* Calculate temporary vectorial force */
1358 tx
= _mm_mul_pd(fscal
,dx21
);
1359 ty
= _mm_mul_pd(fscal
,dy21
);
1360 tz
= _mm_mul_pd(fscal
,dz21
);
1362 /* Update vectorial force */
1363 fix2
= _mm_add_pd(fix2
,tx
);
1364 fiy2
= _mm_add_pd(fiy2
,ty
);
1365 fiz2
= _mm_add_pd(fiz2
,tz
);
1367 fjx1
= _mm_add_pd(fjx1
,tx
);
1368 fjy1
= _mm_add_pd(fjy1
,ty
);
1369 fjz1
= _mm_add_pd(fjz1
,tz
);
1373 /**************************
1374 * CALCULATE INTERACTIONS *
1375 **************************/
1377 if (gmx_mm_any_lt(rsq22
,rcutoff2
))
1380 r22
= _mm_mul_pd(rsq22
,rinv22
);
1382 /* EWALD ELECTROSTATICS */
1384 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1385 ewrt
= _mm_mul_pd(r22
,ewtabscale
);
1386 ewitab
= _mm_cvttpd_epi32(ewrt
);
1387 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1388 ewitab
= _mm_slli_epi32(ewitab
,2);
1389 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1390 ewtabD
= _mm_setzero_pd();
1391 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1392 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1393 ewtabFn
= _mm_setzero_pd();
1394 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1395 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1396 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1397 velec
= _mm_mul_pd(qq22
,_mm_sub_pd(rinv22
,velec
));
1398 felec
= _mm_mul_pd(_mm_mul_pd(qq22
,rinv22
),_mm_sub_pd(rinvsq22
,felec
));
1400 d
= _mm_sub_pd(r22
,rswitch
);
1401 d
= _mm_max_pd(d
,_mm_setzero_pd());
1402 d2
= _mm_mul_pd(d
,d
);
1403 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
1405 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
1407 /* Evaluate switch function */
1408 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1409 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv22
,_mm_mul_pd(velec
,dsw
)) );
1410 velec
= _mm_mul_pd(velec
,sw
);
1411 cutoff_mask
= _mm_cmplt_pd(rsq22
,rcutoff2
);
1413 /* Update potential sum for this i atom from the interaction with this j atom. */
1414 velec
= _mm_and_pd(velec
,cutoff_mask
);
1415 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1416 velecsum
= _mm_add_pd(velecsum
,velec
);
1420 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1422 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1424 /* Calculate temporary vectorial force */
1425 tx
= _mm_mul_pd(fscal
,dx22
);
1426 ty
= _mm_mul_pd(fscal
,dy22
);
1427 tz
= _mm_mul_pd(fscal
,dz22
);
1429 /* Update vectorial force */
1430 fix2
= _mm_add_pd(fix2
,tx
);
1431 fiy2
= _mm_add_pd(fiy2
,ty
);
1432 fiz2
= _mm_add_pd(fiz2
,tz
);
1434 fjx2
= _mm_add_pd(fjx2
,tx
);
1435 fjy2
= _mm_add_pd(fjy2
,ty
);
1436 fjz2
= _mm_add_pd(fjz2
,tz
);
1440 /**************************
1441 * CALCULATE INTERACTIONS *
1442 **************************/
1444 if (gmx_mm_any_lt(rsq23
,rcutoff2
))
1447 r23
= _mm_mul_pd(rsq23
,rinv23
);
1449 /* EWALD ELECTROSTATICS */
1451 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1452 ewrt
= _mm_mul_pd(r23
,ewtabscale
);
1453 ewitab
= _mm_cvttpd_epi32(ewrt
);
1454 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1455 ewitab
= _mm_slli_epi32(ewitab
,2);
1456 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1457 ewtabD
= _mm_setzero_pd();
1458 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1459 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1460 ewtabFn
= _mm_setzero_pd();
1461 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1462 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1463 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1464 velec
= _mm_mul_pd(qq23
,_mm_sub_pd(rinv23
,velec
));
1465 felec
= _mm_mul_pd(_mm_mul_pd(qq23
,rinv23
),_mm_sub_pd(rinvsq23
,felec
));
1467 d
= _mm_sub_pd(r23
,rswitch
);
1468 d
= _mm_max_pd(d
,_mm_setzero_pd());
1469 d2
= _mm_mul_pd(d
,d
);
1470 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
1472 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
1474 /* Evaluate switch function */
1475 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1476 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv23
,_mm_mul_pd(velec
,dsw
)) );
1477 velec
= _mm_mul_pd(velec
,sw
);
1478 cutoff_mask
= _mm_cmplt_pd(rsq23
,rcutoff2
);
1480 /* Update potential sum for this i atom from the interaction with this j atom. */
1481 velec
= _mm_and_pd(velec
,cutoff_mask
);
1482 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1483 velecsum
= _mm_add_pd(velecsum
,velec
);
1487 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1489 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1491 /* Calculate temporary vectorial force */
1492 tx
= _mm_mul_pd(fscal
,dx23
);
1493 ty
= _mm_mul_pd(fscal
,dy23
);
1494 tz
= _mm_mul_pd(fscal
,dz23
);
1496 /* Update vectorial force */
1497 fix2
= _mm_add_pd(fix2
,tx
);
1498 fiy2
= _mm_add_pd(fiy2
,ty
);
1499 fiz2
= _mm_add_pd(fiz2
,tz
);
1501 fjx3
= _mm_add_pd(fjx3
,tx
);
1502 fjy3
= _mm_add_pd(fjy3
,ty
);
1503 fjz3
= _mm_add_pd(fjz3
,tz
);
1507 /**************************
1508 * CALCULATE INTERACTIONS *
1509 **************************/
1511 if (gmx_mm_any_lt(rsq31
,rcutoff2
))
1514 r31
= _mm_mul_pd(rsq31
,rinv31
);
1516 /* EWALD ELECTROSTATICS */
1518 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1519 ewrt
= _mm_mul_pd(r31
,ewtabscale
);
1520 ewitab
= _mm_cvttpd_epi32(ewrt
);
1521 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1522 ewitab
= _mm_slli_epi32(ewitab
,2);
1523 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1524 ewtabD
= _mm_setzero_pd();
1525 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1526 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1527 ewtabFn
= _mm_setzero_pd();
1528 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1529 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1530 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1531 velec
= _mm_mul_pd(qq31
,_mm_sub_pd(rinv31
,velec
));
1532 felec
= _mm_mul_pd(_mm_mul_pd(qq31
,rinv31
),_mm_sub_pd(rinvsq31
,felec
));
1534 d
= _mm_sub_pd(r31
,rswitch
);
1535 d
= _mm_max_pd(d
,_mm_setzero_pd());
1536 d2
= _mm_mul_pd(d
,d
);
1537 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
1539 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
1541 /* Evaluate switch function */
1542 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1543 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv31
,_mm_mul_pd(velec
,dsw
)) );
1544 velec
= _mm_mul_pd(velec
,sw
);
1545 cutoff_mask
= _mm_cmplt_pd(rsq31
,rcutoff2
);
1547 /* Update potential sum for this i atom from the interaction with this j atom. */
1548 velec
= _mm_and_pd(velec
,cutoff_mask
);
1549 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1550 velecsum
= _mm_add_pd(velecsum
,velec
);
1554 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1556 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1558 /* Calculate temporary vectorial force */
1559 tx
= _mm_mul_pd(fscal
,dx31
);
1560 ty
= _mm_mul_pd(fscal
,dy31
);
1561 tz
= _mm_mul_pd(fscal
,dz31
);
1563 /* Update vectorial force */
1564 fix3
= _mm_add_pd(fix3
,tx
);
1565 fiy3
= _mm_add_pd(fiy3
,ty
);
1566 fiz3
= _mm_add_pd(fiz3
,tz
);
1568 fjx1
= _mm_add_pd(fjx1
,tx
);
1569 fjy1
= _mm_add_pd(fjy1
,ty
);
1570 fjz1
= _mm_add_pd(fjz1
,tz
);
1574 /**************************
1575 * CALCULATE INTERACTIONS *
1576 **************************/
1578 if (gmx_mm_any_lt(rsq32
,rcutoff2
))
1581 r32
= _mm_mul_pd(rsq32
,rinv32
);
1583 /* EWALD ELECTROSTATICS */
1585 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1586 ewrt
= _mm_mul_pd(r32
,ewtabscale
);
1587 ewitab
= _mm_cvttpd_epi32(ewrt
);
1588 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1589 ewitab
= _mm_slli_epi32(ewitab
,2);
1590 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1591 ewtabD
= _mm_setzero_pd();
1592 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1593 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1594 ewtabFn
= _mm_setzero_pd();
1595 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1596 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1597 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1598 velec
= _mm_mul_pd(qq32
,_mm_sub_pd(rinv32
,velec
));
1599 felec
= _mm_mul_pd(_mm_mul_pd(qq32
,rinv32
),_mm_sub_pd(rinvsq32
,felec
));
1601 d
= _mm_sub_pd(r32
,rswitch
);
1602 d
= _mm_max_pd(d
,_mm_setzero_pd());
1603 d2
= _mm_mul_pd(d
,d
);
1604 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
1606 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
1608 /* Evaluate switch function */
1609 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1610 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv32
,_mm_mul_pd(velec
,dsw
)) );
1611 velec
= _mm_mul_pd(velec
,sw
);
1612 cutoff_mask
= _mm_cmplt_pd(rsq32
,rcutoff2
);
1614 /* Update potential sum for this i atom from the interaction with this j atom. */
1615 velec
= _mm_and_pd(velec
,cutoff_mask
);
1616 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1617 velecsum
= _mm_add_pd(velecsum
,velec
);
1621 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1623 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1625 /* Calculate temporary vectorial force */
1626 tx
= _mm_mul_pd(fscal
,dx32
);
1627 ty
= _mm_mul_pd(fscal
,dy32
);
1628 tz
= _mm_mul_pd(fscal
,dz32
);
1630 /* Update vectorial force */
1631 fix3
= _mm_add_pd(fix3
,tx
);
1632 fiy3
= _mm_add_pd(fiy3
,ty
);
1633 fiz3
= _mm_add_pd(fiz3
,tz
);
1635 fjx2
= _mm_add_pd(fjx2
,tx
);
1636 fjy2
= _mm_add_pd(fjy2
,ty
);
1637 fjz2
= _mm_add_pd(fjz2
,tz
);
1641 /**************************
1642 * CALCULATE INTERACTIONS *
1643 **************************/
1645 if (gmx_mm_any_lt(rsq33
,rcutoff2
))
1648 r33
= _mm_mul_pd(rsq33
,rinv33
);
1650 /* EWALD ELECTROSTATICS */
1652 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1653 ewrt
= _mm_mul_pd(r33
,ewtabscale
);
1654 ewitab
= _mm_cvttpd_epi32(ewrt
);
1655 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1656 ewitab
= _mm_slli_epi32(ewitab
,2);
1657 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1658 ewtabD
= _mm_setzero_pd();
1659 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1660 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1661 ewtabFn
= _mm_setzero_pd();
1662 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1663 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1664 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1665 velec
= _mm_mul_pd(qq33
,_mm_sub_pd(rinv33
,velec
));
1666 felec
= _mm_mul_pd(_mm_mul_pd(qq33
,rinv33
),_mm_sub_pd(rinvsq33
,felec
));
1668 d
= _mm_sub_pd(r33
,rswitch
);
1669 d
= _mm_max_pd(d
,_mm_setzero_pd());
1670 d2
= _mm_mul_pd(d
,d
);
1671 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
1673 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
1675 /* Evaluate switch function */
1676 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1677 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv33
,_mm_mul_pd(velec
,dsw
)) );
1678 velec
= _mm_mul_pd(velec
,sw
);
1679 cutoff_mask
= _mm_cmplt_pd(rsq33
,rcutoff2
);
1681 /* Update potential sum for this i atom from the interaction with this j atom. */
1682 velec
= _mm_and_pd(velec
,cutoff_mask
);
1683 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1684 velecsum
= _mm_add_pd(velecsum
,velec
);
1688 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1690 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1692 /* Calculate temporary vectorial force */
1693 tx
= _mm_mul_pd(fscal
,dx33
);
1694 ty
= _mm_mul_pd(fscal
,dy33
);
1695 tz
= _mm_mul_pd(fscal
,dz33
);
1697 /* Update vectorial force */
1698 fix3
= _mm_add_pd(fix3
,tx
);
1699 fiy3
= _mm_add_pd(fiy3
,ty
);
1700 fiz3
= _mm_add_pd(fiz3
,tz
);
1702 fjx3
= _mm_add_pd(fjx3
,tx
);
1703 fjy3
= _mm_add_pd(fjy3
,ty
);
1704 fjz3
= _mm_add_pd(fjz3
,tz
);
1708 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
1710 /* Inner loop uses 647 flops */
1713 /* End of innermost loop */
1715 gmx_mm_update_iforce_4atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
1716 f
+i_coord_offset
,fshift
+i_shift_offset
);
1719 /* Update potential energies */
1720 gmx_mm_update_1pot_pd(velecsum
,kernel_data
->energygrp_elec
+ggid
);
1721 gmx_mm_update_1pot_pd(vvdwsum
,kernel_data
->energygrp_vdw
+ggid
);
1723 /* Increment number of inner iterations */
1724 inneriter
+= j_index_end
- j_index_start
;
1726 /* Outer loop uses 26 flops */
1729 /* Increment number of outer iterations */
1732 /* Update outer/inner flops */
1734 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W4W4_VF
,outeriter
*26 + inneriter
*647);
1737 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sse2_double
1738 * Electrostatics interaction: Ewald
1739 * VdW interaction: LennardJones
1740 * Geometry: Water4-Water4
1741 * Calculate force/pot: Force
1744 nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sse2_double
1745 (t_nblist
* gmx_restrict nlist
,
1746 rvec
* gmx_restrict xx
,
1747 rvec
* gmx_restrict ff
,
1748 t_forcerec
* gmx_restrict fr
,
1749 t_mdatoms
* gmx_restrict mdatoms
,
1750 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
1751 t_nrnb
* gmx_restrict nrnb
)
1753 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1754 * just 0 for non-waters.
1755 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1756 * jnr indices corresponding to data put in the four positions in the SIMD register.
1758 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
1759 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
1761 int j_coord_offsetA
,j_coord_offsetB
;
1762 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
1763 real rcutoff_scalar
;
1764 real
*shiftvec
,*fshift
,*x
,*f
;
1765 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
1767 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
1769 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
1771 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
1773 __m128d ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
1774 int vdwjidx0A
,vdwjidx0B
;
1775 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
1776 int vdwjidx1A
,vdwjidx1B
;
1777 __m128d jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
1778 int vdwjidx2A
,vdwjidx2B
;
1779 __m128d jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
1780 int vdwjidx3A
,vdwjidx3B
;
1781 __m128d jx3
,jy3
,jz3
,fjx3
,fjy3
,fjz3
,jq3
,isaj3
;
1782 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
1783 __m128d dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
1784 __m128d dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
1785 __m128d dx13
,dy13
,dz13
,rsq13
,rinv13
,rinvsq13
,r13
,qq13
,c6_13
,c12_13
;
1786 __m128d dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
1787 __m128d dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
1788 __m128d dx23
,dy23
,dz23
,rsq23
,rinv23
,rinvsq23
,r23
,qq23
,c6_23
,c12_23
;
1789 __m128d dx31
,dy31
,dz31
,rsq31
,rinv31
,rinvsq31
,r31
,qq31
,c6_31
,c12_31
;
1790 __m128d dx32
,dy32
,dz32
,rsq32
,rinv32
,rinvsq32
,r32
,qq32
,c6_32
,c12_32
;
1791 __m128d dx33
,dy33
,dz33
,rsq33
,rinv33
,rinvsq33
,r33
,qq33
,c6_33
,c12_33
;
1792 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
1795 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
1798 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
1799 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
1801 __m128d ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
1803 __m128d rswitch
,swV3
,swV4
,swV5
,swF2
,swF3
,swF4
,d
,d2
,sw
,dsw
;
1804 real rswitch_scalar
,d_scalar
;
1805 __m128d dummy_mask
,cutoff_mask
;
1806 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1807 __m128d one
= _mm_set1_pd(1.0);
1808 __m128d two
= _mm_set1_pd(2.0);
1814 jindex
= nlist
->jindex
;
1816 shiftidx
= nlist
->shift
;
1818 shiftvec
= fr
->shift_vec
[0];
1819 fshift
= fr
->fshift
[0];
1820 facel
= _mm_set1_pd(fr
->epsfac
);
1821 charge
= mdatoms
->chargeA
;
1822 nvdwtype
= fr
->ntype
;
1823 vdwparam
= fr
->nbfp
;
1824 vdwtype
= mdatoms
->typeA
;
1826 sh_ewald
= _mm_set1_pd(fr
->ic
->sh_ewald
);
1827 ewtab
= fr
->ic
->tabq_coul_FDV0
;
1828 ewtabscale
= _mm_set1_pd(fr
->ic
->tabq_scale
);
1829 ewtabhalfspace
= _mm_set1_pd(0.5/fr
->ic
->tabq_scale
);
1831 /* Setup water-specific parameters */
1832 inr
= nlist
->iinr
[0];
1833 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
1834 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
1835 iq3
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+3]));
1836 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
1838 jq1
= _mm_set1_pd(charge
[inr
+1]);
1839 jq2
= _mm_set1_pd(charge
[inr
+2]);
1840 jq3
= _mm_set1_pd(charge
[inr
+3]);
1841 vdwjidx0A
= 2*vdwtype
[inr
+0];
1842 c6_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
]);
1843 c12_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
1844 qq11
= _mm_mul_pd(iq1
,jq1
);
1845 qq12
= _mm_mul_pd(iq1
,jq2
);
1846 qq13
= _mm_mul_pd(iq1
,jq3
);
1847 qq21
= _mm_mul_pd(iq2
,jq1
);
1848 qq22
= _mm_mul_pd(iq2
,jq2
);
1849 qq23
= _mm_mul_pd(iq2
,jq3
);
1850 qq31
= _mm_mul_pd(iq3
,jq1
);
1851 qq32
= _mm_mul_pd(iq3
,jq2
);
1852 qq33
= _mm_mul_pd(iq3
,jq3
);
1854 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1855 rcutoff_scalar
= fr
->rcoulomb
;
1856 rcutoff
= _mm_set1_pd(rcutoff_scalar
);
1857 rcutoff2
= _mm_mul_pd(rcutoff
,rcutoff
);
1859 rswitch_scalar
= fr
->rcoulomb_switch
;
1860 rswitch
= _mm_set1_pd(rswitch_scalar
);
1861 /* Setup switch parameters */
1862 d_scalar
= rcutoff_scalar
-rswitch_scalar
;
1863 d
= _mm_set1_pd(d_scalar
);
1864 swV3
= _mm_set1_pd(-10.0/(d_scalar
*d_scalar
*d_scalar
));
1865 swV4
= _mm_set1_pd( 15.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
));
1866 swV5
= _mm_set1_pd( -6.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
*d_scalar
));
1867 swF2
= _mm_set1_pd(-30.0/(d_scalar
*d_scalar
*d_scalar
));
1868 swF3
= _mm_set1_pd( 60.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
));
1869 swF4
= _mm_set1_pd(-30.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
*d_scalar
));
1871 /* Avoid stupid compiler warnings */
1873 j_coord_offsetA
= 0;
1874 j_coord_offsetB
= 0;
1879 /* Start outer loop over neighborlists */
1880 for(iidx
=0; iidx
<nri
; iidx
++)
1882 /* Load shift vector for this list */
1883 i_shift_offset
= DIM
*shiftidx
[iidx
];
1885 /* Load limits for loop over neighbors */
1886 j_index_start
= jindex
[iidx
];
1887 j_index_end
= jindex
[iidx
+1];
1889 /* Get outer coordinate index */
1891 i_coord_offset
= DIM
*inr
;
1893 /* Load i particle coords and add shift vector */
1894 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
1895 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
1897 fix0
= _mm_setzero_pd();
1898 fiy0
= _mm_setzero_pd();
1899 fiz0
= _mm_setzero_pd();
1900 fix1
= _mm_setzero_pd();
1901 fiy1
= _mm_setzero_pd();
1902 fiz1
= _mm_setzero_pd();
1903 fix2
= _mm_setzero_pd();
1904 fiy2
= _mm_setzero_pd();
1905 fiz2
= _mm_setzero_pd();
1906 fix3
= _mm_setzero_pd();
1907 fiy3
= _mm_setzero_pd();
1908 fiz3
= _mm_setzero_pd();
1910 /* Start inner kernel loop */
1911 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
1914 /* Get j neighbor index, and coordinate index */
1916 jnrB
= jjnr
[jidx
+1];
1917 j_coord_offsetA
= DIM
*jnrA
;
1918 j_coord_offsetB
= DIM
*jnrB
;
1920 /* load j atom coordinates */
1921 gmx_mm_load_4rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1922 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
1923 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
1925 /* Calculate displacement vector */
1926 dx00
= _mm_sub_pd(ix0
,jx0
);
1927 dy00
= _mm_sub_pd(iy0
,jy0
);
1928 dz00
= _mm_sub_pd(iz0
,jz0
);
1929 dx11
= _mm_sub_pd(ix1
,jx1
);
1930 dy11
= _mm_sub_pd(iy1
,jy1
);
1931 dz11
= _mm_sub_pd(iz1
,jz1
);
1932 dx12
= _mm_sub_pd(ix1
,jx2
);
1933 dy12
= _mm_sub_pd(iy1
,jy2
);
1934 dz12
= _mm_sub_pd(iz1
,jz2
);
1935 dx13
= _mm_sub_pd(ix1
,jx3
);
1936 dy13
= _mm_sub_pd(iy1
,jy3
);
1937 dz13
= _mm_sub_pd(iz1
,jz3
);
1938 dx21
= _mm_sub_pd(ix2
,jx1
);
1939 dy21
= _mm_sub_pd(iy2
,jy1
);
1940 dz21
= _mm_sub_pd(iz2
,jz1
);
1941 dx22
= _mm_sub_pd(ix2
,jx2
);
1942 dy22
= _mm_sub_pd(iy2
,jy2
);
1943 dz22
= _mm_sub_pd(iz2
,jz2
);
1944 dx23
= _mm_sub_pd(ix2
,jx3
);
1945 dy23
= _mm_sub_pd(iy2
,jy3
);
1946 dz23
= _mm_sub_pd(iz2
,jz3
);
1947 dx31
= _mm_sub_pd(ix3
,jx1
);
1948 dy31
= _mm_sub_pd(iy3
,jy1
);
1949 dz31
= _mm_sub_pd(iz3
,jz1
);
1950 dx32
= _mm_sub_pd(ix3
,jx2
);
1951 dy32
= _mm_sub_pd(iy3
,jy2
);
1952 dz32
= _mm_sub_pd(iz3
,jz2
);
1953 dx33
= _mm_sub_pd(ix3
,jx3
);
1954 dy33
= _mm_sub_pd(iy3
,jy3
);
1955 dz33
= _mm_sub_pd(iz3
,jz3
);
1957 /* Calculate squared distance and things based on it */
1958 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
1959 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
1960 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
1961 rsq13
= gmx_mm_calc_rsq_pd(dx13
,dy13
,dz13
);
1962 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
1963 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
1964 rsq23
= gmx_mm_calc_rsq_pd(dx23
,dy23
,dz23
);
1965 rsq31
= gmx_mm_calc_rsq_pd(dx31
,dy31
,dz31
);
1966 rsq32
= gmx_mm_calc_rsq_pd(dx32
,dy32
,dz32
);
1967 rsq33
= gmx_mm_calc_rsq_pd(dx33
,dy33
,dz33
);
1969 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
1970 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
1971 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
1972 rinv13
= gmx_mm_invsqrt_pd(rsq13
);
1973 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
1974 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
1975 rinv23
= gmx_mm_invsqrt_pd(rsq23
);
1976 rinv31
= gmx_mm_invsqrt_pd(rsq31
);
1977 rinv32
= gmx_mm_invsqrt_pd(rsq32
);
1978 rinv33
= gmx_mm_invsqrt_pd(rsq33
);
1980 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
1981 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
1982 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
1983 rinvsq13
= _mm_mul_pd(rinv13
,rinv13
);
1984 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
1985 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
1986 rinvsq23
= _mm_mul_pd(rinv23
,rinv23
);
1987 rinvsq31
= _mm_mul_pd(rinv31
,rinv31
);
1988 rinvsq32
= _mm_mul_pd(rinv32
,rinv32
);
1989 rinvsq33
= _mm_mul_pd(rinv33
,rinv33
);
1991 fjx0
= _mm_setzero_pd();
1992 fjy0
= _mm_setzero_pd();
1993 fjz0
= _mm_setzero_pd();
1994 fjx1
= _mm_setzero_pd();
1995 fjy1
= _mm_setzero_pd();
1996 fjz1
= _mm_setzero_pd();
1997 fjx2
= _mm_setzero_pd();
1998 fjy2
= _mm_setzero_pd();
1999 fjz2
= _mm_setzero_pd();
2000 fjx3
= _mm_setzero_pd();
2001 fjy3
= _mm_setzero_pd();
2002 fjz3
= _mm_setzero_pd();
2004 /**************************
2005 * CALCULATE INTERACTIONS *
2006 **************************/
2008 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
2011 r00
= _mm_mul_pd(rsq00
,rinv00
);
2013 /* LENNARD-JONES DISPERSION/REPULSION */
2015 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
2016 vvdw6
= _mm_mul_pd(c6_00
,rinvsix
);
2017 vvdw12
= _mm_mul_pd(c12_00
,_mm_mul_pd(rinvsix
,rinvsix
));
2018 vvdw
= _mm_sub_pd( _mm_mul_pd(vvdw12
,one_twelfth
) , _mm_mul_pd(vvdw6
,one_sixth
) );
2019 fvdw
= _mm_mul_pd(_mm_sub_pd(vvdw12
,vvdw6
),rinvsq00
);
2021 d
= _mm_sub_pd(r00
,rswitch
);
2022 d
= _mm_max_pd(d
,_mm_setzero_pd());
2023 d2
= _mm_mul_pd(d
,d
);
2024 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
2026 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
2028 /* Evaluate switch function */
2029 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2030 fvdw
= _mm_sub_pd( _mm_mul_pd(fvdw
,sw
) , _mm_mul_pd(rinv00
,_mm_mul_pd(vvdw
,dsw
)) );
2031 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
2035 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2037 /* Calculate temporary vectorial force */
2038 tx
= _mm_mul_pd(fscal
,dx00
);
2039 ty
= _mm_mul_pd(fscal
,dy00
);
2040 tz
= _mm_mul_pd(fscal
,dz00
);
2042 /* Update vectorial force */
2043 fix0
= _mm_add_pd(fix0
,tx
);
2044 fiy0
= _mm_add_pd(fiy0
,ty
);
2045 fiz0
= _mm_add_pd(fiz0
,tz
);
2047 fjx0
= _mm_add_pd(fjx0
,tx
);
2048 fjy0
= _mm_add_pd(fjy0
,ty
);
2049 fjz0
= _mm_add_pd(fjz0
,tz
);
2053 /**************************
2054 * CALCULATE INTERACTIONS *
2055 **************************/
2057 if (gmx_mm_any_lt(rsq11
,rcutoff2
))
2060 r11
= _mm_mul_pd(rsq11
,rinv11
);
2062 /* EWALD ELECTROSTATICS */
2064 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2065 ewrt
= _mm_mul_pd(r11
,ewtabscale
);
2066 ewitab
= _mm_cvttpd_epi32(ewrt
);
2067 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
2068 ewitab
= _mm_slli_epi32(ewitab
,2);
2069 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
2070 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
2071 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
2072 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
2073 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
2074 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
2075 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
2076 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
2077 velec
= _mm_mul_pd(qq11
,_mm_sub_pd(rinv11
,velec
));
2078 felec
= _mm_mul_pd(_mm_mul_pd(qq11
,rinv11
),_mm_sub_pd(rinvsq11
,felec
));
2080 d
= _mm_sub_pd(r11
,rswitch
);
2081 d
= _mm_max_pd(d
,_mm_setzero_pd());
2082 d2
= _mm_mul_pd(d
,d
);
2083 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
2085 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
2087 /* Evaluate switch function */
2088 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2089 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv11
,_mm_mul_pd(velec
,dsw
)) );
2090 cutoff_mask
= _mm_cmplt_pd(rsq11
,rcutoff2
);
2094 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2096 /* Calculate temporary vectorial force */
2097 tx
= _mm_mul_pd(fscal
,dx11
);
2098 ty
= _mm_mul_pd(fscal
,dy11
);
2099 tz
= _mm_mul_pd(fscal
,dz11
);
2101 /* Update vectorial force */
2102 fix1
= _mm_add_pd(fix1
,tx
);
2103 fiy1
= _mm_add_pd(fiy1
,ty
);
2104 fiz1
= _mm_add_pd(fiz1
,tz
);
2106 fjx1
= _mm_add_pd(fjx1
,tx
);
2107 fjy1
= _mm_add_pd(fjy1
,ty
);
2108 fjz1
= _mm_add_pd(fjz1
,tz
);
2112 /**************************
2113 * CALCULATE INTERACTIONS *
2114 **************************/
2116 if (gmx_mm_any_lt(rsq12
,rcutoff2
))
2119 r12
= _mm_mul_pd(rsq12
,rinv12
);
2121 /* EWALD ELECTROSTATICS */
2123 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2124 ewrt
= _mm_mul_pd(r12
,ewtabscale
);
2125 ewitab
= _mm_cvttpd_epi32(ewrt
);
2126 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
2127 ewitab
= _mm_slli_epi32(ewitab
,2);
2128 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
2129 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
2130 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
2131 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
2132 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
2133 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
2134 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
2135 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
2136 velec
= _mm_mul_pd(qq12
,_mm_sub_pd(rinv12
,velec
));
2137 felec
= _mm_mul_pd(_mm_mul_pd(qq12
,rinv12
),_mm_sub_pd(rinvsq12
,felec
));
2139 d
= _mm_sub_pd(r12
,rswitch
);
2140 d
= _mm_max_pd(d
,_mm_setzero_pd());
2141 d2
= _mm_mul_pd(d
,d
);
2142 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
2144 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
2146 /* Evaluate switch function */
2147 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2148 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv12
,_mm_mul_pd(velec
,dsw
)) );
2149 cutoff_mask
= _mm_cmplt_pd(rsq12
,rcutoff2
);
2153 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2155 /* Calculate temporary vectorial force */
2156 tx
= _mm_mul_pd(fscal
,dx12
);
2157 ty
= _mm_mul_pd(fscal
,dy12
);
2158 tz
= _mm_mul_pd(fscal
,dz12
);
2160 /* Update vectorial force */
2161 fix1
= _mm_add_pd(fix1
,tx
);
2162 fiy1
= _mm_add_pd(fiy1
,ty
);
2163 fiz1
= _mm_add_pd(fiz1
,tz
);
2165 fjx2
= _mm_add_pd(fjx2
,tx
);
2166 fjy2
= _mm_add_pd(fjy2
,ty
);
2167 fjz2
= _mm_add_pd(fjz2
,tz
);
2171 /**************************
2172 * CALCULATE INTERACTIONS *
2173 **************************/
2175 if (gmx_mm_any_lt(rsq13
,rcutoff2
))
2178 r13
= _mm_mul_pd(rsq13
,rinv13
);
2180 /* EWALD ELECTROSTATICS */
2182 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2183 ewrt
= _mm_mul_pd(r13
,ewtabscale
);
2184 ewitab
= _mm_cvttpd_epi32(ewrt
);
2185 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
2186 ewitab
= _mm_slli_epi32(ewitab
,2);
2187 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
2188 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
2189 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
2190 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
2191 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
2192 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
2193 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
2194 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
2195 velec
= _mm_mul_pd(qq13
,_mm_sub_pd(rinv13
,velec
));
2196 felec
= _mm_mul_pd(_mm_mul_pd(qq13
,rinv13
),_mm_sub_pd(rinvsq13
,felec
));
2198 d
= _mm_sub_pd(r13
,rswitch
);
2199 d
= _mm_max_pd(d
,_mm_setzero_pd());
2200 d2
= _mm_mul_pd(d
,d
);
2201 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
2203 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
2205 /* Evaluate switch function */
2206 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2207 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv13
,_mm_mul_pd(velec
,dsw
)) );
2208 cutoff_mask
= _mm_cmplt_pd(rsq13
,rcutoff2
);
2212 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2214 /* Calculate temporary vectorial force */
2215 tx
= _mm_mul_pd(fscal
,dx13
);
2216 ty
= _mm_mul_pd(fscal
,dy13
);
2217 tz
= _mm_mul_pd(fscal
,dz13
);
2219 /* Update vectorial force */
2220 fix1
= _mm_add_pd(fix1
,tx
);
2221 fiy1
= _mm_add_pd(fiy1
,ty
);
2222 fiz1
= _mm_add_pd(fiz1
,tz
);
2224 fjx3
= _mm_add_pd(fjx3
,tx
);
2225 fjy3
= _mm_add_pd(fjy3
,ty
);
2226 fjz3
= _mm_add_pd(fjz3
,tz
);
2230 /**************************
2231 * CALCULATE INTERACTIONS *
2232 **************************/
2234 if (gmx_mm_any_lt(rsq21
,rcutoff2
))
2237 r21
= _mm_mul_pd(rsq21
,rinv21
);
2239 /* EWALD ELECTROSTATICS */
2241 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2242 ewrt
= _mm_mul_pd(r21
,ewtabscale
);
2243 ewitab
= _mm_cvttpd_epi32(ewrt
);
2244 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
2245 ewitab
= _mm_slli_epi32(ewitab
,2);
2246 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
2247 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
2248 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
2249 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
2250 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
2251 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
2252 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
2253 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
2254 velec
= _mm_mul_pd(qq21
,_mm_sub_pd(rinv21
,velec
));
2255 felec
= _mm_mul_pd(_mm_mul_pd(qq21
,rinv21
),_mm_sub_pd(rinvsq21
,felec
));
2257 d
= _mm_sub_pd(r21
,rswitch
);
2258 d
= _mm_max_pd(d
,_mm_setzero_pd());
2259 d2
= _mm_mul_pd(d
,d
);
2260 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
2262 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
2264 /* Evaluate switch function */
2265 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2266 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv21
,_mm_mul_pd(velec
,dsw
)) );
2267 cutoff_mask
= _mm_cmplt_pd(rsq21
,rcutoff2
);
2271 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2273 /* Calculate temporary vectorial force */
2274 tx
= _mm_mul_pd(fscal
,dx21
);
2275 ty
= _mm_mul_pd(fscal
,dy21
);
2276 tz
= _mm_mul_pd(fscal
,dz21
);
2278 /* Update vectorial force */
2279 fix2
= _mm_add_pd(fix2
,tx
);
2280 fiy2
= _mm_add_pd(fiy2
,ty
);
2281 fiz2
= _mm_add_pd(fiz2
,tz
);
2283 fjx1
= _mm_add_pd(fjx1
,tx
);
2284 fjy1
= _mm_add_pd(fjy1
,ty
);
2285 fjz1
= _mm_add_pd(fjz1
,tz
);
2289 /**************************
2290 * CALCULATE INTERACTIONS *
2291 **************************/
2293 if (gmx_mm_any_lt(rsq22
,rcutoff2
))
2296 r22
= _mm_mul_pd(rsq22
,rinv22
);
2298 /* EWALD ELECTROSTATICS */
2300 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2301 ewrt
= _mm_mul_pd(r22
,ewtabscale
);
2302 ewitab
= _mm_cvttpd_epi32(ewrt
);
2303 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
2304 ewitab
= _mm_slli_epi32(ewitab
,2);
2305 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
2306 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
2307 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
2308 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
2309 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
2310 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
2311 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
2312 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
2313 velec
= _mm_mul_pd(qq22
,_mm_sub_pd(rinv22
,velec
));
2314 felec
= _mm_mul_pd(_mm_mul_pd(qq22
,rinv22
),_mm_sub_pd(rinvsq22
,felec
));
2316 d
= _mm_sub_pd(r22
,rswitch
);
2317 d
= _mm_max_pd(d
,_mm_setzero_pd());
2318 d2
= _mm_mul_pd(d
,d
);
2319 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
2321 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
2323 /* Evaluate switch function */
2324 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2325 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv22
,_mm_mul_pd(velec
,dsw
)) );
2326 cutoff_mask
= _mm_cmplt_pd(rsq22
,rcutoff2
);
2330 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2332 /* Calculate temporary vectorial force */
2333 tx
= _mm_mul_pd(fscal
,dx22
);
2334 ty
= _mm_mul_pd(fscal
,dy22
);
2335 tz
= _mm_mul_pd(fscal
,dz22
);
2337 /* Update vectorial force */
2338 fix2
= _mm_add_pd(fix2
,tx
);
2339 fiy2
= _mm_add_pd(fiy2
,ty
);
2340 fiz2
= _mm_add_pd(fiz2
,tz
);
2342 fjx2
= _mm_add_pd(fjx2
,tx
);
2343 fjy2
= _mm_add_pd(fjy2
,ty
);
2344 fjz2
= _mm_add_pd(fjz2
,tz
);
2348 /**************************
2349 * CALCULATE INTERACTIONS *
2350 **************************/
2352 if (gmx_mm_any_lt(rsq23
,rcutoff2
))
2355 r23
= _mm_mul_pd(rsq23
,rinv23
);
2357 /* EWALD ELECTROSTATICS */
2359 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2360 ewrt
= _mm_mul_pd(r23
,ewtabscale
);
2361 ewitab
= _mm_cvttpd_epi32(ewrt
);
2362 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
2363 ewitab
= _mm_slli_epi32(ewitab
,2);
2364 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
2365 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
2366 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
2367 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
2368 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
2369 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
2370 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
2371 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
2372 velec
= _mm_mul_pd(qq23
,_mm_sub_pd(rinv23
,velec
));
2373 felec
= _mm_mul_pd(_mm_mul_pd(qq23
,rinv23
),_mm_sub_pd(rinvsq23
,felec
));
2375 d
= _mm_sub_pd(r23
,rswitch
);
2376 d
= _mm_max_pd(d
,_mm_setzero_pd());
2377 d2
= _mm_mul_pd(d
,d
);
2378 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
2380 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
2382 /* Evaluate switch function */
2383 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2384 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv23
,_mm_mul_pd(velec
,dsw
)) );
2385 cutoff_mask
= _mm_cmplt_pd(rsq23
,rcutoff2
);
2389 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2391 /* Calculate temporary vectorial force */
2392 tx
= _mm_mul_pd(fscal
,dx23
);
2393 ty
= _mm_mul_pd(fscal
,dy23
);
2394 tz
= _mm_mul_pd(fscal
,dz23
);
2396 /* Update vectorial force */
2397 fix2
= _mm_add_pd(fix2
,tx
);
2398 fiy2
= _mm_add_pd(fiy2
,ty
);
2399 fiz2
= _mm_add_pd(fiz2
,tz
);
2401 fjx3
= _mm_add_pd(fjx3
,tx
);
2402 fjy3
= _mm_add_pd(fjy3
,ty
);
2403 fjz3
= _mm_add_pd(fjz3
,tz
);
2407 /**************************
2408 * CALCULATE INTERACTIONS *
2409 **************************/
2411 if (gmx_mm_any_lt(rsq31
,rcutoff2
))
2414 r31
= _mm_mul_pd(rsq31
,rinv31
);
2416 /* EWALD ELECTROSTATICS */
2418 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2419 ewrt
= _mm_mul_pd(r31
,ewtabscale
);
2420 ewitab
= _mm_cvttpd_epi32(ewrt
);
2421 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
2422 ewitab
= _mm_slli_epi32(ewitab
,2);
2423 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
2424 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
2425 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
2426 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
2427 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
2428 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
2429 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
2430 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
2431 velec
= _mm_mul_pd(qq31
,_mm_sub_pd(rinv31
,velec
));
2432 felec
= _mm_mul_pd(_mm_mul_pd(qq31
,rinv31
),_mm_sub_pd(rinvsq31
,felec
));
2434 d
= _mm_sub_pd(r31
,rswitch
);
2435 d
= _mm_max_pd(d
,_mm_setzero_pd());
2436 d2
= _mm_mul_pd(d
,d
);
2437 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
2439 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
2441 /* Evaluate switch function */
2442 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2443 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv31
,_mm_mul_pd(velec
,dsw
)) );
2444 cutoff_mask
= _mm_cmplt_pd(rsq31
,rcutoff2
);
2448 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2450 /* Calculate temporary vectorial force */
2451 tx
= _mm_mul_pd(fscal
,dx31
);
2452 ty
= _mm_mul_pd(fscal
,dy31
);
2453 tz
= _mm_mul_pd(fscal
,dz31
);
2455 /* Update vectorial force */
2456 fix3
= _mm_add_pd(fix3
,tx
);
2457 fiy3
= _mm_add_pd(fiy3
,ty
);
2458 fiz3
= _mm_add_pd(fiz3
,tz
);
2460 fjx1
= _mm_add_pd(fjx1
,tx
);
2461 fjy1
= _mm_add_pd(fjy1
,ty
);
2462 fjz1
= _mm_add_pd(fjz1
,tz
);
2466 /**************************
2467 * CALCULATE INTERACTIONS *
2468 **************************/
2470 if (gmx_mm_any_lt(rsq32
,rcutoff2
))
2473 r32
= _mm_mul_pd(rsq32
,rinv32
);
2475 /* EWALD ELECTROSTATICS */
2477 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2478 ewrt
= _mm_mul_pd(r32
,ewtabscale
);
2479 ewitab
= _mm_cvttpd_epi32(ewrt
);
2480 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
2481 ewitab
= _mm_slli_epi32(ewitab
,2);
2482 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
2483 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
2484 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
2485 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
2486 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
2487 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
2488 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
2489 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
2490 velec
= _mm_mul_pd(qq32
,_mm_sub_pd(rinv32
,velec
));
2491 felec
= _mm_mul_pd(_mm_mul_pd(qq32
,rinv32
),_mm_sub_pd(rinvsq32
,felec
));
2493 d
= _mm_sub_pd(r32
,rswitch
);
2494 d
= _mm_max_pd(d
,_mm_setzero_pd());
2495 d2
= _mm_mul_pd(d
,d
);
2496 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
2498 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
2500 /* Evaluate switch function */
2501 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2502 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv32
,_mm_mul_pd(velec
,dsw
)) );
2503 cutoff_mask
= _mm_cmplt_pd(rsq32
,rcutoff2
);
2507 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2509 /* Calculate temporary vectorial force */
2510 tx
= _mm_mul_pd(fscal
,dx32
);
2511 ty
= _mm_mul_pd(fscal
,dy32
);
2512 tz
= _mm_mul_pd(fscal
,dz32
);
2514 /* Update vectorial force */
2515 fix3
= _mm_add_pd(fix3
,tx
);
2516 fiy3
= _mm_add_pd(fiy3
,ty
);
2517 fiz3
= _mm_add_pd(fiz3
,tz
);
2519 fjx2
= _mm_add_pd(fjx2
,tx
);
2520 fjy2
= _mm_add_pd(fjy2
,ty
);
2521 fjz2
= _mm_add_pd(fjz2
,tz
);
2525 /**************************
2526 * CALCULATE INTERACTIONS *
2527 **************************/
2529 if (gmx_mm_any_lt(rsq33
,rcutoff2
))
2532 r33
= _mm_mul_pd(rsq33
,rinv33
);
2534 /* EWALD ELECTROSTATICS */
2536 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2537 ewrt
= _mm_mul_pd(r33
,ewtabscale
);
2538 ewitab
= _mm_cvttpd_epi32(ewrt
);
2539 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
2540 ewitab
= _mm_slli_epi32(ewitab
,2);
2541 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
2542 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
2543 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
2544 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
2545 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
2546 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
2547 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
2548 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
2549 velec
= _mm_mul_pd(qq33
,_mm_sub_pd(rinv33
,velec
));
2550 felec
= _mm_mul_pd(_mm_mul_pd(qq33
,rinv33
),_mm_sub_pd(rinvsq33
,felec
));
2552 d
= _mm_sub_pd(r33
,rswitch
);
2553 d
= _mm_max_pd(d
,_mm_setzero_pd());
2554 d2
= _mm_mul_pd(d
,d
);
2555 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
2557 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
2559 /* Evaluate switch function */
2560 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2561 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv33
,_mm_mul_pd(velec
,dsw
)) );
2562 cutoff_mask
= _mm_cmplt_pd(rsq33
,rcutoff2
);
2566 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2568 /* Calculate temporary vectorial force */
2569 tx
= _mm_mul_pd(fscal
,dx33
);
2570 ty
= _mm_mul_pd(fscal
,dy33
);
2571 tz
= _mm_mul_pd(fscal
,dz33
);
2573 /* Update vectorial force */
2574 fix3
= _mm_add_pd(fix3
,tx
);
2575 fiy3
= _mm_add_pd(fiy3
,ty
);
2576 fiz3
= _mm_add_pd(fiz3
,tz
);
2578 fjx3
= _mm_add_pd(fjx3
,tx
);
2579 fjy3
= _mm_add_pd(fjy3
,ty
);
2580 fjz3
= _mm_add_pd(fjz3
,tz
);
2584 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
2586 /* Inner loop uses 617 flops */
2589 if(jidx
<j_index_end
)
2593 j_coord_offsetA
= DIM
*jnrA
;
2595 /* load j atom coordinates */
2596 gmx_mm_load_4rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
2597 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
2598 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
2600 /* Calculate displacement vector */
2601 dx00
= _mm_sub_pd(ix0
,jx0
);
2602 dy00
= _mm_sub_pd(iy0
,jy0
);
2603 dz00
= _mm_sub_pd(iz0
,jz0
);
2604 dx11
= _mm_sub_pd(ix1
,jx1
);
2605 dy11
= _mm_sub_pd(iy1
,jy1
);
2606 dz11
= _mm_sub_pd(iz1
,jz1
);
2607 dx12
= _mm_sub_pd(ix1
,jx2
);
2608 dy12
= _mm_sub_pd(iy1
,jy2
);
2609 dz12
= _mm_sub_pd(iz1
,jz2
);
2610 dx13
= _mm_sub_pd(ix1
,jx3
);
2611 dy13
= _mm_sub_pd(iy1
,jy3
);
2612 dz13
= _mm_sub_pd(iz1
,jz3
);
2613 dx21
= _mm_sub_pd(ix2
,jx1
);
2614 dy21
= _mm_sub_pd(iy2
,jy1
);
2615 dz21
= _mm_sub_pd(iz2
,jz1
);
2616 dx22
= _mm_sub_pd(ix2
,jx2
);
2617 dy22
= _mm_sub_pd(iy2
,jy2
);
2618 dz22
= _mm_sub_pd(iz2
,jz2
);
2619 dx23
= _mm_sub_pd(ix2
,jx3
);
2620 dy23
= _mm_sub_pd(iy2
,jy3
);
2621 dz23
= _mm_sub_pd(iz2
,jz3
);
2622 dx31
= _mm_sub_pd(ix3
,jx1
);
2623 dy31
= _mm_sub_pd(iy3
,jy1
);
2624 dz31
= _mm_sub_pd(iz3
,jz1
);
2625 dx32
= _mm_sub_pd(ix3
,jx2
);
2626 dy32
= _mm_sub_pd(iy3
,jy2
);
2627 dz32
= _mm_sub_pd(iz3
,jz2
);
2628 dx33
= _mm_sub_pd(ix3
,jx3
);
2629 dy33
= _mm_sub_pd(iy3
,jy3
);
2630 dz33
= _mm_sub_pd(iz3
,jz3
);
2632 /* Calculate squared distance and things based on it */
2633 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
2634 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
2635 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
2636 rsq13
= gmx_mm_calc_rsq_pd(dx13
,dy13
,dz13
);
2637 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
2638 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
2639 rsq23
= gmx_mm_calc_rsq_pd(dx23
,dy23
,dz23
);
2640 rsq31
= gmx_mm_calc_rsq_pd(dx31
,dy31
,dz31
);
2641 rsq32
= gmx_mm_calc_rsq_pd(dx32
,dy32
,dz32
);
2642 rsq33
= gmx_mm_calc_rsq_pd(dx33
,dy33
,dz33
);
2644 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
2645 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
2646 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
2647 rinv13
= gmx_mm_invsqrt_pd(rsq13
);
2648 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
2649 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
2650 rinv23
= gmx_mm_invsqrt_pd(rsq23
);
2651 rinv31
= gmx_mm_invsqrt_pd(rsq31
);
2652 rinv32
= gmx_mm_invsqrt_pd(rsq32
);
2653 rinv33
= gmx_mm_invsqrt_pd(rsq33
);
2655 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
2656 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
2657 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
2658 rinvsq13
= _mm_mul_pd(rinv13
,rinv13
);
2659 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
2660 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
2661 rinvsq23
= _mm_mul_pd(rinv23
,rinv23
);
2662 rinvsq31
= _mm_mul_pd(rinv31
,rinv31
);
2663 rinvsq32
= _mm_mul_pd(rinv32
,rinv32
);
2664 rinvsq33
= _mm_mul_pd(rinv33
,rinv33
);
2666 fjx0
= _mm_setzero_pd();
2667 fjy0
= _mm_setzero_pd();
2668 fjz0
= _mm_setzero_pd();
2669 fjx1
= _mm_setzero_pd();
2670 fjy1
= _mm_setzero_pd();
2671 fjz1
= _mm_setzero_pd();
2672 fjx2
= _mm_setzero_pd();
2673 fjy2
= _mm_setzero_pd();
2674 fjz2
= _mm_setzero_pd();
2675 fjx3
= _mm_setzero_pd();
2676 fjy3
= _mm_setzero_pd();
2677 fjz3
= _mm_setzero_pd();
2679 /**************************
2680 * CALCULATE INTERACTIONS *
2681 **************************/
2683 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
2686 r00
= _mm_mul_pd(rsq00
,rinv00
);
2688 /* LENNARD-JONES DISPERSION/REPULSION */
2690 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
2691 vvdw6
= _mm_mul_pd(c6_00
,rinvsix
);
2692 vvdw12
= _mm_mul_pd(c12_00
,_mm_mul_pd(rinvsix
,rinvsix
));
2693 vvdw
= _mm_sub_pd( _mm_mul_pd(vvdw12
,one_twelfth
) , _mm_mul_pd(vvdw6
,one_sixth
) );
2694 fvdw
= _mm_mul_pd(_mm_sub_pd(vvdw12
,vvdw6
),rinvsq00
);
2696 d
= _mm_sub_pd(r00
,rswitch
);
2697 d
= _mm_max_pd(d
,_mm_setzero_pd());
2698 d2
= _mm_mul_pd(d
,d
);
2699 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
2701 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
2703 /* Evaluate switch function */
2704 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2705 fvdw
= _mm_sub_pd( _mm_mul_pd(fvdw
,sw
) , _mm_mul_pd(rinv00
,_mm_mul_pd(vvdw
,dsw
)) );
2706 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
2710 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2712 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2714 /* Calculate temporary vectorial force */
2715 tx
= _mm_mul_pd(fscal
,dx00
);
2716 ty
= _mm_mul_pd(fscal
,dy00
);
2717 tz
= _mm_mul_pd(fscal
,dz00
);
2719 /* Update vectorial force */
2720 fix0
= _mm_add_pd(fix0
,tx
);
2721 fiy0
= _mm_add_pd(fiy0
,ty
);
2722 fiz0
= _mm_add_pd(fiz0
,tz
);
2724 fjx0
= _mm_add_pd(fjx0
,tx
);
2725 fjy0
= _mm_add_pd(fjy0
,ty
);
2726 fjz0
= _mm_add_pd(fjz0
,tz
);
2730 /**************************
2731 * CALCULATE INTERACTIONS *
2732 **************************/
2734 if (gmx_mm_any_lt(rsq11
,rcutoff2
))
2737 r11
= _mm_mul_pd(rsq11
,rinv11
);
2739 /* EWALD ELECTROSTATICS */
2741 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2742 ewrt
= _mm_mul_pd(r11
,ewtabscale
);
2743 ewitab
= _mm_cvttpd_epi32(ewrt
);
2744 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
2745 ewitab
= _mm_slli_epi32(ewitab
,2);
2746 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
2747 ewtabD
= _mm_setzero_pd();
2748 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
2749 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
2750 ewtabFn
= _mm_setzero_pd();
2751 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
2752 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
2753 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
2754 velec
= _mm_mul_pd(qq11
,_mm_sub_pd(rinv11
,velec
));
2755 felec
= _mm_mul_pd(_mm_mul_pd(qq11
,rinv11
),_mm_sub_pd(rinvsq11
,felec
));
2757 d
= _mm_sub_pd(r11
,rswitch
);
2758 d
= _mm_max_pd(d
,_mm_setzero_pd());
2759 d2
= _mm_mul_pd(d
,d
);
2760 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
2762 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
2764 /* Evaluate switch function */
2765 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2766 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv11
,_mm_mul_pd(velec
,dsw
)) );
2767 cutoff_mask
= _mm_cmplt_pd(rsq11
,rcutoff2
);
2771 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2773 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2775 /* Calculate temporary vectorial force */
2776 tx
= _mm_mul_pd(fscal
,dx11
);
2777 ty
= _mm_mul_pd(fscal
,dy11
);
2778 tz
= _mm_mul_pd(fscal
,dz11
);
2780 /* Update vectorial force */
2781 fix1
= _mm_add_pd(fix1
,tx
);
2782 fiy1
= _mm_add_pd(fiy1
,ty
);
2783 fiz1
= _mm_add_pd(fiz1
,tz
);
2785 fjx1
= _mm_add_pd(fjx1
,tx
);
2786 fjy1
= _mm_add_pd(fjy1
,ty
);
2787 fjz1
= _mm_add_pd(fjz1
,tz
);
2791 /**************************
2792 * CALCULATE INTERACTIONS *
2793 **************************/
2795 if (gmx_mm_any_lt(rsq12
,rcutoff2
))
2798 r12
= _mm_mul_pd(rsq12
,rinv12
);
2800 /* EWALD ELECTROSTATICS */
2802 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2803 ewrt
= _mm_mul_pd(r12
,ewtabscale
);
2804 ewitab
= _mm_cvttpd_epi32(ewrt
);
2805 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
2806 ewitab
= _mm_slli_epi32(ewitab
,2);
2807 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
2808 ewtabD
= _mm_setzero_pd();
2809 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
2810 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
2811 ewtabFn
= _mm_setzero_pd();
2812 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
2813 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
2814 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
2815 velec
= _mm_mul_pd(qq12
,_mm_sub_pd(rinv12
,velec
));
2816 felec
= _mm_mul_pd(_mm_mul_pd(qq12
,rinv12
),_mm_sub_pd(rinvsq12
,felec
));
2818 d
= _mm_sub_pd(r12
,rswitch
);
2819 d
= _mm_max_pd(d
,_mm_setzero_pd());
2820 d2
= _mm_mul_pd(d
,d
);
2821 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
2823 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
2825 /* Evaluate switch function */
2826 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2827 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv12
,_mm_mul_pd(velec
,dsw
)) );
2828 cutoff_mask
= _mm_cmplt_pd(rsq12
,rcutoff2
);
2832 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2834 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2836 /* Calculate temporary vectorial force */
2837 tx
= _mm_mul_pd(fscal
,dx12
);
2838 ty
= _mm_mul_pd(fscal
,dy12
);
2839 tz
= _mm_mul_pd(fscal
,dz12
);
2841 /* Update vectorial force */
2842 fix1
= _mm_add_pd(fix1
,tx
);
2843 fiy1
= _mm_add_pd(fiy1
,ty
);
2844 fiz1
= _mm_add_pd(fiz1
,tz
);
2846 fjx2
= _mm_add_pd(fjx2
,tx
);
2847 fjy2
= _mm_add_pd(fjy2
,ty
);
2848 fjz2
= _mm_add_pd(fjz2
,tz
);
2852 /**************************
2853 * CALCULATE INTERACTIONS *
2854 **************************/
2856 if (gmx_mm_any_lt(rsq13
,rcutoff2
))
2859 r13
= _mm_mul_pd(rsq13
,rinv13
);
2861 /* EWALD ELECTROSTATICS */
2863 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2864 ewrt
= _mm_mul_pd(r13
,ewtabscale
);
2865 ewitab
= _mm_cvttpd_epi32(ewrt
);
2866 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
2867 ewitab
= _mm_slli_epi32(ewitab
,2);
2868 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
2869 ewtabD
= _mm_setzero_pd();
2870 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
2871 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
2872 ewtabFn
= _mm_setzero_pd();
2873 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
2874 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
2875 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
2876 velec
= _mm_mul_pd(qq13
,_mm_sub_pd(rinv13
,velec
));
2877 felec
= _mm_mul_pd(_mm_mul_pd(qq13
,rinv13
),_mm_sub_pd(rinvsq13
,felec
));
2879 d
= _mm_sub_pd(r13
,rswitch
);
2880 d
= _mm_max_pd(d
,_mm_setzero_pd());
2881 d2
= _mm_mul_pd(d
,d
);
2882 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
2884 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
2886 /* Evaluate switch function */
2887 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2888 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv13
,_mm_mul_pd(velec
,dsw
)) );
2889 cutoff_mask
= _mm_cmplt_pd(rsq13
,rcutoff2
);
2893 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2895 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2897 /* Calculate temporary vectorial force */
2898 tx
= _mm_mul_pd(fscal
,dx13
);
2899 ty
= _mm_mul_pd(fscal
,dy13
);
2900 tz
= _mm_mul_pd(fscal
,dz13
);
2902 /* Update vectorial force */
2903 fix1
= _mm_add_pd(fix1
,tx
);
2904 fiy1
= _mm_add_pd(fiy1
,ty
);
2905 fiz1
= _mm_add_pd(fiz1
,tz
);
2907 fjx3
= _mm_add_pd(fjx3
,tx
);
2908 fjy3
= _mm_add_pd(fjy3
,ty
);
2909 fjz3
= _mm_add_pd(fjz3
,tz
);
2913 /**************************
2914 * CALCULATE INTERACTIONS *
2915 **************************/
2917 if (gmx_mm_any_lt(rsq21
,rcutoff2
))
2920 r21
= _mm_mul_pd(rsq21
,rinv21
);
2922 /* EWALD ELECTROSTATICS */
2924 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2925 ewrt
= _mm_mul_pd(r21
,ewtabscale
);
2926 ewitab
= _mm_cvttpd_epi32(ewrt
);
2927 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
2928 ewitab
= _mm_slli_epi32(ewitab
,2);
2929 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
2930 ewtabD
= _mm_setzero_pd();
2931 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
2932 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
2933 ewtabFn
= _mm_setzero_pd();
2934 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
2935 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
2936 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
2937 velec
= _mm_mul_pd(qq21
,_mm_sub_pd(rinv21
,velec
));
2938 felec
= _mm_mul_pd(_mm_mul_pd(qq21
,rinv21
),_mm_sub_pd(rinvsq21
,felec
));
2940 d
= _mm_sub_pd(r21
,rswitch
);
2941 d
= _mm_max_pd(d
,_mm_setzero_pd());
2942 d2
= _mm_mul_pd(d
,d
);
2943 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
2945 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
2947 /* Evaluate switch function */
2948 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2949 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv21
,_mm_mul_pd(velec
,dsw
)) );
2950 cutoff_mask
= _mm_cmplt_pd(rsq21
,rcutoff2
);
2954 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2956 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2958 /* Calculate temporary vectorial force */
2959 tx
= _mm_mul_pd(fscal
,dx21
);
2960 ty
= _mm_mul_pd(fscal
,dy21
);
2961 tz
= _mm_mul_pd(fscal
,dz21
);
2963 /* Update vectorial force */
2964 fix2
= _mm_add_pd(fix2
,tx
);
2965 fiy2
= _mm_add_pd(fiy2
,ty
);
2966 fiz2
= _mm_add_pd(fiz2
,tz
);
2968 fjx1
= _mm_add_pd(fjx1
,tx
);
2969 fjy1
= _mm_add_pd(fjy1
,ty
);
2970 fjz1
= _mm_add_pd(fjz1
,tz
);
2974 /**************************
2975 * CALCULATE INTERACTIONS *
2976 **************************/
2978 if (gmx_mm_any_lt(rsq22
,rcutoff2
))
2981 r22
= _mm_mul_pd(rsq22
,rinv22
);
2983 /* EWALD ELECTROSTATICS */
2985 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2986 ewrt
= _mm_mul_pd(r22
,ewtabscale
);
2987 ewitab
= _mm_cvttpd_epi32(ewrt
);
2988 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
2989 ewitab
= _mm_slli_epi32(ewitab
,2);
2990 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
2991 ewtabD
= _mm_setzero_pd();
2992 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
2993 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
2994 ewtabFn
= _mm_setzero_pd();
2995 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
2996 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
2997 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
2998 velec
= _mm_mul_pd(qq22
,_mm_sub_pd(rinv22
,velec
));
2999 felec
= _mm_mul_pd(_mm_mul_pd(qq22
,rinv22
),_mm_sub_pd(rinvsq22
,felec
));
3001 d
= _mm_sub_pd(r22
,rswitch
);
3002 d
= _mm_max_pd(d
,_mm_setzero_pd());
3003 d2
= _mm_mul_pd(d
,d
);
3004 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
3006 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
3008 /* Evaluate switch function */
3009 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3010 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv22
,_mm_mul_pd(velec
,dsw
)) );
3011 cutoff_mask
= _mm_cmplt_pd(rsq22
,rcutoff2
);
3015 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
3017 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
3019 /* Calculate temporary vectorial force */
3020 tx
= _mm_mul_pd(fscal
,dx22
);
3021 ty
= _mm_mul_pd(fscal
,dy22
);
3022 tz
= _mm_mul_pd(fscal
,dz22
);
3024 /* Update vectorial force */
3025 fix2
= _mm_add_pd(fix2
,tx
);
3026 fiy2
= _mm_add_pd(fiy2
,ty
);
3027 fiz2
= _mm_add_pd(fiz2
,tz
);
3029 fjx2
= _mm_add_pd(fjx2
,tx
);
3030 fjy2
= _mm_add_pd(fjy2
,ty
);
3031 fjz2
= _mm_add_pd(fjz2
,tz
);
3035 /**************************
3036 * CALCULATE INTERACTIONS *
3037 **************************/
3039 if (gmx_mm_any_lt(rsq23
,rcutoff2
))
3042 r23
= _mm_mul_pd(rsq23
,rinv23
);
3044 /* EWALD ELECTROSTATICS */
3046 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3047 ewrt
= _mm_mul_pd(r23
,ewtabscale
);
3048 ewitab
= _mm_cvttpd_epi32(ewrt
);
3049 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
3050 ewitab
= _mm_slli_epi32(ewitab
,2);
3051 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
3052 ewtabD
= _mm_setzero_pd();
3053 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
3054 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
3055 ewtabFn
= _mm_setzero_pd();
3056 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
3057 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
3058 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
3059 velec
= _mm_mul_pd(qq23
,_mm_sub_pd(rinv23
,velec
));
3060 felec
= _mm_mul_pd(_mm_mul_pd(qq23
,rinv23
),_mm_sub_pd(rinvsq23
,felec
));
3062 d
= _mm_sub_pd(r23
,rswitch
);
3063 d
= _mm_max_pd(d
,_mm_setzero_pd());
3064 d2
= _mm_mul_pd(d
,d
);
3065 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
3067 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
3069 /* Evaluate switch function */
3070 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3071 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv23
,_mm_mul_pd(velec
,dsw
)) );
3072 cutoff_mask
= _mm_cmplt_pd(rsq23
,rcutoff2
);
3076 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
3078 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
3080 /* Calculate temporary vectorial force */
3081 tx
= _mm_mul_pd(fscal
,dx23
);
3082 ty
= _mm_mul_pd(fscal
,dy23
);
3083 tz
= _mm_mul_pd(fscal
,dz23
);
3085 /* Update vectorial force */
3086 fix2
= _mm_add_pd(fix2
,tx
);
3087 fiy2
= _mm_add_pd(fiy2
,ty
);
3088 fiz2
= _mm_add_pd(fiz2
,tz
);
3090 fjx3
= _mm_add_pd(fjx3
,tx
);
3091 fjy3
= _mm_add_pd(fjy3
,ty
);
3092 fjz3
= _mm_add_pd(fjz3
,tz
);
3096 /**************************
3097 * CALCULATE INTERACTIONS *
3098 **************************/
3100 if (gmx_mm_any_lt(rsq31
,rcutoff2
))
3103 r31
= _mm_mul_pd(rsq31
,rinv31
);
3105 /* EWALD ELECTROSTATICS */
3107 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3108 ewrt
= _mm_mul_pd(r31
,ewtabscale
);
3109 ewitab
= _mm_cvttpd_epi32(ewrt
);
3110 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
3111 ewitab
= _mm_slli_epi32(ewitab
,2);
3112 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
3113 ewtabD
= _mm_setzero_pd();
3114 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
3115 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
3116 ewtabFn
= _mm_setzero_pd();
3117 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
3118 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
3119 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
3120 velec
= _mm_mul_pd(qq31
,_mm_sub_pd(rinv31
,velec
));
3121 felec
= _mm_mul_pd(_mm_mul_pd(qq31
,rinv31
),_mm_sub_pd(rinvsq31
,felec
));
3123 d
= _mm_sub_pd(r31
,rswitch
);
3124 d
= _mm_max_pd(d
,_mm_setzero_pd());
3125 d2
= _mm_mul_pd(d
,d
);
3126 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
3128 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
3130 /* Evaluate switch function */
3131 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3132 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv31
,_mm_mul_pd(velec
,dsw
)) );
3133 cutoff_mask
= _mm_cmplt_pd(rsq31
,rcutoff2
);
3137 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
3139 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
3141 /* Calculate temporary vectorial force */
3142 tx
= _mm_mul_pd(fscal
,dx31
);
3143 ty
= _mm_mul_pd(fscal
,dy31
);
3144 tz
= _mm_mul_pd(fscal
,dz31
);
3146 /* Update vectorial force */
3147 fix3
= _mm_add_pd(fix3
,tx
);
3148 fiy3
= _mm_add_pd(fiy3
,ty
);
3149 fiz3
= _mm_add_pd(fiz3
,tz
);
3151 fjx1
= _mm_add_pd(fjx1
,tx
);
3152 fjy1
= _mm_add_pd(fjy1
,ty
);
3153 fjz1
= _mm_add_pd(fjz1
,tz
);
3157 /**************************
3158 * CALCULATE INTERACTIONS *
3159 **************************/
3161 if (gmx_mm_any_lt(rsq32
,rcutoff2
))
3164 r32
= _mm_mul_pd(rsq32
,rinv32
);
3166 /* EWALD ELECTROSTATICS */
3168 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3169 ewrt
= _mm_mul_pd(r32
,ewtabscale
);
3170 ewitab
= _mm_cvttpd_epi32(ewrt
);
3171 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
3172 ewitab
= _mm_slli_epi32(ewitab
,2);
3173 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
3174 ewtabD
= _mm_setzero_pd();
3175 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
3176 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
3177 ewtabFn
= _mm_setzero_pd();
3178 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
3179 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
3180 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
3181 velec
= _mm_mul_pd(qq32
,_mm_sub_pd(rinv32
,velec
));
3182 felec
= _mm_mul_pd(_mm_mul_pd(qq32
,rinv32
),_mm_sub_pd(rinvsq32
,felec
));
3184 d
= _mm_sub_pd(r32
,rswitch
);
3185 d
= _mm_max_pd(d
,_mm_setzero_pd());
3186 d2
= _mm_mul_pd(d
,d
);
3187 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
3189 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
3191 /* Evaluate switch function */
3192 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3193 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv32
,_mm_mul_pd(velec
,dsw
)) );
3194 cutoff_mask
= _mm_cmplt_pd(rsq32
,rcutoff2
);
3198 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
3200 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
3202 /* Calculate temporary vectorial force */
3203 tx
= _mm_mul_pd(fscal
,dx32
);
3204 ty
= _mm_mul_pd(fscal
,dy32
);
3205 tz
= _mm_mul_pd(fscal
,dz32
);
3207 /* Update vectorial force */
3208 fix3
= _mm_add_pd(fix3
,tx
);
3209 fiy3
= _mm_add_pd(fiy3
,ty
);
3210 fiz3
= _mm_add_pd(fiz3
,tz
);
3212 fjx2
= _mm_add_pd(fjx2
,tx
);
3213 fjy2
= _mm_add_pd(fjy2
,ty
);
3214 fjz2
= _mm_add_pd(fjz2
,tz
);
3218 /**************************
3219 * CALCULATE INTERACTIONS *
3220 **************************/
3222 if (gmx_mm_any_lt(rsq33
,rcutoff2
))
3225 r33
= _mm_mul_pd(rsq33
,rinv33
);
3227 /* EWALD ELECTROSTATICS */
3229 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3230 ewrt
= _mm_mul_pd(r33
,ewtabscale
);
3231 ewitab
= _mm_cvttpd_epi32(ewrt
);
3232 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
3233 ewitab
= _mm_slli_epi32(ewitab
,2);
3234 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
3235 ewtabD
= _mm_setzero_pd();
3236 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
3237 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
3238 ewtabFn
= _mm_setzero_pd();
3239 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
3240 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
3241 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
3242 velec
= _mm_mul_pd(qq33
,_mm_sub_pd(rinv33
,velec
));
3243 felec
= _mm_mul_pd(_mm_mul_pd(qq33
,rinv33
),_mm_sub_pd(rinvsq33
,felec
));
3245 d
= _mm_sub_pd(r33
,rswitch
);
3246 d
= _mm_max_pd(d
,_mm_setzero_pd());
3247 d2
= _mm_mul_pd(d
,d
);
3248 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
3250 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
3252 /* Evaluate switch function */
3253 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3254 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv33
,_mm_mul_pd(velec
,dsw
)) );
3255 cutoff_mask
= _mm_cmplt_pd(rsq33
,rcutoff2
);
3259 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
3261 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
3263 /* Calculate temporary vectorial force */
3264 tx
= _mm_mul_pd(fscal
,dx33
);
3265 ty
= _mm_mul_pd(fscal
,dy33
);
3266 tz
= _mm_mul_pd(fscal
,dz33
);
3268 /* Update vectorial force */
3269 fix3
= _mm_add_pd(fix3
,tx
);
3270 fiy3
= _mm_add_pd(fiy3
,ty
);
3271 fiz3
= _mm_add_pd(fiz3
,tz
);
3273 fjx3
= _mm_add_pd(fjx3
,tx
);
3274 fjy3
= _mm_add_pd(fjy3
,ty
);
3275 fjz3
= _mm_add_pd(fjz3
,tz
);
3279 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
3281 /* Inner loop uses 617 flops */
3284 /* End of innermost loop */
3286 gmx_mm_update_iforce_4atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
3287 f
+i_coord_offset
,fshift
+i_shift_offset
);
3289 /* Increment number of inner iterations */
3290 inneriter
+= j_index_end
- j_index_start
;
3292 /* Outer loop uses 24 flops */
3295 /* Increment number of outer iterations */
3298 /* Update outer/inner flops */
3300 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W4W4_F
,outeriter
*24 + inneriter
*617);