2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse4_1_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_sse4_1_double.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sse4_1_double
51 * Electrostatics interaction: Ewald
52 * VdW interaction: LennardJones
53 * Geometry: Water3-Water3
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sse4_1_double
58 (t_nblist
* gmx_restrict nlist
,
59 rvec
* gmx_restrict xx
,
60 rvec
* gmx_restrict ff
,
61 struct t_forcerec
* gmx_restrict fr
,
62 t_mdatoms
* gmx_restrict mdatoms
,
63 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
64 t_nrnb
* gmx_restrict nrnb
)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
72 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
74 int j_coord_offsetA
,j_coord_offsetB
;
75 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
77 real
*shiftvec
,*fshift
,*x
,*f
;
78 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
80 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
82 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
84 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
85 int vdwjidx0A
,vdwjidx0B
;
86 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
87 int vdwjidx1A
,vdwjidx1B
;
88 __m128d jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
89 int vdwjidx2A
,vdwjidx2B
;
90 __m128d jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
91 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
92 __m128d dx01
,dy01
,dz01
,rsq01
,rinv01
,rinvsq01
,r01
,qq01
,c6_01
,c12_01
;
93 __m128d dx02
,dy02
,dz02
,rsq02
,rinv02
,rinvsq02
,r02
,qq02
,c6_02
,c12_02
;
94 __m128d dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
95 __m128d dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
96 __m128d dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
97 __m128d dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
98 __m128d dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
99 __m128d dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
100 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
103 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
106 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
107 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
109 __m128d ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
111 __m128d rswitch
,swV3
,swV4
,swV5
,swF2
,swF3
,swF4
,d
,d2
,sw
,dsw
;
112 real rswitch_scalar
,d_scalar
;
113 __m128d dummy_mask
,cutoff_mask
;
114 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
115 __m128d one
= _mm_set1_pd(1.0);
116 __m128d two
= _mm_set1_pd(2.0);
122 jindex
= nlist
->jindex
;
124 shiftidx
= nlist
->shift
;
126 shiftvec
= fr
->shift_vec
[0];
127 fshift
= fr
->fshift
[0];
128 facel
= _mm_set1_pd(fr
->ic
->epsfac
);
129 charge
= mdatoms
->chargeA
;
130 nvdwtype
= fr
->ntype
;
132 vdwtype
= mdatoms
->typeA
;
134 sh_ewald
= _mm_set1_pd(fr
->ic
->sh_ewald
);
135 ewtab
= fr
->ic
->tabq_coul_FDV0
;
136 ewtabscale
= _mm_set1_pd(fr
->ic
->tabq_scale
);
137 ewtabhalfspace
= _mm_set1_pd(0.5/fr
->ic
->tabq_scale
);
139 /* Setup water-specific parameters */
140 inr
= nlist
->iinr
[0];
141 iq0
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+0]));
142 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
143 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
144 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
146 jq0
= _mm_set1_pd(charge
[inr
+0]);
147 jq1
= _mm_set1_pd(charge
[inr
+1]);
148 jq2
= _mm_set1_pd(charge
[inr
+2]);
149 vdwjidx0A
= 2*vdwtype
[inr
+0];
150 qq00
= _mm_mul_pd(iq0
,jq0
);
151 c6_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
]);
152 c12_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
153 qq01
= _mm_mul_pd(iq0
,jq1
);
154 qq02
= _mm_mul_pd(iq0
,jq2
);
155 qq10
= _mm_mul_pd(iq1
,jq0
);
156 qq11
= _mm_mul_pd(iq1
,jq1
);
157 qq12
= _mm_mul_pd(iq1
,jq2
);
158 qq20
= _mm_mul_pd(iq2
,jq0
);
159 qq21
= _mm_mul_pd(iq2
,jq1
);
160 qq22
= _mm_mul_pd(iq2
,jq2
);
162 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
163 rcutoff_scalar
= fr
->ic
->rcoulomb
;
164 rcutoff
= _mm_set1_pd(rcutoff_scalar
);
165 rcutoff2
= _mm_mul_pd(rcutoff
,rcutoff
);
167 rswitch_scalar
= fr
->ic
->rcoulomb_switch
;
168 rswitch
= _mm_set1_pd(rswitch_scalar
);
169 /* Setup switch parameters */
170 d_scalar
= rcutoff_scalar
-rswitch_scalar
;
171 d
= _mm_set1_pd(d_scalar
);
172 swV3
= _mm_set1_pd(-10.0/(d_scalar
*d_scalar
*d_scalar
));
173 swV4
= _mm_set1_pd( 15.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
));
174 swV5
= _mm_set1_pd( -6.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
*d_scalar
));
175 swF2
= _mm_set1_pd(-30.0/(d_scalar
*d_scalar
*d_scalar
));
176 swF3
= _mm_set1_pd( 60.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
));
177 swF4
= _mm_set1_pd(-30.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
*d_scalar
));
179 /* Avoid stupid compiler warnings */
187 /* Start outer loop over neighborlists */
188 for(iidx
=0; iidx
<nri
; iidx
++)
190 /* Load shift vector for this list */
191 i_shift_offset
= DIM
*shiftidx
[iidx
];
193 /* Load limits for loop over neighbors */
194 j_index_start
= jindex
[iidx
];
195 j_index_end
= jindex
[iidx
+1];
197 /* Get outer coordinate index */
199 i_coord_offset
= DIM
*inr
;
201 /* Load i particle coords and add shift vector */
202 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
203 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
205 fix0
= _mm_setzero_pd();
206 fiy0
= _mm_setzero_pd();
207 fiz0
= _mm_setzero_pd();
208 fix1
= _mm_setzero_pd();
209 fiy1
= _mm_setzero_pd();
210 fiz1
= _mm_setzero_pd();
211 fix2
= _mm_setzero_pd();
212 fiy2
= _mm_setzero_pd();
213 fiz2
= _mm_setzero_pd();
215 /* Reset potential sums */
216 velecsum
= _mm_setzero_pd();
217 vvdwsum
= _mm_setzero_pd();
219 /* Start inner kernel loop */
220 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
223 /* Get j neighbor index, and coordinate index */
226 j_coord_offsetA
= DIM
*jnrA
;
227 j_coord_offsetB
= DIM
*jnrB
;
229 /* load j atom coordinates */
230 gmx_mm_load_3rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
231 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
233 /* Calculate displacement vector */
234 dx00
= _mm_sub_pd(ix0
,jx0
);
235 dy00
= _mm_sub_pd(iy0
,jy0
);
236 dz00
= _mm_sub_pd(iz0
,jz0
);
237 dx01
= _mm_sub_pd(ix0
,jx1
);
238 dy01
= _mm_sub_pd(iy0
,jy1
);
239 dz01
= _mm_sub_pd(iz0
,jz1
);
240 dx02
= _mm_sub_pd(ix0
,jx2
);
241 dy02
= _mm_sub_pd(iy0
,jy2
);
242 dz02
= _mm_sub_pd(iz0
,jz2
);
243 dx10
= _mm_sub_pd(ix1
,jx0
);
244 dy10
= _mm_sub_pd(iy1
,jy0
);
245 dz10
= _mm_sub_pd(iz1
,jz0
);
246 dx11
= _mm_sub_pd(ix1
,jx1
);
247 dy11
= _mm_sub_pd(iy1
,jy1
);
248 dz11
= _mm_sub_pd(iz1
,jz1
);
249 dx12
= _mm_sub_pd(ix1
,jx2
);
250 dy12
= _mm_sub_pd(iy1
,jy2
);
251 dz12
= _mm_sub_pd(iz1
,jz2
);
252 dx20
= _mm_sub_pd(ix2
,jx0
);
253 dy20
= _mm_sub_pd(iy2
,jy0
);
254 dz20
= _mm_sub_pd(iz2
,jz0
);
255 dx21
= _mm_sub_pd(ix2
,jx1
);
256 dy21
= _mm_sub_pd(iy2
,jy1
);
257 dz21
= _mm_sub_pd(iz2
,jz1
);
258 dx22
= _mm_sub_pd(ix2
,jx2
);
259 dy22
= _mm_sub_pd(iy2
,jy2
);
260 dz22
= _mm_sub_pd(iz2
,jz2
);
262 /* Calculate squared distance and things based on it */
263 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
264 rsq01
= gmx_mm_calc_rsq_pd(dx01
,dy01
,dz01
);
265 rsq02
= gmx_mm_calc_rsq_pd(dx02
,dy02
,dz02
);
266 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
267 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
268 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
269 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
270 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
271 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
273 rinv00
= sse41_invsqrt_d(rsq00
);
274 rinv01
= sse41_invsqrt_d(rsq01
);
275 rinv02
= sse41_invsqrt_d(rsq02
);
276 rinv10
= sse41_invsqrt_d(rsq10
);
277 rinv11
= sse41_invsqrt_d(rsq11
);
278 rinv12
= sse41_invsqrt_d(rsq12
);
279 rinv20
= sse41_invsqrt_d(rsq20
);
280 rinv21
= sse41_invsqrt_d(rsq21
);
281 rinv22
= sse41_invsqrt_d(rsq22
);
283 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
284 rinvsq01
= _mm_mul_pd(rinv01
,rinv01
);
285 rinvsq02
= _mm_mul_pd(rinv02
,rinv02
);
286 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
287 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
288 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
289 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
290 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
291 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
293 fjx0
= _mm_setzero_pd();
294 fjy0
= _mm_setzero_pd();
295 fjz0
= _mm_setzero_pd();
296 fjx1
= _mm_setzero_pd();
297 fjy1
= _mm_setzero_pd();
298 fjz1
= _mm_setzero_pd();
299 fjx2
= _mm_setzero_pd();
300 fjy2
= _mm_setzero_pd();
301 fjz2
= _mm_setzero_pd();
303 /**************************
304 * CALCULATE INTERACTIONS *
305 **************************/
307 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
310 r00
= _mm_mul_pd(rsq00
,rinv00
);
312 /* EWALD ELECTROSTATICS */
314 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
315 ewrt
= _mm_mul_pd(r00
,ewtabscale
);
316 ewitab
= _mm_cvttpd_epi32(ewrt
);
317 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
318 ewitab
= _mm_slli_epi32(ewitab
,2);
319 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
320 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
321 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
322 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
323 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
324 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
325 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
326 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
327 velec
= _mm_mul_pd(qq00
,_mm_sub_pd(rinv00
,velec
));
328 felec
= _mm_mul_pd(_mm_mul_pd(qq00
,rinv00
),_mm_sub_pd(rinvsq00
,felec
));
330 /* LENNARD-JONES DISPERSION/REPULSION */
332 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
333 vvdw6
= _mm_mul_pd(c6_00
,rinvsix
);
334 vvdw12
= _mm_mul_pd(c12_00
,_mm_mul_pd(rinvsix
,rinvsix
));
335 vvdw
= _mm_sub_pd( _mm_mul_pd(vvdw12
,one_twelfth
) , _mm_mul_pd(vvdw6
,one_sixth
) );
336 fvdw
= _mm_mul_pd(_mm_sub_pd(vvdw12
,vvdw6
),rinvsq00
);
338 d
= _mm_sub_pd(r00
,rswitch
);
339 d
= _mm_max_pd(d
,_mm_setzero_pd());
340 d2
= _mm_mul_pd(d
,d
);
341 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
343 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
345 /* Evaluate switch function */
346 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
347 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv00
,_mm_mul_pd(velec
,dsw
)) );
348 fvdw
= _mm_sub_pd( _mm_mul_pd(fvdw
,sw
) , _mm_mul_pd(rinv00
,_mm_mul_pd(vvdw
,dsw
)) );
349 velec
= _mm_mul_pd(velec
,sw
);
350 vvdw
= _mm_mul_pd(vvdw
,sw
);
351 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
353 /* Update potential sum for this i atom from the interaction with this j atom. */
354 velec
= _mm_and_pd(velec
,cutoff_mask
);
355 velecsum
= _mm_add_pd(velecsum
,velec
);
356 vvdw
= _mm_and_pd(vvdw
,cutoff_mask
);
357 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
359 fscal
= _mm_add_pd(felec
,fvdw
);
361 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
363 /* Calculate temporary vectorial force */
364 tx
= _mm_mul_pd(fscal
,dx00
);
365 ty
= _mm_mul_pd(fscal
,dy00
);
366 tz
= _mm_mul_pd(fscal
,dz00
);
368 /* Update vectorial force */
369 fix0
= _mm_add_pd(fix0
,tx
);
370 fiy0
= _mm_add_pd(fiy0
,ty
);
371 fiz0
= _mm_add_pd(fiz0
,tz
);
373 fjx0
= _mm_add_pd(fjx0
,tx
);
374 fjy0
= _mm_add_pd(fjy0
,ty
);
375 fjz0
= _mm_add_pd(fjz0
,tz
);
379 /**************************
380 * CALCULATE INTERACTIONS *
381 **************************/
383 if (gmx_mm_any_lt(rsq01
,rcutoff2
))
386 r01
= _mm_mul_pd(rsq01
,rinv01
);
388 /* EWALD ELECTROSTATICS */
390 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
391 ewrt
= _mm_mul_pd(r01
,ewtabscale
);
392 ewitab
= _mm_cvttpd_epi32(ewrt
);
393 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
394 ewitab
= _mm_slli_epi32(ewitab
,2);
395 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
396 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
397 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
398 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
399 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
400 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
401 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
402 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
403 velec
= _mm_mul_pd(qq01
,_mm_sub_pd(rinv01
,velec
));
404 felec
= _mm_mul_pd(_mm_mul_pd(qq01
,rinv01
),_mm_sub_pd(rinvsq01
,felec
));
406 d
= _mm_sub_pd(r01
,rswitch
);
407 d
= _mm_max_pd(d
,_mm_setzero_pd());
408 d2
= _mm_mul_pd(d
,d
);
409 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
411 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
413 /* Evaluate switch function */
414 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
415 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv01
,_mm_mul_pd(velec
,dsw
)) );
416 velec
= _mm_mul_pd(velec
,sw
);
417 cutoff_mask
= _mm_cmplt_pd(rsq01
,rcutoff2
);
419 /* Update potential sum for this i atom from the interaction with this j atom. */
420 velec
= _mm_and_pd(velec
,cutoff_mask
);
421 velecsum
= _mm_add_pd(velecsum
,velec
);
425 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
427 /* Calculate temporary vectorial force */
428 tx
= _mm_mul_pd(fscal
,dx01
);
429 ty
= _mm_mul_pd(fscal
,dy01
);
430 tz
= _mm_mul_pd(fscal
,dz01
);
432 /* Update vectorial force */
433 fix0
= _mm_add_pd(fix0
,tx
);
434 fiy0
= _mm_add_pd(fiy0
,ty
);
435 fiz0
= _mm_add_pd(fiz0
,tz
);
437 fjx1
= _mm_add_pd(fjx1
,tx
);
438 fjy1
= _mm_add_pd(fjy1
,ty
);
439 fjz1
= _mm_add_pd(fjz1
,tz
);
443 /**************************
444 * CALCULATE INTERACTIONS *
445 **************************/
447 if (gmx_mm_any_lt(rsq02
,rcutoff2
))
450 r02
= _mm_mul_pd(rsq02
,rinv02
);
452 /* EWALD ELECTROSTATICS */
454 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
455 ewrt
= _mm_mul_pd(r02
,ewtabscale
);
456 ewitab
= _mm_cvttpd_epi32(ewrt
);
457 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
458 ewitab
= _mm_slli_epi32(ewitab
,2);
459 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
460 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
461 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
462 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
463 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
464 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
465 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
466 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
467 velec
= _mm_mul_pd(qq02
,_mm_sub_pd(rinv02
,velec
));
468 felec
= _mm_mul_pd(_mm_mul_pd(qq02
,rinv02
),_mm_sub_pd(rinvsq02
,felec
));
470 d
= _mm_sub_pd(r02
,rswitch
);
471 d
= _mm_max_pd(d
,_mm_setzero_pd());
472 d2
= _mm_mul_pd(d
,d
);
473 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
475 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
477 /* Evaluate switch function */
478 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
479 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv02
,_mm_mul_pd(velec
,dsw
)) );
480 velec
= _mm_mul_pd(velec
,sw
);
481 cutoff_mask
= _mm_cmplt_pd(rsq02
,rcutoff2
);
483 /* Update potential sum for this i atom from the interaction with this j atom. */
484 velec
= _mm_and_pd(velec
,cutoff_mask
);
485 velecsum
= _mm_add_pd(velecsum
,velec
);
489 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
491 /* Calculate temporary vectorial force */
492 tx
= _mm_mul_pd(fscal
,dx02
);
493 ty
= _mm_mul_pd(fscal
,dy02
);
494 tz
= _mm_mul_pd(fscal
,dz02
);
496 /* Update vectorial force */
497 fix0
= _mm_add_pd(fix0
,tx
);
498 fiy0
= _mm_add_pd(fiy0
,ty
);
499 fiz0
= _mm_add_pd(fiz0
,tz
);
501 fjx2
= _mm_add_pd(fjx2
,tx
);
502 fjy2
= _mm_add_pd(fjy2
,ty
);
503 fjz2
= _mm_add_pd(fjz2
,tz
);
507 /**************************
508 * CALCULATE INTERACTIONS *
509 **************************/
511 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
514 r10
= _mm_mul_pd(rsq10
,rinv10
);
516 /* EWALD ELECTROSTATICS */
518 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
519 ewrt
= _mm_mul_pd(r10
,ewtabscale
);
520 ewitab
= _mm_cvttpd_epi32(ewrt
);
521 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
522 ewitab
= _mm_slli_epi32(ewitab
,2);
523 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
524 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
525 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
526 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
527 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
528 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
529 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
530 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
531 velec
= _mm_mul_pd(qq10
,_mm_sub_pd(rinv10
,velec
));
532 felec
= _mm_mul_pd(_mm_mul_pd(qq10
,rinv10
),_mm_sub_pd(rinvsq10
,felec
));
534 d
= _mm_sub_pd(r10
,rswitch
);
535 d
= _mm_max_pd(d
,_mm_setzero_pd());
536 d2
= _mm_mul_pd(d
,d
);
537 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
539 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
541 /* Evaluate switch function */
542 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
543 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv10
,_mm_mul_pd(velec
,dsw
)) );
544 velec
= _mm_mul_pd(velec
,sw
);
545 cutoff_mask
= _mm_cmplt_pd(rsq10
,rcutoff2
);
547 /* Update potential sum for this i atom from the interaction with this j atom. */
548 velec
= _mm_and_pd(velec
,cutoff_mask
);
549 velecsum
= _mm_add_pd(velecsum
,velec
);
553 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
555 /* Calculate temporary vectorial force */
556 tx
= _mm_mul_pd(fscal
,dx10
);
557 ty
= _mm_mul_pd(fscal
,dy10
);
558 tz
= _mm_mul_pd(fscal
,dz10
);
560 /* Update vectorial force */
561 fix1
= _mm_add_pd(fix1
,tx
);
562 fiy1
= _mm_add_pd(fiy1
,ty
);
563 fiz1
= _mm_add_pd(fiz1
,tz
);
565 fjx0
= _mm_add_pd(fjx0
,tx
);
566 fjy0
= _mm_add_pd(fjy0
,ty
);
567 fjz0
= _mm_add_pd(fjz0
,tz
);
571 /**************************
572 * CALCULATE INTERACTIONS *
573 **************************/
575 if (gmx_mm_any_lt(rsq11
,rcutoff2
))
578 r11
= _mm_mul_pd(rsq11
,rinv11
);
580 /* EWALD ELECTROSTATICS */
582 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
583 ewrt
= _mm_mul_pd(r11
,ewtabscale
);
584 ewitab
= _mm_cvttpd_epi32(ewrt
);
585 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
586 ewitab
= _mm_slli_epi32(ewitab
,2);
587 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
588 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
589 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
590 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
591 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
592 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
593 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
594 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
595 velec
= _mm_mul_pd(qq11
,_mm_sub_pd(rinv11
,velec
));
596 felec
= _mm_mul_pd(_mm_mul_pd(qq11
,rinv11
),_mm_sub_pd(rinvsq11
,felec
));
598 d
= _mm_sub_pd(r11
,rswitch
);
599 d
= _mm_max_pd(d
,_mm_setzero_pd());
600 d2
= _mm_mul_pd(d
,d
);
601 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
603 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
605 /* Evaluate switch function */
606 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
607 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv11
,_mm_mul_pd(velec
,dsw
)) );
608 velec
= _mm_mul_pd(velec
,sw
);
609 cutoff_mask
= _mm_cmplt_pd(rsq11
,rcutoff2
);
611 /* Update potential sum for this i atom from the interaction with this j atom. */
612 velec
= _mm_and_pd(velec
,cutoff_mask
);
613 velecsum
= _mm_add_pd(velecsum
,velec
);
617 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
619 /* Calculate temporary vectorial force */
620 tx
= _mm_mul_pd(fscal
,dx11
);
621 ty
= _mm_mul_pd(fscal
,dy11
);
622 tz
= _mm_mul_pd(fscal
,dz11
);
624 /* Update vectorial force */
625 fix1
= _mm_add_pd(fix1
,tx
);
626 fiy1
= _mm_add_pd(fiy1
,ty
);
627 fiz1
= _mm_add_pd(fiz1
,tz
);
629 fjx1
= _mm_add_pd(fjx1
,tx
);
630 fjy1
= _mm_add_pd(fjy1
,ty
);
631 fjz1
= _mm_add_pd(fjz1
,tz
);
635 /**************************
636 * CALCULATE INTERACTIONS *
637 **************************/
639 if (gmx_mm_any_lt(rsq12
,rcutoff2
))
642 r12
= _mm_mul_pd(rsq12
,rinv12
);
644 /* EWALD ELECTROSTATICS */
646 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
647 ewrt
= _mm_mul_pd(r12
,ewtabscale
);
648 ewitab
= _mm_cvttpd_epi32(ewrt
);
649 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
650 ewitab
= _mm_slli_epi32(ewitab
,2);
651 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
652 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
653 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
654 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
655 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
656 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
657 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
658 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
659 velec
= _mm_mul_pd(qq12
,_mm_sub_pd(rinv12
,velec
));
660 felec
= _mm_mul_pd(_mm_mul_pd(qq12
,rinv12
),_mm_sub_pd(rinvsq12
,felec
));
662 d
= _mm_sub_pd(r12
,rswitch
);
663 d
= _mm_max_pd(d
,_mm_setzero_pd());
664 d2
= _mm_mul_pd(d
,d
);
665 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
667 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
669 /* Evaluate switch function */
670 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
671 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv12
,_mm_mul_pd(velec
,dsw
)) );
672 velec
= _mm_mul_pd(velec
,sw
);
673 cutoff_mask
= _mm_cmplt_pd(rsq12
,rcutoff2
);
675 /* Update potential sum for this i atom from the interaction with this j atom. */
676 velec
= _mm_and_pd(velec
,cutoff_mask
);
677 velecsum
= _mm_add_pd(velecsum
,velec
);
681 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
683 /* Calculate temporary vectorial force */
684 tx
= _mm_mul_pd(fscal
,dx12
);
685 ty
= _mm_mul_pd(fscal
,dy12
);
686 tz
= _mm_mul_pd(fscal
,dz12
);
688 /* Update vectorial force */
689 fix1
= _mm_add_pd(fix1
,tx
);
690 fiy1
= _mm_add_pd(fiy1
,ty
);
691 fiz1
= _mm_add_pd(fiz1
,tz
);
693 fjx2
= _mm_add_pd(fjx2
,tx
);
694 fjy2
= _mm_add_pd(fjy2
,ty
);
695 fjz2
= _mm_add_pd(fjz2
,tz
);
699 /**************************
700 * CALCULATE INTERACTIONS *
701 **************************/
703 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
706 r20
= _mm_mul_pd(rsq20
,rinv20
);
708 /* EWALD ELECTROSTATICS */
710 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
711 ewrt
= _mm_mul_pd(r20
,ewtabscale
);
712 ewitab
= _mm_cvttpd_epi32(ewrt
);
713 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
714 ewitab
= _mm_slli_epi32(ewitab
,2);
715 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
716 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
717 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
718 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
719 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
720 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
721 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
722 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
723 velec
= _mm_mul_pd(qq20
,_mm_sub_pd(rinv20
,velec
));
724 felec
= _mm_mul_pd(_mm_mul_pd(qq20
,rinv20
),_mm_sub_pd(rinvsq20
,felec
));
726 d
= _mm_sub_pd(r20
,rswitch
);
727 d
= _mm_max_pd(d
,_mm_setzero_pd());
728 d2
= _mm_mul_pd(d
,d
);
729 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
731 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
733 /* Evaluate switch function */
734 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
735 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv20
,_mm_mul_pd(velec
,dsw
)) );
736 velec
= _mm_mul_pd(velec
,sw
);
737 cutoff_mask
= _mm_cmplt_pd(rsq20
,rcutoff2
);
739 /* Update potential sum for this i atom from the interaction with this j atom. */
740 velec
= _mm_and_pd(velec
,cutoff_mask
);
741 velecsum
= _mm_add_pd(velecsum
,velec
);
745 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
747 /* Calculate temporary vectorial force */
748 tx
= _mm_mul_pd(fscal
,dx20
);
749 ty
= _mm_mul_pd(fscal
,dy20
);
750 tz
= _mm_mul_pd(fscal
,dz20
);
752 /* Update vectorial force */
753 fix2
= _mm_add_pd(fix2
,tx
);
754 fiy2
= _mm_add_pd(fiy2
,ty
);
755 fiz2
= _mm_add_pd(fiz2
,tz
);
757 fjx0
= _mm_add_pd(fjx0
,tx
);
758 fjy0
= _mm_add_pd(fjy0
,ty
);
759 fjz0
= _mm_add_pd(fjz0
,tz
);
763 /**************************
764 * CALCULATE INTERACTIONS *
765 **************************/
767 if (gmx_mm_any_lt(rsq21
,rcutoff2
))
770 r21
= _mm_mul_pd(rsq21
,rinv21
);
772 /* EWALD ELECTROSTATICS */
774 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
775 ewrt
= _mm_mul_pd(r21
,ewtabscale
);
776 ewitab
= _mm_cvttpd_epi32(ewrt
);
777 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
778 ewitab
= _mm_slli_epi32(ewitab
,2);
779 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
780 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
781 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
782 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
783 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
784 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
785 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
786 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
787 velec
= _mm_mul_pd(qq21
,_mm_sub_pd(rinv21
,velec
));
788 felec
= _mm_mul_pd(_mm_mul_pd(qq21
,rinv21
),_mm_sub_pd(rinvsq21
,felec
));
790 d
= _mm_sub_pd(r21
,rswitch
);
791 d
= _mm_max_pd(d
,_mm_setzero_pd());
792 d2
= _mm_mul_pd(d
,d
);
793 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
795 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
797 /* Evaluate switch function */
798 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
799 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv21
,_mm_mul_pd(velec
,dsw
)) );
800 velec
= _mm_mul_pd(velec
,sw
);
801 cutoff_mask
= _mm_cmplt_pd(rsq21
,rcutoff2
);
803 /* Update potential sum for this i atom from the interaction with this j atom. */
804 velec
= _mm_and_pd(velec
,cutoff_mask
);
805 velecsum
= _mm_add_pd(velecsum
,velec
);
809 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
811 /* Calculate temporary vectorial force */
812 tx
= _mm_mul_pd(fscal
,dx21
);
813 ty
= _mm_mul_pd(fscal
,dy21
);
814 tz
= _mm_mul_pd(fscal
,dz21
);
816 /* Update vectorial force */
817 fix2
= _mm_add_pd(fix2
,tx
);
818 fiy2
= _mm_add_pd(fiy2
,ty
);
819 fiz2
= _mm_add_pd(fiz2
,tz
);
821 fjx1
= _mm_add_pd(fjx1
,tx
);
822 fjy1
= _mm_add_pd(fjy1
,ty
);
823 fjz1
= _mm_add_pd(fjz1
,tz
);
827 /**************************
828 * CALCULATE INTERACTIONS *
829 **************************/
831 if (gmx_mm_any_lt(rsq22
,rcutoff2
))
834 r22
= _mm_mul_pd(rsq22
,rinv22
);
836 /* EWALD ELECTROSTATICS */
838 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
839 ewrt
= _mm_mul_pd(r22
,ewtabscale
);
840 ewitab
= _mm_cvttpd_epi32(ewrt
);
841 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
842 ewitab
= _mm_slli_epi32(ewitab
,2);
843 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
844 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
845 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
846 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
847 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
848 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
849 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
850 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
851 velec
= _mm_mul_pd(qq22
,_mm_sub_pd(rinv22
,velec
));
852 felec
= _mm_mul_pd(_mm_mul_pd(qq22
,rinv22
),_mm_sub_pd(rinvsq22
,felec
));
854 d
= _mm_sub_pd(r22
,rswitch
);
855 d
= _mm_max_pd(d
,_mm_setzero_pd());
856 d2
= _mm_mul_pd(d
,d
);
857 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
859 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
861 /* Evaluate switch function */
862 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
863 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv22
,_mm_mul_pd(velec
,dsw
)) );
864 velec
= _mm_mul_pd(velec
,sw
);
865 cutoff_mask
= _mm_cmplt_pd(rsq22
,rcutoff2
);
867 /* Update potential sum for this i atom from the interaction with this j atom. */
868 velec
= _mm_and_pd(velec
,cutoff_mask
);
869 velecsum
= _mm_add_pd(velecsum
,velec
);
873 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
875 /* Calculate temporary vectorial force */
876 tx
= _mm_mul_pd(fscal
,dx22
);
877 ty
= _mm_mul_pd(fscal
,dy22
);
878 tz
= _mm_mul_pd(fscal
,dz22
);
880 /* Update vectorial force */
881 fix2
= _mm_add_pd(fix2
,tx
);
882 fiy2
= _mm_add_pd(fiy2
,ty
);
883 fiz2
= _mm_add_pd(fiz2
,tz
);
885 fjx2
= _mm_add_pd(fjx2
,tx
);
886 fjy2
= _mm_add_pd(fjy2
,ty
);
887 fjz2
= _mm_add_pd(fjz2
,tz
);
891 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
893 /* Inner loop uses 603 flops */
900 j_coord_offsetA
= DIM
*jnrA
;
902 /* load j atom coordinates */
903 gmx_mm_load_3rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
904 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
906 /* Calculate displacement vector */
907 dx00
= _mm_sub_pd(ix0
,jx0
);
908 dy00
= _mm_sub_pd(iy0
,jy0
);
909 dz00
= _mm_sub_pd(iz0
,jz0
);
910 dx01
= _mm_sub_pd(ix0
,jx1
);
911 dy01
= _mm_sub_pd(iy0
,jy1
);
912 dz01
= _mm_sub_pd(iz0
,jz1
);
913 dx02
= _mm_sub_pd(ix0
,jx2
);
914 dy02
= _mm_sub_pd(iy0
,jy2
);
915 dz02
= _mm_sub_pd(iz0
,jz2
);
916 dx10
= _mm_sub_pd(ix1
,jx0
);
917 dy10
= _mm_sub_pd(iy1
,jy0
);
918 dz10
= _mm_sub_pd(iz1
,jz0
);
919 dx11
= _mm_sub_pd(ix1
,jx1
);
920 dy11
= _mm_sub_pd(iy1
,jy1
);
921 dz11
= _mm_sub_pd(iz1
,jz1
);
922 dx12
= _mm_sub_pd(ix1
,jx2
);
923 dy12
= _mm_sub_pd(iy1
,jy2
);
924 dz12
= _mm_sub_pd(iz1
,jz2
);
925 dx20
= _mm_sub_pd(ix2
,jx0
);
926 dy20
= _mm_sub_pd(iy2
,jy0
);
927 dz20
= _mm_sub_pd(iz2
,jz0
);
928 dx21
= _mm_sub_pd(ix2
,jx1
);
929 dy21
= _mm_sub_pd(iy2
,jy1
);
930 dz21
= _mm_sub_pd(iz2
,jz1
);
931 dx22
= _mm_sub_pd(ix2
,jx2
);
932 dy22
= _mm_sub_pd(iy2
,jy2
);
933 dz22
= _mm_sub_pd(iz2
,jz2
);
935 /* Calculate squared distance and things based on it */
936 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
937 rsq01
= gmx_mm_calc_rsq_pd(dx01
,dy01
,dz01
);
938 rsq02
= gmx_mm_calc_rsq_pd(dx02
,dy02
,dz02
);
939 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
940 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
941 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
942 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
943 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
944 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
946 rinv00
= sse41_invsqrt_d(rsq00
);
947 rinv01
= sse41_invsqrt_d(rsq01
);
948 rinv02
= sse41_invsqrt_d(rsq02
);
949 rinv10
= sse41_invsqrt_d(rsq10
);
950 rinv11
= sse41_invsqrt_d(rsq11
);
951 rinv12
= sse41_invsqrt_d(rsq12
);
952 rinv20
= sse41_invsqrt_d(rsq20
);
953 rinv21
= sse41_invsqrt_d(rsq21
);
954 rinv22
= sse41_invsqrt_d(rsq22
);
956 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
957 rinvsq01
= _mm_mul_pd(rinv01
,rinv01
);
958 rinvsq02
= _mm_mul_pd(rinv02
,rinv02
);
959 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
960 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
961 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
962 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
963 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
964 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
966 fjx0
= _mm_setzero_pd();
967 fjy0
= _mm_setzero_pd();
968 fjz0
= _mm_setzero_pd();
969 fjx1
= _mm_setzero_pd();
970 fjy1
= _mm_setzero_pd();
971 fjz1
= _mm_setzero_pd();
972 fjx2
= _mm_setzero_pd();
973 fjy2
= _mm_setzero_pd();
974 fjz2
= _mm_setzero_pd();
976 /**************************
977 * CALCULATE INTERACTIONS *
978 **************************/
980 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
983 r00
= _mm_mul_pd(rsq00
,rinv00
);
985 /* EWALD ELECTROSTATICS */
987 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
988 ewrt
= _mm_mul_pd(r00
,ewtabscale
);
989 ewitab
= _mm_cvttpd_epi32(ewrt
);
990 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
991 ewitab
= _mm_slli_epi32(ewitab
,2);
992 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
993 ewtabD
= _mm_setzero_pd();
994 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
995 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
996 ewtabFn
= _mm_setzero_pd();
997 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
998 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
999 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1000 velec
= _mm_mul_pd(qq00
,_mm_sub_pd(rinv00
,velec
));
1001 felec
= _mm_mul_pd(_mm_mul_pd(qq00
,rinv00
),_mm_sub_pd(rinvsq00
,felec
));
1003 /* LENNARD-JONES DISPERSION/REPULSION */
1005 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
1006 vvdw6
= _mm_mul_pd(c6_00
,rinvsix
);
1007 vvdw12
= _mm_mul_pd(c12_00
,_mm_mul_pd(rinvsix
,rinvsix
));
1008 vvdw
= _mm_sub_pd( _mm_mul_pd(vvdw12
,one_twelfth
) , _mm_mul_pd(vvdw6
,one_sixth
) );
1009 fvdw
= _mm_mul_pd(_mm_sub_pd(vvdw12
,vvdw6
),rinvsq00
);
1011 d
= _mm_sub_pd(r00
,rswitch
);
1012 d
= _mm_max_pd(d
,_mm_setzero_pd());
1013 d2
= _mm_mul_pd(d
,d
);
1014 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
1016 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
1018 /* Evaluate switch function */
1019 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1020 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv00
,_mm_mul_pd(velec
,dsw
)) );
1021 fvdw
= _mm_sub_pd( _mm_mul_pd(fvdw
,sw
) , _mm_mul_pd(rinv00
,_mm_mul_pd(vvdw
,dsw
)) );
1022 velec
= _mm_mul_pd(velec
,sw
);
1023 vvdw
= _mm_mul_pd(vvdw
,sw
);
1024 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
1026 /* Update potential sum for this i atom from the interaction with this j atom. */
1027 velec
= _mm_and_pd(velec
,cutoff_mask
);
1028 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1029 velecsum
= _mm_add_pd(velecsum
,velec
);
1030 vvdw
= _mm_and_pd(vvdw
,cutoff_mask
);
1031 vvdw
= _mm_unpacklo_pd(vvdw
,_mm_setzero_pd());
1032 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
1034 fscal
= _mm_add_pd(felec
,fvdw
);
1036 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1038 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1040 /* Calculate temporary vectorial force */
1041 tx
= _mm_mul_pd(fscal
,dx00
);
1042 ty
= _mm_mul_pd(fscal
,dy00
);
1043 tz
= _mm_mul_pd(fscal
,dz00
);
1045 /* Update vectorial force */
1046 fix0
= _mm_add_pd(fix0
,tx
);
1047 fiy0
= _mm_add_pd(fiy0
,ty
);
1048 fiz0
= _mm_add_pd(fiz0
,tz
);
1050 fjx0
= _mm_add_pd(fjx0
,tx
);
1051 fjy0
= _mm_add_pd(fjy0
,ty
);
1052 fjz0
= _mm_add_pd(fjz0
,tz
);
1056 /**************************
1057 * CALCULATE INTERACTIONS *
1058 **************************/
1060 if (gmx_mm_any_lt(rsq01
,rcutoff2
))
1063 r01
= _mm_mul_pd(rsq01
,rinv01
);
1065 /* EWALD ELECTROSTATICS */
1067 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1068 ewrt
= _mm_mul_pd(r01
,ewtabscale
);
1069 ewitab
= _mm_cvttpd_epi32(ewrt
);
1070 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1071 ewitab
= _mm_slli_epi32(ewitab
,2);
1072 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1073 ewtabD
= _mm_setzero_pd();
1074 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1075 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1076 ewtabFn
= _mm_setzero_pd();
1077 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1078 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1079 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1080 velec
= _mm_mul_pd(qq01
,_mm_sub_pd(rinv01
,velec
));
1081 felec
= _mm_mul_pd(_mm_mul_pd(qq01
,rinv01
),_mm_sub_pd(rinvsq01
,felec
));
1083 d
= _mm_sub_pd(r01
,rswitch
);
1084 d
= _mm_max_pd(d
,_mm_setzero_pd());
1085 d2
= _mm_mul_pd(d
,d
);
1086 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
1088 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
1090 /* Evaluate switch function */
1091 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1092 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv01
,_mm_mul_pd(velec
,dsw
)) );
1093 velec
= _mm_mul_pd(velec
,sw
);
1094 cutoff_mask
= _mm_cmplt_pd(rsq01
,rcutoff2
);
1096 /* Update potential sum for this i atom from the interaction with this j atom. */
1097 velec
= _mm_and_pd(velec
,cutoff_mask
);
1098 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1099 velecsum
= _mm_add_pd(velecsum
,velec
);
1103 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1105 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1107 /* Calculate temporary vectorial force */
1108 tx
= _mm_mul_pd(fscal
,dx01
);
1109 ty
= _mm_mul_pd(fscal
,dy01
);
1110 tz
= _mm_mul_pd(fscal
,dz01
);
1112 /* Update vectorial force */
1113 fix0
= _mm_add_pd(fix0
,tx
);
1114 fiy0
= _mm_add_pd(fiy0
,ty
);
1115 fiz0
= _mm_add_pd(fiz0
,tz
);
1117 fjx1
= _mm_add_pd(fjx1
,tx
);
1118 fjy1
= _mm_add_pd(fjy1
,ty
);
1119 fjz1
= _mm_add_pd(fjz1
,tz
);
1123 /**************************
1124 * CALCULATE INTERACTIONS *
1125 **************************/
1127 if (gmx_mm_any_lt(rsq02
,rcutoff2
))
1130 r02
= _mm_mul_pd(rsq02
,rinv02
);
1132 /* EWALD ELECTROSTATICS */
1134 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1135 ewrt
= _mm_mul_pd(r02
,ewtabscale
);
1136 ewitab
= _mm_cvttpd_epi32(ewrt
);
1137 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1138 ewitab
= _mm_slli_epi32(ewitab
,2);
1139 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1140 ewtabD
= _mm_setzero_pd();
1141 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1142 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1143 ewtabFn
= _mm_setzero_pd();
1144 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1145 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1146 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1147 velec
= _mm_mul_pd(qq02
,_mm_sub_pd(rinv02
,velec
));
1148 felec
= _mm_mul_pd(_mm_mul_pd(qq02
,rinv02
),_mm_sub_pd(rinvsq02
,felec
));
1150 d
= _mm_sub_pd(r02
,rswitch
);
1151 d
= _mm_max_pd(d
,_mm_setzero_pd());
1152 d2
= _mm_mul_pd(d
,d
);
1153 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
1155 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
1157 /* Evaluate switch function */
1158 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1159 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv02
,_mm_mul_pd(velec
,dsw
)) );
1160 velec
= _mm_mul_pd(velec
,sw
);
1161 cutoff_mask
= _mm_cmplt_pd(rsq02
,rcutoff2
);
1163 /* Update potential sum for this i atom from the interaction with this j atom. */
1164 velec
= _mm_and_pd(velec
,cutoff_mask
);
1165 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1166 velecsum
= _mm_add_pd(velecsum
,velec
);
1170 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1172 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1174 /* Calculate temporary vectorial force */
1175 tx
= _mm_mul_pd(fscal
,dx02
);
1176 ty
= _mm_mul_pd(fscal
,dy02
);
1177 tz
= _mm_mul_pd(fscal
,dz02
);
1179 /* Update vectorial force */
1180 fix0
= _mm_add_pd(fix0
,tx
);
1181 fiy0
= _mm_add_pd(fiy0
,ty
);
1182 fiz0
= _mm_add_pd(fiz0
,tz
);
1184 fjx2
= _mm_add_pd(fjx2
,tx
);
1185 fjy2
= _mm_add_pd(fjy2
,ty
);
1186 fjz2
= _mm_add_pd(fjz2
,tz
);
1190 /**************************
1191 * CALCULATE INTERACTIONS *
1192 **************************/
1194 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
1197 r10
= _mm_mul_pd(rsq10
,rinv10
);
1199 /* EWALD ELECTROSTATICS */
1201 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1202 ewrt
= _mm_mul_pd(r10
,ewtabscale
);
1203 ewitab
= _mm_cvttpd_epi32(ewrt
);
1204 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1205 ewitab
= _mm_slli_epi32(ewitab
,2);
1206 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1207 ewtabD
= _mm_setzero_pd();
1208 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1209 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1210 ewtabFn
= _mm_setzero_pd();
1211 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1212 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1213 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1214 velec
= _mm_mul_pd(qq10
,_mm_sub_pd(rinv10
,velec
));
1215 felec
= _mm_mul_pd(_mm_mul_pd(qq10
,rinv10
),_mm_sub_pd(rinvsq10
,felec
));
1217 d
= _mm_sub_pd(r10
,rswitch
);
1218 d
= _mm_max_pd(d
,_mm_setzero_pd());
1219 d2
= _mm_mul_pd(d
,d
);
1220 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
1222 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
1224 /* Evaluate switch function */
1225 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1226 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv10
,_mm_mul_pd(velec
,dsw
)) );
1227 velec
= _mm_mul_pd(velec
,sw
);
1228 cutoff_mask
= _mm_cmplt_pd(rsq10
,rcutoff2
);
1230 /* Update potential sum for this i atom from the interaction with this j atom. */
1231 velec
= _mm_and_pd(velec
,cutoff_mask
);
1232 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1233 velecsum
= _mm_add_pd(velecsum
,velec
);
1237 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1239 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1241 /* Calculate temporary vectorial force */
1242 tx
= _mm_mul_pd(fscal
,dx10
);
1243 ty
= _mm_mul_pd(fscal
,dy10
);
1244 tz
= _mm_mul_pd(fscal
,dz10
);
1246 /* Update vectorial force */
1247 fix1
= _mm_add_pd(fix1
,tx
);
1248 fiy1
= _mm_add_pd(fiy1
,ty
);
1249 fiz1
= _mm_add_pd(fiz1
,tz
);
1251 fjx0
= _mm_add_pd(fjx0
,tx
);
1252 fjy0
= _mm_add_pd(fjy0
,ty
);
1253 fjz0
= _mm_add_pd(fjz0
,tz
);
1257 /**************************
1258 * CALCULATE INTERACTIONS *
1259 **************************/
1261 if (gmx_mm_any_lt(rsq11
,rcutoff2
))
1264 r11
= _mm_mul_pd(rsq11
,rinv11
);
1266 /* EWALD ELECTROSTATICS */
1268 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1269 ewrt
= _mm_mul_pd(r11
,ewtabscale
);
1270 ewitab
= _mm_cvttpd_epi32(ewrt
);
1271 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1272 ewitab
= _mm_slli_epi32(ewitab
,2);
1273 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1274 ewtabD
= _mm_setzero_pd();
1275 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1276 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1277 ewtabFn
= _mm_setzero_pd();
1278 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1279 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1280 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1281 velec
= _mm_mul_pd(qq11
,_mm_sub_pd(rinv11
,velec
));
1282 felec
= _mm_mul_pd(_mm_mul_pd(qq11
,rinv11
),_mm_sub_pd(rinvsq11
,felec
));
1284 d
= _mm_sub_pd(r11
,rswitch
);
1285 d
= _mm_max_pd(d
,_mm_setzero_pd());
1286 d2
= _mm_mul_pd(d
,d
);
1287 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
1289 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
1291 /* Evaluate switch function */
1292 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1293 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv11
,_mm_mul_pd(velec
,dsw
)) );
1294 velec
= _mm_mul_pd(velec
,sw
);
1295 cutoff_mask
= _mm_cmplt_pd(rsq11
,rcutoff2
);
1297 /* Update potential sum for this i atom from the interaction with this j atom. */
1298 velec
= _mm_and_pd(velec
,cutoff_mask
);
1299 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1300 velecsum
= _mm_add_pd(velecsum
,velec
);
1304 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1306 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1308 /* Calculate temporary vectorial force */
1309 tx
= _mm_mul_pd(fscal
,dx11
);
1310 ty
= _mm_mul_pd(fscal
,dy11
);
1311 tz
= _mm_mul_pd(fscal
,dz11
);
1313 /* Update vectorial force */
1314 fix1
= _mm_add_pd(fix1
,tx
);
1315 fiy1
= _mm_add_pd(fiy1
,ty
);
1316 fiz1
= _mm_add_pd(fiz1
,tz
);
1318 fjx1
= _mm_add_pd(fjx1
,tx
);
1319 fjy1
= _mm_add_pd(fjy1
,ty
);
1320 fjz1
= _mm_add_pd(fjz1
,tz
);
1324 /**************************
1325 * CALCULATE INTERACTIONS *
1326 **************************/
1328 if (gmx_mm_any_lt(rsq12
,rcutoff2
))
1331 r12
= _mm_mul_pd(rsq12
,rinv12
);
1333 /* EWALD ELECTROSTATICS */
1335 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1336 ewrt
= _mm_mul_pd(r12
,ewtabscale
);
1337 ewitab
= _mm_cvttpd_epi32(ewrt
);
1338 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1339 ewitab
= _mm_slli_epi32(ewitab
,2);
1340 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1341 ewtabD
= _mm_setzero_pd();
1342 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1343 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1344 ewtabFn
= _mm_setzero_pd();
1345 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1346 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1347 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1348 velec
= _mm_mul_pd(qq12
,_mm_sub_pd(rinv12
,velec
));
1349 felec
= _mm_mul_pd(_mm_mul_pd(qq12
,rinv12
),_mm_sub_pd(rinvsq12
,felec
));
1351 d
= _mm_sub_pd(r12
,rswitch
);
1352 d
= _mm_max_pd(d
,_mm_setzero_pd());
1353 d2
= _mm_mul_pd(d
,d
);
1354 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
1356 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
1358 /* Evaluate switch function */
1359 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1360 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv12
,_mm_mul_pd(velec
,dsw
)) );
1361 velec
= _mm_mul_pd(velec
,sw
);
1362 cutoff_mask
= _mm_cmplt_pd(rsq12
,rcutoff2
);
1364 /* Update potential sum for this i atom from the interaction with this j atom. */
1365 velec
= _mm_and_pd(velec
,cutoff_mask
);
1366 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1367 velecsum
= _mm_add_pd(velecsum
,velec
);
1371 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1373 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1375 /* Calculate temporary vectorial force */
1376 tx
= _mm_mul_pd(fscal
,dx12
);
1377 ty
= _mm_mul_pd(fscal
,dy12
);
1378 tz
= _mm_mul_pd(fscal
,dz12
);
1380 /* Update vectorial force */
1381 fix1
= _mm_add_pd(fix1
,tx
);
1382 fiy1
= _mm_add_pd(fiy1
,ty
);
1383 fiz1
= _mm_add_pd(fiz1
,tz
);
1385 fjx2
= _mm_add_pd(fjx2
,tx
);
1386 fjy2
= _mm_add_pd(fjy2
,ty
);
1387 fjz2
= _mm_add_pd(fjz2
,tz
);
1391 /**************************
1392 * CALCULATE INTERACTIONS *
1393 **************************/
1395 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
1398 r20
= _mm_mul_pd(rsq20
,rinv20
);
1400 /* EWALD ELECTROSTATICS */
1402 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1403 ewrt
= _mm_mul_pd(r20
,ewtabscale
);
1404 ewitab
= _mm_cvttpd_epi32(ewrt
);
1405 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1406 ewitab
= _mm_slli_epi32(ewitab
,2);
1407 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1408 ewtabD
= _mm_setzero_pd();
1409 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1410 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1411 ewtabFn
= _mm_setzero_pd();
1412 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1413 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1414 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1415 velec
= _mm_mul_pd(qq20
,_mm_sub_pd(rinv20
,velec
));
1416 felec
= _mm_mul_pd(_mm_mul_pd(qq20
,rinv20
),_mm_sub_pd(rinvsq20
,felec
));
1418 d
= _mm_sub_pd(r20
,rswitch
);
1419 d
= _mm_max_pd(d
,_mm_setzero_pd());
1420 d2
= _mm_mul_pd(d
,d
);
1421 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
1423 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
1425 /* Evaluate switch function */
1426 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1427 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv20
,_mm_mul_pd(velec
,dsw
)) );
1428 velec
= _mm_mul_pd(velec
,sw
);
1429 cutoff_mask
= _mm_cmplt_pd(rsq20
,rcutoff2
);
1431 /* Update potential sum for this i atom from the interaction with this j atom. */
1432 velec
= _mm_and_pd(velec
,cutoff_mask
);
1433 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1434 velecsum
= _mm_add_pd(velecsum
,velec
);
1438 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1440 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1442 /* Calculate temporary vectorial force */
1443 tx
= _mm_mul_pd(fscal
,dx20
);
1444 ty
= _mm_mul_pd(fscal
,dy20
);
1445 tz
= _mm_mul_pd(fscal
,dz20
);
1447 /* Update vectorial force */
1448 fix2
= _mm_add_pd(fix2
,tx
);
1449 fiy2
= _mm_add_pd(fiy2
,ty
);
1450 fiz2
= _mm_add_pd(fiz2
,tz
);
1452 fjx0
= _mm_add_pd(fjx0
,tx
);
1453 fjy0
= _mm_add_pd(fjy0
,ty
);
1454 fjz0
= _mm_add_pd(fjz0
,tz
);
1458 /**************************
1459 * CALCULATE INTERACTIONS *
1460 **************************/
1462 if (gmx_mm_any_lt(rsq21
,rcutoff2
))
1465 r21
= _mm_mul_pd(rsq21
,rinv21
);
1467 /* EWALD ELECTROSTATICS */
1469 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1470 ewrt
= _mm_mul_pd(r21
,ewtabscale
);
1471 ewitab
= _mm_cvttpd_epi32(ewrt
);
1472 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1473 ewitab
= _mm_slli_epi32(ewitab
,2);
1474 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1475 ewtabD
= _mm_setzero_pd();
1476 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1477 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1478 ewtabFn
= _mm_setzero_pd();
1479 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1480 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1481 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1482 velec
= _mm_mul_pd(qq21
,_mm_sub_pd(rinv21
,velec
));
1483 felec
= _mm_mul_pd(_mm_mul_pd(qq21
,rinv21
),_mm_sub_pd(rinvsq21
,felec
));
1485 d
= _mm_sub_pd(r21
,rswitch
);
1486 d
= _mm_max_pd(d
,_mm_setzero_pd());
1487 d2
= _mm_mul_pd(d
,d
);
1488 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
1490 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
1492 /* Evaluate switch function */
1493 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1494 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv21
,_mm_mul_pd(velec
,dsw
)) );
1495 velec
= _mm_mul_pd(velec
,sw
);
1496 cutoff_mask
= _mm_cmplt_pd(rsq21
,rcutoff2
);
1498 /* Update potential sum for this i atom from the interaction with this j atom. */
1499 velec
= _mm_and_pd(velec
,cutoff_mask
);
1500 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1501 velecsum
= _mm_add_pd(velecsum
,velec
);
1505 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1507 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1509 /* Calculate temporary vectorial force */
1510 tx
= _mm_mul_pd(fscal
,dx21
);
1511 ty
= _mm_mul_pd(fscal
,dy21
);
1512 tz
= _mm_mul_pd(fscal
,dz21
);
1514 /* Update vectorial force */
1515 fix2
= _mm_add_pd(fix2
,tx
);
1516 fiy2
= _mm_add_pd(fiy2
,ty
);
1517 fiz2
= _mm_add_pd(fiz2
,tz
);
1519 fjx1
= _mm_add_pd(fjx1
,tx
);
1520 fjy1
= _mm_add_pd(fjy1
,ty
);
1521 fjz1
= _mm_add_pd(fjz1
,tz
);
1525 /**************************
1526 * CALCULATE INTERACTIONS *
1527 **************************/
1529 if (gmx_mm_any_lt(rsq22
,rcutoff2
))
1532 r22
= _mm_mul_pd(rsq22
,rinv22
);
1534 /* EWALD ELECTROSTATICS */
1536 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1537 ewrt
= _mm_mul_pd(r22
,ewtabscale
);
1538 ewitab
= _mm_cvttpd_epi32(ewrt
);
1539 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1540 ewitab
= _mm_slli_epi32(ewitab
,2);
1541 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1542 ewtabD
= _mm_setzero_pd();
1543 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1544 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1545 ewtabFn
= _mm_setzero_pd();
1546 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1547 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1548 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1549 velec
= _mm_mul_pd(qq22
,_mm_sub_pd(rinv22
,velec
));
1550 felec
= _mm_mul_pd(_mm_mul_pd(qq22
,rinv22
),_mm_sub_pd(rinvsq22
,felec
));
1552 d
= _mm_sub_pd(r22
,rswitch
);
1553 d
= _mm_max_pd(d
,_mm_setzero_pd());
1554 d2
= _mm_mul_pd(d
,d
);
1555 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
1557 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
1559 /* Evaluate switch function */
1560 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1561 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv22
,_mm_mul_pd(velec
,dsw
)) );
1562 velec
= _mm_mul_pd(velec
,sw
);
1563 cutoff_mask
= _mm_cmplt_pd(rsq22
,rcutoff2
);
1565 /* Update potential sum for this i atom from the interaction with this j atom. */
1566 velec
= _mm_and_pd(velec
,cutoff_mask
);
1567 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1568 velecsum
= _mm_add_pd(velecsum
,velec
);
1572 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1574 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1576 /* Calculate temporary vectorial force */
1577 tx
= _mm_mul_pd(fscal
,dx22
);
1578 ty
= _mm_mul_pd(fscal
,dy22
);
1579 tz
= _mm_mul_pd(fscal
,dz22
);
1581 /* Update vectorial force */
1582 fix2
= _mm_add_pd(fix2
,tx
);
1583 fiy2
= _mm_add_pd(fiy2
,ty
);
1584 fiz2
= _mm_add_pd(fiz2
,tz
);
1586 fjx2
= _mm_add_pd(fjx2
,tx
);
1587 fjy2
= _mm_add_pd(fjy2
,ty
);
1588 fjz2
= _mm_add_pd(fjz2
,tz
);
1592 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
1594 /* Inner loop uses 603 flops */
1597 /* End of innermost loop */
1599 gmx_mm_update_iforce_3atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
1600 f
+i_coord_offset
,fshift
+i_shift_offset
);
1603 /* Update potential energies */
1604 gmx_mm_update_1pot_pd(velecsum
,kernel_data
->energygrp_elec
+ggid
);
1605 gmx_mm_update_1pot_pd(vvdwsum
,kernel_data
->energygrp_vdw
+ggid
);
1607 /* Increment number of inner iterations */
1608 inneriter
+= j_index_end
- j_index_start
;
1610 /* Outer loop uses 20 flops */
1613 /* Increment number of outer iterations */
1616 /* Update outer/inner flops */
1618 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W3W3_VF
,outeriter
*20 + inneriter
*603);
1621 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sse4_1_double
1622 * Electrostatics interaction: Ewald
1623 * VdW interaction: LennardJones
1624 * Geometry: Water3-Water3
1625 * Calculate force/pot: Force
1628 nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sse4_1_double
1629 (t_nblist
* gmx_restrict nlist
,
1630 rvec
* gmx_restrict xx
,
1631 rvec
* gmx_restrict ff
,
1632 struct t_forcerec
* gmx_restrict fr
,
1633 t_mdatoms
* gmx_restrict mdatoms
,
1634 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
1635 t_nrnb
* gmx_restrict nrnb
)
1637 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1638 * just 0 for non-waters.
1639 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1640 * jnr indices corresponding to data put in the four positions in the SIMD register.
1642 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
1643 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
1645 int j_coord_offsetA
,j_coord_offsetB
;
1646 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
1647 real rcutoff_scalar
;
1648 real
*shiftvec
,*fshift
,*x
,*f
;
1649 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
1651 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
1653 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
1655 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
1656 int vdwjidx0A
,vdwjidx0B
;
1657 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
1658 int vdwjidx1A
,vdwjidx1B
;
1659 __m128d jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
1660 int vdwjidx2A
,vdwjidx2B
;
1661 __m128d jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
1662 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
1663 __m128d dx01
,dy01
,dz01
,rsq01
,rinv01
,rinvsq01
,r01
,qq01
,c6_01
,c12_01
;
1664 __m128d dx02
,dy02
,dz02
,rsq02
,rinv02
,rinvsq02
,r02
,qq02
,c6_02
,c12_02
;
1665 __m128d dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
1666 __m128d dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
1667 __m128d dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
1668 __m128d dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
1669 __m128d dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
1670 __m128d dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
1671 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
1674 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
1677 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
1678 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
1680 __m128d ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
1682 __m128d rswitch
,swV3
,swV4
,swV5
,swF2
,swF3
,swF4
,d
,d2
,sw
,dsw
;
1683 real rswitch_scalar
,d_scalar
;
1684 __m128d dummy_mask
,cutoff_mask
;
1685 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1686 __m128d one
= _mm_set1_pd(1.0);
1687 __m128d two
= _mm_set1_pd(2.0);
1693 jindex
= nlist
->jindex
;
1695 shiftidx
= nlist
->shift
;
1697 shiftvec
= fr
->shift_vec
[0];
1698 fshift
= fr
->fshift
[0];
1699 facel
= _mm_set1_pd(fr
->ic
->epsfac
);
1700 charge
= mdatoms
->chargeA
;
1701 nvdwtype
= fr
->ntype
;
1702 vdwparam
= fr
->nbfp
;
1703 vdwtype
= mdatoms
->typeA
;
1705 sh_ewald
= _mm_set1_pd(fr
->ic
->sh_ewald
);
1706 ewtab
= fr
->ic
->tabq_coul_FDV0
;
1707 ewtabscale
= _mm_set1_pd(fr
->ic
->tabq_scale
);
1708 ewtabhalfspace
= _mm_set1_pd(0.5/fr
->ic
->tabq_scale
);
1710 /* Setup water-specific parameters */
1711 inr
= nlist
->iinr
[0];
1712 iq0
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+0]));
1713 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
1714 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
1715 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
1717 jq0
= _mm_set1_pd(charge
[inr
+0]);
1718 jq1
= _mm_set1_pd(charge
[inr
+1]);
1719 jq2
= _mm_set1_pd(charge
[inr
+2]);
1720 vdwjidx0A
= 2*vdwtype
[inr
+0];
1721 qq00
= _mm_mul_pd(iq0
,jq0
);
1722 c6_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
]);
1723 c12_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
1724 qq01
= _mm_mul_pd(iq0
,jq1
);
1725 qq02
= _mm_mul_pd(iq0
,jq2
);
1726 qq10
= _mm_mul_pd(iq1
,jq0
);
1727 qq11
= _mm_mul_pd(iq1
,jq1
);
1728 qq12
= _mm_mul_pd(iq1
,jq2
);
1729 qq20
= _mm_mul_pd(iq2
,jq0
);
1730 qq21
= _mm_mul_pd(iq2
,jq1
);
1731 qq22
= _mm_mul_pd(iq2
,jq2
);
1733 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1734 rcutoff_scalar
= fr
->ic
->rcoulomb
;
1735 rcutoff
= _mm_set1_pd(rcutoff_scalar
);
1736 rcutoff2
= _mm_mul_pd(rcutoff
,rcutoff
);
1738 rswitch_scalar
= fr
->ic
->rcoulomb_switch
;
1739 rswitch
= _mm_set1_pd(rswitch_scalar
);
1740 /* Setup switch parameters */
1741 d_scalar
= rcutoff_scalar
-rswitch_scalar
;
1742 d
= _mm_set1_pd(d_scalar
);
1743 swV3
= _mm_set1_pd(-10.0/(d_scalar
*d_scalar
*d_scalar
));
1744 swV4
= _mm_set1_pd( 15.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
));
1745 swV5
= _mm_set1_pd( -6.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
*d_scalar
));
1746 swF2
= _mm_set1_pd(-30.0/(d_scalar
*d_scalar
*d_scalar
));
1747 swF3
= _mm_set1_pd( 60.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
));
1748 swF4
= _mm_set1_pd(-30.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
*d_scalar
));
1750 /* Avoid stupid compiler warnings */
1752 j_coord_offsetA
= 0;
1753 j_coord_offsetB
= 0;
1758 /* Start outer loop over neighborlists */
1759 for(iidx
=0; iidx
<nri
; iidx
++)
1761 /* Load shift vector for this list */
1762 i_shift_offset
= DIM
*shiftidx
[iidx
];
1764 /* Load limits for loop over neighbors */
1765 j_index_start
= jindex
[iidx
];
1766 j_index_end
= jindex
[iidx
+1];
1768 /* Get outer coordinate index */
1770 i_coord_offset
= DIM
*inr
;
1772 /* Load i particle coords and add shift vector */
1773 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
1774 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
1776 fix0
= _mm_setzero_pd();
1777 fiy0
= _mm_setzero_pd();
1778 fiz0
= _mm_setzero_pd();
1779 fix1
= _mm_setzero_pd();
1780 fiy1
= _mm_setzero_pd();
1781 fiz1
= _mm_setzero_pd();
1782 fix2
= _mm_setzero_pd();
1783 fiy2
= _mm_setzero_pd();
1784 fiz2
= _mm_setzero_pd();
1786 /* Start inner kernel loop */
1787 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
1790 /* Get j neighbor index, and coordinate index */
1792 jnrB
= jjnr
[jidx
+1];
1793 j_coord_offsetA
= DIM
*jnrA
;
1794 j_coord_offsetB
= DIM
*jnrB
;
1796 /* load j atom coordinates */
1797 gmx_mm_load_3rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1798 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
1800 /* Calculate displacement vector */
1801 dx00
= _mm_sub_pd(ix0
,jx0
);
1802 dy00
= _mm_sub_pd(iy0
,jy0
);
1803 dz00
= _mm_sub_pd(iz0
,jz0
);
1804 dx01
= _mm_sub_pd(ix0
,jx1
);
1805 dy01
= _mm_sub_pd(iy0
,jy1
);
1806 dz01
= _mm_sub_pd(iz0
,jz1
);
1807 dx02
= _mm_sub_pd(ix0
,jx2
);
1808 dy02
= _mm_sub_pd(iy0
,jy2
);
1809 dz02
= _mm_sub_pd(iz0
,jz2
);
1810 dx10
= _mm_sub_pd(ix1
,jx0
);
1811 dy10
= _mm_sub_pd(iy1
,jy0
);
1812 dz10
= _mm_sub_pd(iz1
,jz0
);
1813 dx11
= _mm_sub_pd(ix1
,jx1
);
1814 dy11
= _mm_sub_pd(iy1
,jy1
);
1815 dz11
= _mm_sub_pd(iz1
,jz1
);
1816 dx12
= _mm_sub_pd(ix1
,jx2
);
1817 dy12
= _mm_sub_pd(iy1
,jy2
);
1818 dz12
= _mm_sub_pd(iz1
,jz2
);
1819 dx20
= _mm_sub_pd(ix2
,jx0
);
1820 dy20
= _mm_sub_pd(iy2
,jy0
);
1821 dz20
= _mm_sub_pd(iz2
,jz0
);
1822 dx21
= _mm_sub_pd(ix2
,jx1
);
1823 dy21
= _mm_sub_pd(iy2
,jy1
);
1824 dz21
= _mm_sub_pd(iz2
,jz1
);
1825 dx22
= _mm_sub_pd(ix2
,jx2
);
1826 dy22
= _mm_sub_pd(iy2
,jy2
);
1827 dz22
= _mm_sub_pd(iz2
,jz2
);
1829 /* Calculate squared distance and things based on it */
1830 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
1831 rsq01
= gmx_mm_calc_rsq_pd(dx01
,dy01
,dz01
);
1832 rsq02
= gmx_mm_calc_rsq_pd(dx02
,dy02
,dz02
);
1833 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
1834 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
1835 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
1836 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
1837 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
1838 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
1840 rinv00
= sse41_invsqrt_d(rsq00
);
1841 rinv01
= sse41_invsqrt_d(rsq01
);
1842 rinv02
= sse41_invsqrt_d(rsq02
);
1843 rinv10
= sse41_invsqrt_d(rsq10
);
1844 rinv11
= sse41_invsqrt_d(rsq11
);
1845 rinv12
= sse41_invsqrt_d(rsq12
);
1846 rinv20
= sse41_invsqrt_d(rsq20
);
1847 rinv21
= sse41_invsqrt_d(rsq21
);
1848 rinv22
= sse41_invsqrt_d(rsq22
);
1850 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
1851 rinvsq01
= _mm_mul_pd(rinv01
,rinv01
);
1852 rinvsq02
= _mm_mul_pd(rinv02
,rinv02
);
1853 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
1854 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
1855 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
1856 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
1857 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
1858 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
1860 fjx0
= _mm_setzero_pd();
1861 fjy0
= _mm_setzero_pd();
1862 fjz0
= _mm_setzero_pd();
1863 fjx1
= _mm_setzero_pd();
1864 fjy1
= _mm_setzero_pd();
1865 fjz1
= _mm_setzero_pd();
1866 fjx2
= _mm_setzero_pd();
1867 fjy2
= _mm_setzero_pd();
1868 fjz2
= _mm_setzero_pd();
1870 /**************************
1871 * CALCULATE INTERACTIONS *
1872 **************************/
1874 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
1877 r00
= _mm_mul_pd(rsq00
,rinv00
);
1879 /* EWALD ELECTROSTATICS */
1881 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1882 ewrt
= _mm_mul_pd(r00
,ewtabscale
);
1883 ewitab
= _mm_cvttpd_epi32(ewrt
);
1884 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1885 ewitab
= _mm_slli_epi32(ewitab
,2);
1886 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1887 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
1888 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1889 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1890 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
1891 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1892 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1893 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1894 velec
= _mm_mul_pd(qq00
,_mm_sub_pd(rinv00
,velec
));
1895 felec
= _mm_mul_pd(_mm_mul_pd(qq00
,rinv00
),_mm_sub_pd(rinvsq00
,felec
));
1897 /* LENNARD-JONES DISPERSION/REPULSION */
1899 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
1900 vvdw6
= _mm_mul_pd(c6_00
,rinvsix
);
1901 vvdw12
= _mm_mul_pd(c12_00
,_mm_mul_pd(rinvsix
,rinvsix
));
1902 vvdw
= _mm_sub_pd( _mm_mul_pd(vvdw12
,one_twelfth
) , _mm_mul_pd(vvdw6
,one_sixth
) );
1903 fvdw
= _mm_mul_pd(_mm_sub_pd(vvdw12
,vvdw6
),rinvsq00
);
1905 d
= _mm_sub_pd(r00
,rswitch
);
1906 d
= _mm_max_pd(d
,_mm_setzero_pd());
1907 d2
= _mm_mul_pd(d
,d
);
1908 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
1910 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
1912 /* Evaluate switch function */
1913 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1914 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv00
,_mm_mul_pd(velec
,dsw
)) );
1915 fvdw
= _mm_sub_pd( _mm_mul_pd(fvdw
,sw
) , _mm_mul_pd(rinv00
,_mm_mul_pd(vvdw
,dsw
)) );
1916 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
1918 fscal
= _mm_add_pd(felec
,fvdw
);
1920 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1922 /* Calculate temporary vectorial force */
1923 tx
= _mm_mul_pd(fscal
,dx00
);
1924 ty
= _mm_mul_pd(fscal
,dy00
);
1925 tz
= _mm_mul_pd(fscal
,dz00
);
1927 /* Update vectorial force */
1928 fix0
= _mm_add_pd(fix0
,tx
);
1929 fiy0
= _mm_add_pd(fiy0
,ty
);
1930 fiz0
= _mm_add_pd(fiz0
,tz
);
1932 fjx0
= _mm_add_pd(fjx0
,tx
);
1933 fjy0
= _mm_add_pd(fjy0
,ty
);
1934 fjz0
= _mm_add_pd(fjz0
,tz
);
1938 /**************************
1939 * CALCULATE INTERACTIONS *
1940 **************************/
1942 if (gmx_mm_any_lt(rsq01
,rcutoff2
))
1945 r01
= _mm_mul_pd(rsq01
,rinv01
);
1947 /* EWALD ELECTROSTATICS */
1949 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1950 ewrt
= _mm_mul_pd(r01
,ewtabscale
);
1951 ewitab
= _mm_cvttpd_epi32(ewrt
);
1952 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1953 ewitab
= _mm_slli_epi32(ewitab
,2);
1954 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1955 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
1956 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1957 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1958 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
1959 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1960 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1961 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1962 velec
= _mm_mul_pd(qq01
,_mm_sub_pd(rinv01
,velec
));
1963 felec
= _mm_mul_pd(_mm_mul_pd(qq01
,rinv01
),_mm_sub_pd(rinvsq01
,felec
));
1965 d
= _mm_sub_pd(r01
,rswitch
);
1966 d
= _mm_max_pd(d
,_mm_setzero_pd());
1967 d2
= _mm_mul_pd(d
,d
);
1968 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
1970 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
1972 /* Evaluate switch function */
1973 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1974 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv01
,_mm_mul_pd(velec
,dsw
)) );
1975 cutoff_mask
= _mm_cmplt_pd(rsq01
,rcutoff2
);
1979 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1981 /* Calculate temporary vectorial force */
1982 tx
= _mm_mul_pd(fscal
,dx01
);
1983 ty
= _mm_mul_pd(fscal
,dy01
);
1984 tz
= _mm_mul_pd(fscal
,dz01
);
1986 /* Update vectorial force */
1987 fix0
= _mm_add_pd(fix0
,tx
);
1988 fiy0
= _mm_add_pd(fiy0
,ty
);
1989 fiz0
= _mm_add_pd(fiz0
,tz
);
1991 fjx1
= _mm_add_pd(fjx1
,tx
);
1992 fjy1
= _mm_add_pd(fjy1
,ty
);
1993 fjz1
= _mm_add_pd(fjz1
,tz
);
1997 /**************************
1998 * CALCULATE INTERACTIONS *
1999 **************************/
2001 if (gmx_mm_any_lt(rsq02
,rcutoff2
))
2004 r02
= _mm_mul_pd(rsq02
,rinv02
);
2006 /* EWALD ELECTROSTATICS */
2008 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2009 ewrt
= _mm_mul_pd(r02
,ewtabscale
);
2010 ewitab
= _mm_cvttpd_epi32(ewrt
);
2011 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2012 ewitab
= _mm_slli_epi32(ewitab
,2);
2013 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
2014 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
2015 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
2016 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
2017 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
2018 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
2019 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
2020 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
2021 velec
= _mm_mul_pd(qq02
,_mm_sub_pd(rinv02
,velec
));
2022 felec
= _mm_mul_pd(_mm_mul_pd(qq02
,rinv02
),_mm_sub_pd(rinvsq02
,felec
));
2024 d
= _mm_sub_pd(r02
,rswitch
);
2025 d
= _mm_max_pd(d
,_mm_setzero_pd());
2026 d2
= _mm_mul_pd(d
,d
);
2027 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
2029 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
2031 /* Evaluate switch function */
2032 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2033 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv02
,_mm_mul_pd(velec
,dsw
)) );
2034 cutoff_mask
= _mm_cmplt_pd(rsq02
,rcutoff2
);
2038 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2040 /* Calculate temporary vectorial force */
2041 tx
= _mm_mul_pd(fscal
,dx02
);
2042 ty
= _mm_mul_pd(fscal
,dy02
);
2043 tz
= _mm_mul_pd(fscal
,dz02
);
2045 /* Update vectorial force */
2046 fix0
= _mm_add_pd(fix0
,tx
);
2047 fiy0
= _mm_add_pd(fiy0
,ty
);
2048 fiz0
= _mm_add_pd(fiz0
,tz
);
2050 fjx2
= _mm_add_pd(fjx2
,tx
);
2051 fjy2
= _mm_add_pd(fjy2
,ty
);
2052 fjz2
= _mm_add_pd(fjz2
,tz
);
2056 /**************************
2057 * CALCULATE INTERACTIONS *
2058 **************************/
2060 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
2063 r10
= _mm_mul_pd(rsq10
,rinv10
);
2065 /* EWALD ELECTROSTATICS */
2067 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2068 ewrt
= _mm_mul_pd(r10
,ewtabscale
);
2069 ewitab
= _mm_cvttpd_epi32(ewrt
);
2070 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2071 ewitab
= _mm_slli_epi32(ewitab
,2);
2072 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
2073 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
2074 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
2075 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
2076 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
2077 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
2078 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
2079 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
2080 velec
= _mm_mul_pd(qq10
,_mm_sub_pd(rinv10
,velec
));
2081 felec
= _mm_mul_pd(_mm_mul_pd(qq10
,rinv10
),_mm_sub_pd(rinvsq10
,felec
));
2083 d
= _mm_sub_pd(r10
,rswitch
);
2084 d
= _mm_max_pd(d
,_mm_setzero_pd());
2085 d2
= _mm_mul_pd(d
,d
);
2086 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
2088 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
2090 /* Evaluate switch function */
2091 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2092 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv10
,_mm_mul_pd(velec
,dsw
)) );
2093 cutoff_mask
= _mm_cmplt_pd(rsq10
,rcutoff2
);
2097 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2099 /* Calculate temporary vectorial force */
2100 tx
= _mm_mul_pd(fscal
,dx10
);
2101 ty
= _mm_mul_pd(fscal
,dy10
);
2102 tz
= _mm_mul_pd(fscal
,dz10
);
2104 /* Update vectorial force */
2105 fix1
= _mm_add_pd(fix1
,tx
);
2106 fiy1
= _mm_add_pd(fiy1
,ty
);
2107 fiz1
= _mm_add_pd(fiz1
,tz
);
2109 fjx0
= _mm_add_pd(fjx0
,tx
);
2110 fjy0
= _mm_add_pd(fjy0
,ty
);
2111 fjz0
= _mm_add_pd(fjz0
,tz
);
2115 /**************************
2116 * CALCULATE INTERACTIONS *
2117 **************************/
2119 if (gmx_mm_any_lt(rsq11
,rcutoff2
))
2122 r11
= _mm_mul_pd(rsq11
,rinv11
);
2124 /* EWALD ELECTROSTATICS */
2126 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2127 ewrt
= _mm_mul_pd(r11
,ewtabscale
);
2128 ewitab
= _mm_cvttpd_epi32(ewrt
);
2129 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2130 ewitab
= _mm_slli_epi32(ewitab
,2);
2131 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
2132 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
2133 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
2134 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
2135 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
2136 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
2137 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
2138 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
2139 velec
= _mm_mul_pd(qq11
,_mm_sub_pd(rinv11
,velec
));
2140 felec
= _mm_mul_pd(_mm_mul_pd(qq11
,rinv11
),_mm_sub_pd(rinvsq11
,felec
));
2142 d
= _mm_sub_pd(r11
,rswitch
);
2143 d
= _mm_max_pd(d
,_mm_setzero_pd());
2144 d2
= _mm_mul_pd(d
,d
);
2145 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
2147 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
2149 /* Evaluate switch function */
2150 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2151 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv11
,_mm_mul_pd(velec
,dsw
)) );
2152 cutoff_mask
= _mm_cmplt_pd(rsq11
,rcutoff2
);
2156 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2158 /* Calculate temporary vectorial force */
2159 tx
= _mm_mul_pd(fscal
,dx11
);
2160 ty
= _mm_mul_pd(fscal
,dy11
);
2161 tz
= _mm_mul_pd(fscal
,dz11
);
2163 /* Update vectorial force */
2164 fix1
= _mm_add_pd(fix1
,tx
);
2165 fiy1
= _mm_add_pd(fiy1
,ty
);
2166 fiz1
= _mm_add_pd(fiz1
,tz
);
2168 fjx1
= _mm_add_pd(fjx1
,tx
);
2169 fjy1
= _mm_add_pd(fjy1
,ty
);
2170 fjz1
= _mm_add_pd(fjz1
,tz
);
2174 /**************************
2175 * CALCULATE INTERACTIONS *
2176 **************************/
2178 if (gmx_mm_any_lt(rsq12
,rcutoff2
))
2181 r12
= _mm_mul_pd(rsq12
,rinv12
);
2183 /* EWALD ELECTROSTATICS */
2185 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2186 ewrt
= _mm_mul_pd(r12
,ewtabscale
);
2187 ewitab
= _mm_cvttpd_epi32(ewrt
);
2188 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2189 ewitab
= _mm_slli_epi32(ewitab
,2);
2190 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
2191 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
2192 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
2193 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
2194 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
2195 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
2196 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
2197 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
2198 velec
= _mm_mul_pd(qq12
,_mm_sub_pd(rinv12
,velec
));
2199 felec
= _mm_mul_pd(_mm_mul_pd(qq12
,rinv12
),_mm_sub_pd(rinvsq12
,felec
));
2201 d
= _mm_sub_pd(r12
,rswitch
);
2202 d
= _mm_max_pd(d
,_mm_setzero_pd());
2203 d2
= _mm_mul_pd(d
,d
);
2204 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
2206 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
2208 /* Evaluate switch function */
2209 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2210 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv12
,_mm_mul_pd(velec
,dsw
)) );
2211 cutoff_mask
= _mm_cmplt_pd(rsq12
,rcutoff2
);
2215 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2217 /* Calculate temporary vectorial force */
2218 tx
= _mm_mul_pd(fscal
,dx12
);
2219 ty
= _mm_mul_pd(fscal
,dy12
);
2220 tz
= _mm_mul_pd(fscal
,dz12
);
2222 /* Update vectorial force */
2223 fix1
= _mm_add_pd(fix1
,tx
);
2224 fiy1
= _mm_add_pd(fiy1
,ty
);
2225 fiz1
= _mm_add_pd(fiz1
,tz
);
2227 fjx2
= _mm_add_pd(fjx2
,tx
);
2228 fjy2
= _mm_add_pd(fjy2
,ty
);
2229 fjz2
= _mm_add_pd(fjz2
,tz
);
2233 /**************************
2234 * CALCULATE INTERACTIONS *
2235 **************************/
2237 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
2240 r20
= _mm_mul_pd(rsq20
,rinv20
);
2242 /* EWALD ELECTROSTATICS */
2244 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2245 ewrt
= _mm_mul_pd(r20
,ewtabscale
);
2246 ewitab
= _mm_cvttpd_epi32(ewrt
);
2247 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2248 ewitab
= _mm_slli_epi32(ewitab
,2);
2249 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
2250 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
2251 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
2252 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
2253 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
2254 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
2255 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
2256 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
2257 velec
= _mm_mul_pd(qq20
,_mm_sub_pd(rinv20
,velec
));
2258 felec
= _mm_mul_pd(_mm_mul_pd(qq20
,rinv20
),_mm_sub_pd(rinvsq20
,felec
));
2260 d
= _mm_sub_pd(r20
,rswitch
);
2261 d
= _mm_max_pd(d
,_mm_setzero_pd());
2262 d2
= _mm_mul_pd(d
,d
);
2263 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
2265 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
2267 /* Evaluate switch function */
2268 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2269 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv20
,_mm_mul_pd(velec
,dsw
)) );
2270 cutoff_mask
= _mm_cmplt_pd(rsq20
,rcutoff2
);
2274 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2276 /* Calculate temporary vectorial force */
2277 tx
= _mm_mul_pd(fscal
,dx20
);
2278 ty
= _mm_mul_pd(fscal
,dy20
);
2279 tz
= _mm_mul_pd(fscal
,dz20
);
2281 /* Update vectorial force */
2282 fix2
= _mm_add_pd(fix2
,tx
);
2283 fiy2
= _mm_add_pd(fiy2
,ty
);
2284 fiz2
= _mm_add_pd(fiz2
,tz
);
2286 fjx0
= _mm_add_pd(fjx0
,tx
);
2287 fjy0
= _mm_add_pd(fjy0
,ty
);
2288 fjz0
= _mm_add_pd(fjz0
,tz
);
2292 /**************************
2293 * CALCULATE INTERACTIONS *
2294 **************************/
2296 if (gmx_mm_any_lt(rsq21
,rcutoff2
))
2299 r21
= _mm_mul_pd(rsq21
,rinv21
);
2301 /* EWALD ELECTROSTATICS */
2303 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2304 ewrt
= _mm_mul_pd(r21
,ewtabscale
);
2305 ewitab
= _mm_cvttpd_epi32(ewrt
);
2306 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2307 ewitab
= _mm_slli_epi32(ewitab
,2);
2308 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
2309 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
2310 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
2311 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
2312 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
2313 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
2314 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
2315 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
2316 velec
= _mm_mul_pd(qq21
,_mm_sub_pd(rinv21
,velec
));
2317 felec
= _mm_mul_pd(_mm_mul_pd(qq21
,rinv21
),_mm_sub_pd(rinvsq21
,felec
));
2319 d
= _mm_sub_pd(r21
,rswitch
);
2320 d
= _mm_max_pd(d
,_mm_setzero_pd());
2321 d2
= _mm_mul_pd(d
,d
);
2322 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
2324 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
2326 /* Evaluate switch function */
2327 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2328 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv21
,_mm_mul_pd(velec
,dsw
)) );
2329 cutoff_mask
= _mm_cmplt_pd(rsq21
,rcutoff2
);
2333 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2335 /* Calculate temporary vectorial force */
2336 tx
= _mm_mul_pd(fscal
,dx21
);
2337 ty
= _mm_mul_pd(fscal
,dy21
);
2338 tz
= _mm_mul_pd(fscal
,dz21
);
2340 /* Update vectorial force */
2341 fix2
= _mm_add_pd(fix2
,tx
);
2342 fiy2
= _mm_add_pd(fiy2
,ty
);
2343 fiz2
= _mm_add_pd(fiz2
,tz
);
2345 fjx1
= _mm_add_pd(fjx1
,tx
);
2346 fjy1
= _mm_add_pd(fjy1
,ty
);
2347 fjz1
= _mm_add_pd(fjz1
,tz
);
2351 /**************************
2352 * CALCULATE INTERACTIONS *
2353 **************************/
2355 if (gmx_mm_any_lt(rsq22
,rcutoff2
))
2358 r22
= _mm_mul_pd(rsq22
,rinv22
);
2360 /* EWALD ELECTROSTATICS */
2362 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2363 ewrt
= _mm_mul_pd(r22
,ewtabscale
);
2364 ewitab
= _mm_cvttpd_epi32(ewrt
);
2365 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2366 ewitab
= _mm_slli_epi32(ewitab
,2);
2367 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
2368 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
2369 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
2370 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
2371 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
2372 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
2373 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
2374 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
2375 velec
= _mm_mul_pd(qq22
,_mm_sub_pd(rinv22
,velec
));
2376 felec
= _mm_mul_pd(_mm_mul_pd(qq22
,rinv22
),_mm_sub_pd(rinvsq22
,felec
));
2378 d
= _mm_sub_pd(r22
,rswitch
);
2379 d
= _mm_max_pd(d
,_mm_setzero_pd());
2380 d2
= _mm_mul_pd(d
,d
);
2381 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
2383 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
2385 /* Evaluate switch function */
2386 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2387 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv22
,_mm_mul_pd(velec
,dsw
)) );
2388 cutoff_mask
= _mm_cmplt_pd(rsq22
,rcutoff2
);
2392 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2394 /* Calculate temporary vectorial force */
2395 tx
= _mm_mul_pd(fscal
,dx22
);
2396 ty
= _mm_mul_pd(fscal
,dy22
);
2397 tz
= _mm_mul_pd(fscal
,dz22
);
2399 /* Update vectorial force */
2400 fix2
= _mm_add_pd(fix2
,tx
);
2401 fiy2
= _mm_add_pd(fiy2
,ty
);
2402 fiz2
= _mm_add_pd(fiz2
,tz
);
2404 fjx2
= _mm_add_pd(fjx2
,tx
);
2405 fjy2
= _mm_add_pd(fjy2
,ty
);
2406 fjz2
= _mm_add_pd(fjz2
,tz
);
2410 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
2412 /* Inner loop uses 573 flops */
2415 if(jidx
<j_index_end
)
2419 j_coord_offsetA
= DIM
*jnrA
;
2421 /* load j atom coordinates */
2422 gmx_mm_load_3rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
2423 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
2425 /* Calculate displacement vector */
2426 dx00
= _mm_sub_pd(ix0
,jx0
);
2427 dy00
= _mm_sub_pd(iy0
,jy0
);
2428 dz00
= _mm_sub_pd(iz0
,jz0
);
2429 dx01
= _mm_sub_pd(ix0
,jx1
);
2430 dy01
= _mm_sub_pd(iy0
,jy1
);
2431 dz01
= _mm_sub_pd(iz0
,jz1
);
2432 dx02
= _mm_sub_pd(ix0
,jx2
);
2433 dy02
= _mm_sub_pd(iy0
,jy2
);
2434 dz02
= _mm_sub_pd(iz0
,jz2
);
2435 dx10
= _mm_sub_pd(ix1
,jx0
);
2436 dy10
= _mm_sub_pd(iy1
,jy0
);
2437 dz10
= _mm_sub_pd(iz1
,jz0
);
2438 dx11
= _mm_sub_pd(ix1
,jx1
);
2439 dy11
= _mm_sub_pd(iy1
,jy1
);
2440 dz11
= _mm_sub_pd(iz1
,jz1
);
2441 dx12
= _mm_sub_pd(ix1
,jx2
);
2442 dy12
= _mm_sub_pd(iy1
,jy2
);
2443 dz12
= _mm_sub_pd(iz1
,jz2
);
2444 dx20
= _mm_sub_pd(ix2
,jx0
);
2445 dy20
= _mm_sub_pd(iy2
,jy0
);
2446 dz20
= _mm_sub_pd(iz2
,jz0
);
2447 dx21
= _mm_sub_pd(ix2
,jx1
);
2448 dy21
= _mm_sub_pd(iy2
,jy1
);
2449 dz21
= _mm_sub_pd(iz2
,jz1
);
2450 dx22
= _mm_sub_pd(ix2
,jx2
);
2451 dy22
= _mm_sub_pd(iy2
,jy2
);
2452 dz22
= _mm_sub_pd(iz2
,jz2
);
2454 /* Calculate squared distance and things based on it */
2455 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
2456 rsq01
= gmx_mm_calc_rsq_pd(dx01
,dy01
,dz01
);
2457 rsq02
= gmx_mm_calc_rsq_pd(dx02
,dy02
,dz02
);
2458 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
2459 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
2460 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
2461 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
2462 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
2463 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
2465 rinv00
= sse41_invsqrt_d(rsq00
);
2466 rinv01
= sse41_invsqrt_d(rsq01
);
2467 rinv02
= sse41_invsqrt_d(rsq02
);
2468 rinv10
= sse41_invsqrt_d(rsq10
);
2469 rinv11
= sse41_invsqrt_d(rsq11
);
2470 rinv12
= sse41_invsqrt_d(rsq12
);
2471 rinv20
= sse41_invsqrt_d(rsq20
);
2472 rinv21
= sse41_invsqrt_d(rsq21
);
2473 rinv22
= sse41_invsqrt_d(rsq22
);
2475 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
2476 rinvsq01
= _mm_mul_pd(rinv01
,rinv01
);
2477 rinvsq02
= _mm_mul_pd(rinv02
,rinv02
);
2478 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
2479 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
2480 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
2481 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
2482 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
2483 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
2485 fjx0
= _mm_setzero_pd();
2486 fjy0
= _mm_setzero_pd();
2487 fjz0
= _mm_setzero_pd();
2488 fjx1
= _mm_setzero_pd();
2489 fjy1
= _mm_setzero_pd();
2490 fjz1
= _mm_setzero_pd();
2491 fjx2
= _mm_setzero_pd();
2492 fjy2
= _mm_setzero_pd();
2493 fjz2
= _mm_setzero_pd();
2495 /**************************
2496 * CALCULATE INTERACTIONS *
2497 **************************/
2499 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
2502 r00
= _mm_mul_pd(rsq00
,rinv00
);
2504 /* EWALD ELECTROSTATICS */
2506 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2507 ewrt
= _mm_mul_pd(r00
,ewtabscale
);
2508 ewitab
= _mm_cvttpd_epi32(ewrt
);
2509 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2510 ewitab
= _mm_slli_epi32(ewitab
,2);
2511 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
2512 ewtabD
= _mm_setzero_pd();
2513 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
2514 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
2515 ewtabFn
= _mm_setzero_pd();
2516 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
2517 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
2518 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
2519 velec
= _mm_mul_pd(qq00
,_mm_sub_pd(rinv00
,velec
));
2520 felec
= _mm_mul_pd(_mm_mul_pd(qq00
,rinv00
),_mm_sub_pd(rinvsq00
,felec
));
2522 /* LENNARD-JONES DISPERSION/REPULSION */
2524 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
2525 vvdw6
= _mm_mul_pd(c6_00
,rinvsix
);
2526 vvdw12
= _mm_mul_pd(c12_00
,_mm_mul_pd(rinvsix
,rinvsix
));
2527 vvdw
= _mm_sub_pd( _mm_mul_pd(vvdw12
,one_twelfth
) , _mm_mul_pd(vvdw6
,one_sixth
) );
2528 fvdw
= _mm_mul_pd(_mm_sub_pd(vvdw12
,vvdw6
),rinvsq00
);
2530 d
= _mm_sub_pd(r00
,rswitch
);
2531 d
= _mm_max_pd(d
,_mm_setzero_pd());
2532 d2
= _mm_mul_pd(d
,d
);
2533 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
2535 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
2537 /* Evaluate switch function */
2538 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2539 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv00
,_mm_mul_pd(velec
,dsw
)) );
2540 fvdw
= _mm_sub_pd( _mm_mul_pd(fvdw
,sw
) , _mm_mul_pd(rinv00
,_mm_mul_pd(vvdw
,dsw
)) );
2541 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
2543 fscal
= _mm_add_pd(felec
,fvdw
);
2545 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2547 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2549 /* Calculate temporary vectorial force */
2550 tx
= _mm_mul_pd(fscal
,dx00
);
2551 ty
= _mm_mul_pd(fscal
,dy00
);
2552 tz
= _mm_mul_pd(fscal
,dz00
);
2554 /* Update vectorial force */
2555 fix0
= _mm_add_pd(fix0
,tx
);
2556 fiy0
= _mm_add_pd(fiy0
,ty
);
2557 fiz0
= _mm_add_pd(fiz0
,tz
);
2559 fjx0
= _mm_add_pd(fjx0
,tx
);
2560 fjy0
= _mm_add_pd(fjy0
,ty
);
2561 fjz0
= _mm_add_pd(fjz0
,tz
);
2565 /**************************
2566 * CALCULATE INTERACTIONS *
2567 **************************/
2569 if (gmx_mm_any_lt(rsq01
,rcutoff2
))
2572 r01
= _mm_mul_pd(rsq01
,rinv01
);
2574 /* EWALD ELECTROSTATICS */
2576 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2577 ewrt
= _mm_mul_pd(r01
,ewtabscale
);
2578 ewitab
= _mm_cvttpd_epi32(ewrt
);
2579 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2580 ewitab
= _mm_slli_epi32(ewitab
,2);
2581 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
2582 ewtabD
= _mm_setzero_pd();
2583 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
2584 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
2585 ewtabFn
= _mm_setzero_pd();
2586 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
2587 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
2588 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
2589 velec
= _mm_mul_pd(qq01
,_mm_sub_pd(rinv01
,velec
));
2590 felec
= _mm_mul_pd(_mm_mul_pd(qq01
,rinv01
),_mm_sub_pd(rinvsq01
,felec
));
2592 d
= _mm_sub_pd(r01
,rswitch
);
2593 d
= _mm_max_pd(d
,_mm_setzero_pd());
2594 d2
= _mm_mul_pd(d
,d
);
2595 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
2597 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
2599 /* Evaluate switch function */
2600 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2601 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv01
,_mm_mul_pd(velec
,dsw
)) );
2602 cutoff_mask
= _mm_cmplt_pd(rsq01
,rcutoff2
);
2606 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2608 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2610 /* Calculate temporary vectorial force */
2611 tx
= _mm_mul_pd(fscal
,dx01
);
2612 ty
= _mm_mul_pd(fscal
,dy01
);
2613 tz
= _mm_mul_pd(fscal
,dz01
);
2615 /* Update vectorial force */
2616 fix0
= _mm_add_pd(fix0
,tx
);
2617 fiy0
= _mm_add_pd(fiy0
,ty
);
2618 fiz0
= _mm_add_pd(fiz0
,tz
);
2620 fjx1
= _mm_add_pd(fjx1
,tx
);
2621 fjy1
= _mm_add_pd(fjy1
,ty
);
2622 fjz1
= _mm_add_pd(fjz1
,tz
);
2626 /**************************
2627 * CALCULATE INTERACTIONS *
2628 **************************/
2630 if (gmx_mm_any_lt(rsq02
,rcutoff2
))
2633 r02
= _mm_mul_pd(rsq02
,rinv02
);
2635 /* EWALD ELECTROSTATICS */
2637 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2638 ewrt
= _mm_mul_pd(r02
,ewtabscale
);
2639 ewitab
= _mm_cvttpd_epi32(ewrt
);
2640 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2641 ewitab
= _mm_slli_epi32(ewitab
,2);
2642 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
2643 ewtabD
= _mm_setzero_pd();
2644 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
2645 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
2646 ewtabFn
= _mm_setzero_pd();
2647 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
2648 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
2649 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
2650 velec
= _mm_mul_pd(qq02
,_mm_sub_pd(rinv02
,velec
));
2651 felec
= _mm_mul_pd(_mm_mul_pd(qq02
,rinv02
),_mm_sub_pd(rinvsq02
,felec
));
2653 d
= _mm_sub_pd(r02
,rswitch
);
2654 d
= _mm_max_pd(d
,_mm_setzero_pd());
2655 d2
= _mm_mul_pd(d
,d
);
2656 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
2658 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
2660 /* Evaluate switch function */
2661 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2662 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv02
,_mm_mul_pd(velec
,dsw
)) );
2663 cutoff_mask
= _mm_cmplt_pd(rsq02
,rcutoff2
);
2667 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2669 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2671 /* Calculate temporary vectorial force */
2672 tx
= _mm_mul_pd(fscal
,dx02
);
2673 ty
= _mm_mul_pd(fscal
,dy02
);
2674 tz
= _mm_mul_pd(fscal
,dz02
);
2676 /* Update vectorial force */
2677 fix0
= _mm_add_pd(fix0
,tx
);
2678 fiy0
= _mm_add_pd(fiy0
,ty
);
2679 fiz0
= _mm_add_pd(fiz0
,tz
);
2681 fjx2
= _mm_add_pd(fjx2
,tx
);
2682 fjy2
= _mm_add_pd(fjy2
,ty
);
2683 fjz2
= _mm_add_pd(fjz2
,tz
);
2687 /**************************
2688 * CALCULATE INTERACTIONS *
2689 **************************/
2691 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
2694 r10
= _mm_mul_pd(rsq10
,rinv10
);
2696 /* EWALD ELECTROSTATICS */
2698 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2699 ewrt
= _mm_mul_pd(r10
,ewtabscale
);
2700 ewitab
= _mm_cvttpd_epi32(ewrt
);
2701 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2702 ewitab
= _mm_slli_epi32(ewitab
,2);
2703 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
2704 ewtabD
= _mm_setzero_pd();
2705 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
2706 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
2707 ewtabFn
= _mm_setzero_pd();
2708 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
2709 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
2710 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
2711 velec
= _mm_mul_pd(qq10
,_mm_sub_pd(rinv10
,velec
));
2712 felec
= _mm_mul_pd(_mm_mul_pd(qq10
,rinv10
),_mm_sub_pd(rinvsq10
,felec
));
2714 d
= _mm_sub_pd(r10
,rswitch
);
2715 d
= _mm_max_pd(d
,_mm_setzero_pd());
2716 d2
= _mm_mul_pd(d
,d
);
2717 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
2719 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
2721 /* Evaluate switch function */
2722 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2723 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv10
,_mm_mul_pd(velec
,dsw
)) );
2724 cutoff_mask
= _mm_cmplt_pd(rsq10
,rcutoff2
);
2728 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2730 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2732 /* Calculate temporary vectorial force */
2733 tx
= _mm_mul_pd(fscal
,dx10
);
2734 ty
= _mm_mul_pd(fscal
,dy10
);
2735 tz
= _mm_mul_pd(fscal
,dz10
);
2737 /* Update vectorial force */
2738 fix1
= _mm_add_pd(fix1
,tx
);
2739 fiy1
= _mm_add_pd(fiy1
,ty
);
2740 fiz1
= _mm_add_pd(fiz1
,tz
);
2742 fjx0
= _mm_add_pd(fjx0
,tx
);
2743 fjy0
= _mm_add_pd(fjy0
,ty
);
2744 fjz0
= _mm_add_pd(fjz0
,tz
);
2748 /**************************
2749 * CALCULATE INTERACTIONS *
2750 **************************/
2752 if (gmx_mm_any_lt(rsq11
,rcutoff2
))
2755 r11
= _mm_mul_pd(rsq11
,rinv11
);
2757 /* EWALD ELECTROSTATICS */
2759 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2760 ewrt
= _mm_mul_pd(r11
,ewtabscale
);
2761 ewitab
= _mm_cvttpd_epi32(ewrt
);
2762 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2763 ewitab
= _mm_slli_epi32(ewitab
,2);
2764 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
2765 ewtabD
= _mm_setzero_pd();
2766 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
2767 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
2768 ewtabFn
= _mm_setzero_pd();
2769 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
2770 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
2771 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
2772 velec
= _mm_mul_pd(qq11
,_mm_sub_pd(rinv11
,velec
));
2773 felec
= _mm_mul_pd(_mm_mul_pd(qq11
,rinv11
),_mm_sub_pd(rinvsq11
,felec
));
2775 d
= _mm_sub_pd(r11
,rswitch
);
2776 d
= _mm_max_pd(d
,_mm_setzero_pd());
2777 d2
= _mm_mul_pd(d
,d
);
2778 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
2780 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
2782 /* Evaluate switch function */
2783 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2784 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv11
,_mm_mul_pd(velec
,dsw
)) );
2785 cutoff_mask
= _mm_cmplt_pd(rsq11
,rcutoff2
);
2789 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2791 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2793 /* Calculate temporary vectorial force */
2794 tx
= _mm_mul_pd(fscal
,dx11
);
2795 ty
= _mm_mul_pd(fscal
,dy11
);
2796 tz
= _mm_mul_pd(fscal
,dz11
);
2798 /* Update vectorial force */
2799 fix1
= _mm_add_pd(fix1
,tx
);
2800 fiy1
= _mm_add_pd(fiy1
,ty
);
2801 fiz1
= _mm_add_pd(fiz1
,tz
);
2803 fjx1
= _mm_add_pd(fjx1
,tx
);
2804 fjy1
= _mm_add_pd(fjy1
,ty
);
2805 fjz1
= _mm_add_pd(fjz1
,tz
);
2809 /**************************
2810 * CALCULATE INTERACTIONS *
2811 **************************/
2813 if (gmx_mm_any_lt(rsq12
,rcutoff2
))
2816 r12
= _mm_mul_pd(rsq12
,rinv12
);
2818 /* EWALD ELECTROSTATICS */
2820 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2821 ewrt
= _mm_mul_pd(r12
,ewtabscale
);
2822 ewitab
= _mm_cvttpd_epi32(ewrt
);
2823 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2824 ewitab
= _mm_slli_epi32(ewitab
,2);
2825 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
2826 ewtabD
= _mm_setzero_pd();
2827 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
2828 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
2829 ewtabFn
= _mm_setzero_pd();
2830 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
2831 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
2832 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
2833 velec
= _mm_mul_pd(qq12
,_mm_sub_pd(rinv12
,velec
));
2834 felec
= _mm_mul_pd(_mm_mul_pd(qq12
,rinv12
),_mm_sub_pd(rinvsq12
,felec
));
2836 d
= _mm_sub_pd(r12
,rswitch
);
2837 d
= _mm_max_pd(d
,_mm_setzero_pd());
2838 d2
= _mm_mul_pd(d
,d
);
2839 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
2841 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
2843 /* Evaluate switch function */
2844 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2845 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv12
,_mm_mul_pd(velec
,dsw
)) );
2846 cutoff_mask
= _mm_cmplt_pd(rsq12
,rcutoff2
);
2850 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2852 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2854 /* Calculate temporary vectorial force */
2855 tx
= _mm_mul_pd(fscal
,dx12
);
2856 ty
= _mm_mul_pd(fscal
,dy12
);
2857 tz
= _mm_mul_pd(fscal
,dz12
);
2859 /* Update vectorial force */
2860 fix1
= _mm_add_pd(fix1
,tx
);
2861 fiy1
= _mm_add_pd(fiy1
,ty
);
2862 fiz1
= _mm_add_pd(fiz1
,tz
);
2864 fjx2
= _mm_add_pd(fjx2
,tx
);
2865 fjy2
= _mm_add_pd(fjy2
,ty
);
2866 fjz2
= _mm_add_pd(fjz2
,tz
);
2870 /**************************
2871 * CALCULATE INTERACTIONS *
2872 **************************/
2874 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
2877 r20
= _mm_mul_pd(rsq20
,rinv20
);
2879 /* EWALD ELECTROSTATICS */
2881 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2882 ewrt
= _mm_mul_pd(r20
,ewtabscale
);
2883 ewitab
= _mm_cvttpd_epi32(ewrt
);
2884 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2885 ewitab
= _mm_slli_epi32(ewitab
,2);
2886 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
2887 ewtabD
= _mm_setzero_pd();
2888 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
2889 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
2890 ewtabFn
= _mm_setzero_pd();
2891 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
2892 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
2893 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
2894 velec
= _mm_mul_pd(qq20
,_mm_sub_pd(rinv20
,velec
));
2895 felec
= _mm_mul_pd(_mm_mul_pd(qq20
,rinv20
),_mm_sub_pd(rinvsq20
,felec
));
2897 d
= _mm_sub_pd(r20
,rswitch
);
2898 d
= _mm_max_pd(d
,_mm_setzero_pd());
2899 d2
= _mm_mul_pd(d
,d
);
2900 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
2902 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
2904 /* Evaluate switch function */
2905 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2906 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv20
,_mm_mul_pd(velec
,dsw
)) );
2907 cutoff_mask
= _mm_cmplt_pd(rsq20
,rcutoff2
);
2911 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2913 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2915 /* Calculate temporary vectorial force */
2916 tx
= _mm_mul_pd(fscal
,dx20
);
2917 ty
= _mm_mul_pd(fscal
,dy20
);
2918 tz
= _mm_mul_pd(fscal
,dz20
);
2920 /* Update vectorial force */
2921 fix2
= _mm_add_pd(fix2
,tx
);
2922 fiy2
= _mm_add_pd(fiy2
,ty
);
2923 fiz2
= _mm_add_pd(fiz2
,tz
);
2925 fjx0
= _mm_add_pd(fjx0
,tx
);
2926 fjy0
= _mm_add_pd(fjy0
,ty
);
2927 fjz0
= _mm_add_pd(fjz0
,tz
);
2931 /**************************
2932 * CALCULATE INTERACTIONS *
2933 **************************/
2935 if (gmx_mm_any_lt(rsq21
,rcutoff2
))
2938 r21
= _mm_mul_pd(rsq21
,rinv21
);
2940 /* EWALD ELECTROSTATICS */
2942 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2943 ewrt
= _mm_mul_pd(r21
,ewtabscale
);
2944 ewitab
= _mm_cvttpd_epi32(ewrt
);
2945 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2946 ewitab
= _mm_slli_epi32(ewitab
,2);
2947 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
2948 ewtabD
= _mm_setzero_pd();
2949 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
2950 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
2951 ewtabFn
= _mm_setzero_pd();
2952 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
2953 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
2954 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
2955 velec
= _mm_mul_pd(qq21
,_mm_sub_pd(rinv21
,velec
));
2956 felec
= _mm_mul_pd(_mm_mul_pd(qq21
,rinv21
),_mm_sub_pd(rinvsq21
,felec
));
2958 d
= _mm_sub_pd(r21
,rswitch
);
2959 d
= _mm_max_pd(d
,_mm_setzero_pd());
2960 d2
= _mm_mul_pd(d
,d
);
2961 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
2963 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
2965 /* Evaluate switch function */
2966 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2967 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv21
,_mm_mul_pd(velec
,dsw
)) );
2968 cutoff_mask
= _mm_cmplt_pd(rsq21
,rcutoff2
);
2972 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2974 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2976 /* Calculate temporary vectorial force */
2977 tx
= _mm_mul_pd(fscal
,dx21
);
2978 ty
= _mm_mul_pd(fscal
,dy21
);
2979 tz
= _mm_mul_pd(fscal
,dz21
);
2981 /* Update vectorial force */
2982 fix2
= _mm_add_pd(fix2
,tx
);
2983 fiy2
= _mm_add_pd(fiy2
,ty
);
2984 fiz2
= _mm_add_pd(fiz2
,tz
);
2986 fjx1
= _mm_add_pd(fjx1
,tx
);
2987 fjy1
= _mm_add_pd(fjy1
,ty
);
2988 fjz1
= _mm_add_pd(fjz1
,tz
);
2992 /**************************
2993 * CALCULATE INTERACTIONS *
2994 **************************/
2996 if (gmx_mm_any_lt(rsq22
,rcutoff2
))
2999 r22
= _mm_mul_pd(rsq22
,rinv22
);
3001 /* EWALD ELECTROSTATICS */
3003 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3004 ewrt
= _mm_mul_pd(r22
,ewtabscale
);
3005 ewitab
= _mm_cvttpd_epi32(ewrt
);
3006 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
3007 ewitab
= _mm_slli_epi32(ewitab
,2);
3008 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
3009 ewtabD
= _mm_setzero_pd();
3010 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
3011 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
3012 ewtabFn
= _mm_setzero_pd();
3013 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
3014 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
3015 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
3016 velec
= _mm_mul_pd(qq22
,_mm_sub_pd(rinv22
,velec
));
3017 felec
= _mm_mul_pd(_mm_mul_pd(qq22
,rinv22
),_mm_sub_pd(rinvsq22
,felec
));
3019 d
= _mm_sub_pd(r22
,rswitch
);
3020 d
= _mm_max_pd(d
,_mm_setzero_pd());
3021 d2
= _mm_mul_pd(d
,d
);
3022 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
3024 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
3026 /* Evaluate switch function */
3027 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3028 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv22
,_mm_mul_pd(velec
,dsw
)) );
3029 cutoff_mask
= _mm_cmplt_pd(rsq22
,rcutoff2
);
3033 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
3035 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
3037 /* Calculate temporary vectorial force */
3038 tx
= _mm_mul_pd(fscal
,dx22
);
3039 ty
= _mm_mul_pd(fscal
,dy22
);
3040 tz
= _mm_mul_pd(fscal
,dz22
);
3042 /* Update vectorial force */
3043 fix2
= _mm_add_pd(fix2
,tx
);
3044 fiy2
= _mm_add_pd(fiy2
,ty
);
3045 fiz2
= _mm_add_pd(fiz2
,tz
);
3047 fjx2
= _mm_add_pd(fjx2
,tx
);
3048 fjy2
= _mm_add_pd(fjy2
,ty
);
3049 fjz2
= _mm_add_pd(fjz2
,tz
);
3053 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
3055 /* Inner loop uses 573 flops */
3058 /* End of innermost loop */
3060 gmx_mm_update_iforce_3atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
3061 f
+i_coord_offset
,fshift
+i_shift_offset
);
3063 /* Increment number of inner iterations */
3064 inneriter
+= j_index_end
- j_index_start
;
3066 /* Outer loop uses 18 flops */
3069 /* Increment number of outer iterations */
3072 /* Update outer/inner flops */
3074 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W3W3_F
,outeriter
*18 + inneriter
*573);