2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_sse2_double.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_sse2_double
51 * Electrostatics interaction: Ewald
52 * VdW interaction: None
53 * Geometry: Water4-Water4
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_sse2_double
58 (t_nblist
* gmx_restrict nlist
,
59 rvec
* gmx_restrict xx
,
60 rvec
* gmx_restrict ff
,
61 struct t_forcerec
* gmx_restrict fr
,
62 t_mdatoms
* gmx_restrict mdatoms
,
63 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
64 t_nrnb
* gmx_restrict nrnb
)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
72 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
74 int j_coord_offsetA
,j_coord_offsetB
;
75 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
77 real
*shiftvec
,*fshift
,*x
,*f
;
78 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
80 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
82 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
84 __m128d ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
85 int vdwjidx1A
,vdwjidx1B
;
86 __m128d jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
87 int vdwjidx2A
,vdwjidx2B
;
88 __m128d jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
89 int vdwjidx3A
,vdwjidx3B
;
90 __m128d jx3
,jy3
,jz3
,fjx3
,fjy3
,fjz3
,jq3
,isaj3
;
91 __m128d dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
92 __m128d dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
93 __m128d dx13
,dy13
,dz13
,rsq13
,rinv13
,rinvsq13
,r13
,qq13
,c6_13
,c12_13
;
94 __m128d dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
95 __m128d dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
96 __m128d dx23
,dy23
,dz23
,rsq23
,rinv23
,rinvsq23
,r23
,qq23
,c6_23
,c12_23
;
97 __m128d dx31
,dy31
,dz31
,rsq31
,rinv31
,rinvsq31
,r31
,qq31
,c6_31
,c12_31
;
98 __m128d dx32
,dy32
,dz32
,rsq32
,rinv32
,rinvsq32
,r32
,qq32
,c6_32
,c12_32
;
99 __m128d dx33
,dy33
,dz33
,rsq33
,rinv33
,rinvsq33
,r33
,qq33
,c6_33
,c12_33
;
100 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
103 __m128d ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
105 __m128d rswitch
,swV3
,swV4
,swV5
,swF2
,swF3
,swF4
,d
,d2
,sw
,dsw
;
106 real rswitch_scalar
,d_scalar
;
107 __m128d dummy_mask
,cutoff_mask
;
108 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
109 __m128d one
= _mm_set1_pd(1.0);
110 __m128d two
= _mm_set1_pd(2.0);
116 jindex
= nlist
->jindex
;
118 shiftidx
= nlist
->shift
;
120 shiftvec
= fr
->shift_vec
[0];
121 fshift
= fr
->fshift
[0];
122 facel
= _mm_set1_pd(fr
->ic
->epsfac
);
123 charge
= mdatoms
->chargeA
;
125 sh_ewald
= _mm_set1_pd(fr
->ic
->sh_ewald
);
126 ewtab
= fr
->ic
->tabq_coul_FDV0
;
127 ewtabscale
= _mm_set1_pd(fr
->ic
->tabq_scale
);
128 ewtabhalfspace
= _mm_set1_pd(0.5/fr
->ic
->tabq_scale
);
130 /* Setup water-specific parameters */
131 inr
= nlist
->iinr
[0];
132 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
133 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
134 iq3
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+3]));
136 jq1
= _mm_set1_pd(charge
[inr
+1]);
137 jq2
= _mm_set1_pd(charge
[inr
+2]);
138 jq3
= _mm_set1_pd(charge
[inr
+3]);
139 qq11
= _mm_mul_pd(iq1
,jq1
);
140 qq12
= _mm_mul_pd(iq1
,jq2
);
141 qq13
= _mm_mul_pd(iq1
,jq3
);
142 qq21
= _mm_mul_pd(iq2
,jq1
);
143 qq22
= _mm_mul_pd(iq2
,jq2
);
144 qq23
= _mm_mul_pd(iq2
,jq3
);
145 qq31
= _mm_mul_pd(iq3
,jq1
);
146 qq32
= _mm_mul_pd(iq3
,jq2
);
147 qq33
= _mm_mul_pd(iq3
,jq3
);
149 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
150 rcutoff_scalar
= fr
->ic
->rcoulomb
;
151 rcutoff
= _mm_set1_pd(rcutoff_scalar
);
152 rcutoff2
= _mm_mul_pd(rcutoff
,rcutoff
);
154 rswitch_scalar
= fr
->ic
->rcoulomb_switch
;
155 rswitch
= _mm_set1_pd(rswitch_scalar
);
156 /* Setup switch parameters */
157 d_scalar
= rcutoff_scalar
-rswitch_scalar
;
158 d
= _mm_set1_pd(d_scalar
);
159 swV3
= _mm_set1_pd(-10.0/(d_scalar
*d_scalar
*d_scalar
));
160 swV4
= _mm_set1_pd( 15.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
));
161 swV5
= _mm_set1_pd( -6.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
*d_scalar
));
162 swF2
= _mm_set1_pd(-30.0/(d_scalar
*d_scalar
*d_scalar
));
163 swF3
= _mm_set1_pd( 60.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
));
164 swF4
= _mm_set1_pd(-30.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
*d_scalar
));
166 /* Avoid stupid compiler warnings */
174 /* Start outer loop over neighborlists */
175 for(iidx
=0; iidx
<nri
; iidx
++)
177 /* Load shift vector for this list */
178 i_shift_offset
= DIM
*shiftidx
[iidx
];
180 /* Load limits for loop over neighbors */
181 j_index_start
= jindex
[iidx
];
182 j_index_end
= jindex
[iidx
+1];
184 /* Get outer coordinate index */
186 i_coord_offset
= DIM
*inr
;
188 /* Load i particle coords and add shift vector */
189 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
+DIM
,
190 &ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
192 fix1
= _mm_setzero_pd();
193 fiy1
= _mm_setzero_pd();
194 fiz1
= _mm_setzero_pd();
195 fix2
= _mm_setzero_pd();
196 fiy2
= _mm_setzero_pd();
197 fiz2
= _mm_setzero_pd();
198 fix3
= _mm_setzero_pd();
199 fiy3
= _mm_setzero_pd();
200 fiz3
= _mm_setzero_pd();
202 /* Reset potential sums */
203 velecsum
= _mm_setzero_pd();
205 /* Start inner kernel loop */
206 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
209 /* Get j neighbor index, and coordinate index */
212 j_coord_offsetA
= DIM
*jnrA
;
213 j_coord_offsetB
= DIM
*jnrB
;
215 /* load j atom coordinates */
216 gmx_mm_load_3rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
+DIM
,x
+j_coord_offsetB
+DIM
,
217 &jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
219 /* Calculate displacement vector */
220 dx11
= _mm_sub_pd(ix1
,jx1
);
221 dy11
= _mm_sub_pd(iy1
,jy1
);
222 dz11
= _mm_sub_pd(iz1
,jz1
);
223 dx12
= _mm_sub_pd(ix1
,jx2
);
224 dy12
= _mm_sub_pd(iy1
,jy2
);
225 dz12
= _mm_sub_pd(iz1
,jz2
);
226 dx13
= _mm_sub_pd(ix1
,jx3
);
227 dy13
= _mm_sub_pd(iy1
,jy3
);
228 dz13
= _mm_sub_pd(iz1
,jz3
);
229 dx21
= _mm_sub_pd(ix2
,jx1
);
230 dy21
= _mm_sub_pd(iy2
,jy1
);
231 dz21
= _mm_sub_pd(iz2
,jz1
);
232 dx22
= _mm_sub_pd(ix2
,jx2
);
233 dy22
= _mm_sub_pd(iy2
,jy2
);
234 dz22
= _mm_sub_pd(iz2
,jz2
);
235 dx23
= _mm_sub_pd(ix2
,jx3
);
236 dy23
= _mm_sub_pd(iy2
,jy3
);
237 dz23
= _mm_sub_pd(iz2
,jz3
);
238 dx31
= _mm_sub_pd(ix3
,jx1
);
239 dy31
= _mm_sub_pd(iy3
,jy1
);
240 dz31
= _mm_sub_pd(iz3
,jz1
);
241 dx32
= _mm_sub_pd(ix3
,jx2
);
242 dy32
= _mm_sub_pd(iy3
,jy2
);
243 dz32
= _mm_sub_pd(iz3
,jz2
);
244 dx33
= _mm_sub_pd(ix3
,jx3
);
245 dy33
= _mm_sub_pd(iy3
,jy3
);
246 dz33
= _mm_sub_pd(iz3
,jz3
);
248 /* Calculate squared distance and things based on it */
249 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
250 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
251 rsq13
= gmx_mm_calc_rsq_pd(dx13
,dy13
,dz13
);
252 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
253 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
254 rsq23
= gmx_mm_calc_rsq_pd(dx23
,dy23
,dz23
);
255 rsq31
= gmx_mm_calc_rsq_pd(dx31
,dy31
,dz31
);
256 rsq32
= gmx_mm_calc_rsq_pd(dx32
,dy32
,dz32
);
257 rsq33
= gmx_mm_calc_rsq_pd(dx33
,dy33
,dz33
);
259 rinv11
= sse2_invsqrt_d(rsq11
);
260 rinv12
= sse2_invsqrt_d(rsq12
);
261 rinv13
= sse2_invsqrt_d(rsq13
);
262 rinv21
= sse2_invsqrt_d(rsq21
);
263 rinv22
= sse2_invsqrt_d(rsq22
);
264 rinv23
= sse2_invsqrt_d(rsq23
);
265 rinv31
= sse2_invsqrt_d(rsq31
);
266 rinv32
= sse2_invsqrt_d(rsq32
);
267 rinv33
= sse2_invsqrt_d(rsq33
);
269 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
270 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
271 rinvsq13
= _mm_mul_pd(rinv13
,rinv13
);
272 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
273 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
274 rinvsq23
= _mm_mul_pd(rinv23
,rinv23
);
275 rinvsq31
= _mm_mul_pd(rinv31
,rinv31
);
276 rinvsq32
= _mm_mul_pd(rinv32
,rinv32
);
277 rinvsq33
= _mm_mul_pd(rinv33
,rinv33
);
279 fjx1
= _mm_setzero_pd();
280 fjy1
= _mm_setzero_pd();
281 fjz1
= _mm_setzero_pd();
282 fjx2
= _mm_setzero_pd();
283 fjy2
= _mm_setzero_pd();
284 fjz2
= _mm_setzero_pd();
285 fjx3
= _mm_setzero_pd();
286 fjy3
= _mm_setzero_pd();
287 fjz3
= _mm_setzero_pd();
289 /**************************
290 * CALCULATE INTERACTIONS *
291 **************************/
293 if (gmx_mm_any_lt(rsq11
,rcutoff2
))
296 r11
= _mm_mul_pd(rsq11
,rinv11
);
298 /* EWALD ELECTROSTATICS */
300 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
301 ewrt
= _mm_mul_pd(r11
,ewtabscale
);
302 ewitab
= _mm_cvttpd_epi32(ewrt
);
303 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
304 ewitab
= _mm_slli_epi32(ewitab
,2);
305 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
306 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
307 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
308 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
309 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
310 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
311 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
312 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
313 velec
= _mm_mul_pd(qq11
,_mm_sub_pd(rinv11
,velec
));
314 felec
= _mm_mul_pd(_mm_mul_pd(qq11
,rinv11
),_mm_sub_pd(rinvsq11
,felec
));
316 d
= _mm_sub_pd(r11
,rswitch
);
317 d
= _mm_max_pd(d
,_mm_setzero_pd());
318 d2
= _mm_mul_pd(d
,d
);
319 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
321 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
323 /* Evaluate switch function */
324 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
325 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv11
,_mm_mul_pd(velec
,dsw
)) );
326 velec
= _mm_mul_pd(velec
,sw
);
327 cutoff_mask
= _mm_cmplt_pd(rsq11
,rcutoff2
);
329 /* Update potential sum for this i atom from the interaction with this j atom. */
330 velec
= _mm_and_pd(velec
,cutoff_mask
);
331 velecsum
= _mm_add_pd(velecsum
,velec
);
335 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
337 /* Calculate temporary vectorial force */
338 tx
= _mm_mul_pd(fscal
,dx11
);
339 ty
= _mm_mul_pd(fscal
,dy11
);
340 tz
= _mm_mul_pd(fscal
,dz11
);
342 /* Update vectorial force */
343 fix1
= _mm_add_pd(fix1
,tx
);
344 fiy1
= _mm_add_pd(fiy1
,ty
);
345 fiz1
= _mm_add_pd(fiz1
,tz
);
347 fjx1
= _mm_add_pd(fjx1
,tx
);
348 fjy1
= _mm_add_pd(fjy1
,ty
);
349 fjz1
= _mm_add_pd(fjz1
,tz
);
353 /**************************
354 * CALCULATE INTERACTIONS *
355 **************************/
357 if (gmx_mm_any_lt(rsq12
,rcutoff2
))
360 r12
= _mm_mul_pd(rsq12
,rinv12
);
362 /* EWALD ELECTROSTATICS */
364 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
365 ewrt
= _mm_mul_pd(r12
,ewtabscale
);
366 ewitab
= _mm_cvttpd_epi32(ewrt
);
367 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
368 ewitab
= _mm_slli_epi32(ewitab
,2);
369 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
370 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
371 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
372 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
373 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
374 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
375 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
376 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
377 velec
= _mm_mul_pd(qq12
,_mm_sub_pd(rinv12
,velec
));
378 felec
= _mm_mul_pd(_mm_mul_pd(qq12
,rinv12
),_mm_sub_pd(rinvsq12
,felec
));
380 d
= _mm_sub_pd(r12
,rswitch
);
381 d
= _mm_max_pd(d
,_mm_setzero_pd());
382 d2
= _mm_mul_pd(d
,d
);
383 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
385 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
387 /* Evaluate switch function */
388 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
389 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv12
,_mm_mul_pd(velec
,dsw
)) );
390 velec
= _mm_mul_pd(velec
,sw
);
391 cutoff_mask
= _mm_cmplt_pd(rsq12
,rcutoff2
);
393 /* Update potential sum for this i atom from the interaction with this j atom. */
394 velec
= _mm_and_pd(velec
,cutoff_mask
);
395 velecsum
= _mm_add_pd(velecsum
,velec
);
399 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
401 /* Calculate temporary vectorial force */
402 tx
= _mm_mul_pd(fscal
,dx12
);
403 ty
= _mm_mul_pd(fscal
,dy12
);
404 tz
= _mm_mul_pd(fscal
,dz12
);
406 /* Update vectorial force */
407 fix1
= _mm_add_pd(fix1
,tx
);
408 fiy1
= _mm_add_pd(fiy1
,ty
);
409 fiz1
= _mm_add_pd(fiz1
,tz
);
411 fjx2
= _mm_add_pd(fjx2
,tx
);
412 fjy2
= _mm_add_pd(fjy2
,ty
);
413 fjz2
= _mm_add_pd(fjz2
,tz
);
417 /**************************
418 * CALCULATE INTERACTIONS *
419 **************************/
421 if (gmx_mm_any_lt(rsq13
,rcutoff2
))
424 r13
= _mm_mul_pd(rsq13
,rinv13
);
426 /* EWALD ELECTROSTATICS */
428 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
429 ewrt
= _mm_mul_pd(r13
,ewtabscale
);
430 ewitab
= _mm_cvttpd_epi32(ewrt
);
431 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
432 ewitab
= _mm_slli_epi32(ewitab
,2);
433 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
434 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
435 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
436 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
437 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
438 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
439 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
440 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
441 velec
= _mm_mul_pd(qq13
,_mm_sub_pd(rinv13
,velec
));
442 felec
= _mm_mul_pd(_mm_mul_pd(qq13
,rinv13
),_mm_sub_pd(rinvsq13
,felec
));
444 d
= _mm_sub_pd(r13
,rswitch
);
445 d
= _mm_max_pd(d
,_mm_setzero_pd());
446 d2
= _mm_mul_pd(d
,d
);
447 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
449 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
451 /* Evaluate switch function */
452 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
453 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv13
,_mm_mul_pd(velec
,dsw
)) );
454 velec
= _mm_mul_pd(velec
,sw
);
455 cutoff_mask
= _mm_cmplt_pd(rsq13
,rcutoff2
);
457 /* Update potential sum for this i atom from the interaction with this j atom. */
458 velec
= _mm_and_pd(velec
,cutoff_mask
);
459 velecsum
= _mm_add_pd(velecsum
,velec
);
463 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
465 /* Calculate temporary vectorial force */
466 tx
= _mm_mul_pd(fscal
,dx13
);
467 ty
= _mm_mul_pd(fscal
,dy13
);
468 tz
= _mm_mul_pd(fscal
,dz13
);
470 /* Update vectorial force */
471 fix1
= _mm_add_pd(fix1
,tx
);
472 fiy1
= _mm_add_pd(fiy1
,ty
);
473 fiz1
= _mm_add_pd(fiz1
,tz
);
475 fjx3
= _mm_add_pd(fjx3
,tx
);
476 fjy3
= _mm_add_pd(fjy3
,ty
);
477 fjz3
= _mm_add_pd(fjz3
,tz
);
481 /**************************
482 * CALCULATE INTERACTIONS *
483 **************************/
485 if (gmx_mm_any_lt(rsq21
,rcutoff2
))
488 r21
= _mm_mul_pd(rsq21
,rinv21
);
490 /* EWALD ELECTROSTATICS */
492 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
493 ewrt
= _mm_mul_pd(r21
,ewtabscale
);
494 ewitab
= _mm_cvttpd_epi32(ewrt
);
495 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
496 ewitab
= _mm_slli_epi32(ewitab
,2);
497 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
498 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
499 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
500 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
501 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
502 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
503 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
504 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
505 velec
= _mm_mul_pd(qq21
,_mm_sub_pd(rinv21
,velec
));
506 felec
= _mm_mul_pd(_mm_mul_pd(qq21
,rinv21
),_mm_sub_pd(rinvsq21
,felec
));
508 d
= _mm_sub_pd(r21
,rswitch
);
509 d
= _mm_max_pd(d
,_mm_setzero_pd());
510 d2
= _mm_mul_pd(d
,d
);
511 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
513 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
515 /* Evaluate switch function */
516 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
517 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv21
,_mm_mul_pd(velec
,dsw
)) );
518 velec
= _mm_mul_pd(velec
,sw
);
519 cutoff_mask
= _mm_cmplt_pd(rsq21
,rcutoff2
);
521 /* Update potential sum for this i atom from the interaction with this j atom. */
522 velec
= _mm_and_pd(velec
,cutoff_mask
);
523 velecsum
= _mm_add_pd(velecsum
,velec
);
527 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
529 /* Calculate temporary vectorial force */
530 tx
= _mm_mul_pd(fscal
,dx21
);
531 ty
= _mm_mul_pd(fscal
,dy21
);
532 tz
= _mm_mul_pd(fscal
,dz21
);
534 /* Update vectorial force */
535 fix2
= _mm_add_pd(fix2
,tx
);
536 fiy2
= _mm_add_pd(fiy2
,ty
);
537 fiz2
= _mm_add_pd(fiz2
,tz
);
539 fjx1
= _mm_add_pd(fjx1
,tx
);
540 fjy1
= _mm_add_pd(fjy1
,ty
);
541 fjz1
= _mm_add_pd(fjz1
,tz
);
545 /**************************
546 * CALCULATE INTERACTIONS *
547 **************************/
549 if (gmx_mm_any_lt(rsq22
,rcutoff2
))
552 r22
= _mm_mul_pd(rsq22
,rinv22
);
554 /* EWALD ELECTROSTATICS */
556 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
557 ewrt
= _mm_mul_pd(r22
,ewtabscale
);
558 ewitab
= _mm_cvttpd_epi32(ewrt
);
559 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
560 ewitab
= _mm_slli_epi32(ewitab
,2);
561 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
562 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
563 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
564 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
565 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
566 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
567 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
568 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
569 velec
= _mm_mul_pd(qq22
,_mm_sub_pd(rinv22
,velec
));
570 felec
= _mm_mul_pd(_mm_mul_pd(qq22
,rinv22
),_mm_sub_pd(rinvsq22
,felec
));
572 d
= _mm_sub_pd(r22
,rswitch
);
573 d
= _mm_max_pd(d
,_mm_setzero_pd());
574 d2
= _mm_mul_pd(d
,d
);
575 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
577 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
579 /* Evaluate switch function */
580 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
581 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv22
,_mm_mul_pd(velec
,dsw
)) );
582 velec
= _mm_mul_pd(velec
,sw
);
583 cutoff_mask
= _mm_cmplt_pd(rsq22
,rcutoff2
);
585 /* Update potential sum for this i atom from the interaction with this j atom. */
586 velec
= _mm_and_pd(velec
,cutoff_mask
);
587 velecsum
= _mm_add_pd(velecsum
,velec
);
591 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
593 /* Calculate temporary vectorial force */
594 tx
= _mm_mul_pd(fscal
,dx22
);
595 ty
= _mm_mul_pd(fscal
,dy22
);
596 tz
= _mm_mul_pd(fscal
,dz22
);
598 /* Update vectorial force */
599 fix2
= _mm_add_pd(fix2
,tx
);
600 fiy2
= _mm_add_pd(fiy2
,ty
);
601 fiz2
= _mm_add_pd(fiz2
,tz
);
603 fjx2
= _mm_add_pd(fjx2
,tx
);
604 fjy2
= _mm_add_pd(fjy2
,ty
);
605 fjz2
= _mm_add_pd(fjz2
,tz
);
609 /**************************
610 * CALCULATE INTERACTIONS *
611 **************************/
613 if (gmx_mm_any_lt(rsq23
,rcutoff2
))
616 r23
= _mm_mul_pd(rsq23
,rinv23
);
618 /* EWALD ELECTROSTATICS */
620 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
621 ewrt
= _mm_mul_pd(r23
,ewtabscale
);
622 ewitab
= _mm_cvttpd_epi32(ewrt
);
623 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
624 ewitab
= _mm_slli_epi32(ewitab
,2);
625 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
626 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
627 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
628 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
629 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
630 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
631 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
632 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
633 velec
= _mm_mul_pd(qq23
,_mm_sub_pd(rinv23
,velec
));
634 felec
= _mm_mul_pd(_mm_mul_pd(qq23
,rinv23
),_mm_sub_pd(rinvsq23
,felec
));
636 d
= _mm_sub_pd(r23
,rswitch
);
637 d
= _mm_max_pd(d
,_mm_setzero_pd());
638 d2
= _mm_mul_pd(d
,d
);
639 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
641 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
643 /* Evaluate switch function */
644 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
645 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv23
,_mm_mul_pd(velec
,dsw
)) );
646 velec
= _mm_mul_pd(velec
,sw
);
647 cutoff_mask
= _mm_cmplt_pd(rsq23
,rcutoff2
);
649 /* Update potential sum for this i atom from the interaction with this j atom. */
650 velec
= _mm_and_pd(velec
,cutoff_mask
);
651 velecsum
= _mm_add_pd(velecsum
,velec
);
655 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
657 /* Calculate temporary vectorial force */
658 tx
= _mm_mul_pd(fscal
,dx23
);
659 ty
= _mm_mul_pd(fscal
,dy23
);
660 tz
= _mm_mul_pd(fscal
,dz23
);
662 /* Update vectorial force */
663 fix2
= _mm_add_pd(fix2
,tx
);
664 fiy2
= _mm_add_pd(fiy2
,ty
);
665 fiz2
= _mm_add_pd(fiz2
,tz
);
667 fjx3
= _mm_add_pd(fjx3
,tx
);
668 fjy3
= _mm_add_pd(fjy3
,ty
);
669 fjz3
= _mm_add_pd(fjz3
,tz
);
673 /**************************
674 * CALCULATE INTERACTIONS *
675 **************************/
677 if (gmx_mm_any_lt(rsq31
,rcutoff2
))
680 r31
= _mm_mul_pd(rsq31
,rinv31
);
682 /* EWALD ELECTROSTATICS */
684 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
685 ewrt
= _mm_mul_pd(r31
,ewtabscale
);
686 ewitab
= _mm_cvttpd_epi32(ewrt
);
687 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
688 ewitab
= _mm_slli_epi32(ewitab
,2);
689 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
690 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
691 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
692 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
693 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
694 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
695 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
696 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
697 velec
= _mm_mul_pd(qq31
,_mm_sub_pd(rinv31
,velec
));
698 felec
= _mm_mul_pd(_mm_mul_pd(qq31
,rinv31
),_mm_sub_pd(rinvsq31
,felec
));
700 d
= _mm_sub_pd(r31
,rswitch
);
701 d
= _mm_max_pd(d
,_mm_setzero_pd());
702 d2
= _mm_mul_pd(d
,d
);
703 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
705 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
707 /* Evaluate switch function */
708 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
709 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv31
,_mm_mul_pd(velec
,dsw
)) );
710 velec
= _mm_mul_pd(velec
,sw
);
711 cutoff_mask
= _mm_cmplt_pd(rsq31
,rcutoff2
);
713 /* Update potential sum for this i atom from the interaction with this j atom. */
714 velec
= _mm_and_pd(velec
,cutoff_mask
);
715 velecsum
= _mm_add_pd(velecsum
,velec
);
719 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
721 /* Calculate temporary vectorial force */
722 tx
= _mm_mul_pd(fscal
,dx31
);
723 ty
= _mm_mul_pd(fscal
,dy31
);
724 tz
= _mm_mul_pd(fscal
,dz31
);
726 /* Update vectorial force */
727 fix3
= _mm_add_pd(fix3
,tx
);
728 fiy3
= _mm_add_pd(fiy3
,ty
);
729 fiz3
= _mm_add_pd(fiz3
,tz
);
731 fjx1
= _mm_add_pd(fjx1
,tx
);
732 fjy1
= _mm_add_pd(fjy1
,ty
);
733 fjz1
= _mm_add_pd(fjz1
,tz
);
737 /**************************
738 * CALCULATE INTERACTIONS *
739 **************************/
741 if (gmx_mm_any_lt(rsq32
,rcutoff2
))
744 r32
= _mm_mul_pd(rsq32
,rinv32
);
746 /* EWALD ELECTROSTATICS */
748 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
749 ewrt
= _mm_mul_pd(r32
,ewtabscale
);
750 ewitab
= _mm_cvttpd_epi32(ewrt
);
751 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
752 ewitab
= _mm_slli_epi32(ewitab
,2);
753 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
754 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
755 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
756 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
757 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
758 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
759 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
760 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
761 velec
= _mm_mul_pd(qq32
,_mm_sub_pd(rinv32
,velec
));
762 felec
= _mm_mul_pd(_mm_mul_pd(qq32
,rinv32
),_mm_sub_pd(rinvsq32
,felec
));
764 d
= _mm_sub_pd(r32
,rswitch
);
765 d
= _mm_max_pd(d
,_mm_setzero_pd());
766 d2
= _mm_mul_pd(d
,d
);
767 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
769 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
771 /* Evaluate switch function */
772 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
773 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv32
,_mm_mul_pd(velec
,dsw
)) );
774 velec
= _mm_mul_pd(velec
,sw
);
775 cutoff_mask
= _mm_cmplt_pd(rsq32
,rcutoff2
);
777 /* Update potential sum for this i atom from the interaction with this j atom. */
778 velec
= _mm_and_pd(velec
,cutoff_mask
);
779 velecsum
= _mm_add_pd(velecsum
,velec
);
783 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
785 /* Calculate temporary vectorial force */
786 tx
= _mm_mul_pd(fscal
,dx32
);
787 ty
= _mm_mul_pd(fscal
,dy32
);
788 tz
= _mm_mul_pd(fscal
,dz32
);
790 /* Update vectorial force */
791 fix3
= _mm_add_pd(fix3
,tx
);
792 fiy3
= _mm_add_pd(fiy3
,ty
);
793 fiz3
= _mm_add_pd(fiz3
,tz
);
795 fjx2
= _mm_add_pd(fjx2
,tx
);
796 fjy2
= _mm_add_pd(fjy2
,ty
);
797 fjz2
= _mm_add_pd(fjz2
,tz
);
801 /**************************
802 * CALCULATE INTERACTIONS *
803 **************************/
805 if (gmx_mm_any_lt(rsq33
,rcutoff2
))
808 r33
= _mm_mul_pd(rsq33
,rinv33
);
810 /* EWALD ELECTROSTATICS */
812 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
813 ewrt
= _mm_mul_pd(r33
,ewtabscale
);
814 ewitab
= _mm_cvttpd_epi32(ewrt
);
815 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
816 ewitab
= _mm_slli_epi32(ewitab
,2);
817 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
818 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
819 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
820 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
821 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
822 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
823 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
824 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
825 velec
= _mm_mul_pd(qq33
,_mm_sub_pd(rinv33
,velec
));
826 felec
= _mm_mul_pd(_mm_mul_pd(qq33
,rinv33
),_mm_sub_pd(rinvsq33
,felec
));
828 d
= _mm_sub_pd(r33
,rswitch
);
829 d
= _mm_max_pd(d
,_mm_setzero_pd());
830 d2
= _mm_mul_pd(d
,d
);
831 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
833 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
835 /* Evaluate switch function */
836 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
837 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv33
,_mm_mul_pd(velec
,dsw
)) );
838 velec
= _mm_mul_pd(velec
,sw
);
839 cutoff_mask
= _mm_cmplt_pd(rsq33
,rcutoff2
);
841 /* Update potential sum for this i atom from the interaction with this j atom. */
842 velec
= _mm_and_pd(velec
,cutoff_mask
);
843 velecsum
= _mm_add_pd(velecsum
,velec
);
847 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
849 /* Calculate temporary vectorial force */
850 tx
= _mm_mul_pd(fscal
,dx33
);
851 ty
= _mm_mul_pd(fscal
,dy33
);
852 tz
= _mm_mul_pd(fscal
,dz33
);
854 /* Update vectorial force */
855 fix3
= _mm_add_pd(fix3
,tx
);
856 fiy3
= _mm_add_pd(fiy3
,ty
);
857 fiz3
= _mm_add_pd(fiz3
,tz
);
859 fjx3
= _mm_add_pd(fjx3
,tx
);
860 fjy3
= _mm_add_pd(fjy3
,ty
);
861 fjz3
= _mm_add_pd(fjz3
,tz
);
865 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
+DIM
,f
+j_coord_offsetB
+DIM
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
867 /* Inner loop uses 585 flops */
874 j_coord_offsetA
= DIM
*jnrA
;
876 /* load j atom coordinates */
877 gmx_mm_load_3rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
+DIM
,
878 &jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
880 /* Calculate displacement vector */
881 dx11
= _mm_sub_pd(ix1
,jx1
);
882 dy11
= _mm_sub_pd(iy1
,jy1
);
883 dz11
= _mm_sub_pd(iz1
,jz1
);
884 dx12
= _mm_sub_pd(ix1
,jx2
);
885 dy12
= _mm_sub_pd(iy1
,jy2
);
886 dz12
= _mm_sub_pd(iz1
,jz2
);
887 dx13
= _mm_sub_pd(ix1
,jx3
);
888 dy13
= _mm_sub_pd(iy1
,jy3
);
889 dz13
= _mm_sub_pd(iz1
,jz3
);
890 dx21
= _mm_sub_pd(ix2
,jx1
);
891 dy21
= _mm_sub_pd(iy2
,jy1
);
892 dz21
= _mm_sub_pd(iz2
,jz1
);
893 dx22
= _mm_sub_pd(ix2
,jx2
);
894 dy22
= _mm_sub_pd(iy2
,jy2
);
895 dz22
= _mm_sub_pd(iz2
,jz2
);
896 dx23
= _mm_sub_pd(ix2
,jx3
);
897 dy23
= _mm_sub_pd(iy2
,jy3
);
898 dz23
= _mm_sub_pd(iz2
,jz3
);
899 dx31
= _mm_sub_pd(ix3
,jx1
);
900 dy31
= _mm_sub_pd(iy3
,jy1
);
901 dz31
= _mm_sub_pd(iz3
,jz1
);
902 dx32
= _mm_sub_pd(ix3
,jx2
);
903 dy32
= _mm_sub_pd(iy3
,jy2
);
904 dz32
= _mm_sub_pd(iz3
,jz2
);
905 dx33
= _mm_sub_pd(ix3
,jx3
);
906 dy33
= _mm_sub_pd(iy3
,jy3
);
907 dz33
= _mm_sub_pd(iz3
,jz3
);
909 /* Calculate squared distance and things based on it */
910 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
911 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
912 rsq13
= gmx_mm_calc_rsq_pd(dx13
,dy13
,dz13
);
913 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
914 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
915 rsq23
= gmx_mm_calc_rsq_pd(dx23
,dy23
,dz23
);
916 rsq31
= gmx_mm_calc_rsq_pd(dx31
,dy31
,dz31
);
917 rsq32
= gmx_mm_calc_rsq_pd(dx32
,dy32
,dz32
);
918 rsq33
= gmx_mm_calc_rsq_pd(dx33
,dy33
,dz33
);
920 rinv11
= sse2_invsqrt_d(rsq11
);
921 rinv12
= sse2_invsqrt_d(rsq12
);
922 rinv13
= sse2_invsqrt_d(rsq13
);
923 rinv21
= sse2_invsqrt_d(rsq21
);
924 rinv22
= sse2_invsqrt_d(rsq22
);
925 rinv23
= sse2_invsqrt_d(rsq23
);
926 rinv31
= sse2_invsqrt_d(rsq31
);
927 rinv32
= sse2_invsqrt_d(rsq32
);
928 rinv33
= sse2_invsqrt_d(rsq33
);
930 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
931 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
932 rinvsq13
= _mm_mul_pd(rinv13
,rinv13
);
933 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
934 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
935 rinvsq23
= _mm_mul_pd(rinv23
,rinv23
);
936 rinvsq31
= _mm_mul_pd(rinv31
,rinv31
);
937 rinvsq32
= _mm_mul_pd(rinv32
,rinv32
);
938 rinvsq33
= _mm_mul_pd(rinv33
,rinv33
);
940 fjx1
= _mm_setzero_pd();
941 fjy1
= _mm_setzero_pd();
942 fjz1
= _mm_setzero_pd();
943 fjx2
= _mm_setzero_pd();
944 fjy2
= _mm_setzero_pd();
945 fjz2
= _mm_setzero_pd();
946 fjx3
= _mm_setzero_pd();
947 fjy3
= _mm_setzero_pd();
948 fjz3
= _mm_setzero_pd();
950 /**************************
951 * CALCULATE INTERACTIONS *
952 **************************/
954 if (gmx_mm_any_lt(rsq11
,rcutoff2
))
957 r11
= _mm_mul_pd(rsq11
,rinv11
);
959 /* EWALD ELECTROSTATICS */
961 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
962 ewrt
= _mm_mul_pd(r11
,ewtabscale
);
963 ewitab
= _mm_cvttpd_epi32(ewrt
);
964 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
965 ewitab
= _mm_slli_epi32(ewitab
,2);
966 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
967 ewtabD
= _mm_setzero_pd();
968 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
969 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
970 ewtabFn
= _mm_setzero_pd();
971 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
972 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
973 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
974 velec
= _mm_mul_pd(qq11
,_mm_sub_pd(rinv11
,velec
));
975 felec
= _mm_mul_pd(_mm_mul_pd(qq11
,rinv11
),_mm_sub_pd(rinvsq11
,felec
));
977 d
= _mm_sub_pd(r11
,rswitch
);
978 d
= _mm_max_pd(d
,_mm_setzero_pd());
979 d2
= _mm_mul_pd(d
,d
);
980 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
982 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
984 /* Evaluate switch function */
985 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
986 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv11
,_mm_mul_pd(velec
,dsw
)) );
987 velec
= _mm_mul_pd(velec
,sw
);
988 cutoff_mask
= _mm_cmplt_pd(rsq11
,rcutoff2
);
990 /* Update potential sum for this i atom from the interaction with this j atom. */
991 velec
= _mm_and_pd(velec
,cutoff_mask
);
992 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
993 velecsum
= _mm_add_pd(velecsum
,velec
);
997 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
999 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1001 /* Calculate temporary vectorial force */
1002 tx
= _mm_mul_pd(fscal
,dx11
);
1003 ty
= _mm_mul_pd(fscal
,dy11
);
1004 tz
= _mm_mul_pd(fscal
,dz11
);
1006 /* Update vectorial force */
1007 fix1
= _mm_add_pd(fix1
,tx
);
1008 fiy1
= _mm_add_pd(fiy1
,ty
);
1009 fiz1
= _mm_add_pd(fiz1
,tz
);
1011 fjx1
= _mm_add_pd(fjx1
,tx
);
1012 fjy1
= _mm_add_pd(fjy1
,ty
);
1013 fjz1
= _mm_add_pd(fjz1
,tz
);
1017 /**************************
1018 * CALCULATE INTERACTIONS *
1019 **************************/
1021 if (gmx_mm_any_lt(rsq12
,rcutoff2
))
1024 r12
= _mm_mul_pd(rsq12
,rinv12
);
1026 /* EWALD ELECTROSTATICS */
1028 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1029 ewrt
= _mm_mul_pd(r12
,ewtabscale
);
1030 ewitab
= _mm_cvttpd_epi32(ewrt
);
1031 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1032 ewitab
= _mm_slli_epi32(ewitab
,2);
1033 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1034 ewtabD
= _mm_setzero_pd();
1035 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1036 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1037 ewtabFn
= _mm_setzero_pd();
1038 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1039 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1040 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1041 velec
= _mm_mul_pd(qq12
,_mm_sub_pd(rinv12
,velec
));
1042 felec
= _mm_mul_pd(_mm_mul_pd(qq12
,rinv12
),_mm_sub_pd(rinvsq12
,felec
));
1044 d
= _mm_sub_pd(r12
,rswitch
);
1045 d
= _mm_max_pd(d
,_mm_setzero_pd());
1046 d2
= _mm_mul_pd(d
,d
);
1047 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
1049 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
1051 /* Evaluate switch function */
1052 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1053 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv12
,_mm_mul_pd(velec
,dsw
)) );
1054 velec
= _mm_mul_pd(velec
,sw
);
1055 cutoff_mask
= _mm_cmplt_pd(rsq12
,rcutoff2
);
1057 /* Update potential sum for this i atom from the interaction with this j atom. */
1058 velec
= _mm_and_pd(velec
,cutoff_mask
);
1059 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1060 velecsum
= _mm_add_pd(velecsum
,velec
);
1064 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1066 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1068 /* Calculate temporary vectorial force */
1069 tx
= _mm_mul_pd(fscal
,dx12
);
1070 ty
= _mm_mul_pd(fscal
,dy12
);
1071 tz
= _mm_mul_pd(fscal
,dz12
);
1073 /* Update vectorial force */
1074 fix1
= _mm_add_pd(fix1
,tx
);
1075 fiy1
= _mm_add_pd(fiy1
,ty
);
1076 fiz1
= _mm_add_pd(fiz1
,tz
);
1078 fjx2
= _mm_add_pd(fjx2
,tx
);
1079 fjy2
= _mm_add_pd(fjy2
,ty
);
1080 fjz2
= _mm_add_pd(fjz2
,tz
);
1084 /**************************
1085 * CALCULATE INTERACTIONS *
1086 **************************/
1088 if (gmx_mm_any_lt(rsq13
,rcutoff2
))
1091 r13
= _mm_mul_pd(rsq13
,rinv13
);
1093 /* EWALD ELECTROSTATICS */
1095 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1096 ewrt
= _mm_mul_pd(r13
,ewtabscale
);
1097 ewitab
= _mm_cvttpd_epi32(ewrt
);
1098 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1099 ewitab
= _mm_slli_epi32(ewitab
,2);
1100 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1101 ewtabD
= _mm_setzero_pd();
1102 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1103 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1104 ewtabFn
= _mm_setzero_pd();
1105 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1106 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1107 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1108 velec
= _mm_mul_pd(qq13
,_mm_sub_pd(rinv13
,velec
));
1109 felec
= _mm_mul_pd(_mm_mul_pd(qq13
,rinv13
),_mm_sub_pd(rinvsq13
,felec
));
1111 d
= _mm_sub_pd(r13
,rswitch
);
1112 d
= _mm_max_pd(d
,_mm_setzero_pd());
1113 d2
= _mm_mul_pd(d
,d
);
1114 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
1116 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
1118 /* Evaluate switch function */
1119 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1120 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv13
,_mm_mul_pd(velec
,dsw
)) );
1121 velec
= _mm_mul_pd(velec
,sw
);
1122 cutoff_mask
= _mm_cmplt_pd(rsq13
,rcutoff2
);
1124 /* Update potential sum for this i atom from the interaction with this j atom. */
1125 velec
= _mm_and_pd(velec
,cutoff_mask
);
1126 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1127 velecsum
= _mm_add_pd(velecsum
,velec
);
1131 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1133 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1135 /* Calculate temporary vectorial force */
1136 tx
= _mm_mul_pd(fscal
,dx13
);
1137 ty
= _mm_mul_pd(fscal
,dy13
);
1138 tz
= _mm_mul_pd(fscal
,dz13
);
1140 /* Update vectorial force */
1141 fix1
= _mm_add_pd(fix1
,tx
);
1142 fiy1
= _mm_add_pd(fiy1
,ty
);
1143 fiz1
= _mm_add_pd(fiz1
,tz
);
1145 fjx3
= _mm_add_pd(fjx3
,tx
);
1146 fjy3
= _mm_add_pd(fjy3
,ty
);
1147 fjz3
= _mm_add_pd(fjz3
,tz
);
1151 /**************************
1152 * CALCULATE INTERACTIONS *
1153 **************************/
1155 if (gmx_mm_any_lt(rsq21
,rcutoff2
))
1158 r21
= _mm_mul_pd(rsq21
,rinv21
);
1160 /* EWALD ELECTROSTATICS */
1162 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1163 ewrt
= _mm_mul_pd(r21
,ewtabscale
);
1164 ewitab
= _mm_cvttpd_epi32(ewrt
);
1165 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1166 ewitab
= _mm_slli_epi32(ewitab
,2);
1167 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1168 ewtabD
= _mm_setzero_pd();
1169 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1170 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1171 ewtabFn
= _mm_setzero_pd();
1172 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1173 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1174 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1175 velec
= _mm_mul_pd(qq21
,_mm_sub_pd(rinv21
,velec
));
1176 felec
= _mm_mul_pd(_mm_mul_pd(qq21
,rinv21
),_mm_sub_pd(rinvsq21
,felec
));
1178 d
= _mm_sub_pd(r21
,rswitch
);
1179 d
= _mm_max_pd(d
,_mm_setzero_pd());
1180 d2
= _mm_mul_pd(d
,d
);
1181 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
1183 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
1185 /* Evaluate switch function */
1186 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1187 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv21
,_mm_mul_pd(velec
,dsw
)) );
1188 velec
= _mm_mul_pd(velec
,sw
);
1189 cutoff_mask
= _mm_cmplt_pd(rsq21
,rcutoff2
);
1191 /* Update potential sum for this i atom from the interaction with this j atom. */
1192 velec
= _mm_and_pd(velec
,cutoff_mask
);
1193 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1194 velecsum
= _mm_add_pd(velecsum
,velec
);
1198 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1200 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1202 /* Calculate temporary vectorial force */
1203 tx
= _mm_mul_pd(fscal
,dx21
);
1204 ty
= _mm_mul_pd(fscal
,dy21
);
1205 tz
= _mm_mul_pd(fscal
,dz21
);
1207 /* Update vectorial force */
1208 fix2
= _mm_add_pd(fix2
,tx
);
1209 fiy2
= _mm_add_pd(fiy2
,ty
);
1210 fiz2
= _mm_add_pd(fiz2
,tz
);
1212 fjx1
= _mm_add_pd(fjx1
,tx
);
1213 fjy1
= _mm_add_pd(fjy1
,ty
);
1214 fjz1
= _mm_add_pd(fjz1
,tz
);
1218 /**************************
1219 * CALCULATE INTERACTIONS *
1220 **************************/
1222 if (gmx_mm_any_lt(rsq22
,rcutoff2
))
1225 r22
= _mm_mul_pd(rsq22
,rinv22
);
1227 /* EWALD ELECTROSTATICS */
1229 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1230 ewrt
= _mm_mul_pd(r22
,ewtabscale
);
1231 ewitab
= _mm_cvttpd_epi32(ewrt
);
1232 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1233 ewitab
= _mm_slli_epi32(ewitab
,2);
1234 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1235 ewtabD
= _mm_setzero_pd();
1236 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1237 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1238 ewtabFn
= _mm_setzero_pd();
1239 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1240 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1241 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1242 velec
= _mm_mul_pd(qq22
,_mm_sub_pd(rinv22
,velec
));
1243 felec
= _mm_mul_pd(_mm_mul_pd(qq22
,rinv22
),_mm_sub_pd(rinvsq22
,felec
));
1245 d
= _mm_sub_pd(r22
,rswitch
);
1246 d
= _mm_max_pd(d
,_mm_setzero_pd());
1247 d2
= _mm_mul_pd(d
,d
);
1248 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
1250 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
1252 /* Evaluate switch function */
1253 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1254 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv22
,_mm_mul_pd(velec
,dsw
)) );
1255 velec
= _mm_mul_pd(velec
,sw
);
1256 cutoff_mask
= _mm_cmplt_pd(rsq22
,rcutoff2
);
1258 /* Update potential sum for this i atom from the interaction with this j atom. */
1259 velec
= _mm_and_pd(velec
,cutoff_mask
);
1260 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1261 velecsum
= _mm_add_pd(velecsum
,velec
);
1265 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1267 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1269 /* Calculate temporary vectorial force */
1270 tx
= _mm_mul_pd(fscal
,dx22
);
1271 ty
= _mm_mul_pd(fscal
,dy22
);
1272 tz
= _mm_mul_pd(fscal
,dz22
);
1274 /* Update vectorial force */
1275 fix2
= _mm_add_pd(fix2
,tx
);
1276 fiy2
= _mm_add_pd(fiy2
,ty
);
1277 fiz2
= _mm_add_pd(fiz2
,tz
);
1279 fjx2
= _mm_add_pd(fjx2
,tx
);
1280 fjy2
= _mm_add_pd(fjy2
,ty
);
1281 fjz2
= _mm_add_pd(fjz2
,tz
);
1285 /**************************
1286 * CALCULATE INTERACTIONS *
1287 **************************/
1289 if (gmx_mm_any_lt(rsq23
,rcutoff2
))
1292 r23
= _mm_mul_pd(rsq23
,rinv23
);
1294 /* EWALD ELECTROSTATICS */
1296 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1297 ewrt
= _mm_mul_pd(r23
,ewtabscale
);
1298 ewitab
= _mm_cvttpd_epi32(ewrt
);
1299 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1300 ewitab
= _mm_slli_epi32(ewitab
,2);
1301 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1302 ewtabD
= _mm_setzero_pd();
1303 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1304 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1305 ewtabFn
= _mm_setzero_pd();
1306 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1307 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1308 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1309 velec
= _mm_mul_pd(qq23
,_mm_sub_pd(rinv23
,velec
));
1310 felec
= _mm_mul_pd(_mm_mul_pd(qq23
,rinv23
),_mm_sub_pd(rinvsq23
,felec
));
1312 d
= _mm_sub_pd(r23
,rswitch
);
1313 d
= _mm_max_pd(d
,_mm_setzero_pd());
1314 d2
= _mm_mul_pd(d
,d
);
1315 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
1317 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
1319 /* Evaluate switch function */
1320 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1321 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv23
,_mm_mul_pd(velec
,dsw
)) );
1322 velec
= _mm_mul_pd(velec
,sw
);
1323 cutoff_mask
= _mm_cmplt_pd(rsq23
,rcutoff2
);
1325 /* Update potential sum for this i atom from the interaction with this j atom. */
1326 velec
= _mm_and_pd(velec
,cutoff_mask
);
1327 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1328 velecsum
= _mm_add_pd(velecsum
,velec
);
1332 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1334 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1336 /* Calculate temporary vectorial force */
1337 tx
= _mm_mul_pd(fscal
,dx23
);
1338 ty
= _mm_mul_pd(fscal
,dy23
);
1339 tz
= _mm_mul_pd(fscal
,dz23
);
1341 /* Update vectorial force */
1342 fix2
= _mm_add_pd(fix2
,tx
);
1343 fiy2
= _mm_add_pd(fiy2
,ty
);
1344 fiz2
= _mm_add_pd(fiz2
,tz
);
1346 fjx3
= _mm_add_pd(fjx3
,tx
);
1347 fjy3
= _mm_add_pd(fjy3
,ty
);
1348 fjz3
= _mm_add_pd(fjz3
,tz
);
1352 /**************************
1353 * CALCULATE INTERACTIONS *
1354 **************************/
1356 if (gmx_mm_any_lt(rsq31
,rcutoff2
))
1359 r31
= _mm_mul_pd(rsq31
,rinv31
);
1361 /* EWALD ELECTROSTATICS */
1363 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1364 ewrt
= _mm_mul_pd(r31
,ewtabscale
);
1365 ewitab
= _mm_cvttpd_epi32(ewrt
);
1366 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1367 ewitab
= _mm_slli_epi32(ewitab
,2);
1368 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1369 ewtabD
= _mm_setzero_pd();
1370 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1371 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1372 ewtabFn
= _mm_setzero_pd();
1373 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1374 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1375 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1376 velec
= _mm_mul_pd(qq31
,_mm_sub_pd(rinv31
,velec
));
1377 felec
= _mm_mul_pd(_mm_mul_pd(qq31
,rinv31
),_mm_sub_pd(rinvsq31
,felec
));
1379 d
= _mm_sub_pd(r31
,rswitch
);
1380 d
= _mm_max_pd(d
,_mm_setzero_pd());
1381 d2
= _mm_mul_pd(d
,d
);
1382 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
1384 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
1386 /* Evaluate switch function */
1387 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1388 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv31
,_mm_mul_pd(velec
,dsw
)) );
1389 velec
= _mm_mul_pd(velec
,sw
);
1390 cutoff_mask
= _mm_cmplt_pd(rsq31
,rcutoff2
);
1392 /* Update potential sum for this i atom from the interaction with this j atom. */
1393 velec
= _mm_and_pd(velec
,cutoff_mask
);
1394 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1395 velecsum
= _mm_add_pd(velecsum
,velec
);
1399 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1401 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1403 /* Calculate temporary vectorial force */
1404 tx
= _mm_mul_pd(fscal
,dx31
);
1405 ty
= _mm_mul_pd(fscal
,dy31
);
1406 tz
= _mm_mul_pd(fscal
,dz31
);
1408 /* Update vectorial force */
1409 fix3
= _mm_add_pd(fix3
,tx
);
1410 fiy3
= _mm_add_pd(fiy3
,ty
);
1411 fiz3
= _mm_add_pd(fiz3
,tz
);
1413 fjx1
= _mm_add_pd(fjx1
,tx
);
1414 fjy1
= _mm_add_pd(fjy1
,ty
);
1415 fjz1
= _mm_add_pd(fjz1
,tz
);
1419 /**************************
1420 * CALCULATE INTERACTIONS *
1421 **************************/
1423 if (gmx_mm_any_lt(rsq32
,rcutoff2
))
1426 r32
= _mm_mul_pd(rsq32
,rinv32
);
1428 /* EWALD ELECTROSTATICS */
1430 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1431 ewrt
= _mm_mul_pd(r32
,ewtabscale
);
1432 ewitab
= _mm_cvttpd_epi32(ewrt
);
1433 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1434 ewitab
= _mm_slli_epi32(ewitab
,2);
1435 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1436 ewtabD
= _mm_setzero_pd();
1437 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1438 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1439 ewtabFn
= _mm_setzero_pd();
1440 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1441 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1442 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1443 velec
= _mm_mul_pd(qq32
,_mm_sub_pd(rinv32
,velec
));
1444 felec
= _mm_mul_pd(_mm_mul_pd(qq32
,rinv32
),_mm_sub_pd(rinvsq32
,felec
));
1446 d
= _mm_sub_pd(r32
,rswitch
);
1447 d
= _mm_max_pd(d
,_mm_setzero_pd());
1448 d2
= _mm_mul_pd(d
,d
);
1449 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
1451 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
1453 /* Evaluate switch function */
1454 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1455 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv32
,_mm_mul_pd(velec
,dsw
)) );
1456 velec
= _mm_mul_pd(velec
,sw
);
1457 cutoff_mask
= _mm_cmplt_pd(rsq32
,rcutoff2
);
1459 /* Update potential sum for this i atom from the interaction with this j atom. */
1460 velec
= _mm_and_pd(velec
,cutoff_mask
);
1461 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1462 velecsum
= _mm_add_pd(velecsum
,velec
);
1466 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1468 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1470 /* Calculate temporary vectorial force */
1471 tx
= _mm_mul_pd(fscal
,dx32
);
1472 ty
= _mm_mul_pd(fscal
,dy32
);
1473 tz
= _mm_mul_pd(fscal
,dz32
);
1475 /* Update vectorial force */
1476 fix3
= _mm_add_pd(fix3
,tx
);
1477 fiy3
= _mm_add_pd(fiy3
,ty
);
1478 fiz3
= _mm_add_pd(fiz3
,tz
);
1480 fjx2
= _mm_add_pd(fjx2
,tx
);
1481 fjy2
= _mm_add_pd(fjy2
,ty
);
1482 fjz2
= _mm_add_pd(fjz2
,tz
);
1486 /**************************
1487 * CALCULATE INTERACTIONS *
1488 **************************/
1490 if (gmx_mm_any_lt(rsq33
,rcutoff2
))
1493 r33
= _mm_mul_pd(rsq33
,rinv33
);
1495 /* EWALD ELECTROSTATICS */
1497 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1498 ewrt
= _mm_mul_pd(r33
,ewtabscale
);
1499 ewitab
= _mm_cvttpd_epi32(ewrt
);
1500 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1501 ewitab
= _mm_slli_epi32(ewitab
,2);
1502 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1503 ewtabD
= _mm_setzero_pd();
1504 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1505 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1506 ewtabFn
= _mm_setzero_pd();
1507 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1508 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1509 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1510 velec
= _mm_mul_pd(qq33
,_mm_sub_pd(rinv33
,velec
));
1511 felec
= _mm_mul_pd(_mm_mul_pd(qq33
,rinv33
),_mm_sub_pd(rinvsq33
,felec
));
1513 d
= _mm_sub_pd(r33
,rswitch
);
1514 d
= _mm_max_pd(d
,_mm_setzero_pd());
1515 d2
= _mm_mul_pd(d
,d
);
1516 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
1518 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
1520 /* Evaluate switch function */
1521 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1522 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv33
,_mm_mul_pd(velec
,dsw
)) );
1523 velec
= _mm_mul_pd(velec
,sw
);
1524 cutoff_mask
= _mm_cmplt_pd(rsq33
,rcutoff2
);
1526 /* Update potential sum for this i atom from the interaction with this j atom. */
1527 velec
= _mm_and_pd(velec
,cutoff_mask
);
1528 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1529 velecsum
= _mm_add_pd(velecsum
,velec
);
1533 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1535 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1537 /* Calculate temporary vectorial force */
1538 tx
= _mm_mul_pd(fscal
,dx33
);
1539 ty
= _mm_mul_pd(fscal
,dy33
);
1540 tz
= _mm_mul_pd(fscal
,dz33
);
1542 /* Update vectorial force */
1543 fix3
= _mm_add_pd(fix3
,tx
);
1544 fiy3
= _mm_add_pd(fiy3
,ty
);
1545 fiz3
= _mm_add_pd(fiz3
,tz
);
1547 fjx3
= _mm_add_pd(fjx3
,tx
);
1548 fjy3
= _mm_add_pd(fjy3
,ty
);
1549 fjz3
= _mm_add_pd(fjz3
,tz
);
1553 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
+DIM
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
1555 /* Inner loop uses 585 flops */
1558 /* End of innermost loop */
1560 gmx_mm_update_iforce_3atom_swizzle_pd(fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
1561 f
+i_coord_offset
+DIM
,fshift
+i_shift_offset
);
1564 /* Update potential energies */
1565 gmx_mm_update_1pot_pd(velecsum
,kernel_data
->energygrp_elec
+ggid
);
1567 /* Increment number of inner iterations */
1568 inneriter
+= j_index_end
- j_index_start
;
1570 /* Outer loop uses 19 flops */
1573 /* Increment number of outer iterations */
1576 /* Update outer/inner flops */
1578 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_W4W4_VF
,outeriter
*19 + inneriter
*585);
1581 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_sse2_double
1582 * Electrostatics interaction: Ewald
1583 * VdW interaction: None
1584 * Geometry: Water4-Water4
1585 * Calculate force/pot: Force
1588 nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_sse2_double
1589 (t_nblist
* gmx_restrict nlist
,
1590 rvec
* gmx_restrict xx
,
1591 rvec
* gmx_restrict ff
,
1592 struct t_forcerec
* gmx_restrict fr
,
1593 t_mdatoms
* gmx_restrict mdatoms
,
1594 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
1595 t_nrnb
* gmx_restrict nrnb
)
1597 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1598 * just 0 for non-waters.
1599 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1600 * jnr indices corresponding to data put in the four positions in the SIMD register.
1602 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
1603 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
1605 int j_coord_offsetA
,j_coord_offsetB
;
1606 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
1607 real rcutoff_scalar
;
1608 real
*shiftvec
,*fshift
,*x
,*f
;
1609 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
1611 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
1613 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
1615 __m128d ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
1616 int vdwjidx1A
,vdwjidx1B
;
1617 __m128d jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
1618 int vdwjidx2A
,vdwjidx2B
;
1619 __m128d jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
1620 int vdwjidx3A
,vdwjidx3B
;
1621 __m128d jx3
,jy3
,jz3
,fjx3
,fjy3
,fjz3
,jq3
,isaj3
;
1622 __m128d dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
1623 __m128d dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
1624 __m128d dx13
,dy13
,dz13
,rsq13
,rinv13
,rinvsq13
,r13
,qq13
,c6_13
,c12_13
;
1625 __m128d dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
1626 __m128d dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
1627 __m128d dx23
,dy23
,dz23
,rsq23
,rinv23
,rinvsq23
,r23
,qq23
,c6_23
,c12_23
;
1628 __m128d dx31
,dy31
,dz31
,rsq31
,rinv31
,rinvsq31
,r31
,qq31
,c6_31
,c12_31
;
1629 __m128d dx32
,dy32
,dz32
,rsq32
,rinv32
,rinvsq32
,r32
,qq32
,c6_32
,c12_32
;
1630 __m128d dx33
,dy33
,dz33
,rsq33
,rinv33
,rinvsq33
,r33
,qq33
,c6_33
,c12_33
;
1631 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
1634 __m128d ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
1636 __m128d rswitch
,swV3
,swV4
,swV5
,swF2
,swF3
,swF4
,d
,d2
,sw
,dsw
;
1637 real rswitch_scalar
,d_scalar
;
1638 __m128d dummy_mask
,cutoff_mask
;
1639 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1640 __m128d one
= _mm_set1_pd(1.0);
1641 __m128d two
= _mm_set1_pd(2.0);
1647 jindex
= nlist
->jindex
;
1649 shiftidx
= nlist
->shift
;
1651 shiftvec
= fr
->shift_vec
[0];
1652 fshift
= fr
->fshift
[0];
1653 facel
= _mm_set1_pd(fr
->ic
->epsfac
);
1654 charge
= mdatoms
->chargeA
;
1656 sh_ewald
= _mm_set1_pd(fr
->ic
->sh_ewald
);
1657 ewtab
= fr
->ic
->tabq_coul_FDV0
;
1658 ewtabscale
= _mm_set1_pd(fr
->ic
->tabq_scale
);
1659 ewtabhalfspace
= _mm_set1_pd(0.5/fr
->ic
->tabq_scale
);
1661 /* Setup water-specific parameters */
1662 inr
= nlist
->iinr
[0];
1663 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
1664 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
1665 iq3
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+3]));
1667 jq1
= _mm_set1_pd(charge
[inr
+1]);
1668 jq2
= _mm_set1_pd(charge
[inr
+2]);
1669 jq3
= _mm_set1_pd(charge
[inr
+3]);
1670 qq11
= _mm_mul_pd(iq1
,jq1
);
1671 qq12
= _mm_mul_pd(iq1
,jq2
);
1672 qq13
= _mm_mul_pd(iq1
,jq3
);
1673 qq21
= _mm_mul_pd(iq2
,jq1
);
1674 qq22
= _mm_mul_pd(iq2
,jq2
);
1675 qq23
= _mm_mul_pd(iq2
,jq3
);
1676 qq31
= _mm_mul_pd(iq3
,jq1
);
1677 qq32
= _mm_mul_pd(iq3
,jq2
);
1678 qq33
= _mm_mul_pd(iq3
,jq3
);
1680 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1681 rcutoff_scalar
= fr
->ic
->rcoulomb
;
1682 rcutoff
= _mm_set1_pd(rcutoff_scalar
);
1683 rcutoff2
= _mm_mul_pd(rcutoff
,rcutoff
);
1685 rswitch_scalar
= fr
->ic
->rcoulomb_switch
;
1686 rswitch
= _mm_set1_pd(rswitch_scalar
);
1687 /* Setup switch parameters */
1688 d_scalar
= rcutoff_scalar
-rswitch_scalar
;
1689 d
= _mm_set1_pd(d_scalar
);
1690 swV3
= _mm_set1_pd(-10.0/(d_scalar
*d_scalar
*d_scalar
));
1691 swV4
= _mm_set1_pd( 15.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
));
1692 swV5
= _mm_set1_pd( -6.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
*d_scalar
));
1693 swF2
= _mm_set1_pd(-30.0/(d_scalar
*d_scalar
*d_scalar
));
1694 swF3
= _mm_set1_pd( 60.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
));
1695 swF4
= _mm_set1_pd(-30.0/(d_scalar
*d_scalar
*d_scalar
*d_scalar
*d_scalar
));
1697 /* Avoid stupid compiler warnings */
1699 j_coord_offsetA
= 0;
1700 j_coord_offsetB
= 0;
1705 /* Start outer loop over neighborlists */
1706 for(iidx
=0; iidx
<nri
; iidx
++)
1708 /* Load shift vector for this list */
1709 i_shift_offset
= DIM
*shiftidx
[iidx
];
1711 /* Load limits for loop over neighbors */
1712 j_index_start
= jindex
[iidx
];
1713 j_index_end
= jindex
[iidx
+1];
1715 /* Get outer coordinate index */
1717 i_coord_offset
= DIM
*inr
;
1719 /* Load i particle coords and add shift vector */
1720 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
+DIM
,
1721 &ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
1723 fix1
= _mm_setzero_pd();
1724 fiy1
= _mm_setzero_pd();
1725 fiz1
= _mm_setzero_pd();
1726 fix2
= _mm_setzero_pd();
1727 fiy2
= _mm_setzero_pd();
1728 fiz2
= _mm_setzero_pd();
1729 fix3
= _mm_setzero_pd();
1730 fiy3
= _mm_setzero_pd();
1731 fiz3
= _mm_setzero_pd();
1733 /* Start inner kernel loop */
1734 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
1737 /* Get j neighbor index, and coordinate index */
1739 jnrB
= jjnr
[jidx
+1];
1740 j_coord_offsetA
= DIM
*jnrA
;
1741 j_coord_offsetB
= DIM
*jnrB
;
1743 /* load j atom coordinates */
1744 gmx_mm_load_3rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
+DIM
,x
+j_coord_offsetB
+DIM
,
1745 &jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
1747 /* Calculate displacement vector */
1748 dx11
= _mm_sub_pd(ix1
,jx1
);
1749 dy11
= _mm_sub_pd(iy1
,jy1
);
1750 dz11
= _mm_sub_pd(iz1
,jz1
);
1751 dx12
= _mm_sub_pd(ix1
,jx2
);
1752 dy12
= _mm_sub_pd(iy1
,jy2
);
1753 dz12
= _mm_sub_pd(iz1
,jz2
);
1754 dx13
= _mm_sub_pd(ix1
,jx3
);
1755 dy13
= _mm_sub_pd(iy1
,jy3
);
1756 dz13
= _mm_sub_pd(iz1
,jz3
);
1757 dx21
= _mm_sub_pd(ix2
,jx1
);
1758 dy21
= _mm_sub_pd(iy2
,jy1
);
1759 dz21
= _mm_sub_pd(iz2
,jz1
);
1760 dx22
= _mm_sub_pd(ix2
,jx2
);
1761 dy22
= _mm_sub_pd(iy2
,jy2
);
1762 dz22
= _mm_sub_pd(iz2
,jz2
);
1763 dx23
= _mm_sub_pd(ix2
,jx3
);
1764 dy23
= _mm_sub_pd(iy2
,jy3
);
1765 dz23
= _mm_sub_pd(iz2
,jz3
);
1766 dx31
= _mm_sub_pd(ix3
,jx1
);
1767 dy31
= _mm_sub_pd(iy3
,jy1
);
1768 dz31
= _mm_sub_pd(iz3
,jz1
);
1769 dx32
= _mm_sub_pd(ix3
,jx2
);
1770 dy32
= _mm_sub_pd(iy3
,jy2
);
1771 dz32
= _mm_sub_pd(iz3
,jz2
);
1772 dx33
= _mm_sub_pd(ix3
,jx3
);
1773 dy33
= _mm_sub_pd(iy3
,jy3
);
1774 dz33
= _mm_sub_pd(iz3
,jz3
);
1776 /* Calculate squared distance and things based on it */
1777 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
1778 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
1779 rsq13
= gmx_mm_calc_rsq_pd(dx13
,dy13
,dz13
);
1780 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
1781 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
1782 rsq23
= gmx_mm_calc_rsq_pd(dx23
,dy23
,dz23
);
1783 rsq31
= gmx_mm_calc_rsq_pd(dx31
,dy31
,dz31
);
1784 rsq32
= gmx_mm_calc_rsq_pd(dx32
,dy32
,dz32
);
1785 rsq33
= gmx_mm_calc_rsq_pd(dx33
,dy33
,dz33
);
1787 rinv11
= sse2_invsqrt_d(rsq11
);
1788 rinv12
= sse2_invsqrt_d(rsq12
);
1789 rinv13
= sse2_invsqrt_d(rsq13
);
1790 rinv21
= sse2_invsqrt_d(rsq21
);
1791 rinv22
= sse2_invsqrt_d(rsq22
);
1792 rinv23
= sse2_invsqrt_d(rsq23
);
1793 rinv31
= sse2_invsqrt_d(rsq31
);
1794 rinv32
= sse2_invsqrt_d(rsq32
);
1795 rinv33
= sse2_invsqrt_d(rsq33
);
1797 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
1798 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
1799 rinvsq13
= _mm_mul_pd(rinv13
,rinv13
);
1800 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
1801 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
1802 rinvsq23
= _mm_mul_pd(rinv23
,rinv23
);
1803 rinvsq31
= _mm_mul_pd(rinv31
,rinv31
);
1804 rinvsq32
= _mm_mul_pd(rinv32
,rinv32
);
1805 rinvsq33
= _mm_mul_pd(rinv33
,rinv33
);
1807 fjx1
= _mm_setzero_pd();
1808 fjy1
= _mm_setzero_pd();
1809 fjz1
= _mm_setzero_pd();
1810 fjx2
= _mm_setzero_pd();
1811 fjy2
= _mm_setzero_pd();
1812 fjz2
= _mm_setzero_pd();
1813 fjx3
= _mm_setzero_pd();
1814 fjy3
= _mm_setzero_pd();
1815 fjz3
= _mm_setzero_pd();
1817 /**************************
1818 * CALCULATE INTERACTIONS *
1819 **************************/
1821 if (gmx_mm_any_lt(rsq11
,rcutoff2
))
1824 r11
= _mm_mul_pd(rsq11
,rinv11
);
1826 /* EWALD ELECTROSTATICS */
1828 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1829 ewrt
= _mm_mul_pd(r11
,ewtabscale
);
1830 ewitab
= _mm_cvttpd_epi32(ewrt
);
1831 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1832 ewitab
= _mm_slli_epi32(ewitab
,2);
1833 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1834 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
1835 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1836 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1837 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
1838 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1839 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1840 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1841 velec
= _mm_mul_pd(qq11
,_mm_sub_pd(rinv11
,velec
));
1842 felec
= _mm_mul_pd(_mm_mul_pd(qq11
,rinv11
),_mm_sub_pd(rinvsq11
,felec
));
1844 d
= _mm_sub_pd(r11
,rswitch
);
1845 d
= _mm_max_pd(d
,_mm_setzero_pd());
1846 d2
= _mm_mul_pd(d
,d
);
1847 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
1849 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
1851 /* Evaluate switch function */
1852 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1853 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv11
,_mm_mul_pd(velec
,dsw
)) );
1854 cutoff_mask
= _mm_cmplt_pd(rsq11
,rcutoff2
);
1858 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1860 /* Calculate temporary vectorial force */
1861 tx
= _mm_mul_pd(fscal
,dx11
);
1862 ty
= _mm_mul_pd(fscal
,dy11
);
1863 tz
= _mm_mul_pd(fscal
,dz11
);
1865 /* Update vectorial force */
1866 fix1
= _mm_add_pd(fix1
,tx
);
1867 fiy1
= _mm_add_pd(fiy1
,ty
);
1868 fiz1
= _mm_add_pd(fiz1
,tz
);
1870 fjx1
= _mm_add_pd(fjx1
,tx
);
1871 fjy1
= _mm_add_pd(fjy1
,ty
);
1872 fjz1
= _mm_add_pd(fjz1
,tz
);
1876 /**************************
1877 * CALCULATE INTERACTIONS *
1878 **************************/
1880 if (gmx_mm_any_lt(rsq12
,rcutoff2
))
1883 r12
= _mm_mul_pd(rsq12
,rinv12
);
1885 /* EWALD ELECTROSTATICS */
1887 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1888 ewrt
= _mm_mul_pd(r12
,ewtabscale
);
1889 ewitab
= _mm_cvttpd_epi32(ewrt
);
1890 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1891 ewitab
= _mm_slli_epi32(ewitab
,2);
1892 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1893 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
1894 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1895 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1896 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
1897 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1898 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1899 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1900 velec
= _mm_mul_pd(qq12
,_mm_sub_pd(rinv12
,velec
));
1901 felec
= _mm_mul_pd(_mm_mul_pd(qq12
,rinv12
),_mm_sub_pd(rinvsq12
,felec
));
1903 d
= _mm_sub_pd(r12
,rswitch
);
1904 d
= _mm_max_pd(d
,_mm_setzero_pd());
1905 d2
= _mm_mul_pd(d
,d
);
1906 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
1908 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
1910 /* Evaluate switch function */
1911 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1912 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv12
,_mm_mul_pd(velec
,dsw
)) );
1913 cutoff_mask
= _mm_cmplt_pd(rsq12
,rcutoff2
);
1917 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1919 /* Calculate temporary vectorial force */
1920 tx
= _mm_mul_pd(fscal
,dx12
);
1921 ty
= _mm_mul_pd(fscal
,dy12
);
1922 tz
= _mm_mul_pd(fscal
,dz12
);
1924 /* Update vectorial force */
1925 fix1
= _mm_add_pd(fix1
,tx
);
1926 fiy1
= _mm_add_pd(fiy1
,ty
);
1927 fiz1
= _mm_add_pd(fiz1
,tz
);
1929 fjx2
= _mm_add_pd(fjx2
,tx
);
1930 fjy2
= _mm_add_pd(fjy2
,ty
);
1931 fjz2
= _mm_add_pd(fjz2
,tz
);
1935 /**************************
1936 * CALCULATE INTERACTIONS *
1937 **************************/
1939 if (gmx_mm_any_lt(rsq13
,rcutoff2
))
1942 r13
= _mm_mul_pd(rsq13
,rinv13
);
1944 /* EWALD ELECTROSTATICS */
1946 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1947 ewrt
= _mm_mul_pd(r13
,ewtabscale
);
1948 ewitab
= _mm_cvttpd_epi32(ewrt
);
1949 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1950 ewitab
= _mm_slli_epi32(ewitab
,2);
1951 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1952 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
1953 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1954 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1955 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
1956 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1957 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1958 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1959 velec
= _mm_mul_pd(qq13
,_mm_sub_pd(rinv13
,velec
));
1960 felec
= _mm_mul_pd(_mm_mul_pd(qq13
,rinv13
),_mm_sub_pd(rinvsq13
,felec
));
1962 d
= _mm_sub_pd(r13
,rswitch
);
1963 d
= _mm_max_pd(d
,_mm_setzero_pd());
1964 d2
= _mm_mul_pd(d
,d
);
1965 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
1967 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
1969 /* Evaluate switch function */
1970 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1971 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv13
,_mm_mul_pd(velec
,dsw
)) );
1972 cutoff_mask
= _mm_cmplt_pd(rsq13
,rcutoff2
);
1976 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1978 /* Calculate temporary vectorial force */
1979 tx
= _mm_mul_pd(fscal
,dx13
);
1980 ty
= _mm_mul_pd(fscal
,dy13
);
1981 tz
= _mm_mul_pd(fscal
,dz13
);
1983 /* Update vectorial force */
1984 fix1
= _mm_add_pd(fix1
,tx
);
1985 fiy1
= _mm_add_pd(fiy1
,ty
);
1986 fiz1
= _mm_add_pd(fiz1
,tz
);
1988 fjx3
= _mm_add_pd(fjx3
,tx
);
1989 fjy3
= _mm_add_pd(fjy3
,ty
);
1990 fjz3
= _mm_add_pd(fjz3
,tz
);
1994 /**************************
1995 * CALCULATE INTERACTIONS *
1996 **************************/
1998 if (gmx_mm_any_lt(rsq21
,rcutoff2
))
2001 r21
= _mm_mul_pd(rsq21
,rinv21
);
2003 /* EWALD ELECTROSTATICS */
2005 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2006 ewrt
= _mm_mul_pd(r21
,ewtabscale
);
2007 ewitab
= _mm_cvttpd_epi32(ewrt
);
2008 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
2009 ewitab
= _mm_slli_epi32(ewitab
,2);
2010 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
2011 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
2012 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
2013 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
2014 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
2015 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
2016 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
2017 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
2018 velec
= _mm_mul_pd(qq21
,_mm_sub_pd(rinv21
,velec
));
2019 felec
= _mm_mul_pd(_mm_mul_pd(qq21
,rinv21
),_mm_sub_pd(rinvsq21
,felec
));
2021 d
= _mm_sub_pd(r21
,rswitch
);
2022 d
= _mm_max_pd(d
,_mm_setzero_pd());
2023 d2
= _mm_mul_pd(d
,d
);
2024 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
2026 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
2028 /* Evaluate switch function */
2029 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2030 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv21
,_mm_mul_pd(velec
,dsw
)) );
2031 cutoff_mask
= _mm_cmplt_pd(rsq21
,rcutoff2
);
2035 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2037 /* Calculate temporary vectorial force */
2038 tx
= _mm_mul_pd(fscal
,dx21
);
2039 ty
= _mm_mul_pd(fscal
,dy21
);
2040 tz
= _mm_mul_pd(fscal
,dz21
);
2042 /* Update vectorial force */
2043 fix2
= _mm_add_pd(fix2
,tx
);
2044 fiy2
= _mm_add_pd(fiy2
,ty
);
2045 fiz2
= _mm_add_pd(fiz2
,tz
);
2047 fjx1
= _mm_add_pd(fjx1
,tx
);
2048 fjy1
= _mm_add_pd(fjy1
,ty
);
2049 fjz1
= _mm_add_pd(fjz1
,tz
);
2053 /**************************
2054 * CALCULATE INTERACTIONS *
2055 **************************/
2057 if (gmx_mm_any_lt(rsq22
,rcutoff2
))
2060 r22
= _mm_mul_pd(rsq22
,rinv22
);
2062 /* EWALD ELECTROSTATICS */
2064 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2065 ewrt
= _mm_mul_pd(r22
,ewtabscale
);
2066 ewitab
= _mm_cvttpd_epi32(ewrt
);
2067 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
2068 ewitab
= _mm_slli_epi32(ewitab
,2);
2069 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
2070 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
2071 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
2072 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
2073 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
2074 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
2075 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
2076 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
2077 velec
= _mm_mul_pd(qq22
,_mm_sub_pd(rinv22
,velec
));
2078 felec
= _mm_mul_pd(_mm_mul_pd(qq22
,rinv22
),_mm_sub_pd(rinvsq22
,felec
));
2080 d
= _mm_sub_pd(r22
,rswitch
);
2081 d
= _mm_max_pd(d
,_mm_setzero_pd());
2082 d2
= _mm_mul_pd(d
,d
);
2083 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
2085 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
2087 /* Evaluate switch function */
2088 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2089 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv22
,_mm_mul_pd(velec
,dsw
)) );
2090 cutoff_mask
= _mm_cmplt_pd(rsq22
,rcutoff2
);
2094 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2096 /* Calculate temporary vectorial force */
2097 tx
= _mm_mul_pd(fscal
,dx22
);
2098 ty
= _mm_mul_pd(fscal
,dy22
);
2099 tz
= _mm_mul_pd(fscal
,dz22
);
2101 /* Update vectorial force */
2102 fix2
= _mm_add_pd(fix2
,tx
);
2103 fiy2
= _mm_add_pd(fiy2
,ty
);
2104 fiz2
= _mm_add_pd(fiz2
,tz
);
2106 fjx2
= _mm_add_pd(fjx2
,tx
);
2107 fjy2
= _mm_add_pd(fjy2
,ty
);
2108 fjz2
= _mm_add_pd(fjz2
,tz
);
2112 /**************************
2113 * CALCULATE INTERACTIONS *
2114 **************************/
2116 if (gmx_mm_any_lt(rsq23
,rcutoff2
))
2119 r23
= _mm_mul_pd(rsq23
,rinv23
);
2121 /* EWALD ELECTROSTATICS */
2123 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2124 ewrt
= _mm_mul_pd(r23
,ewtabscale
);
2125 ewitab
= _mm_cvttpd_epi32(ewrt
);
2126 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
2127 ewitab
= _mm_slli_epi32(ewitab
,2);
2128 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
2129 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
2130 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
2131 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
2132 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
2133 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
2134 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
2135 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
2136 velec
= _mm_mul_pd(qq23
,_mm_sub_pd(rinv23
,velec
));
2137 felec
= _mm_mul_pd(_mm_mul_pd(qq23
,rinv23
),_mm_sub_pd(rinvsq23
,felec
));
2139 d
= _mm_sub_pd(r23
,rswitch
);
2140 d
= _mm_max_pd(d
,_mm_setzero_pd());
2141 d2
= _mm_mul_pd(d
,d
);
2142 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
2144 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
2146 /* Evaluate switch function */
2147 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2148 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv23
,_mm_mul_pd(velec
,dsw
)) );
2149 cutoff_mask
= _mm_cmplt_pd(rsq23
,rcutoff2
);
2153 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2155 /* Calculate temporary vectorial force */
2156 tx
= _mm_mul_pd(fscal
,dx23
);
2157 ty
= _mm_mul_pd(fscal
,dy23
);
2158 tz
= _mm_mul_pd(fscal
,dz23
);
2160 /* Update vectorial force */
2161 fix2
= _mm_add_pd(fix2
,tx
);
2162 fiy2
= _mm_add_pd(fiy2
,ty
);
2163 fiz2
= _mm_add_pd(fiz2
,tz
);
2165 fjx3
= _mm_add_pd(fjx3
,tx
);
2166 fjy3
= _mm_add_pd(fjy3
,ty
);
2167 fjz3
= _mm_add_pd(fjz3
,tz
);
2171 /**************************
2172 * CALCULATE INTERACTIONS *
2173 **************************/
2175 if (gmx_mm_any_lt(rsq31
,rcutoff2
))
2178 r31
= _mm_mul_pd(rsq31
,rinv31
);
2180 /* EWALD ELECTROSTATICS */
2182 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2183 ewrt
= _mm_mul_pd(r31
,ewtabscale
);
2184 ewitab
= _mm_cvttpd_epi32(ewrt
);
2185 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
2186 ewitab
= _mm_slli_epi32(ewitab
,2);
2187 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
2188 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
2189 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
2190 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
2191 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
2192 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
2193 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
2194 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
2195 velec
= _mm_mul_pd(qq31
,_mm_sub_pd(rinv31
,velec
));
2196 felec
= _mm_mul_pd(_mm_mul_pd(qq31
,rinv31
),_mm_sub_pd(rinvsq31
,felec
));
2198 d
= _mm_sub_pd(r31
,rswitch
);
2199 d
= _mm_max_pd(d
,_mm_setzero_pd());
2200 d2
= _mm_mul_pd(d
,d
);
2201 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
2203 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
2205 /* Evaluate switch function */
2206 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2207 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv31
,_mm_mul_pd(velec
,dsw
)) );
2208 cutoff_mask
= _mm_cmplt_pd(rsq31
,rcutoff2
);
2212 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2214 /* Calculate temporary vectorial force */
2215 tx
= _mm_mul_pd(fscal
,dx31
);
2216 ty
= _mm_mul_pd(fscal
,dy31
);
2217 tz
= _mm_mul_pd(fscal
,dz31
);
2219 /* Update vectorial force */
2220 fix3
= _mm_add_pd(fix3
,tx
);
2221 fiy3
= _mm_add_pd(fiy3
,ty
);
2222 fiz3
= _mm_add_pd(fiz3
,tz
);
2224 fjx1
= _mm_add_pd(fjx1
,tx
);
2225 fjy1
= _mm_add_pd(fjy1
,ty
);
2226 fjz1
= _mm_add_pd(fjz1
,tz
);
2230 /**************************
2231 * CALCULATE INTERACTIONS *
2232 **************************/
2234 if (gmx_mm_any_lt(rsq32
,rcutoff2
))
2237 r32
= _mm_mul_pd(rsq32
,rinv32
);
2239 /* EWALD ELECTROSTATICS */
2241 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2242 ewrt
= _mm_mul_pd(r32
,ewtabscale
);
2243 ewitab
= _mm_cvttpd_epi32(ewrt
);
2244 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
2245 ewitab
= _mm_slli_epi32(ewitab
,2);
2246 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
2247 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
2248 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
2249 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
2250 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
2251 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
2252 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
2253 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
2254 velec
= _mm_mul_pd(qq32
,_mm_sub_pd(rinv32
,velec
));
2255 felec
= _mm_mul_pd(_mm_mul_pd(qq32
,rinv32
),_mm_sub_pd(rinvsq32
,felec
));
2257 d
= _mm_sub_pd(r32
,rswitch
);
2258 d
= _mm_max_pd(d
,_mm_setzero_pd());
2259 d2
= _mm_mul_pd(d
,d
);
2260 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
2262 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
2264 /* Evaluate switch function */
2265 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2266 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv32
,_mm_mul_pd(velec
,dsw
)) );
2267 cutoff_mask
= _mm_cmplt_pd(rsq32
,rcutoff2
);
2271 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2273 /* Calculate temporary vectorial force */
2274 tx
= _mm_mul_pd(fscal
,dx32
);
2275 ty
= _mm_mul_pd(fscal
,dy32
);
2276 tz
= _mm_mul_pd(fscal
,dz32
);
2278 /* Update vectorial force */
2279 fix3
= _mm_add_pd(fix3
,tx
);
2280 fiy3
= _mm_add_pd(fiy3
,ty
);
2281 fiz3
= _mm_add_pd(fiz3
,tz
);
2283 fjx2
= _mm_add_pd(fjx2
,tx
);
2284 fjy2
= _mm_add_pd(fjy2
,ty
);
2285 fjz2
= _mm_add_pd(fjz2
,tz
);
2289 /**************************
2290 * CALCULATE INTERACTIONS *
2291 **************************/
2293 if (gmx_mm_any_lt(rsq33
,rcutoff2
))
2296 r33
= _mm_mul_pd(rsq33
,rinv33
);
2298 /* EWALD ELECTROSTATICS */
2300 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2301 ewrt
= _mm_mul_pd(r33
,ewtabscale
);
2302 ewitab
= _mm_cvttpd_epi32(ewrt
);
2303 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
2304 ewitab
= _mm_slli_epi32(ewitab
,2);
2305 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
2306 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
2307 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
2308 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
2309 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
2310 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
2311 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
2312 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
2313 velec
= _mm_mul_pd(qq33
,_mm_sub_pd(rinv33
,velec
));
2314 felec
= _mm_mul_pd(_mm_mul_pd(qq33
,rinv33
),_mm_sub_pd(rinvsq33
,felec
));
2316 d
= _mm_sub_pd(r33
,rswitch
);
2317 d
= _mm_max_pd(d
,_mm_setzero_pd());
2318 d2
= _mm_mul_pd(d
,d
);
2319 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
2321 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
2323 /* Evaluate switch function */
2324 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2325 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv33
,_mm_mul_pd(velec
,dsw
)) );
2326 cutoff_mask
= _mm_cmplt_pd(rsq33
,rcutoff2
);
2330 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2332 /* Calculate temporary vectorial force */
2333 tx
= _mm_mul_pd(fscal
,dx33
);
2334 ty
= _mm_mul_pd(fscal
,dy33
);
2335 tz
= _mm_mul_pd(fscal
,dz33
);
2337 /* Update vectorial force */
2338 fix3
= _mm_add_pd(fix3
,tx
);
2339 fiy3
= _mm_add_pd(fiy3
,ty
);
2340 fiz3
= _mm_add_pd(fiz3
,tz
);
2342 fjx3
= _mm_add_pd(fjx3
,tx
);
2343 fjy3
= _mm_add_pd(fjy3
,ty
);
2344 fjz3
= _mm_add_pd(fjz3
,tz
);
2348 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
+DIM
,f
+j_coord_offsetB
+DIM
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
2350 /* Inner loop uses 558 flops */
2353 if(jidx
<j_index_end
)
2357 j_coord_offsetA
= DIM
*jnrA
;
2359 /* load j atom coordinates */
2360 gmx_mm_load_3rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
+DIM
,
2361 &jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
2363 /* Calculate displacement vector */
2364 dx11
= _mm_sub_pd(ix1
,jx1
);
2365 dy11
= _mm_sub_pd(iy1
,jy1
);
2366 dz11
= _mm_sub_pd(iz1
,jz1
);
2367 dx12
= _mm_sub_pd(ix1
,jx2
);
2368 dy12
= _mm_sub_pd(iy1
,jy2
);
2369 dz12
= _mm_sub_pd(iz1
,jz2
);
2370 dx13
= _mm_sub_pd(ix1
,jx3
);
2371 dy13
= _mm_sub_pd(iy1
,jy3
);
2372 dz13
= _mm_sub_pd(iz1
,jz3
);
2373 dx21
= _mm_sub_pd(ix2
,jx1
);
2374 dy21
= _mm_sub_pd(iy2
,jy1
);
2375 dz21
= _mm_sub_pd(iz2
,jz1
);
2376 dx22
= _mm_sub_pd(ix2
,jx2
);
2377 dy22
= _mm_sub_pd(iy2
,jy2
);
2378 dz22
= _mm_sub_pd(iz2
,jz2
);
2379 dx23
= _mm_sub_pd(ix2
,jx3
);
2380 dy23
= _mm_sub_pd(iy2
,jy3
);
2381 dz23
= _mm_sub_pd(iz2
,jz3
);
2382 dx31
= _mm_sub_pd(ix3
,jx1
);
2383 dy31
= _mm_sub_pd(iy3
,jy1
);
2384 dz31
= _mm_sub_pd(iz3
,jz1
);
2385 dx32
= _mm_sub_pd(ix3
,jx2
);
2386 dy32
= _mm_sub_pd(iy3
,jy2
);
2387 dz32
= _mm_sub_pd(iz3
,jz2
);
2388 dx33
= _mm_sub_pd(ix3
,jx3
);
2389 dy33
= _mm_sub_pd(iy3
,jy3
);
2390 dz33
= _mm_sub_pd(iz3
,jz3
);
2392 /* Calculate squared distance and things based on it */
2393 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
2394 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
2395 rsq13
= gmx_mm_calc_rsq_pd(dx13
,dy13
,dz13
);
2396 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
2397 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
2398 rsq23
= gmx_mm_calc_rsq_pd(dx23
,dy23
,dz23
);
2399 rsq31
= gmx_mm_calc_rsq_pd(dx31
,dy31
,dz31
);
2400 rsq32
= gmx_mm_calc_rsq_pd(dx32
,dy32
,dz32
);
2401 rsq33
= gmx_mm_calc_rsq_pd(dx33
,dy33
,dz33
);
2403 rinv11
= sse2_invsqrt_d(rsq11
);
2404 rinv12
= sse2_invsqrt_d(rsq12
);
2405 rinv13
= sse2_invsqrt_d(rsq13
);
2406 rinv21
= sse2_invsqrt_d(rsq21
);
2407 rinv22
= sse2_invsqrt_d(rsq22
);
2408 rinv23
= sse2_invsqrt_d(rsq23
);
2409 rinv31
= sse2_invsqrt_d(rsq31
);
2410 rinv32
= sse2_invsqrt_d(rsq32
);
2411 rinv33
= sse2_invsqrt_d(rsq33
);
2413 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
2414 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
2415 rinvsq13
= _mm_mul_pd(rinv13
,rinv13
);
2416 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
2417 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
2418 rinvsq23
= _mm_mul_pd(rinv23
,rinv23
);
2419 rinvsq31
= _mm_mul_pd(rinv31
,rinv31
);
2420 rinvsq32
= _mm_mul_pd(rinv32
,rinv32
);
2421 rinvsq33
= _mm_mul_pd(rinv33
,rinv33
);
2423 fjx1
= _mm_setzero_pd();
2424 fjy1
= _mm_setzero_pd();
2425 fjz1
= _mm_setzero_pd();
2426 fjx2
= _mm_setzero_pd();
2427 fjy2
= _mm_setzero_pd();
2428 fjz2
= _mm_setzero_pd();
2429 fjx3
= _mm_setzero_pd();
2430 fjy3
= _mm_setzero_pd();
2431 fjz3
= _mm_setzero_pd();
2433 /**************************
2434 * CALCULATE INTERACTIONS *
2435 **************************/
2437 if (gmx_mm_any_lt(rsq11
,rcutoff2
))
2440 r11
= _mm_mul_pd(rsq11
,rinv11
);
2442 /* EWALD ELECTROSTATICS */
2444 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2445 ewrt
= _mm_mul_pd(r11
,ewtabscale
);
2446 ewitab
= _mm_cvttpd_epi32(ewrt
);
2447 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
2448 ewitab
= _mm_slli_epi32(ewitab
,2);
2449 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
2450 ewtabD
= _mm_setzero_pd();
2451 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
2452 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
2453 ewtabFn
= _mm_setzero_pd();
2454 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
2455 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
2456 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
2457 velec
= _mm_mul_pd(qq11
,_mm_sub_pd(rinv11
,velec
));
2458 felec
= _mm_mul_pd(_mm_mul_pd(qq11
,rinv11
),_mm_sub_pd(rinvsq11
,felec
));
2460 d
= _mm_sub_pd(r11
,rswitch
);
2461 d
= _mm_max_pd(d
,_mm_setzero_pd());
2462 d2
= _mm_mul_pd(d
,d
);
2463 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
2465 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
2467 /* Evaluate switch function */
2468 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2469 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv11
,_mm_mul_pd(velec
,dsw
)) );
2470 cutoff_mask
= _mm_cmplt_pd(rsq11
,rcutoff2
);
2474 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2476 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2478 /* Calculate temporary vectorial force */
2479 tx
= _mm_mul_pd(fscal
,dx11
);
2480 ty
= _mm_mul_pd(fscal
,dy11
);
2481 tz
= _mm_mul_pd(fscal
,dz11
);
2483 /* Update vectorial force */
2484 fix1
= _mm_add_pd(fix1
,tx
);
2485 fiy1
= _mm_add_pd(fiy1
,ty
);
2486 fiz1
= _mm_add_pd(fiz1
,tz
);
2488 fjx1
= _mm_add_pd(fjx1
,tx
);
2489 fjy1
= _mm_add_pd(fjy1
,ty
);
2490 fjz1
= _mm_add_pd(fjz1
,tz
);
2494 /**************************
2495 * CALCULATE INTERACTIONS *
2496 **************************/
2498 if (gmx_mm_any_lt(rsq12
,rcutoff2
))
2501 r12
= _mm_mul_pd(rsq12
,rinv12
);
2503 /* EWALD ELECTROSTATICS */
2505 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2506 ewrt
= _mm_mul_pd(r12
,ewtabscale
);
2507 ewitab
= _mm_cvttpd_epi32(ewrt
);
2508 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
2509 ewitab
= _mm_slli_epi32(ewitab
,2);
2510 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
2511 ewtabD
= _mm_setzero_pd();
2512 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
2513 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
2514 ewtabFn
= _mm_setzero_pd();
2515 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
2516 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
2517 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
2518 velec
= _mm_mul_pd(qq12
,_mm_sub_pd(rinv12
,velec
));
2519 felec
= _mm_mul_pd(_mm_mul_pd(qq12
,rinv12
),_mm_sub_pd(rinvsq12
,felec
));
2521 d
= _mm_sub_pd(r12
,rswitch
);
2522 d
= _mm_max_pd(d
,_mm_setzero_pd());
2523 d2
= _mm_mul_pd(d
,d
);
2524 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
2526 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
2528 /* Evaluate switch function */
2529 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2530 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv12
,_mm_mul_pd(velec
,dsw
)) );
2531 cutoff_mask
= _mm_cmplt_pd(rsq12
,rcutoff2
);
2535 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2537 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2539 /* Calculate temporary vectorial force */
2540 tx
= _mm_mul_pd(fscal
,dx12
);
2541 ty
= _mm_mul_pd(fscal
,dy12
);
2542 tz
= _mm_mul_pd(fscal
,dz12
);
2544 /* Update vectorial force */
2545 fix1
= _mm_add_pd(fix1
,tx
);
2546 fiy1
= _mm_add_pd(fiy1
,ty
);
2547 fiz1
= _mm_add_pd(fiz1
,tz
);
2549 fjx2
= _mm_add_pd(fjx2
,tx
);
2550 fjy2
= _mm_add_pd(fjy2
,ty
);
2551 fjz2
= _mm_add_pd(fjz2
,tz
);
2555 /**************************
2556 * CALCULATE INTERACTIONS *
2557 **************************/
2559 if (gmx_mm_any_lt(rsq13
,rcutoff2
))
2562 r13
= _mm_mul_pd(rsq13
,rinv13
);
2564 /* EWALD ELECTROSTATICS */
2566 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2567 ewrt
= _mm_mul_pd(r13
,ewtabscale
);
2568 ewitab
= _mm_cvttpd_epi32(ewrt
);
2569 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
2570 ewitab
= _mm_slli_epi32(ewitab
,2);
2571 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
2572 ewtabD
= _mm_setzero_pd();
2573 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
2574 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
2575 ewtabFn
= _mm_setzero_pd();
2576 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
2577 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
2578 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
2579 velec
= _mm_mul_pd(qq13
,_mm_sub_pd(rinv13
,velec
));
2580 felec
= _mm_mul_pd(_mm_mul_pd(qq13
,rinv13
),_mm_sub_pd(rinvsq13
,felec
));
2582 d
= _mm_sub_pd(r13
,rswitch
);
2583 d
= _mm_max_pd(d
,_mm_setzero_pd());
2584 d2
= _mm_mul_pd(d
,d
);
2585 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
2587 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
2589 /* Evaluate switch function */
2590 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2591 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv13
,_mm_mul_pd(velec
,dsw
)) );
2592 cutoff_mask
= _mm_cmplt_pd(rsq13
,rcutoff2
);
2596 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2598 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2600 /* Calculate temporary vectorial force */
2601 tx
= _mm_mul_pd(fscal
,dx13
);
2602 ty
= _mm_mul_pd(fscal
,dy13
);
2603 tz
= _mm_mul_pd(fscal
,dz13
);
2605 /* Update vectorial force */
2606 fix1
= _mm_add_pd(fix1
,tx
);
2607 fiy1
= _mm_add_pd(fiy1
,ty
);
2608 fiz1
= _mm_add_pd(fiz1
,tz
);
2610 fjx3
= _mm_add_pd(fjx3
,tx
);
2611 fjy3
= _mm_add_pd(fjy3
,ty
);
2612 fjz3
= _mm_add_pd(fjz3
,tz
);
2616 /**************************
2617 * CALCULATE INTERACTIONS *
2618 **************************/
2620 if (gmx_mm_any_lt(rsq21
,rcutoff2
))
2623 r21
= _mm_mul_pd(rsq21
,rinv21
);
2625 /* EWALD ELECTROSTATICS */
2627 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2628 ewrt
= _mm_mul_pd(r21
,ewtabscale
);
2629 ewitab
= _mm_cvttpd_epi32(ewrt
);
2630 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
2631 ewitab
= _mm_slli_epi32(ewitab
,2);
2632 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
2633 ewtabD
= _mm_setzero_pd();
2634 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
2635 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
2636 ewtabFn
= _mm_setzero_pd();
2637 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
2638 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
2639 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
2640 velec
= _mm_mul_pd(qq21
,_mm_sub_pd(rinv21
,velec
));
2641 felec
= _mm_mul_pd(_mm_mul_pd(qq21
,rinv21
),_mm_sub_pd(rinvsq21
,felec
));
2643 d
= _mm_sub_pd(r21
,rswitch
);
2644 d
= _mm_max_pd(d
,_mm_setzero_pd());
2645 d2
= _mm_mul_pd(d
,d
);
2646 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
2648 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
2650 /* Evaluate switch function */
2651 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2652 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv21
,_mm_mul_pd(velec
,dsw
)) );
2653 cutoff_mask
= _mm_cmplt_pd(rsq21
,rcutoff2
);
2657 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2659 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2661 /* Calculate temporary vectorial force */
2662 tx
= _mm_mul_pd(fscal
,dx21
);
2663 ty
= _mm_mul_pd(fscal
,dy21
);
2664 tz
= _mm_mul_pd(fscal
,dz21
);
2666 /* Update vectorial force */
2667 fix2
= _mm_add_pd(fix2
,tx
);
2668 fiy2
= _mm_add_pd(fiy2
,ty
);
2669 fiz2
= _mm_add_pd(fiz2
,tz
);
2671 fjx1
= _mm_add_pd(fjx1
,tx
);
2672 fjy1
= _mm_add_pd(fjy1
,ty
);
2673 fjz1
= _mm_add_pd(fjz1
,tz
);
2677 /**************************
2678 * CALCULATE INTERACTIONS *
2679 **************************/
2681 if (gmx_mm_any_lt(rsq22
,rcutoff2
))
2684 r22
= _mm_mul_pd(rsq22
,rinv22
);
2686 /* EWALD ELECTROSTATICS */
2688 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2689 ewrt
= _mm_mul_pd(r22
,ewtabscale
);
2690 ewitab
= _mm_cvttpd_epi32(ewrt
);
2691 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
2692 ewitab
= _mm_slli_epi32(ewitab
,2);
2693 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
2694 ewtabD
= _mm_setzero_pd();
2695 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
2696 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
2697 ewtabFn
= _mm_setzero_pd();
2698 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
2699 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
2700 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
2701 velec
= _mm_mul_pd(qq22
,_mm_sub_pd(rinv22
,velec
));
2702 felec
= _mm_mul_pd(_mm_mul_pd(qq22
,rinv22
),_mm_sub_pd(rinvsq22
,felec
));
2704 d
= _mm_sub_pd(r22
,rswitch
);
2705 d
= _mm_max_pd(d
,_mm_setzero_pd());
2706 d2
= _mm_mul_pd(d
,d
);
2707 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
2709 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
2711 /* Evaluate switch function */
2712 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2713 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv22
,_mm_mul_pd(velec
,dsw
)) );
2714 cutoff_mask
= _mm_cmplt_pd(rsq22
,rcutoff2
);
2718 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2720 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2722 /* Calculate temporary vectorial force */
2723 tx
= _mm_mul_pd(fscal
,dx22
);
2724 ty
= _mm_mul_pd(fscal
,dy22
);
2725 tz
= _mm_mul_pd(fscal
,dz22
);
2727 /* Update vectorial force */
2728 fix2
= _mm_add_pd(fix2
,tx
);
2729 fiy2
= _mm_add_pd(fiy2
,ty
);
2730 fiz2
= _mm_add_pd(fiz2
,tz
);
2732 fjx2
= _mm_add_pd(fjx2
,tx
);
2733 fjy2
= _mm_add_pd(fjy2
,ty
);
2734 fjz2
= _mm_add_pd(fjz2
,tz
);
2738 /**************************
2739 * CALCULATE INTERACTIONS *
2740 **************************/
2742 if (gmx_mm_any_lt(rsq23
,rcutoff2
))
2745 r23
= _mm_mul_pd(rsq23
,rinv23
);
2747 /* EWALD ELECTROSTATICS */
2749 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2750 ewrt
= _mm_mul_pd(r23
,ewtabscale
);
2751 ewitab
= _mm_cvttpd_epi32(ewrt
);
2752 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
2753 ewitab
= _mm_slli_epi32(ewitab
,2);
2754 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
2755 ewtabD
= _mm_setzero_pd();
2756 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
2757 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
2758 ewtabFn
= _mm_setzero_pd();
2759 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
2760 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
2761 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
2762 velec
= _mm_mul_pd(qq23
,_mm_sub_pd(rinv23
,velec
));
2763 felec
= _mm_mul_pd(_mm_mul_pd(qq23
,rinv23
),_mm_sub_pd(rinvsq23
,felec
));
2765 d
= _mm_sub_pd(r23
,rswitch
);
2766 d
= _mm_max_pd(d
,_mm_setzero_pd());
2767 d2
= _mm_mul_pd(d
,d
);
2768 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
2770 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
2772 /* Evaluate switch function */
2773 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2774 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv23
,_mm_mul_pd(velec
,dsw
)) );
2775 cutoff_mask
= _mm_cmplt_pd(rsq23
,rcutoff2
);
2779 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2781 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2783 /* Calculate temporary vectorial force */
2784 tx
= _mm_mul_pd(fscal
,dx23
);
2785 ty
= _mm_mul_pd(fscal
,dy23
);
2786 tz
= _mm_mul_pd(fscal
,dz23
);
2788 /* Update vectorial force */
2789 fix2
= _mm_add_pd(fix2
,tx
);
2790 fiy2
= _mm_add_pd(fiy2
,ty
);
2791 fiz2
= _mm_add_pd(fiz2
,tz
);
2793 fjx3
= _mm_add_pd(fjx3
,tx
);
2794 fjy3
= _mm_add_pd(fjy3
,ty
);
2795 fjz3
= _mm_add_pd(fjz3
,tz
);
2799 /**************************
2800 * CALCULATE INTERACTIONS *
2801 **************************/
2803 if (gmx_mm_any_lt(rsq31
,rcutoff2
))
2806 r31
= _mm_mul_pd(rsq31
,rinv31
);
2808 /* EWALD ELECTROSTATICS */
2810 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2811 ewrt
= _mm_mul_pd(r31
,ewtabscale
);
2812 ewitab
= _mm_cvttpd_epi32(ewrt
);
2813 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
2814 ewitab
= _mm_slli_epi32(ewitab
,2);
2815 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
2816 ewtabD
= _mm_setzero_pd();
2817 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
2818 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
2819 ewtabFn
= _mm_setzero_pd();
2820 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
2821 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
2822 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
2823 velec
= _mm_mul_pd(qq31
,_mm_sub_pd(rinv31
,velec
));
2824 felec
= _mm_mul_pd(_mm_mul_pd(qq31
,rinv31
),_mm_sub_pd(rinvsq31
,felec
));
2826 d
= _mm_sub_pd(r31
,rswitch
);
2827 d
= _mm_max_pd(d
,_mm_setzero_pd());
2828 d2
= _mm_mul_pd(d
,d
);
2829 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
2831 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
2833 /* Evaluate switch function */
2834 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2835 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv31
,_mm_mul_pd(velec
,dsw
)) );
2836 cutoff_mask
= _mm_cmplt_pd(rsq31
,rcutoff2
);
2840 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2842 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2844 /* Calculate temporary vectorial force */
2845 tx
= _mm_mul_pd(fscal
,dx31
);
2846 ty
= _mm_mul_pd(fscal
,dy31
);
2847 tz
= _mm_mul_pd(fscal
,dz31
);
2849 /* Update vectorial force */
2850 fix3
= _mm_add_pd(fix3
,tx
);
2851 fiy3
= _mm_add_pd(fiy3
,ty
);
2852 fiz3
= _mm_add_pd(fiz3
,tz
);
2854 fjx1
= _mm_add_pd(fjx1
,tx
);
2855 fjy1
= _mm_add_pd(fjy1
,ty
);
2856 fjz1
= _mm_add_pd(fjz1
,tz
);
2860 /**************************
2861 * CALCULATE INTERACTIONS *
2862 **************************/
2864 if (gmx_mm_any_lt(rsq32
,rcutoff2
))
2867 r32
= _mm_mul_pd(rsq32
,rinv32
);
2869 /* EWALD ELECTROSTATICS */
2871 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2872 ewrt
= _mm_mul_pd(r32
,ewtabscale
);
2873 ewitab
= _mm_cvttpd_epi32(ewrt
);
2874 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
2875 ewitab
= _mm_slli_epi32(ewitab
,2);
2876 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
2877 ewtabD
= _mm_setzero_pd();
2878 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
2879 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
2880 ewtabFn
= _mm_setzero_pd();
2881 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
2882 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
2883 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
2884 velec
= _mm_mul_pd(qq32
,_mm_sub_pd(rinv32
,velec
));
2885 felec
= _mm_mul_pd(_mm_mul_pd(qq32
,rinv32
),_mm_sub_pd(rinvsq32
,felec
));
2887 d
= _mm_sub_pd(r32
,rswitch
);
2888 d
= _mm_max_pd(d
,_mm_setzero_pd());
2889 d2
= _mm_mul_pd(d
,d
);
2890 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
2892 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
2894 /* Evaluate switch function */
2895 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2896 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv32
,_mm_mul_pd(velec
,dsw
)) );
2897 cutoff_mask
= _mm_cmplt_pd(rsq32
,rcutoff2
);
2901 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2903 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2905 /* Calculate temporary vectorial force */
2906 tx
= _mm_mul_pd(fscal
,dx32
);
2907 ty
= _mm_mul_pd(fscal
,dy32
);
2908 tz
= _mm_mul_pd(fscal
,dz32
);
2910 /* Update vectorial force */
2911 fix3
= _mm_add_pd(fix3
,tx
);
2912 fiy3
= _mm_add_pd(fiy3
,ty
);
2913 fiz3
= _mm_add_pd(fiz3
,tz
);
2915 fjx2
= _mm_add_pd(fjx2
,tx
);
2916 fjy2
= _mm_add_pd(fjy2
,ty
);
2917 fjz2
= _mm_add_pd(fjz2
,tz
);
2921 /**************************
2922 * CALCULATE INTERACTIONS *
2923 **************************/
2925 if (gmx_mm_any_lt(rsq33
,rcutoff2
))
2928 r33
= _mm_mul_pd(rsq33
,rinv33
);
2930 /* EWALD ELECTROSTATICS */
2932 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2933 ewrt
= _mm_mul_pd(r33
,ewtabscale
);
2934 ewitab
= _mm_cvttpd_epi32(ewrt
);
2935 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
2936 ewitab
= _mm_slli_epi32(ewitab
,2);
2937 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
2938 ewtabD
= _mm_setzero_pd();
2939 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
2940 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
2941 ewtabFn
= _mm_setzero_pd();
2942 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
2943 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
2944 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
2945 velec
= _mm_mul_pd(qq33
,_mm_sub_pd(rinv33
,velec
));
2946 felec
= _mm_mul_pd(_mm_mul_pd(qq33
,rinv33
),_mm_sub_pd(rinvsq33
,felec
));
2948 d
= _mm_sub_pd(r33
,rswitch
);
2949 d
= _mm_max_pd(d
,_mm_setzero_pd());
2950 d2
= _mm_mul_pd(d
,d
);
2951 sw
= _mm_add_pd(one
,_mm_mul_pd(d2
,_mm_mul_pd(d
,_mm_add_pd(swV3
,_mm_mul_pd(d
,_mm_add_pd(swV4
,_mm_mul_pd(d
,swV5
)))))));
2953 dsw
= _mm_mul_pd(d2
,_mm_add_pd(swF2
,_mm_mul_pd(d
,_mm_add_pd(swF3
,_mm_mul_pd(d
,swF4
)))));
2955 /* Evaluate switch function */
2956 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2957 felec
= _mm_sub_pd( _mm_mul_pd(felec
,sw
) , _mm_mul_pd(rinv33
,_mm_mul_pd(velec
,dsw
)) );
2958 cutoff_mask
= _mm_cmplt_pd(rsq33
,rcutoff2
);
2962 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2964 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2966 /* Calculate temporary vectorial force */
2967 tx
= _mm_mul_pd(fscal
,dx33
);
2968 ty
= _mm_mul_pd(fscal
,dy33
);
2969 tz
= _mm_mul_pd(fscal
,dz33
);
2971 /* Update vectorial force */
2972 fix3
= _mm_add_pd(fix3
,tx
);
2973 fiy3
= _mm_add_pd(fiy3
,ty
);
2974 fiz3
= _mm_add_pd(fiz3
,tz
);
2976 fjx3
= _mm_add_pd(fjx3
,tx
);
2977 fjy3
= _mm_add_pd(fjy3
,ty
);
2978 fjz3
= _mm_add_pd(fjz3
,tz
);
2982 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
+DIM
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
2984 /* Inner loop uses 558 flops */
2987 /* End of innermost loop */
2989 gmx_mm_update_iforce_3atom_swizzle_pd(fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
2990 f
+i_coord_offset
+DIM
,fshift
+i_shift_offset
);
2992 /* Increment number of inner iterations */
2993 inneriter
+= j_index_end
- j_index_start
;
2995 /* Outer loop uses 18 flops */
2998 /* Increment number of outer iterations */
3001 /* Update outer/inner flops */
3003 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_W4W4_F
,outeriter
*18 + inneriter
*558);