2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_sse2_double.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sse2_double
51 * Electrostatics interaction: Ewald
52 * VdW interaction: LennardJones
53 * Geometry: Water3-Water3
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sse2_double
58 (t_nblist
* gmx_restrict nlist
,
59 rvec
* gmx_restrict xx
,
60 rvec
* gmx_restrict ff
,
61 struct t_forcerec
* gmx_restrict fr
,
62 t_mdatoms
* gmx_restrict mdatoms
,
63 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
64 t_nrnb
* gmx_restrict nrnb
)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
72 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
74 int j_coord_offsetA
,j_coord_offsetB
;
75 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
77 real
*shiftvec
,*fshift
,*x
,*f
;
78 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
80 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
82 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
84 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
85 int vdwjidx0A
,vdwjidx0B
;
86 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
87 int vdwjidx1A
,vdwjidx1B
;
88 __m128d jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
89 int vdwjidx2A
,vdwjidx2B
;
90 __m128d jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
91 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
92 __m128d dx01
,dy01
,dz01
,rsq01
,rinv01
,rinvsq01
,r01
,qq01
,c6_01
,c12_01
;
93 __m128d dx02
,dy02
,dz02
,rsq02
,rinv02
,rinvsq02
,r02
,qq02
,c6_02
,c12_02
;
94 __m128d dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
95 __m128d dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
96 __m128d dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
97 __m128d dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
98 __m128d dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
99 __m128d dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
100 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
103 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
106 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
107 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
109 __m128d ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
111 __m128d dummy_mask
,cutoff_mask
;
112 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
113 __m128d one
= _mm_set1_pd(1.0);
114 __m128d two
= _mm_set1_pd(2.0);
120 jindex
= nlist
->jindex
;
122 shiftidx
= nlist
->shift
;
124 shiftvec
= fr
->shift_vec
[0];
125 fshift
= fr
->fshift
[0];
126 facel
= _mm_set1_pd(fr
->ic
->epsfac
);
127 charge
= mdatoms
->chargeA
;
128 nvdwtype
= fr
->ntype
;
130 vdwtype
= mdatoms
->typeA
;
132 sh_ewald
= _mm_set1_pd(fr
->ic
->sh_ewald
);
133 ewtab
= fr
->ic
->tabq_coul_FDV0
;
134 ewtabscale
= _mm_set1_pd(fr
->ic
->tabq_scale
);
135 ewtabhalfspace
= _mm_set1_pd(0.5/fr
->ic
->tabq_scale
);
137 /* Setup water-specific parameters */
138 inr
= nlist
->iinr
[0];
139 iq0
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+0]));
140 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
141 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
142 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
144 jq0
= _mm_set1_pd(charge
[inr
+0]);
145 jq1
= _mm_set1_pd(charge
[inr
+1]);
146 jq2
= _mm_set1_pd(charge
[inr
+2]);
147 vdwjidx0A
= 2*vdwtype
[inr
+0];
148 qq00
= _mm_mul_pd(iq0
,jq0
);
149 c6_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
]);
150 c12_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
151 qq01
= _mm_mul_pd(iq0
,jq1
);
152 qq02
= _mm_mul_pd(iq0
,jq2
);
153 qq10
= _mm_mul_pd(iq1
,jq0
);
154 qq11
= _mm_mul_pd(iq1
,jq1
);
155 qq12
= _mm_mul_pd(iq1
,jq2
);
156 qq20
= _mm_mul_pd(iq2
,jq0
);
157 qq21
= _mm_mul_pd(iq2
,jq1
);
158 qq22
= _mm_mul_pd(iq2
,jq2
);
160 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
161 rcutoff_scalar
= fr
->ic
->rcoulomb
;
162 rcutoff
= _mm_set1_pd(rcutoff_scalar
);
163 rcutoff2
= _mm_mul_pd(rcutoff
,rcutoff
);
165 sh_vdw_invrcut6
= _mm_set1_pd(fr
->ic
->sh_invrc6
);
166 rvdw
= _mm_set1_pd(fr
->ic
->rvdw
);
168 /* Avoid stupid compiler warnings */
176 /* Start outer loop over neighborlists */
177 for(iidx
=0; iidx
<nri
; iidx
++)
179 /* Load shift vector for this list */
180 i_shift_offset
= DIM
*shiftidx
[iidx
];
182 /* Load limits for loop over neighbors */
183 j_index_start
= jindex
[iidx
];
184 j_index_end
= jindex
[iidx
+1];
186 /* Get outer coordinate index */
188 i_coord_offset
= DIM
*inr
;
190 /* Load i particle coords and add shift vector */
191 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
192 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
194 fix0
= _mm_setzero_pd();
195 fiy0
= _mm_setzero_pd();
196 fiz0
= _mm_setzero_pd();
197 fix1
= _mm_setzero_pd();
198 fiy1
= _mm_setzero_pd();
199 fiz1
= _mm_setzero_pd();
200 fix2
= _mm_setzero_pd();
201 fiy2
= _mm_setzero_pd();
202 fiz2
= _mm_setzero_pd();
204 /* Reset potential sums */
205 velecsum
= _mm_setzero_pd();
206 vvdwsum
= _mm_setzero_pd();
208 /* Start inner kernel loop */
209 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
212 /* Get j neighbor index, and coordinate index */
215 j_coord_offsetA
= DIM
*jnrA
;
216 j_coord_offsetB
= DIM
*jnrB
;
218 /* load j atom coordinates */
219 gmx_mm_load_3rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
220 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
222 /* Calculate displacement vector */
223 dx00
= _mm_sub_pd(ix0
,jx0
);
224 dy00
= _mm_sub_pd(iy0
,jy0
);
225 dz00
= _mm_sub_pd(iz0
,jz0
);
226 dx01
= _mm_sub_pd(ix0
,jx1
);
227 dy01
= _mm_sub_pd(iy0
,jy1
);
228 dz01
= _mm_sub_pd(iz0
,jz1
);
229 dx02
= _mm_sub_pd(ix0
,jx2
);
230 dy02
= _mm_sub_pd(iy0
,jy2
);
231 dz02
= _mm_sub_pd(iz0
,jz2
);
232 dx10
= _mm_sub_pd(ix1
,jx0
);
233 dy10
= _mm_sub_pd(iy1
,jy0
);
234 dz10
= _mm_sub_pd(iz1
,jz0
);
235 dx11
= _mm_sub_pd(ix1
,jx1
);
236 dy11
= _mm_sub_pd(iy1
,jy1
);
237 dz11
= _mm_sub_pd(iz1
,jz1
);
238 dx12
= _mm_sub_pd(ix1
,jx2
);
239 dy12
= _mm_sub_pd(iy1
,jy2
);
240 dz12
= _mm_sub_pd(iz1
,jz2
);
241 dx20
= _mm_sub_pd(ix2
,jx0
);
242 dy20
= _mm_sub_pd(iy2
,jy0
);
243 dz20
= _mm_sub_pd(iz2
,jz0
);
244 dx21
= _mm_sub_pd(ix2
,jx1
);
245 dy21
= _mm_sub_pd(iy2
,jy1
);
246 dz21
= _mm_sub_pd(iz2
,jz1
);
247 dx22
= _mm_sub_pd(ix2
,jx2
);
248 dy22
= _mm_sub_pd(iy2
,jy2
);
249 dz22
= _mm_sub_pd(iz2
,jz2
);
251 /* Calculate squared distance and things based on it */
252 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
253 rsq01
= gmx_mm_calc_rsq_pd(dx01
,dy01
,dz01
);
254 rsq02
= gmx_mm_calc_rsq_pd(dx02
,dy02
,dz02
);
255 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
256 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
257 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
258 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
259 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
260 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
262 rinv00
= sse2_invsqrt_d(rsq00
);
263 rinv01
= sse2_invsqrt_d(rsq01
);
264 rinv02
= sse2_invsqrt_d(rsq02
);
265 rinv10
= sse2_invsqrt_d(rsq10
);
266 rinv11
= sse2_invsqrt_d(rsq11
);
267 rinv12
= sse2_invsqrt_d(rsq12
);
268 rinv20
= sse2_invsqrt_d(rsq20
);
269 rinv21
= sse2_invsqrt_d(rsq21
);
270 rinv22
= sse2_invsqrt_d(rsq22
);
272 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
273 rinvsq01
= _mm_mul_pd(rinv01
,rinv01
);
274 rinvsq02
= _mm_mul_pd(rinv02
,rinv02
);
275 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
276 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
277 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
278 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
279 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
280 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
282 fjx0
= _mm_setzero_pd();
283 fjy0
= _mm_setzero_pd();
284 fjz0
= _mm_setzero_pd();
285 fjx1
= _mm_setzero_pd();
286 fjy1
= _mm_setzero_pd();
287 fjz1
= _mm_setzero_pd();
288 fjx2
= _mm_setzero_pd();
289 fjy2
= _mm_setzero_pd();
290 fjz2
= _mm_setzero_pd();
292 /**************************
293 * CALCULATE INTERACTIONS *
294 **************************/
296 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
299 r00
= _mm_mul_pd(rsq00
,rinv00
);
301 /* EWALD ELECTROSTATICS */
303 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
304 ewrt
= _mm_mul_pd(r00
,ewtabscale
);
305 ewitab
= _mm_cvttpd_epi32(ewrt
);
306 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
307 ewitab
= _mm_slli_epi32(ewitab
,2);
308 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
309 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
310 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
311 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
312 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
313 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
314 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
315 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
316 velec
= _mm_mul_pd(qq00
,_mm_sub_pd(_mm_sub_pd(rinv00
,sh_ewald
),velec
));
317 felec
= _mm_mul_pd(_mm_mul_pd(qq00
,rinv00
),_mm_sub_pd(rinvsq00
,felec
));
319 /* LENNARD-JONES DISPERSION/REPULSION */
321 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
322 vvdw6
= _mm_mul_pd(c6_00
,rinvsix
);
323 vvdw12
= _mm_mul_pd(c12_00
,_mm_mul_pd(rinvsix
,rinvsix
));
324 vvdw
= _mm_sub_pd(_mm_mul_pd( _mm_sub_pd(vvdw12
, _mm_mul_pd(c12_00
,_mm_mul_pd(sh_vdw_invrcut6
,sh_vdw_invrcut6
))), one_twelfth
) ,
325 _mm_mul_pd( _mm_sub_pd(vvdw6
,_mm_mul_pd(c6_00
,sh_vdw_invrcut6
)),one_sixth
));
326 fvdw
= _mm_mul_pd(_mm_sub_pd(vvdw12
,vvdw6
),rinvsq00
);
328 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
330 /* Update potential sum for this i atom from the interaction with this j atom. */
331 velec
= _mm_and_pd(velec
,cutoff_mask
);
332 velecsum
= _mm_add_pd(velecsum
,velec
);
333 vvdw
= _mm_and_pd(vvdw
,cutoff_mask
);
334 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
336 fscal
= _mm_add_pd(felec
,fvdw
);
338 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
340 /* Calculate temporary vectorial force */
341 tx
= _mm_mul_pd(fscal
,dx00
);
342 ty
= _mm_mul_pd(fscal
,dy00
);
343 tz
= _mm_mul_pd(fscal
,dz00
);
345 /* Update vectorial force */
346 fix0
= _mm_add_pd(fix0
,tx
);
347 fiy0
= _mm_add_pd(fiy0
,ty
);
348 fiz0
= _mm_add_pd(fiz0
,tz
);
350 fjx0
= _mm_add_pd(fjx0
,tx
);
351 fjy0
= _mm_add_pd(fjy0
,ty
);
352 fjz0
= _mm_add_pd(fjz0
,tz
);
356 /**************************
357 * CALCULATE INTERACTIONS *
358 **************************/
360 if (gmx_mm_any_lt(rsq01
,rcutoff2
))
363 r01
= _mm_mul_pd(rsq01
,rinv01
);
365 /* EWALD ELECTROSTATICS */
367 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
368 ewrt
= _mm_mul_pd(r01
,ewtabscale
);
369 ewitab
= _mm_cvttpd_epi32(ewrt
);
370 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
371 ewitab
= _mm_slli_epi32(ewitab
,2);
372 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
373 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
374 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
375 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
376 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
377 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
378 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
379 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
380 velec
= _mm_mul_pd(qq01
,_mm_sub_pd(_mm_sub_pd(rinv01
,sh_ewald
),velec
));
381 felec
= _mm_mul_pd(_mm_mul_pd(qq01
,rinv01
),_mm_sub_pd(rinvsq01
,felec
));
383 cutoff_mask
= _mm_cmplt_pd(rsq01
,rcutoff2
);
385 /* Update potential sum for this i atom from the interaction with this j atom. */
386 velec
= _mm_and_pd(velec
,cutoff_mask
);
387 velecsum
= _mm_add_pd(velecsum
,velec
);
391 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
393 /* Calculate temporary vectorial force */
394 tx
= _mm_mul_pd(fscal
,dx01
);
395 ty
= _mm_mul_pd(fscal
,dy01
);
396 tz
= _mm_mul_pd(fscal
,dz01
);
398 /* Update vectorial force */
399 fix0
= _mm_add_pd(fix0
,tx
);
400 fiy0
= _mm_add_pd(fiy0
,ty
);
401 fiz0
= _mm_add_pd(fiz0
,tz
);
403 fjx1
= _mm_add_pd(fjx1
,tx
);
404 fjy1
= _mm_add_pd(fjy1
,ty
);
405 fjz1
= _mm_add_pd(fjz1
,tz
);
409 /**************************
410 * CALCULATE INTERACTIONS *
411 **************************/
413 if (gmx_mm_any_lt(rsq02
,rcutoff2
))
416 r02
= _mm_mul_pd(rsq02
,rinv02
);
418 /* EWALD ELECTROSTATICS */
420 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
421 ewrt
= _mm_mul_pd(r02
,ewtabscale
);
422 ewitab
= _mm_cvttpd_epi32(ewrt
);
423 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
424 ewitab
= _mm_slli_epi32(ewitab
,2);
425 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
426 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
427 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
428 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
429 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
430 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
431 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
432 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
433 velec
= _mm_mul_pd(qq02
,_mm_sub_pd(_mm_sub_pd(rinv02
,sh_ewald
),velec
));
434 felec
= _mm_mul_pd(_mm_mul_pd(qq02
,rinv02
),_mm_sub_pd(rinvsq02
,felec
));
436 cutoff_mask
= _mm_cmplt_pd(rsq02
,rcutoff2
);
438 /* Update potential sum for this i atom from the interaction with this j atom. */
439 velec
= _mm_and_pd(velec
,cutoff_mask
);
440 velecsum
= _mm_add_pd(velecsum
,velec
);
444 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
446 /* Calculate temporary vectorial force */
447 tx
= _mm_mul_pd(fscal
,dx02
);
448 ty
= _mm_mul_pd(fscal
,dy02
);
449 tz
= _mm_mul_pd(fscal
,dz02
);
451 /* Update vectorial force */
452 fix0
= _mm_add_pd(fix0
,tx
);
453 fiy0
= _mm_add_pd(fiy0
,ty
);
454 fiz0
= _mm_add_pd(fiz0
,tz
);
456 fjx2
= _mm_add_pd(fjx2
,tx
);
457 fjy2
= _mm_add_pd(fjy2
,ty
);
458 fjz2
= _mm_add_pd(fjz2
,tz
);
462 /**************************
463 * CALCULATE INTERACTIONS *
464 **************************/
466 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
469 r10
= _mm_mul_pd(rsq10
,rinv10
);
471 /* EWALD ELECTROSTATICS */
473 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
474 ewrt
= _mm_mul_pd(r10
,ewtabscale
);
475 ewitab
= _mm_cvttpd_epi32(ewrt
);
476 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
477 ewitab
= _mm_slli_epi32(ewitab
,2);
478 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
479 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
480 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
481 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
482 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
483 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
484 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
485 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
486 velec
= _mm_mul_pd(qq10
,_mm_sub_pd(_mm_sub_pd(rinv10
,sh_ewald
),velec
));
487 felec
= _mm_mul_pd(_mm_mul_pd(qq10
,rinv10
),_mm_sub_pd(rinvsq10
,felec
));
489 cutoff_mask
= _mm_cmplt_pd(rsq10
,rcutoff2
);
491 /* Update potential sum for this i atom from the interaction with this j atom. */
492 velec
= _mm_and_pd(velec
,cutoff_mask
);
493 velecsum
= _mm_add_pd(velecsum
,velec
);
497 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
499 /* Calculate temporary vectorial force */
500 tx
= _mm_mul_pd(fscal
,dx10
);
501 ty
= _mm_mul_pd(fscal
,dy10
);
502 tz
= _mm_mul_pd(fscal
,dz10
);
504 /* Update vectorial force */
505 fix1
= _mm_add_pd(fix1
,tx
);
506 fiy1
= _mm_add_pd(fiy1
,ty
);
507 fiz1
= _mm_add_pd(fiz1
,tz
);
509 fjx0
= _mm_add_pd(fjx0
,tx
);
510 fjy0
= _mm_add_pd(fjy0
,ty
);
511 fjz0
= _mm_add_pd(fjz0
,tz
);
515 /**************************
516 * CALCULATE INTERACTIONS *
517 **************************/
519 if (gmx_mm_any_lt(rsq11
,rcutoff2
))
522 r11
= _mm_mul_pd(rsq11
,rinv11
);
524 /* EWALD ELECTROSTATICS */
526 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
527 ewrt
= _mm_mul_pd(r11
,ewtabscale
);
528 ewitab
= _mm_cvttpd_epi32(ewrt
);
529 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
530 ewitab
= _mm_slli_epi32(ewitab
,2);
531 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
532 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
533 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
534 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
535 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
536 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
537 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
538 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
539 velec
= _mm_mul_pd(qq11
,_mm_sub_pd(_mm_sub_pd(rinv11
,sh_ewald
),velec
));
540 felec
= _mm_mul_pd(_mm_mul_pd(qq11
,rinv11
),_mm_sub_pd(rinvsq11
,felec
));
542 cutoff_mask
= _mm_cmplt_pd(rsq11
,rcutoff2
);
544 /* Update potential sum for this i atom from the interaction with this j atom. */
545 velec
= _mm_and_pd(velec
,cutoff_mask
);
546 velecsum
= _mm_add_pd(velecsum
,velec
);
550 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
552 /* Calculate temporary vectorial force */
553 tx
= _mm_mul_pd(fscal
,dx11
);
554 ty
= _mm_mul_pd(fscal
,dy11
);
555 tz
= _mm_mul_pd(fscal
,dz11
);
557 /* Update vectorial force */
558 fix1
= _mm_add_pd(fix1
,tx
);
559 fiy1
= _mm_add_pd(fiy1
,ty
);
560 fiz1
= _mm_add_pd(fiz1
,tz
);
562 fjx1
= _mm_add_pd(fjx1
,tx
);
563 fjy1
= _mm_add_pd(fjy1
,ty
);
564 fjz1
= _mm_add_pd(fjz1
,tz
);
568 /**************************
569 * CALCULATE INTERACTIONS *
570 **************************/
572 if (gmx_mm_any_lt(rsq12
,rcutoff2
))
575 r12
= _mm_mul_pd(rsq12
,rinv12
);
577 /* EWALD ELECTROSTATICS */
579 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
580 ewrt
= _mm_mul_pd(r12
,ewtabscale
);
581 ewitab
= _mm_cvttpd_epi32(ewrt
);
582 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
583 ewitab
= _mm_slli_epi32(ewitab
,2);
584 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
585 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
586 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
587 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
588 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
589 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
590 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
591 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
592 velec
= _mm_mul_pd(qq12
,_mm_sub_pd(_mm_sub_pd(rinv12
,sh_ewald
),velec
));
593 felec
= _mm_mul_pd(_mm_mul_pd(qq12
,rinv12
),_mm_sub_pd(rinvsq12
,felec
));
595 cutoff_mask
= _mm_cmplt_pd(rsq12
,rcutoff2
);
597 /* Update potential sum for this i atom from the interaction with this j atom. */
598 velec
= _mm_and_pd(velec
,cutoff_mask
);
599 velecsum
= _mm_add_pd(velecsum
,velec
);
603 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
605 /* Calculate temporary vectorial force */
606 tx
= _mm_mul_pd(fscal
,dx12
);
607 ty
= _mm_mul_pd(fscal
,dy12
);
608 tz
= _mm_mul_pd(fscal
,dz12
);
610 /* Update vectorial force */
611 fix1
= _mm_add_pd(fix1
,tx
);
612 fiy1
= _mm_add_pd(fiy1
,ty
);
613 fiz1
= _mm_add_pd(fiz1
,tz
);
615 fjx2
= _mm_add_pd(fjx2
,tx
);
616 fjy2
= _mm_add_pd(fjy2
,ty
);
617 fjz2
= _mm_add_pd(fjz2
,tz
);
621 /**************************
622 * CALCULATE INTERACTIONS *
623 **************************/
625 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
628 r20
= _mm_mul_pd(rsq20
,rinv20
);
630 /* EWALD ELECTROSTATICS */
632 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
633 ewrt
= _mm_mul_pd(r20
,ewtabscale
);
634 ewitab
= _mm_cvttpd_epi32(ewrt
);
635 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
636 ewitab
= _mm_slli_epi32(ewitab
,2);
637 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
638 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
639 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
640 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
641 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
642 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
643 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
644 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
645 velec
= _mm_mul_pd(qq20
,_mm_sub_pd(_mm_sub_pd(rinv20
,sh_ewald
),velec
));
646 felec
= _mm_mul_pd(_mm_mul_pd(qq20
,rinv20
),_mm_sub_pd(rinvsq20
,felec
));
648 cutoff_mask
= _mm_cmplt_pd(rsq20
,rcutoff2
);
650 /* Update potential sum for this i atom from the interaction with this j atom. */
651 velec
= _mm_and_pd(velec
,cutoff_mask
);
652 velecsum
= _mm_add_pd(velecsum
,velec
);
656 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
658 /* Calculate temporary vectorial force */
659 tx
= _mm_mul_pd(fscal
,dx20
);
660 ty
= _mm_mul_pd(fscal
,dy20
);
661 tz
= _mm_mul_pd(fscal
,dz20
);
663 /* Update vectorial force */
664 fix2
= _mm_add_pd(fix2
,tx
);
665 fiy2
= _mm_add_pd(fiy2
,ty
);
666 fiz2
= _mm_add_pd(fiz2
,tz
);
668 fjx0
= _mm_add_pd(fjx0
,tx
);
669 fjy0
= _mm_add_pd(fjy0
,ty
);
670 fjz0
= _mm_add_pd(fjz0
,tz
);
674 /**************************
675 * CALCULATE INTERACTIONS *
676 **************************/
678 if (gmx_mm_any_lt(rsq21
,rcutoff2
))
681 r21
= _mm_mul_pd(rsq21
,rinv21
);
683 /* EWALD ELECTROSTATICS */
685 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
686 ewrt
= _mm_mul_pd(r21
,ewtabscale
);
687 ewitab
= _mm_cvttpd_epi32(ewrt
);
688 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
689 ewitab
= _mm_slli_epi32(ewitab
,2);
690 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
691 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
692 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
693 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
694 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
695 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
696 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
697 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
698 velec
= _mm_mul_pd(qq21
,_mm_sub_pd(_mm_sub_pd(rinv21
,sh_ewald
),velec
));
699 felec
= _mm_mul_pd(_mm_mul_pd(qq21
,rinv21
),_mm_sub_pd(rinvsq21
,felec
));
701 cutoff_mask
= _mm_cmplt_pd(rsq21
,rcutoff2
);
703 /* Update potential sum for this i atom from the interaction with this j atom. */
704 velec
= _mm_and_pd(velec
,cutoff_mask
);
705 velecsum
= _mm_add_pd(velecsum
,velec
);
709 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
711 /* Calculate temporary vectorial force */
712 tx
= _mm_mul_pd(fscal
,dx21
);
713 ty
= _mm_mul_pd(fscal
,dy21
);
714 tz
= _mm_mul_pd(fscal
,dz21
);
716 /* Update vectorial force */
717 fix2
= _mm_add_pd(fix2
,tx
);
718 fiy2
= _mm_add_pd(fiy2
,ty
);
719 fiz2
= _mm_add_pd(fiz2
,tz
);
721 fjx1
= _mm_add_pd(fjx1
,tx
);
722 fjy1
= _mm_add_pd(fjy1
,ty
);
723 fjz1
= _mm_add_pd(fjz1
,tz
);
727 /**************************
728 * CALCULATE INTERACTIONS *
729 **************************/
731 if (gmx_mm_any_lt(rsq22
,rcutoff2
))
734 r22
= _mm_mul_pd(rsq22
,rinv22
);
736 /* EWALD ELECTROSTATICS */
738 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
739 ewrt
= _mm_mul_pd(r22
,ewtabscale
);
740 ewitab
= _mm_cvttpd_epi32(ewrt
);
741 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
742 ewitab
= _mm_slli_epi32(ewitab
,2);
743 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
744 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
745 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
746 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
747 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
748 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
749 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
750 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
751 velec
= _mm_mul_pd(qq22
,_mm_sub_pd(_mm_sub_pd(rinv22
,sh_ewald
),velec
));
752 felec
= _mm_mul_pd(_mm_mul_pd(qq22
,rinv22
),_mm_sub_pd(rinvsq22
,felec
));
754 cutoff_mask
= _mm_cmplt_pd(rsq22
,rcutoff2
);
756 /* Update potential sum for this i atom from the interaction with this j atom. */
757 velec
= _mm_and_pd(velec
,cutoff_mask
);
758 velecsum
= _mm_add_pd(velecsum
,velec
);
762 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
764 /* Calculate temporary vectorial force */
765 tx
= _mm_mul_pd(fscal
,dx22
);
766 ty
= _mm_mul_pd(fscal
,dy22
);
767 tz
= _mm_mul_pd(fscal
,dz22
);
769 /* Update vectorial force */
770 fix2
= _mm_add_pd(fix2
,tx
);
771 fiy2
= _mm_add_pd(fiy2
,ty
);
772 fiz2
= _mm_add_pd(fiz2
,tz
);
774 fjx2
= _mm_add_pd(fjx2
,tx
);
775 fjy2
= _mm_add_pd(fjy2
,ty
);
776 fjz2
= _mm_add_pd(fjz2
,tz
);
780 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
782 /* Inner loop uses 432 flops */
789 j_coord_offsetA
= DIM
*jnrA
;
791 /* load j atom coordinates */
792 gmx_mm_load_3rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
793 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
795 /* Calculate displacement vector */
796 dx00
= _mm_sub_pd(ix0
,jx0
);
797 dy00
= _mm_sub_pd(iy0
,jy0
);
798 dz00
= _mm_sub_pd(iz0
,jz0
);
799 dx01
= _mm_sub_pd(ix0
,jx1
);
800 dy01
= _mm_sub_pd(iy0
,jy1
);
801 dz01
= _mm_sub_pd(iz0
,jz1
);
802 dx02
= _mm_sub_pd(ix0
,jx2
);
803 dy02
= _mm_sub_pd(iy0
,jy2
);
804 dz02
= _mm_sub_pd(iz0
,jz2
);
805 dx10
= _mm_sub_pd(ix1
,jx0
);
806 dy10
= _mm_sub_pd(iy1
,jy0
);
807 dz10
= _mm_sub_pd(iz1
,jz0
);
808 dx11
= _mm_sub_pd(ix1
,jx1
);
809 dy11
= _mm_sub_pd(iy1
,jy1
);
810 dz11
= _mm_sub_pd(iz1
,jz1
);
811 dx12
= _mm_sub_pd(ix1
,jx2
);
812 dy12
= _mm_sub_pd(iy1
,jy2
);
813 dz12
= _mm_sub_pd(iz1
,jz2
);
814 dx20
= _mm_sub_pd(ix2
,jx0
);
815 dy20
= _mm_sub_pd(iy2
,jy0
);
816 dz20
= _mm_sub_pd(iz2
,jz0
);
817 dx21
= _mm_sub_pd(ix2
,jx1
);
818 dy21
= _mm_sub_pd(iy2
,jy1
);
819 dz21
= _mm_sub_pd(iz2
,jz1
);
820 dx22
= _mm_sub_pd(ix2
,jx2
);
821 dy22
= _mm_sub_pd(iy2
,jy2
);
822 dz22
= _mm_sub_pd(iz2
,jz2
);
824 /* Calculate squared distance and things based on it */
825 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
826 rsq01
= gmx_mm_calc_rsq_pd(dx01
,dy01
,dz01
);
827 rsq02
= gmx_mm_calc_rsq_pd(dx02
,dy02
,dz02
);
828 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
829 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
830 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
831 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
832 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
833 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
835 rinv00
= sse2_invsqrt_d(rsq00
);
836 rinv01
= sse2_invsqrt_d(rsq01
);
837 rinv02
= sse2_invsqrt_d(rsq02
);
838 rinv10
= sse2_invsqrt_d(rsq10
);
839 rinv11
= sse2_invsqrt_d(rsq11
);
840 rinv12
= sse2_invsqrt_d(rsq12
);
841 rinv20
= sse2_invsqrt_d(rsq20
);
842 rinv21
= sse2_invsqrt_d(rsq21
);
843 rinv22
= sse2_invsqrt_d(rsq22
);
845 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
846 rinvsq01
= _mm_mul_pd(rinv01
,rinv01
);
847 rinvsq02
= _mm_mul_pd(rinv02
,rinv02
);
848 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
849 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
850 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
851 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
852 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
853 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
855 fjx0
= _mm_setzero_pd();
856 fjy0
= _mm_setzero_pd();
857 fjz0
= _mm_setzero_pd();
858 fjx1
= _mm_setzero_pd();
859 fjy1
= _mm_setzero_pd();
860 fjz1
= _mm_setzero_pd();
861 fjx2
= _mm_setzero_pd();
862 fjy2
= _mm_setzero_pd();
863 fjz2
= _mm_setzero_pd();
865 /**************************
866 * CALCULATE INTERACTIONS *
867 **************************/
869 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
872 r00
= _mm_mul_pd(rsq00
,rinv00
);
874 /* EWALD ELECTROSTATICS */
876 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
877 ewrt
= _mm_mul_pd(r00
,ewtabscale
);
878 ewitab
= _mm_cvttpd_epi32(ewrt
);
879 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
880 ewitab
= _mm_slli_epi32(ewitab
,2);
881 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
882 ewtabD
= _mm_setzero_pd();
883 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
884 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
885 ewtabFn
= _mm_setzero_pd();
886 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
887 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
888 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
889 velec
= _mm_mul_pd(qq00
,_mm_sub_pd(_mm_sub_pd(rinv00
,sh_ewald
),velec
));
890 felec
= _mm_mul_pd(_mm_mul_pd(qq00
,rinv00
),_mm_sub_pd(rinvsq00
,felec
));
892 /* LENNARD-JONES DISPERSION/REPULSION */
894 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
895 vvdw6
= _mm_mul_pd(c6_00
,rinvsix
);
896 vvdw12
= _mm_mul_pd(c12_00
,_mm_mul_pd(rinvsix
,rinvsix
));
897 vvdw
= _mm_sub_pd(_mm_mul_pd( _mm_sub_pd(vvdw12
, _mm_mul_pd(c12_00
,_mm_mul_pd(sh_vdw_invrcut6
,sh_vdw_invrcut6
))), one_twelfth
) ,
898 _mm_mul_pd( _mm_sub_pd(vvdw6
,_mm_mul_pd(c6_00
,sh_vdw_invrcut6
)),one_sixth
));
899 fvdw
= _mm_mul_pd(_mm_sub_pd(vvdw12
,vvdw6
),rinvsq00
);
901 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
903 /* Update potential sum for this i atom from the interaction with this j atom. */
904 velec
= _mm_and_pd(velec
,cutoff_mask
);
905 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
906 velecsum
= _mm_add_pd(velecsum
,velec
);
907 vvdw
= _mm_and_pd(vvdw
,cutoff_mask
);
908 vvdw
= _mm_unpacklo_pd(vvdw
,_mm_setzero_pd());
909 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
911 fscal
= _mm_add_pd(felec
,fvdw
);
913 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
915 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
917 /* Calculate temporary vectorial force */
918 tx
= _mm_mul_pd(fscal
,dx00
);
919 ty
= _mm_mul_pd(fscal
,dy00
);
920 tz
= _mm_mul_pd(fscal
,dz00
);
922 /* Update vectorial force */
923 fix0
= _mm_add_pd(fix0
,tx
);
924 fiy0
= _mm_add_pd(fiy0
,ty
);
925 fiz0
= _mm_add_pd(fiz0
,tz
);
927 fjx0
= _mm_add_pd(fjx0
,tx
);
928 fjy0
= _mm_add_pd(fjy0
,ty
);
929 fjz0
= _mm_add_pd(fjz0
,tz
);
933 /**************************
934 * CALCULATE INTERACTIONS *
935 **************************/
937 if (gmx_mm_any_lt(rsq01
,rcutoff2
))
940 r01
= _mm_mul_pd(rsq01
,rinv01
);
942 /* EWALD ELECTROSTATICS */
944 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
945 ewrt
= _mm_mul_pd(r01
,ewtabscale
);
946 ewitab
= _mm_cvttpd_epi32(ewrt
);
947 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
948 ewitab
= _mm_slli_epi32(ewitab
,2);
949 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
950 ewtabD
= _mm_setzero_pd();
951 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
952 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
953 ewtabFn
= _mm_setzero_pd();
954 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
955 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
956 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
957 velec
= _mm_mul_pd(qq01
,_mm_sub_pd(_mm_sub_pd(rinv01
,sh_ewald
),velec
));
958 felec
= _mm_mul_pd(_mm_mul_pd(qq01
,rinv01
),_mm_sub_pd(rinvsq01
,felec
));
960 cutoff_mask
= _mm_cmplt_pd(rsq01
,rcutoff2
);
962 /* Update potential sum for this i atom from the interaction with this j atom. */
963 velec
= _mm_and_pd(velec
,cutoff_mask
);
964 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
965 velecsum
= _mm_add_pd(velecsum
,velec
);
969 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
971 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
973 /* Calculate temporary vectorial force */
974 tx
= _mm_mul_pd(fscal
,dx01
);
975 ty
= _mm_mul_pd(fscal
,dy01
);
976 tz
= _mm_mul_pd(fscal
,dz01
);
978 /* Update vectorial force */
979 fix0
= _mm_add_pd(fix0
,tx
);
980 fiy0
= _mm_add_pd(fiy0
,ty
);
981 fiz0
= _mm_add_pd(fiz0
,tz
);
983 fjx1
= _mm_add_pd(fjx1
,tx
);
984 fjy1
= _mm_add_pd(fjy1
,ty
);
985 fjz1
= _mm_add_pd(fjz1
,tz
);
989 /**************************
990 * CALCULATE INTERACTIONS *
991 **************************/
993 if (gmx_mm_any_lt(rsq02
,rcutoff2
))
996 r02
= _mm_mul_pd(rsq02
,rinv02
);
998 /* EWALD ELECTROSTATICS */
1000 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1001 ewrt
= _mm_mul_pd(r02
,ewtabscale
);
1002 ewitab
= _mm_cvttpd_epi32(ewrt
);
1003 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1004 ewitab
= _mm_slli_epi32(ewitab
,2);
1005 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1006 ewtabD
= _mm_setzero_pd();
1007 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1008 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1009 ewtabFn
= _mm_setzero_pd();
1010 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1011 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1012 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1013 velec
= _mm_mul_pd(qq02
,_mm_sub_pd(_mm_sub_pd(rinv02
,sh_ewald
),velec
));
1014 felec
= _mm_mul_pd(_mm_mul_pd(qq02
,rinv02
),_mm_sub_pd(rinvsq02
,felec
));
1016 cutoff_mask
= _mm_cmplt_pd(rsq02
,rcutoff2
);
1018 /* Update potential sum for this i atom from the interaction with this j atom. */
1019 velec
= _mm_and_pd(velec
,cutoff_mask
);
1020 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1021 velecsum
= _mm_add_pd(velecsum
,velec
);
1025 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1027 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1029 /* Calculate temporary vectorial force */
1030 tx
= _mm_mul_pd(fscal
,dx02
);
1031 ty
= _mm_mul_pd(fscal
,dy02
);
1032 tz
= _mm_mul_pd(fscal
,dz02
);
1034 /* Update vectorial force */
1035 fix0
= _mm_add_pd(fix0
,tx
);
1036 fiy0
= _mm_add_pd(fiy0
,ty
);
1037 fiz0
= _mm_add_pd(fiz0
,tz
);
1039 fjx2
= _mm_add_pd(fjx2
,tx
);
1040 fjy2
= _mm_add_pd(fjy2
,ty
);
1041 fjz2
= _mm_add_pd(fjz2
,tz
);
1045 /**************************
1046 * CALCULATE INTERACTIONS *
1047 **************************/
1049 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
1052 r10
= _mm_mul_pd(rsq10
,rinv10
);
1054 /* EWALD ELECTROSTATICS */
1056 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1057 ewrt
= _mm_mul_pd(r10
,ewtabscale
);
1058 ewitab
= _mm_cvttpd_epi32(ewrt
);
1059 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1060 ewitab
= _mm_slli_epi32(ewitab
,2);
1061 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1062 ewtabD
= _mm_setzero_pd();
1063 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1064 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1065 ewtabFn
= _mm_setzero_pd();
1066 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1067 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1068 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1069 velec
= _mm_mul_pd(qq10
,_mm_sub_pd(_mm_sub_pd(rinv10
,sh_ewald
),velec
));
1070 felec
= _mm_mul_pd(_mm_mul_pd(qq10
,rinv10
),_mm_sub_pd(rinvsq10
,felec
));
1072 cutoff_mask
= _mm_cmplt_pd(rsq10
,rcutoff2
);
1074 /* Update potential sum for this i atom from the interaction with this j atom. */
1075 velec
= _mm_and_pd(velec
,cutoff_mask
);
1076 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1077 velecsum
= _mm_add_pd(velecsum
,velec
);
1081 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1083 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1085 /* Calculate temporary vectorial force */
1086 tx
= _mm_mul_pd(fscal
,dx10
);
1087 ty
= _mm_mul_pd(fscal
,dy10
);
1088 tz
= _mm_mul_pd(fscal
,dz10
);
1090 /* Update vectorial force */
1091 fix1
= _mm_add_pd(fix1
,tx
);
1092 fiy1
= _mm_add_pd(fiy1
,ty
);
1093 fiz1
= _mm_add_pd(fiz1
,tz
);
1095 fjx0
= _mm_add_pd(fjx0
,tx
);
1096 fjy0
= _mm_add_pd(fjy0
,ty
);
1097 fjz0
= _mm_add_pd(fjz0
,tz
);
1101 /**************************
1102 * CALCULATE INTERACTIONS *
1103 **************************/
1105 if (gmx_mm_any_lt(rsq11
,rcutoff2
))
1108 r11
= _mm_mul_pd(rsq11
,rinv11
);
1110 /* EWALD ELECTROSTATICS */
1112 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1113 ewrt
= _mm_mul_pd(r11
,ewtabscale
);
1114 ewitab
= _mm_cvttpd_epi32(ewrt
);
1115 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1116 ewitab
= _mm_slli_epi32(ewitab
,2);
1117 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1118 ewtabD
= _mm_setzero_pd();
1119 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1120 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1121 ewtabFn
= _mm_setzero_pd();
1122 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1123 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1124 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1125 velec
= _mm_mul_pd(qq11
,_mm_sub_pd(_mm_sub_pd(rinv11
,sh_ewald
),velec
));
1126 felec
= _mm_mul_pd(_mm_mul_pd(qq11
,rinv11
),_mm_sub_pd(rinvsq11
,felec
));
1128 cutoff_mask
= _mm_cmplt_pd(rsq11
,rcutoff2
);
1130 /* Update potential sum for this i atom from the interaction with this j atom. */
1131 velec
= _mm_and_pd(velec
,cutoff_mask
);
1132 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1133 velecsum
= _mm_add_pd(velecsum
,velec
);
1137 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1139 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1141 /* Calculate temporary vectorial force */
1142 tx
= _mm_mul_pd(fscal
,dx11
);
1143 ty
= _mm_mul_pd(fscal
,dy11
);
1144 tz
= _mm_mul_pd(fscal
,dz11
);
1146 /* Update vectorial force */
1147 fix1
= _mm_add_pd(fix1
,tx
);
1148 fiy1
= _mm_add_pd(fiy1
,ty
);
1149 fiz1
= _mm_add_pd(fiz1
,tz
);
1151 fjx1
= _mm_add_pd(fjx1
,tx
);
1152 fjy1
= _mm_add_pd(fjy1
,ty
);
1153 fjz1
= _mm_add_pd(fjz1
,tz
);
1157 /**************************
1158 * CALCULATE INTERACTIONS *
1159 **************************/
1161 if (gmx_mm_any_lt(rsq12
,rcutoff2
))
1164 r12
= _mm_mul_pd(rsq12
,rinv12
);
1166 /* EWALD ELECTROSTATICS */
1168 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1169 ewrt
= _mm_mul_pd(r12
,ewtabscale
);
1170 ewitab
= _mm_cvttpd_epi32(ewrt
);
1171 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1172 ewitab
= _mm_slli_epi32(ewitab
,2);
1173 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1174 ewtabD
= _mm_setzero_pd();
1175 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1176 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1177 ewtabFn
= _mm_setzero_pd();
1178 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1179 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1180 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1181 velec
= _mm_mul_pd(qq12
,_mm_sub_pd(_mm_sub_pd(rinv12
,sh_ewald
),velec
));
1182 felec
= _mm_mul_pd(_mm_mul_pd(qq12
,rinv12
),_mm_sub_pd(rinvsq12
,felec
));
1184 cutoff_mask
= _mm_cmplt_pd(rsq12
,rcutoff2
);
1186 /* Update potential sum for this i atom from the interaction with this j atom. */
1187 velec
= _mm_and_pd(velec
,cutoff_mask
);
1188 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1189 velecsum
= _mm_add_pd(velecsum
,velec
);
1193 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1195 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1197 /* Calculate temporary vectorial force */
1198 tx
= _mm_mul_pd(fscal
,dx12
);
1199 ty
= _mm_mul_pd(fscal
,dy12
);
1200 tz
= _mm_mul_pd(fscal
,dz12
);
1202 /* Update vectorial force */
1203 fix1
= _mm_add_pd(fix1
,tx
);
1204 fiy1
= _mm_add_pd(fiy1
,ty
);
1205 fiz1
= _mm_add_pd(fiz1
,tz
);
1207 fjx2
= _mm_add_pd(fjx2
,tx
);
1208 fjy2
= _mm_add_pd(fjy2
,ty
);
1209 fjz2
= _mm_add_pd(fjz2
,tz
);
1213 /**************************
1214 * CALCULATE INTERACTIONS *
1215 **************************/
1217 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
1220 r20
= _mm_mul_pd(rsq20
,rinv20
);
1222 /* EWALD ELECTROSTATICS */
1224 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1225 ewrt
= _mm_mul_pd(r20
,ewtabscale
);
1226 ewitab
= _mm_cvttpd_epi32(ewrt
);
1227 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1228 ewitab
= _mm_slli_epi32(ewitab
,2);
1229 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1230 ewtabD
= _mm_setzero_pd();
1231 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1232 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1233 ewtabFn
= _mm_setzero_pd();
1234 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1235 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1236 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1237 velec
= _mm_mul_pd(qq20
,_mm_sub_pd(_mm_sub_pd(rinv20
,sh_ewald
),velec
));
1238 felec
= _mm_mul_pd(_mm_mul_pd(qq20
,rinv20
),_mm_sub_pd(rinvsq20
,felec
));
1240 cutoff_mask
= _mm_cmplt_pd(rsq20
,rcutoff2
);
1242 /* Update potential sum for this i atom from the interaction with this j atom. */
1243 velec
= _mm_and_pd(velec
,cutoff_mask
);
1244 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1245 velecsum
= _mm_add_pd(velecsum
,velec
);
1249 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1251 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1253 /* Calculate temporary vectorial force */
1254 tx
= _mm_mul_pd(fscal
,dx20
);
1255 ty
= _mm_mul_pd(fscal
,dy20
);
1256 tz
= _mm_mul_pd(fscal
,dz20
);
1258 /* Update vectorial force */
1259 fix2
= _mm_add_pd(fix2
,tx
);
1260 fiy2
= _mm_add_pd(fiy2
,ty
);
1261 fiz2
= _mm_add_pd(fiz2
,tz
);
1263 fjx0
= _mm_add_pd(fjx0
,tx
);
1264 fjy0
= _mm_add_pd(fjy0
,ty
);
1265 fjz0
= _mm_add_pd(fjz0
,tz
);
1269 /**************************
1270 * CALCULATE INTERACTIONS *
1271 **************************/
1273 if (gmx_mm_any_lt(rsq21
,rcutoff2
))
1276 r21
= _mm_mul_pd(rsq21
,rinv21
);
1278 /* EWALD ELECTROSTATICS */
1280 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1281 ewrt
= _mm_mul_pd(r21
,ewtabscale
);
1282 ewitab
= _mm_cvttpd_epi32(ewrt
);
1283 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1284 ewitab
= _mm_slli_epi32(ewitab
,2);
1285 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1286 ewtabD
= _mm_setzero_pd();
1287 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1288 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1289 ewtabFn
= _mm_setzero_pd();
1290 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1291 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1292 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1293 velec
= _mm_mul_pd(qq21
,_mm_sub_pd(_mm_sub_pd(rinv21
,sh_ewald
),velec
));
1294 felec
= _mm_mul_pd(_mm_mul_pd(qq21
,rinv21
),_mm_sub_pd(rinvsq21
,felec
));
1296 cutoff_mask
= _mm_cmplt_pd(rsq21
,rcutoff2
);
1298 /* Update potential sum for this i atom from the interaction with this j atom. */
1299 velec
= _mm_and_pd(velec
,cutoff_mask
);
1300 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1301 velecsum
= _mm_add_pd(velecsum
,velec
);
1305 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1307 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1309 /* Calculate temporary vectorial force */
1310 tx
= _mm_mul_pd(fscal
,dx21
);
1311 ty
= _mm_mul_pd(fscal
,dy21
);
1312 tz
= _mm_mul_pd(fscal
,dz21
);
1314 /* Update vectorial force */
1315 fix2
= _mm_add_pd(fix2
,tx
);
1316 fiy2
= _mm_add_pd(fiy2
,ty
);
1317 fiz2
= _mm_add_pd(fiz2
,tz
);
1319 fjx1
= _mm_add_pd(fjx1
,tx
);
1320 fjy1
= _mm_add_pd(fjy1
,ty
);
1321 fjz1
= _mm_add_pd(fjz1
,tz
);
1325 /**************************
1326 * CALCULATE INTERACTIONS *
1327 **************************/
1329 if (gmx_mm_any_lt(rsq22
,rcutoff2
))
1332 r22
= _mm_mul_pd(rsq22
,rinv22
);
1334 /* EWALD ELECTROSTATICS */
1336 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1337 ewrt
= _mm_mul_pd(r22
,ewtabscale
);
1338 ewitab
= _mm_cvttpd_epi32(ewrt
);
1339 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1340 ewitab
= _mm_slli_epi32(ewitab
,2);
1341 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1342 ewtabD
= _mm_setzero_pd();
1343 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1344 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1345 ewtabFn
= _mm_setzero_pd();
1346 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1347 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1348 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1349 velec
= _mm_mul_pd(qq22
,_mm_sub_pd(_mm_sub_pd(rinv22
,sh_ewald
),velec
));
1350 felec
= _mm_mul_pd(_mm_mul_pd(qq22
,rinv22
),_mm_sub_pd(rinvsq22
,felec
));
1352 cutoff_mask
= _mm_cmplt_pd(rsq22
,rcutoff2
);
1354 /* Update potential sum for this i atom from the interaction with this j atom. */
1355 velec
= _mm_and_pd(velec
,cutoff_mask
);
1356 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1357 velecsum
= _mm_add_pd(velecsum
,velec
);
1361 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1363 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1365 /* Calculate temporary vectorial force */
1366 tx
= _mm_mul_pd(fscal
,dx22
);
1367 ty
= _mm_mul_pd(fscal
,dy22
);
1368 tz
= _mm_mul_pd(fscal
,dz22
);
1370 /* Update vectorial force */
1371 fix2
= _mm_add_pd(fix2
,tx
);
1372 fiy2
= _mm_add_pd(fiy2
,ty
);
1373 fiz2
= _mm_add_pd(fiz2
,tz
);
1375 fjx2
= _mm_add_pd(fjx2
,tx
);
1376 fjy2
= _mm_add_pd(fjy2
,ty
);
1377 fjz2
= _mm_add_pd(fjz2
,tz
);
1381 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
1383 /* Inner loop uses 432 flops */
1386 /* End of innermost loop */
1388 gmx_mm_update_iforce_3atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
1389 f
+i_coord_offset
,fshift
+i_shift_offset
);
1392 /* Update potential energies */
1393 gmx_mm_update_1pot_pd(velecsum
,kernel_data
->energygrp_elec
+ggid
);
1394 gmx_mm_update_1pot_pd(vvdwsum
,kernel_data
->energygrp_vdw
+ggid
);
1396 /* Increment number of inner iterations */
1397 inneriter
+= j_index_end
- j_index_start
;
1399 /* Outer loop uses 20 flops */
1402 /* Increment number of outer iterations */
1405 /* Update outer/inner flops */
1407 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W3W3_VF
,outeriter
*20 + inneriter
*432);
1410 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sse2_double
1411 * Electrostatics interaction: Ewald
1412 * VdW interaction: LennardJones
1413 * Geometry: Water3-Water3
1414 * Calculate force/pot: Force
1417 nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sse2_double
1418 (t_nblist
* gmx_restrict nlist
,
1419 rvec
* gmx_restrict xx
,
1420 rvec
* gmx_restrict ff
,
1421 struct t_forcerec
* gmx_restrict fr
,
1422 t_mdatoms
* gmx_restrict mdatoms
,
1423 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
1424 t_nrnb
* gmx_restrict nrnb
)
1426 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1427 * just 0 for non-waters.
1428 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1429 * jnr indices corresponding to data put in the four positions in the SIMD register.
1431 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
1432 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
1434 int j_coord_offsetA
,j_coord_offsetB
;
1435 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
1436 real rcutoff_scalar
;
1437 real
*shiftvec
,*fshift
,*x
,*f
;
1438 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
1440 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
1442 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
1444 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
1445 int vdwjidx0A
,vdwjidx0B
;
1446 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
1447 int vdwjidx1A
,vdwjidx1B
;
1448 __m128d jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
1449 int vdwjidx2A
,vdwjidx2B
;
1450 __m128d jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
1451 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
1452 __m128d dx01
,dy01
,dz01
,rsq01
,rinv01
,rinvsq01
,r01
,qq01
,c6_01
,c12_01
;
1453 __m128d dx02
,dy02
,dz02
,rsq02
,rinv02
,rinvsq02
,r02
,qq02
,c6_02
,c12_02
;
1454 __m128d dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
1455 __m128d dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
1456 __m128d dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
1457 __m128d dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
1458 __m128d dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
1459 __m128d dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
1460 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
1463 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
1466 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
1467 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
1469 __m128d ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
1471 __m128d dummy_mask
,cutoff_mask
;
1472 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1473 __m128d one
= _mm_set1_pd(1.0);
1474 __m128d two
= _mm_set1_pd(2.0);
1480 jindex
= nlist
->jindex
;
1482 shiftidx
= nlist
->shift
;
1484 shiftvec
= fr
->shift_vec
[0];
1485 fshift
= fr
->fshift
[0];
1486 facel
= _mm_set1_pd(fr
->ic
->epsfac
);
1487 charge
= mdatoms
->chargeA
;
1488 nvdwtype
= fr
->ntype
;
1489 vdwparam
= fr
->nbfp
;
1490 vdwtype
= mdatoms
->typeA
;
1492 sh_ewald
= _mm_set1_pd(fr
->ic
->sh_ewald
);
1493 ewtab
= fr
->ic
->tabq_coul_F
;
1494 ewtabscale
= _mm_set1_pd(fr
->ic
->tabq_scale
);
1495 ewtabhalfspace
= _mm_set1_pd(0.5/fr
->ic
->tabq_scale
);
1497 /* Setup water-specific parameters */
1498 inr
= nlist
->iinr
[0];
1499 iq0
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+0]));
1500 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
1501 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
1502 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
1504 jq0
= _mm_set1_pd(charge
[inr
+0]);
1505 jq1
= _mm_set1_pd(charge
[inr
+1]);
1506 jq2
= _mm_set1_pd(charge
[inr
+2]);
1507 vdwjidx0A
= 2*vdwtype
[inr
+0];
1508 qq00
= _mm_mul_pd(iq0
,jq0
);
1509 c6_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
]);
1510 c12_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
1511 qq01
= _mm_mul_pd(iq0
,jq1
);
1512 qq02
= _mm_mul_pd(iq0
,jq2
);
1513 qq10
= _mm_mul_pd(iq1
,jq0
);
1514 qq11
= _mm_mul_pd(iq1
,jq1
);
1515 qq12
= _mm_mul_pd(iq1
,jq2
);
1516 qq20
= _mm_mul_pd(iq2
,jq0
);
1517 qq21
= _mm_mul_pd(iq2
,jq1
);
1518 qq22
= _mm_mul_pd(iq2
,jq2
);
1520 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1521 rcutoff_scalar
= fr
->ic
->rcoulomb
;
1522 rcutoff
= _mm_set1_pd(rcutoff_scalar
);
1523 rcutoff2
= _mm_mul_pd(rcutoff
,rcutoff
);
1525 sh_vdw_invrcut6
= _mm_set1_pd(fr
->ic
->sh_invrc6
);
1526 rvdw
= _mm_set1_pd(fr
->ic
->rvdw
);
1528 /* Avoid stupid compiler warnings */
1530 j_coord_offsetA
= 0;
1531 j_coord_offsetB
= 0;
1536 /* Start outer loop over neighborlists */
1537 for(iidx
=0; iidx
<nri
; iidx
++)
1539 /* Load shift vector for this list */
1540 i_shift_offset
= DIM
*shiftidx
[iidx
];
1542 /* Load limits for loop over neighbors */
1543 j_index_start
= jindex
[iidx
];
1544 j_index_end
= jindex
[iidx
+1];
1546 /* Get outer coordinate index */
1548 i_coord_offset
= DIM
*inr
;
1550 /* Load i particle coords and add shift vector */
1551 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
1552 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
1554 fix0
= _mm_setzero_pd();
1555 fiy0
= _mm_setzero_pd();
1556 fiz0
= _mm_setzero_pd();
1557 fix1
= _mm_setzero_pd();
1558 fiy1
= _mm_setzero_pd();
1559 fiz1
= _mm_setzero_pd();
1560 fix2
= _mm_setzero_pd();
1561 fiy2
= _mm_setzero_pd();
1562 fiz2
= _mm_setzero_pd();
1564 /* Start inner kernel loop */
1565 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
1568 /* Get j neighbor index, and coordinate index */
1570 jnrB
= jjnr
[jidx
+1];
1571 j_coord_offsetA
= DIM
*jnrA
;
1572 j_coord_offsetB
= DIM
*jnrB
;
1574 /* load j atom coordinates */
1575 gmx_mm_load_3rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1576 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
1578 /* Calculate displacement vector */
1579 dx00
= _mm_sub_pd(ix0
,jx0
);
1580 dy00
= _mm_sub_pd(iy0
,jy0
);
1581 dz00
= _mm_sub_pd(iz0
,jz0
);
1582 dx01
= _mm_sub_pd(ix0
,jx1
);
1583 dy01
= _mm_sub_pd(iy0
,jy1
);
1584 dz01
= _mm_sub_pd(iz0
,jz1
);
1585 dx02
= _mm_sub_pd(ix0
,jx2
);
1586 dy02
= _mm_sub_pd(iy0
,jy2
);
1587 dz02
= _mm_sub_pd(iz0
,jz2
);
1588 dx10
= _mm_sub_pd(ix1
,jx0
);
1589 dy10
= _mm_sub_pd(iy1
,jy0
);
1590 dz10
= _mm_sub_pd(iz1
,jz0
);
1591 dx11
= _mm_sub_pd(ix1
,jx1
);
1592 dy11
= _mm_sub_pd(iy1
,jy1
);
1593 dz11
= _mm_sub_pd(iz1
,jz1
);
1594 dx12
= _mm_sub_pd(ix1
,jx2
);
1595 dy12
= _mm_sub_pd(iy1
,jy2
);
1596 dz12
= _mm_sub_pd(iz1
,jz2
);
1597 dx20
= _mm_sub_pd(ix2
,jx0
);
1598 dy20
= _mm_sub_pd(iy2
,jy0
);
1599 dz20
= _mm_sub_pd(iz2
,jz0
);
1600 dx21
= _mm_sub_pd(ix2
,jx1
);
1601 dy21
= _mm_sub_pd(iy2
,jy1
);
1602 dz21
= _mm_sub_pd(iz2
,jz1
);
1603 dx22
= _mm_sub_pd(ix2
,jx2
);
1604 dy22
= _mm_sub_pd(iy2
,jy2
);
1605 dz22
= _mm_sub_pd(iz2
,jz2
);
1607 /* Calculate squared distance and things based on it */
1608 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
1609 rsq01
= gmx_mm_calc_rsq_pd(dx01
,dy01
,dz01
);
1610 rsq02
= gmx_mm_calc_rsq_pd(dx02
,dy02
,dz02
);
1611 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
1612 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
1613 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
1614 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
1615 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
1616 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
1618 rinv00
= sse2_invsqrt_d(rsq00
);
1619 rinv01
= sse2_invsqrt_d(rsq01
);
1620 rinv02
= sse2_invsqrt_d(rsq02
);
1621 rinv10
= sse2_invsqrt_d(rsq10
);
1622 rinv11
= sse2_invsqrt_d(rsq11
);
1623 rinv12
= sse2_invsqrt_d(rsq12
);
1624 rinv20
= sse2_invsqrt_d(rsq20
);
1625 rinv21
= sse2_invsqrt_d(rsq21
);
1626 rinv22
= sse2_invsqrt_d(rsq22
);
1628 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
1629 rinvsq01
= _mm_mul_pd(rinv01
,rinv01
);
1630 rinvsq02
= _mm_mul_pd(rinv02
,rinv02
);
1631 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
1632 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
1633 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
1634 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
1635 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
1636 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
1638 fjx0
= _mm_setzero_pd();
1639 fjy0
= _mm_setzero_pd();
1640 fjz0
= _mm_setzero_pd();
1641 fjx1
= _mm_setzero_pd();
1642 fjy1
= _mm_setzero_pd();
1643 fjz1
= _mm_setzero_pd();
1644 fjx2
= _mm_setzero_pd();
1645 fjy2
= _mm_setzero_pd();
1646 fjz2
= _mm_setzero_pd();
1648 /**************************
1649 * CALCULATE INTERACTIONS *
1650 **************************/
1652 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
1655 r00
= _mm_mul_pd(rsq00
,rinv00
);
1657 /* EWALD ELECTROSTATICS */
1659 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1660 ewrt
= _mm_mul_pd(r00
,ewtabscale
);
1661 ewitab
= _mm_cvttpd_epi32(ewrt
);
1662 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1663 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1665 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
1666 felec
= _mm_mul_pd(_mm_mul_pd(qq00
,rinv00
),_mm_sub_pd(rinvsq00
,felec
));
1668 /* LENNARD-JONES DISPERSION/REPULSION */
1670 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
1671 fvdw
= _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00
,rinvsix
),c6_00
),_mm_mul_pd(rinvsix
,rinvsq00
));
1673 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
1675 fscal
= _mm_add_pd(felec
,fvdw
);
1677 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1679 /* Calculate temporary vectorial force */
1680 tx
= _mm_mul_pd(fscal
,dx00
);
1681 ty
= _mm_mul_pd(fscal
,dy00
);
1682 tz
= _mm_mul_pd(fscal
,dz00
);
1684 /* Update vectorial force */
1685 fix0
= _mm_add_pd(fix0
,tx
);
1686 fiy0
= _mm_add_pd(fiy0
,ty
);
1687 fiz0
= _mm_add_pd(fiz0
,tz
);
1689 fjx0
= _mm_add_pd(fjx0
,tx
);
1690 fjy0
= _mm_add_pd(fjy0
,ty
);
1691 fjz0
= _mm_add_pd(fjz0
,tz
);
1695 /**************************
1696 * CALCULATE INTERACTIONS *
1697 **************************/
1699 if (gmx_mm_any_lt(rsq01
,rcutoff2
))
1702 r01
= _mm_mul_pd(rsq01
,rinv01
);
1704 /* EWALD ELECTROSTATICS */
1706 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1707 ewrt
= _mm_mul_pd(r01
,ewtabscale
);
1708 ewitab
= _mm_cvttpd_epi32(ewrt
);
1709 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1710 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1712 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
1713 felec
= _mm_mul_pd(_mm_mul_pd(qq01
,rinv01
),_mm_sub_pd(rinvsq01
,felec
));
1715 cutoff_mask
= _mm_cmplt_pd(rsq01
,rcutoff2
);
1719 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1721 /* Calculate temporary vectorial force */
1722 tx
= _mm_mul_pd(fscal
,dx01
);
1723 ty
= _mm_mul_pd(fscal
,dy01
);
1724 tz
= _mm_mul_pd(fscal
,dz01
);
1726 /* Update vectorial force */
1727 fix0
= _mm_add_pd(fix0
,tx
);
1728 fiy0
= _mm_add_pd(fiy0
,ty
);
1729 fiz0
= _mm_add_pd(fiz0
,tz
);
1731 fjx1
= _mm_add_pd(fjx1
,tx
);
1732 fjy1
= _mm_add_pd(fjy1
,ty
);
1733 fjz1
= _mm_add_pd(fjz1
,tz
);
1737 /**************************
1738 * CALCULATE INTERACTIONS *
1739 **************************/
1741 if (gmx_mm_any_lt(rsq02
,rcutoff2
))
1744 r02
= _mm_mul_pd(rsq02
,rinv02
);
1746 /* EWALD ELECTROSTATICS */
1748 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1749 ewrt
= _mm_mul_pd(r02
,ewtabscale
);
1750 ewitab
= _mm_cvttpd_epi32(ewrt
);
1751 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1752 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1754 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
1755 felec
= _mm_mul_pd(_mm_mul_pd(qq02
,rinv02
),_mm_sub_pd(rinvsq02
,felec
));
1757 cutoff_mask
= _mm_cmplt_pd(rsq02
,rcutoff2
);
1761 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1763 /* Calculate temporary vectorial force */
1764 tx
= _mm_mul_pd(fscal
,dx02
);
1765 ty
= _mm_mul_pd(fscal
,dy02
);
1766 tz
= _mm_mul_pd(fscal
,dz02
);
1768 /* Update vectorial force */
1769 fix0
= _mm_add_pd(fix0
,tx
);
1770 fiy0
= _mm_add_pd(fiy0
,ty
);
1771 fiz0
= _mm_add_pd(fiz0
,tz
);
1773 fjx2
= _mm_add_pd(fjx2
,tx
);
1774 fjy2
= _mm_add_pd(fjy2
,ty
);
1775 fjz2
= _mm_add_pd(fjz2
,tz
);
1779 /**************************
1780 * CALCULATE INTERACTIONS *
1781 **************************/
1783 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
1786 r10
= _mm_mul_pd(rsq10
,rinv10
);
1788 /* EWALD ELECTROSTATICS */
1790 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1791 ewrt
= _mm_mul_pd(r10
,ewtabscale
);
1792 ewitab
= _mm_cvttpd_epi32(ewrt
);
1793 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1794 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1796 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
1797 felec
= _mm_mul_pd(_mm_mul_pd(qq10
,rinv10
),_mm_sub_pd(rinvsq10
,felec
));
1799 cutoff_mask
= _mm_cmplt_pd(rsq10
,rcutoff2
);
1803 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1805 /* Calculate temporary vectorial force */
1806 tx
= _mm_mul_pd(fscal
,dx10
);
1807 ty
= _mm_mul_pd(fscal
,dy10
);
1808 tz
= _mm_mul_pd(fscal
,dz10
);
1810 /* Update vectorial force */
1811 fix1
= _mm_add_pd(fix1
,tx
);
1812 fiy1
= _mm_add_pd(fiy1
,ty
);
1813 fiz1
= _mm_add_pd(fiz1
,tz
);
1815 fjx0
= _mm_add_pd(fjx0
,tx
);
1816 fjy0
= _mm_add_pd(fjy0
,ty
);
1817 fjz0
= _mm_add_pd(fjz0
,tz
);
1821 /**************************
1822 * CALCULATE INTERACTIONS *
1823 **************************/
1825 if (gmx_mm_any_lt(rsq11
,rcutoff2
))
1828 r11
= _mm_mul_pd(rsq11
,rinv11
);
1830 /* EWALD ELECTROSTATICS */
1832 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1833 ewrt
= _mm_mul_pd(r11
,ewtabscale
);
1834 ewitab
= _mm_cvttpd_epi32(ewrt
);
1835 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1836 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1838 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
1839 felec
= _mm_mul_pd(_mm_mul_pd(qq11
,rinv11
),_mm_sub_pd(rinvsq11
,felec
));
1841 cutoff_mask
= _mm_cmplt_pd(rsq11
,rcutoff2
);
1845 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1847 /* Calculate temporary vectorial force */
1848 tx
= _mm_mul_pd(fscal
,dx11
);
1849 ty
= _mm_mul_pd(fscal
,dy11
);
1850 tz
= _mm_mul_pd(fscal
,dz11
);
1852 /* Update vectorial force */
1853 fix1
= _mm_add_pd(fix1
,tx
);
1854 fiy1
= _mm_add_pd(fiy1
,ty
);
1855 fiz1
= _mm_add_pd(fiz1
,tz
);
1857 fjx1
= _mm_add_pd(fjx1
,tx
);
1858 fjy1
= _mm_add_pd(fjy1
,ty
);
1859 fjz1
= _mm_add_pd(fjz1
,tz
);
1863 /**************************
1864 * CALCULATE INTERACTIONS *
1865 **************************/
1867 if (gmx_mm_any_lt(rsq12
,rcutoff2
))
1870 r12
= _mm_mul_pd(rsq12
,rinv12
);
1872 /* EWALD ELECTROSTATICS */
1874 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1875 ewrt
= _mm_mul_pd(r12
,ewtabscale
);
1876 ewitab
= _mm_cvttpd_epi32(ewrt
);
1877 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1878 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1880 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
1881 felec
= _mm_mul_pd(_mm_mul_pd(qq12
,rinv12
),_mm_sub_pd(rinvsq12
,felec
));
1883 cutoff_mask
= _mm_cmplt_pd(rsq12
,rcutoff2
);
1887 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1889 /* Calculate temporary vectorial force */
1890 tx
= _mm_mul_pd(fscal
,dx12
);
1891 ty
= _mm_mul_pd(fscal
,dy12
);
1892 tz
= _mm_mul_pd(fscal
,dz12
);
1894 /* Update vectorial force */
1895 fix1
= _mm_add_pd(fix1
,tx
);
1896 fiy1
= _mm_add_pd(fiy1
,ty
);
1897 fiz1
= _mm_add_pd(fiz1
,tz
);
1899 fjx2
= _mm_add_pd(fjx2
,tx
);
1900 fjy2
= _mm_add_pd(fjy2
,ty
);
1901 fjz2
= _mm_add_pd(fjz2
,tz
);
1905 /**************************
1906 * CALCULATE INTERACTIONS *
1907 **************************/
1909 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
1912 r20
= _mm_mul_pd(rsq20
,rinv20
);
1914 /* EWALD ELECTROSTATICS */
1916 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1917 ewrt
= _mm_mul_pd(r20
,ewtabscale
);
1918 ewitab
= _mm_cvttpd_epi32(ewrt
);
1919 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1920 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1922 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
1923 felec
= _mm_mul_pd(_mm_mul_pd(qq20
,rinv20
),_mm_sub_pd(rinvsq20
,felec
));
1925 cutoff_mask
= _mm_cmplt_pd(rsq20
,rcutoff2
);
1929 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1931 /* Calculate temporary vectorial force */
1932 tx
= _mm_mul_pd(fscal
,dx20
);
1933 ty
= _mm_mul_pd(fscal
,dy20
);
1934 tz
= _mm_mul_pd(fscal
,dz20
);
1936 /* Update vectorial force */
1937 fix2
= _mm_add_pd(fix2
,tx
);
1938 fiy2
= _mm_add_pd(fiy2
,ty
);
1939 fiz2
= _mm_add_pd(fiz2
,tz
);
1941 fjx0
= _mm_add_pd(fjx0
,tx
);
1942 fjy0
= _mm_add_pd(fjy0
,ty
);
1943 fjz0
= _mm_add_pd(fjz0
,tz
);
1947 /**************************
1948 * CALCULATE INTERACTIONS *
1949 **************************/
1951 if (gmx_mm_any_lt(rsq21
,rcutoff2
))
1954 r21
= _mm_mul_pd(rsq21
,rinv21
);
1956 /* EWALD ELECTROSTATICS */
1958 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1959 ewrt
= _mm_mul_pd(r21
,ewtabscale
);
1960 ewitab
= _mm_cvttpd_epi32(ewrt
);
1961 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1962 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1964 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
1965 felec
= _mm_mul_pd(_mm_mul_pd(qq21
,rinv21
),_mm_sub_pd(rinvsq21
,felec
));
1967 cutoff_mask
= _mm_cmplt_pd(rsq21
,rcutoff2
);
1971 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1973 /* Calculate temporary vectorial force */
1974 tx
= _mm_mul_pd(fscal
,dx21
);
1975 ty
= _mm_mul_pd(fscal
,dy21
);
1976 tz
= _mm_mul_pd(fscal
,dz21
);
1978 /* Update vectorial force */
1979 fix2
= _mm_add_pd(fix2
,tx
);
1980 fiy2
= _mm_add_pd(fiy2
,ty
);
1981 fiz2
= _mm_add_pd(fiz2
,tz
);
1983 fjx1
= _mm_add_pd(fjx1
,tx
);
1984 fjy1
= _mm_add_pd(fjy1
,ty
);
1985 fjz1
= _mm_add_pd(fjz1
,tz
);
1989 /**************************
1990 * CALCULATE INTERACTIONS *
1991 **************************/
1993 if (gmx_mm_any_lt(rsq22
,rcutoff2
))
1996 r22
= _mm_mul_pd(rsq22
,rinv22
);
1998 /* EWALD ELECTROSTATICS */
2000 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2001 ewrt
= _mm_mul_pd(r22
,ewtabscale
);
2002 ewitab
= _mm_cvttpd_epi32(ewrt
);
2003 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
2004 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
2006 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2007 felec
= _mm_mul_pd(_mm_mul_pd(qq22
,rinv22
),_mm_sub_pd(rinvsq22
,felec
));
2009 cutoff_mask
= _mm_cmplt_pd(rsq22
,rcutoff2
);
2013 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2015 /* Calculate temporary vectorial force */
2016 tx
= _mm_mul_pd(fscal
,dx22
);
2017 ty
= _mm_mul_pd(fscal
,dy22
);
2018 tz
= _mm_mul_pd(fscal
,dz22
);
2020 /* Update vectorial force */
2021 fix2
= _mm_add_pd(fix2
,tx
);
2022 fiy2
= _mm_add_pd(fiy2
,ty
);
2023 fiz2
= _mm_add_pd(fiz2
,tz
);
2025 fjx2
= _mm_add_pd(fjx2
,tx
);
2026 fjy2
= _mm_add_pd(fjy2
,ty
);
2027 fjz2
= _mm_add_pd(fjz2
,tz
);
2031 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
2033 /* Inner loop uses 358 flops */
2036 if(jidx
<j_index_end
)
2040 j_coord_offsetA
= DIM
*jnrA
;
2042 /* load j atom coordinates */
2043 gmx_mm_load_3rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
2044 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
2046 /* Calculate displacement vector */
2047 dx00
= _mm_sub_pd(ix0
,jx0
);
2048 dy00
= _mm_sub_pd(iy0
,jy0
);
2049 dz00
= _mm_sub_pd(iz0
,jz0
);
2050 dx01
= _mm_sub_pd(ix0
,jx1
);
2051 dy01
= _mm_sub_pd(iy0
,jy1
);
2052 dz01
= _mm_sub_pd(iz0
,jz1
);
2053 dx02
= _mm_sub_pd(ix0
,jx2
);
2054 dy02
= _mm_sub_pd(iy0
,jy2
);
2055 dz02
= _mm_sub_pd(iz0
,jz2
);
2056 dx10
= _mm_sub_pd(ix1
,jx0
);
2057 dy10
= _mm_sub_pd(iy1
,jy0
);
2058 dz10
= _mm_sub_pd(iz1
,jz0
);
2059 dx11
= _mm_sub_pd(ix1
,jx1
);
2060 dy11
= _mm_sub_pd(iy1
,jy1
);
2061 dz11
= _mm_sub_pd(iz1
,jz1
);
2062 dx12
= _mm_sub_pd(ix1
,jx2
);
2063 dy12
= _mm_sub_pd(iy1
,jy2
);
2064 dz12
= _mm_sub_pd(iz1
,jz2
);
2065 dx20
= _mm_sub_pd(ix2
,jx0
);
2066 dy20
= _mm_sub_pd(iy2
,jy0
);
2067 dz20
= _mm_sub_pd(iz2
,jz0
);
2068 dx21
= _mm_sub_pd(ix2
,jx1
);
2069 dy21
= _mm_sub_pd(iy2
,jy1
);
2070 dz21
= _mm_sub_pd(iz2
,jz1
);
2071 dx22
= _mm_sub_pd(ix2
,jx2
);
2072 dy22
= _mm_sub_pd(iy2
,jy2
);
2073 dz22
= _mm_sub_pd(iz2
,jz2
);
2075 /* Calculate squared distance and things based on it */
2076 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
2077 rsq01
= gmx_mm_calc_rsq_pd(dx01
,dy01
,dz01
);
2078 rsq02
= gmx_mm_calc_rsq_pd(dx02
,dy02
,dz02
);
2079 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
2080 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
2081 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
2082 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
2083 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
2084 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
2086 rinv00
= sse2_invsqrt_d(rsq00
);
2087 rinv01
= sse2_invsqrt_d(rsq01
);
2088 rinv02
= sse2_invsqrt_d(rsq02
);
2089 rinv10
= sse2_invsqrt_d(rsq10
);
2090 rinv11
= sse2_invsqrt_d(rsq11
);
2091 rinv12
= sse2_invsqrt_d(rsq12
);
2092 rinv20
= sse2_invsqrt_d(rsq20
);
2093 rinv21
= sse2_invsqrt_d(rsq21
);
2094 rinv22
= sse2_invsqrt_d(rsq22
);
2096 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
2097 rinvsq01
= _mm_mul_pd(rinv01
,rinv01
);
2098 rinvsq02
= _mm_mul_pd(rinv02
,rinv02
);
2099 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
2100 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
2101 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
2102 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
2103 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
2104 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
2106 fjx0
= _mm_setzero_pd();
2107 fjy0
= _mm_setzero_pd();
2108 fjz0
= _mm_setzero_pd();
2109 fjx1
= _mm_setzero_pd();
2110 fjy1
= _mm_setzero_pd();
2111 fjz1
= _mm_setzero_pd();
2112 fjx2
= _mm_setzero_pd();
2113 fjy2
= _mm_setzero_pd();
2114 fjz2
= _mm_setzero_pd();
2116 /**************************
2117 * CALCULATE INTERACTIONS *
2118 **************************/
2120 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
2123 r00
= _mm_mul_pd(rsq00
,rinv00
);
2125 /* EWALD ELECTROSTATICS */
2127 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2128 ewrt
= _mm_mul_pd(r00
,ewtabscale
);
2129 ewitab
= _mm_cvttpd_epi32(ewrt
);
2130 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
2131 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
2132 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2133 felec
= _mm_mul_pd(_mm_mul_pd(qq00
,rinv00
),_mm_sub_pd(rinvsq00
,felec
));
2135 /* LENNARD-JONES DISPERSION/REPULSION */
2137 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
2138 fvdw
= _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00
,rinvsix
),c6_00
),_mm_mul_pd(rinvsix
,rinvsq00
));
2140 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
2142 fscal
= _mm_add_pd(felec
,fvdw
);
2144 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2146 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2148 /* Calculate temporary vectorial force */
2149 tx
= _mm_mul_pd(fscal
,dx00
);
2150 ty
= _mm_mul_pd(fscal
,dy00
);
2151 tz
= _mm_mul_pd(fscal
,dz00
);
2153 /* Update vectorial force */
2154 fix0
= _mm_add_pd(fix0
,tx
);
2155 fiy0
= _mm_add_pd(fiy0
,ty
);
2156 fiz0
= _mm_add_pd(fiz0
,tz
);
2158 fjx0
= _mm_add_pd(fjx0
,tx
);
2159 fjy0
= _mm_add_pd(fjy0
,ty
);
2160 fjz0
= _mm_add_pd(fjz0
,tz
);
2164 /**************************
2165 * CALCULATE INTERACTIONS *
2166 **************************/
2168 if (gmx_mm_any_lt(rsq01
,rcutoff2
))
2171 r01
= _mm_mul_pd(rsq01
,rinv01
);
2173 /* EWALD ELECTROSTATICS */
2175 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2176 ewrt
= _mm_mul_pd(r01
,ewtabscale
);
2177 ewitab
= _mm_cvttpd_epi32(ewrt
);
2178 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
2179 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
2180 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2181 felec
= _mm_mul_pd(_mm_mul_pd(qq01
,rinv01
),_mm_sub_pd(rinvsq01
,felec
));
2183 cutoff_mask
= _mm_cmplt_pd(rsq01
,rcutoff2
);
2187 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2189 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2191 /* Calculate temporary vectorial force */
2192 tx
= _mm_mul_pd(fscal
,dx01
);
2193 ty
= _mm_mul_pd(fscal
,dy01
);
2194 tz
= _mm_mul_pd(fscal
,dz01
);
2196 /* Update vectorial force */
2197 fix0
= _mm_add_pd(fix0
,tx
);
2198 fiy0
= _mm_add_pd(fiy0
,ty
);
2199 fiz0
= _mm_add_pd(fiz0
,tz
);
2201 fjx1
= _mm_add_pd(fjx1
,tx
);
2202 fjy1
= _mm_add_pd(fjy1
,ty
);
2203 fjz1
= _mm_add_pd(fjz1
,tz
);
2207 /**************************
2208 * CALCULATE INTERACTIONS *
2209 **************************/
2211 if (gmx_mm_any_lt(rsq02
,rcutoff2
))
2214 r02
= _mm_mul_pd(rsq02
,rinv02
);
2216 /* EWALD ELECTROSTATICS */
2218 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2219 ewrt
= _mm_mul_pd(r02
,ewtabscale
);
2220 ewitab
= _mm_cvttpd_epi32(ewrt
);
2221 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
2222 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
2223 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2224 felec
= _mm_mul_pd(_mm_mul_pd(qq02
,rinv02
),_mm_sub_pd(rinvsq02
,felec
));
2226 cutoff_mask
= _mm_cmplt_pd(rsq02
,rcutoff2
);
2230 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2232 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2234 /* Calculate temporary vectorial force */
2235 tx
= _mm_mul_pd(fscal
,dx02
);
2236 ty
= _mm_mul_pd(fscal
,dy02
);
2237 tz
= _mm_mul_pd(fscal
,dz02
);
2239 /* Update vectorial force */
2240 fix0
= _mm_add_pd(fix0
,tx
);
2241 fiy0
= _mm_add_pd(fiy0
,ty
);
2242 fiz0
= _mm_add_pd(fiz0
,tz
);
2244 fjx2
= _mm_add_pd(fjx2
,tx
);
2245 fjy2
= _mm_add_pd(fjy2
,ty
);
2246 fjz2
= _mm_add_pd(fjz2
,tz
);
2250 /**************************
2251 * CALCULATE INTERACTIONS *
2252 **************************/
2254 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
2257 r10
= _mm_mul_pd(rsq10
,rinv10
);
2259 /* EWALD ELECTROSTATICS */
2261 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2262 ewrt
= _mm_mul_pd(r10
,ewtabscale
);
2263 ewitab
= _mm_cvttpd_epi32(ewrt
);
2264 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
2265 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
2266 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2267 felec
= _mm_mul_pd(_mm_mul_pd(qq10
,rinv10
),_mm_sub_pd(rinvsq10
,felec
));
2269 cutoff_mask
= _mm_cmplt_pd(rsq10
,rcutoff2
);
2273 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2275 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2277 /* Calculate temporary vectorial force */
2278 tx
= _mm_mul_pd(fscal
,dx10
);
2279 ty
= _mm_mul_pd(fscal
,dy10
);
2280 tz
= _mm_mul_pd(fscal
,dz10
);
2282 /* Update vectorial force */
2283 fix1
= _mm_add_pd(fix1
,tx
);
2284 fiy1
= _mm_add_pd(fiy1
,ty
);
2285 fiz1
= _mm_add_pd(fiz1
,tz
);
2287 fjx0
= _mm_add_pd(fjx0
,tx
);
2288 fjy0
= _mm_add_pd(fjy0
,ty
);
2289 fjz0
= _mm_add_pd(fjz0
,tz
);
2293 /**************************
2294 * CALCULATE INTERACTIONS *
2295 **************************/
2297 if (gmx_mm_any_lt(rsq11
,rcutoff2
))
2300 r11
= _mm_mul_pd(rsq11
,rinv11
);
2302 /* EWALD ELECTROSTATICS */
2304 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2305 ewrt
= _mm_mul_pd(r11
,ewtabscale
);
2306 ewitab
= _mm_cvttpd_epi32(ewrt
);
2307 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
2308 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
2309 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2310 felec
= _mm_mul_pd(_mm_mul_pd(qq11
,rinv11
),_mm_sub_pd(rinvsq11
,felec
));
2312 cutoff_mask
= _mm_cmplt_pd(rsq11
,rcutoff2
);
2316 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2318 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2320 /* Calculate temporary vectorial force */
2321 tx
= _mm_mul_pd(fscal
,dx11
);
2322 ty
= _mm_mul_pd(fscal
,dy11
);
2323 tz
= _mm_mul_pd(fscal
,dz11
);
2325 /* Update vectorial force */
2326 fix1
= _mm_add_pd(fix1
,tx
);
2327 fiy1
= _mm_add_pd(fiy1
,ty
);
2328 fiz1
= _mm_add_pd(fiz1
,tz
);
2330 fjx1
= _mm_add_pd(fjx1
,tx
);
2331 fjy1
= _mm_add_pd(fjy1
,ty
);
2332 fjz1
= _mm_add_pd(fjz1
,tz
);
2336 /**************************
2337 * CALCULATE INTERACTIONS *
2338 **************************/
2340 if (gmx_mm_any_lt(rsq12
,rcutoff2
))
2343 r12
= _mm_mul_pd(rsq12
,rinv12
);
2345 /* EWALD ELECTROSTATICS */
2347 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2348 ewrt
= _mm_mul_pd(r12
,ewtabscale
);
2349 ewitab
= _mm_cvttpd_epi32(ewrt
);
2350 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
2351 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
2352 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2353 felec
= _mm_mul_pd(_mm_mul_pd(qq12
,rinv12
),_mm_sub_pd(rinvsq12
,felec
));
2355 cutoff_mask
= _mm_cmplt_pd(rsq12
,rcutoff2
);
2359 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2361 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2363 /* Calculate temporary vectorial force */
2364 tx
= _mm_mul_pd(fscal
,dx12
);
2365 ty
= _mm_mul_pd(fscal
,dy12
);
2366 tz
= _mm_mul_pd(fscal
,dz12
);
2368 /* Update vectorial force */
2369 fix1
= _mm_add_pd(fix1
,tx
);
2370 fiy1
= _mm_add_pd(fiy1
,ty
);
2371 fiz1
= _mm_add_pd(fiz1
,tz
);
2373 fjx2
= _mm_add_pd(fjx2
,tx
);
2374 fjy2
= _mm_add_pd(fjy2
,ty
);
2375 fjz2
= _mm_add_pd(fjz2
,tz
);
2379 /**************************
2380 * CALCULATE INTERACTIONS *
2381 **************************/
2383 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
2386 r20
= _mm_mul_pd(rsq20
,rinv20
);
2388 /* EWALD ELECTROSTATICS */
2390 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2391 ewrt
= _mm_mul_pd(r20
,ewtabscale
);
2392 ewitab
= _mm_cvttpd_epi32(ewrt
);
2393 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
2394 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
2395 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2396 felec
= _mm_mul_pd(_mm_mul_pd(qq20
,rinv20
),_mm_sub_pd(rinvsq20
,felec
));
2398 cutoff_mask
= _mm_cmplt_pd(rsq20
,rcutoff2
);
2402 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2404 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2406 /* Calculate temporary vectorial force */
2407 tx
= _mm_mul_pd(fscal
,dx20
);
2408 ty
= _mm_mul_pd(fscal
,dy20
);
2409 tz
= _mm_mul_pd(fscal
,dz20
);
2411 /* Update vectorial force */
2412 fix2
= _mm_add_pd(fix2
,tx
);
2413 fiy2
= _mm_add_pd(fiy2
,ty
);
2414 fiz2
= _mm_add_pd(fiz2
,tz
);
2416 fjx0
= _mm_add_pd(fjx0
,tx
);
2417 fjy0
= _mm_add_pd(fjy0
,ty
);
2418 fjz0
= _mm_add_pd(fjz0
,tz
);
2422 /**************************
2423 * CALCULATE INTERACTIONS *
2424 **************************/
2426 if (gmx_mm_any_lt(rsq21
,rcutoff2
))
2429 r21
= _mm_mul_pd(rsq21
,rinv21
);
2431 /* EWALD ELECTROSTATICS */
2433 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2434 ewrt
= _mm_mul_pd(r21
,ewtabscale
);
2435 ewitab
= _mm_cvttpd_epi32(ewrt
);
2436 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
2437 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
2438 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2439 felec
= _mm_mul_pd(_mm_mul_pd(qq21
,rinv21
),_mm_sub_pd(rinvsq21
,felec
));
2441 cutoff_mask
= _mm_cmplt_pd(rsq21
,rcutoff2
);
2445 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2447 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2449 /* Calculate temporary vectorial force */
2450 tx
= _mm_mul_pd(fscal
,dx21
);
2451 ty
= _mm_mul_pd(fscal
,dy21
);
2452 tz
= _mm_mul_pd(fscal
,dz21
);
2454 /* Update vectorial force */
2455 fix2
= _mm_add_pd(fix2
,tx
);
2456 fiy2
= _mm_add_pd(fiy2
,ty
);
2457 fiz2
= _mm_add_pd(fiz2
,tz
);
2459 fjx1
= _mm_add_pd(fjx1
,tx
);
2460 fjy1
= _mm_add_pd(fjy1
,ty
);
2461 fjz1
= _mm_add_pd(fjz1
,tz
);
2465 /**************************
2466 * CALCULATE INTERACTIONS *
2467 **************************/
2469 if (gmx_mm_any_lt(rsq22
,rcutoff2
))
2472 r22
= _mm_mul_pd(rsq22
,rinv22
);
2474 /* EWALD ELECTROSTATICS */
2476 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2477 ewrt
= _mm_mul_pd(r22
,ewtabscale
);
2478 ewitab
= _mm_cvttpd_epi32(ewrt
);
2479 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
2480 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
2481 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2482 felec
= _mm_mul_pd(_mm_mul_pd(qq22
,rinv22
),_mm_sub_pd(rinvsq22
,felec
));
2484 cutoff_mask
= _mm_cmplt_pd(rsq22
,rcutoff2
);
2488 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2490 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2492 /* Calculate temporary vectorial force */
2493 tx
= _mm_mul_pd(fscal
,dx22
);
2494 ty
= _mm_mul_pd(fscal
,dy22
);
2495 tz
= _mm_mul_pd(fscal
,dz22
);
2497 /* Update vectorial force */
2498 fix2
= _mm_add_pd(fix2
,tx
);
2499 fiy2
= _mm_add_pd(fiy2
,ty
);
2500 fiz2
= _mm_add_pd(fiz2
,tz
);
2502 fjx2
= _mm_add_pd(fjx2
,tx
);
2503 fjy2
= _mm_add_pd(fjy2
,ty
);
2504 fjz2
= _mm_add_pd(fjz2
,tz
);
2508 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
2510 /* Inner loop uses 358 flops */
2513 /* End of innermost loop */
2515 gmx_mm_update_iforce_3atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
2516 f
+i_coord_offset
,fshift
+i_shift_offset
);
2518 /* Increment number of inner iterations */
2519 inneriter
+= j_index_end
- j_index_start
;
2521 /* Outer loop uses 18 flops */
2524 /* Increment number of outer iterations */
2527 /* Update outer/inner flops */
2529 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W3W3_F
,outeriter
*18 + inneriter
*358);