2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse4_1_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/math/vec.h"
46 #include "gromacs/legacyheaders/nrnb.h"
48 #include "gromacs/simd/math_x86_sse4_1_double.h"
49 #include "kernelutil_x86_sse4_1_double.h"
52 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sse4_1_double
53 * Electrostatics interaction: Ewald
54 * VdW interaction: LennardJones
55 * Geometry: Water3-Water3
56 * Calculate force/pot: PotentialAndForce
59 nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sse4_1_double
60 (t_nblist
* gmx_restrict nlist
,
61 rvec
* gmx_restrict xx
,
62 rvec
* gmx_restrict ff
,
63 t_forcerec
* gmx_restrict fr
,
64 t_mdatoms
* gmx_restrict mdatoms
,
65 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
66 t_nrnb
* gmx_restrict nrnb
)
68 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
69 * just 0 for non-waters.
70 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
71 * jnr indices corresponding to data put in the four positions in the SIMD register.
73 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
74 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
76 int j_coord_offsetA
,j_coord_offsetB
;
77 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
79 real
*shiftvec
,*fshift
,*x
,*f
;
80 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
82 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
84 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
86 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
87 int vdwjidx0A
,vdwjidx0B
;
88 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
89 int vdwjidx1A
,vdwjidx1B
;
90 __m128d jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
91 int vdwjidx2A
,vdwjidx2B
;
92 __m128d jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
93 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
94 __m128d dx01
,dy01
,dz01
,rsq01
,rinv01
,rinvsq01
,r01
,qq01
,c6_01
,c12_01
;
95 __m128d dx02
,dy02
,dz02
,rsq02
,rinv02
,rinvsq02
,r02
,qq02
,c6_02
,c12_02
;
96 __m128d dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
97 __m128d dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
98 __m128d dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
99 __m128d dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
100 __m128d dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
101 __m128d dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
102 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
105 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
108 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
109 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
111 __m128d ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
113 __m128d dummy_mask
,cutoff_mask
;
114 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
115 __m128d one
= _mm_set1_pd(1.0);
116 __m128d two
= _mm_set1_pd(2.0);
122 jindex
= nlist
->jindex
;
124 shiftidx
= nlist
->shift
;
126 shiftvec
= fr
->shift_vec
[0];
127 fshift
= fr
->fshift
[0];
128 facel
= _mm_set1_pd(fr
->epsfac
);
129 charge
= mdatoms
->chargeA
;
130 nvdwtype
= fr
->ntype
;
132 vdwtype
= mdatoms
->typeA
;
134 sh_ewald
= _mm_set1_pd(fr
->ic
->sh_ewald
);
135 ewtab
= fr
->ic
->tabq_coul_FDV0
;
136 ewtabscale
= _mm_set1_pd(fr
->ic
->tabq_scale
);
137 ewtabhalfspace
= _mm_set1_pd(0.5/fr
->ic
->tabq_scale
);
139 /* Setup water-specific parameters */
140 inr
= nlist
->iinr
[0];
141 iq0
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+0]));
142 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
143 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
144 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
146 jq0
= _mm_set1_pd(charge
[inr
+0]);
147 jq1
= _mm_set1_pd(charge
[inr
+1]);
148 jq2
= _mm_set1_pd(charge
[inr
+2]);
149 vdwjidx0A
= 2*vdwtype
[inr
+0];
150 qq00
= _mm_mul_pd(iq0
,jq0
);
151 c6_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
]);
152 c12_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
153 qq01
= _mm_mul_pd(iq0
,jq1
);
154 qq02
= _mm_mul_pd(iq0
,jq2
);
155 qq10
= _mm_mul_pd(iq1
,jq0
);
156 qq11
= _mm_mul_pd(iq1
,jq1
);
157 qq12
= _mm_mul_pd(iq1
,jq2
);
158 qq20
= _mm_mul_pd(iq2
,jq0
);
159 qq21
= _mm_mul_pd(iq2
,jq1
);
160 qq22
= _mm_mul_pd(iq2
,jq2
);
162 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
163 rcutoff_scalar
= fr
->rcoulomb
;
164 rcutoff
= _mm_set1_pd(rcutoff_scalar
);
165 rcutoff2
= _mm_mul_pd(rcutoff
,rcutoff
);
167 sh_vdw_invrcut6
= _mm_set1_pd(fr
->ic
->sh_invrc6
);
168 rvdw
= _mm_set1_pd(fr
->rvdw
);
170 /* Avoid stupid compiler warnings */
178 /* Start outer loop over neighborlists */
179 for(iidx
=0; iidx
<nri
; iidx
++)
181 /* Load shift vector for this list */
182 i_shift_offset
= DIM
*shiftidx
[iidx
];
184 /* Load limits for loop over neighbors */
185 j_index_start
= jindex
[iidx
];
186 j_index_end
= jindex
[iidx
+1];
188 /* Get outer coordinate index */
190 i_coord_offset
= DIM
*inr
;
192 /* Load i particle coords and add shift vector */
193 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
194 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
196 fix0
= _mm_setzero_pd();
197 fiy0
= _mm_setzero_pd();
198 fiz0
= _mm_setzero_pd();
199 fix1
= _mm_setzero_pd();
200 fiy1
= _mm_setzero_pd();
201 fiz1
= _mm_setzero_pd();
202 fix2
= _mm_setzero_pd();
203 fiy2
= _mm_setzero_pd();
204 fiz2
= _mm_setzero_pd();
206 /* Reset potential sums */
207 velecsum
= _mm_setzero_pd();
208 vvdwsum
= _mm_setzero_pd();
210 /* Start inner kernel loop */
211 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
214 /* Get j neighbor index, and coordinate index */
217 j_coord_offsetA
= DIM
*jnrA
;
218 j_coord_offsetB
= DIM
*jnrB
;
220 /* load j atom coordinates */
221 gmx_mm_load_3rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
222 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
224 /* Calculate displacement vector */
225 dx00
= _mm_sub_pd(ix0
,jx0
);
226 dy00
= _mm_sub_pd(iy0
,jy0
);
227 dz00
= _mm_sub_pd(iz0
,jz0
);
228 dx01
= _mm_sub_pd(ix0
,jx1
);
229 dy01
= _mm_sub_pd(iy0
,jy1
);
230 dz01
= _mm_sub_pd(iz0
,jz1
);
231 dx02
= _mm_sub_pd(ix0
,jx2
);
232 dy02
= _mm_sub_pd(iy0
,jy2
);
233 dz02
= _mm_sub_pd(iz0
,jz2
);
234 dx10
= _mm_sub_pd(ix1
,jx0
);
235 dy10
= _mm_sub_pd(iy1
,jy0
);
236 dz10
= _mm_sub_pd(iz1
,jz0
);
237 dx11
= _mm_sub_pd(ix1
,jx1
);
238 dy11
= _mm_sub_pd(iy1
,jy1
);
239 dz11
= _mm_sub_pd(iz1
,jz1
);
240 dx12
= _mm_sub_pd(ix1
,jx2
);
241 dy12
= _mm_sub_pd(iy1
,jy2
);
242 dz12
= _mm_sub_pd(iz1
,jz2
);
243 dx20
= _mm_sub_pd(ix2
,jx0
);
244 dy20
= _mm_sub_pd(iy2
,jy0
);
245 dz20
= _mm_sub_pd(iz2
,jz0
);
246 dx21
= _mm_sub_pd(ix2
,jx1
);
247 dy21
= _mm_sub_pd(iy2
,jy1
);
248 dz21
= _mm_sub_pd(iz2
,jz1
);
249 dx22
= _mm_sub_pd(ix2
,jx2
);
250 dy22
= _mm_sub_pd(iy2
,jy2
);
251 dz22
= _mm_sub_pd(iz2
,jz2
);
253 /* Calculate squared distance and things based on it */
254 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
255 rsq01
= gmx_mm_calc_rsq_pd(dx01
,dy01
,dz01
);
256 rsq02
= gmx_mm_calc_rsq_pd(dx02
,dy02
,dz02
);
257 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
258 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
259 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
260 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
261 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
262 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
264 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
265 rinv01
= gmx_mm_invsqrt_pd(rsq01
);
266 rinv02
= gmx_mm_invsqrt_pd(rsq02
);
267 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
268 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
269 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
270 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
271 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
272 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
274 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
275 rinvsq01
= _mm_mul_pd(rinv01
,rinv01
);
276 rinvsq02
= _mm_mul_pd(rinv02
,rinv02
);
277 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
278 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
279 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
280 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
281 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
282 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
284 fjx0
= _mm_setzero_pd();
285 fjy0
= _mm_setzero_pd();
286 fjz0
= _mm_setzero_pd();
287 fjx1
= _mm_setzero_pd();
288 fjy1
= _mm_setzero_pd();
289 fjz1
= _mm_setzero_pd();
290 fjx2
= _mm_setzero_pd();
291 fjy2
= _mm_setzero_pd();
292 fjz2
= _mm_setzero_pd();
294 /**************************
295 * CALCULATE INTERACTIONS *
296 **************************/
298 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
301 r00
= _mm_mul_pd(rsq00
,rinv00
);
303 /* EWALD ELECTROSTATICS */
305 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
306 ewrt
= _mm_mul_pd(r00
,ewtabscale
);
307 ewitab
= _mm_cvttpd_epi32(ewrt
);
308 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
309 ewitab
= _mm_slli_epi32(ewitab
,2);
310 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
311 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
312 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
313 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
314 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
315 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
316 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
317 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
318 velec
= _mm_mul_pd(qq00
,_mm_sub_pd(_mm_sub_pd(rinv00
,sh_ewald
),velec
));
319 felec
= _mm_mul_pd(_mm_mul_pd(qq00
,rinv00
),_mm_sub_pd(rinvsq00
,felec
));
321 /* LENNARD-JONES DISPERSION/REPULSION */
323 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
324 vvdw6
= _mm_mul_pd(c6_00
,rinvsix
);
325 vvdw12
= _mm_mul_pd(c12_00
,_mm_mul_pd(rinvsix
,rinvsix
));
326 vvdw
= _mm_sub_pd(_mm_mul_pd( _mm_sub_pd(vvdw12
, _mm_mul_pd(c12_00
,_mm_mul_pd(sh_vdw_invrcut6
,sh_vdw_invrcut6
))), one_twelfth
) ,
327 _mm_mul_pd( _mm_sub_pd(vvdw6
,_mm_mul_pd(c6_00
,sh_vdw_invrcut6
)),one_sixth
));
328 fvdw
= _mm_mul_pd(_mm_sub_pd(vvdw12
,vvdw6
),rinvsq00
);
330 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
332 /* Update potential sum for this i atom from the interaction with this j atom. */
333 velec
= _mm_and_pd(velec
,cutoff_mask
);
334 velecsum
= _mm_add_pd(velecsum
,velec
);
335 vvdw
= _mm_and_pd(vvdw
,cutoff_mask
);
336 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
338 fscal
= _mm_add_pd(felec
,fvdw
);
340 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
342 /* Calculate temporary vectorial force */
343 tx
= _mm_mul_pd(fscal
,dx00
);
344 ty
= _mm_mul_pd(fscal
,dy00
);
345 tz
= _mm_mul_pd(fscal
,dz00
);
347 /* Update vectorial force */
348 fix0
= _mm_add_pd(fix0
,tx
);
349 fiy0
= _mm_add_pd(fiy0
,ty
);
350 fiz0
= _mm_add_pd(fiz0
,tz
);
352 fjx0
= _mm_add_pd(fjx0
,tx
);
353 fjy0
= _mm_add_pd(fjy0
,ty
);
354 fjz0
= _mm_add_pd(fjz0
,tz
);
358 /**************************
359 * CALCULATE INTERACTIONS *
360 **************************/
362 if (gmx_mm_any_lt(rsq01
,rcutoff2
))
365 r01
= _mm_mul_pd(rsq01
,rinv01
);
367 /* EWALD ELECTROSTATICS */
369 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
370 ewrt
= _mm_mul_pd(r01
,ewtabscale
);
371 ewitab
= _mm_cvttpd_epi32(ewrt
);
372 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
373 ewitab
= _mm_slli_epi32(ewitab
,2);
374 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
375 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
376 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
377 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
378 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
379 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
380 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
381 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
382 velec
= _mm_mul_pd(qq01
,_mm_sub_pd(_mm_sub_pd(rinv01
,sh_ewald
),velec
));
383 felec
= _mm_mul_pd(_mm_mul_pd(qq01
,rinv01
),_mm_sub_pd(rinvsq01
,felec
));
385 cutoff_mask
= _mm_cmplt_pd(rsq01
,rcutoff2
);
387 /* Update potential sum for this i atom from the interaction with this j atom. */
388 velec
= _mm_and_pd(velec
,cutoff_mask
);
389 velecsum
= _mm_add_pd(velecsum
,velec
);
393 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
395 /* Calculate temporary vectorial force */
396 tx
= _mm_mul_pd(fscal
,dx01
);
397 ty
= _mm_mul_pd(fscal
,dy01
);
398 tz
= _mm_mul_pd(fscal
,dz01
);
400 /* Update vectorial force */
401 fix0
= _mm_add_pd(fix0
,tx
);
402 fiy0
= _mm_add_pd(fiy0
,ty
);
403 fiz0
= _mm_add_pd(fiz0
,tz
);
405 fjx1
= _mm_add_pd(fjx1
,tx
);
406 fjy1
= _mm_add_pd(fjy1
,ty
);
407 fjz1
= _mm_add_pd(fjz1
,tz
);
411 /**************************
412 * CALCULATE INTERACTIONS *
413 **************************/
415 if (gmx_mm_any_lt(rsq02
,rcutoff2
))
418 r02
= _mm_mul_pd(rsq02
,rinv02
);
420 /* EWALD ELECTROSTATICS */
422 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
423 ewrt
= _mm_mul_pd(r02
,ewtabscale
);
424 ewitab
= _mm_cvttpd_epi32(ewrt
);
425 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
426 ewitab
= _mm_slli_epi32(ewitab
,2);
427 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
428 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
429 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
430 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
431 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
432 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
433 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
434 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
435 velec
= _mm_mul_pd(qq02
,_mm_sub_pd(_mm_sub_pd(rinv02
,sh_ewald
),velec
));
436 felec
= _mm_mul_pd(_mm_mul_pd(qq02
,rinv02
),_mm_sub_pd(rinvsq02
,felec
));
438 cutoff_mask
= _mm_cmplt_pd(rsq02
,rcutoff2
);
440 /* Update potential sum for this i atom from the interaction with this j atom. */
441 velec
= _mm_and_pd(velec
,cutoff_mask
);
442 velecsum
= _mm_add_pd(velecsum
,velec
);
446 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
448 /* Calculate temporary vectorial force */
449 tx
= _mm_mul_pd(fscal
,dx02
);
450 ty
= _mm_mul_pd(fscal
,dy02
);
451 tz
= _mm_mul_pd(fscal
,dz02
);
453 /* Update vectorial force */
454 fix0
= _mm_add_pd(fix0
,tx
);
455 fiy0
= _mm_add_pd(fiy0
,ty
);
456 fiz0
= _mm_add_pd(fiz0
,tz
);
458 fjx2
= _mm_add_pd(fjx2
,tx
);
459 fjy2
= _mm_add_pd(fjy2
,ty
);
460 fjz2
= _mm_add_pd(fjz2
,tz
);
464 /**************************
465 * CALCULATE INTERACTIONS *
466 **************************/
468 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
471 r10
= _mm_mul_pd(rsq10
,rinv10
);
473 /* EWALD ELECTROSTATICS */
475 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
476 ewrt
= _mm_mul_pd(r10
,ewtabscale
);
477 ewitab
= _mm_cvttpd_epi32(ewrt
);
478 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
479 ewitab
= _mm_slli_epi32(ewitab
,2);
480 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
481 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
482 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
483 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
484 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
485 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
486 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
487 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
488 velec
= _mm_mul_pd(qq10
,_mm_sub_pd(_mm_sub_pd(rinv10
,sh_ewald
),velec
));
489 felec
= _mm_mul_pd(_mm_mul_pd(qq10
,rinv10
),_mm_sub_pd(rinvsq10
,felec
));
491 cutoff_mask
= _mm_cmplt_pd(rsq10
,rcutoff2
);
493 /* Update potential sum for this i atom from the interaction with this j atom. */
494 velec
= _mm_and_pd(velec
,cutoff_mask
);
495 velecsum
= _mm_add_pd(velecsum
,velec
);
499 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
501 /* Calculate temporary vectorial force */
502 tx
= _mm_mul_pd(fscal
,dx10
);
503 ty
= _mm_mul_pd(fscal
,dy10
);
504 tz
= _mm_mul_pd(fscal
,dz10
);
506 /* Update vectorial force */
507 fix1
= _mm_add_pd(fix1
,tx
);
508 fiy1
= _mm_add_pd(fiy1
,ty
);
509 fiz1
= _mm_add_pd(fiz1
,tz
);
511 fjx0
= _mm_add_pd(fjx0
,tx
);
512 fjy0
= _mm_add_pd(fjy0
,ty
);
513 fjz0
= _mm_add_pd(fjz0
,tz
);
517 /**************************
518 * CALCULATE INTERACTIONS *
519 **************************/
521 if (gmx_mm_any_lt(rsq11
,rcutoff2
))
524 r11
= _mm_mul_pd(rsq11
,rinv11
);
526 /* EWALD ELECTROSTATICS */
528 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
529 ewrt
= _mm_mul_pd(r11
,ewtabscale
);
530 ewitab
= _mm_cvttpd_epi32(ewrt
);
531 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
532 ewitab
= _mm_slli_epi32(ewitab
,2);
533 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
534 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
535 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
536 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
537 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
538 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
539 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
540 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
541 velec
= _mm_mul_pd(qq11
,_mm_sub_pd(_mm_sub_pd(rinv11
,sh_ewald
),velec
));
542 felec
= _mm_mul_pd(_mm_mul_pd(qq11
,rinv11
),_mm_sub_pd(rinvsq11
,felec
));
544 cutoff_mask
= _mm_cmplt_pd(rsq11
,rcutoff2
);
546 /* Update potential sum for this i atom from the interaction with this j atom. */
547 velec
= _mm_and_pd(velec
,cutoff_mask
);
548 velecsum
= _mm_add_pd(velecsum
,velec
);
552 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
554 /* Calculate temporary vectorial force */
555 tx
= _mm_mul_pd(fscal
,dx11
);
556 ty
= _mm_mul_pd(fscal
,dy11
);
557 tz
= _mm_mul_pd(fscal
,dz11
);
559 /* Update vectorial force */
560 fix1
= _mm_add_pd(fix1
,tx
);
561 fiy1
= _mm_add_pd(fiy1
,ty
);
562 fiz1
= _mm_add_pd(fiz1
,tz
);
564 fjx1
= _mm_add_pd(fjx1
,tx
);
565 fjy1
= _mm_add_pd(fjy1
,ty
);
566 fjz1
= _mm_add_pd(fjz1
,tz
);
570 /**************************
571 * CALCULATE INTERACTIONS *
572 **************************/
574 if (gmx_mm_any_lt(rsq12
,rcutoff2
))
577 r12
= _mm_mul_pd(rsq12
,rinv12
);
579 /* EWALD ELECTROSTATICS */
581 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
582 ewrt
= _mm_mul_pd(r12
,ewtabscale
);
583 ewitab
= _mm_cvttpd_epi32(ewrt
);
584 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
585 ewitab
= _mm_slli_epi32(ewitab
,2);
586 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
587 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
588 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
589 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
590 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
591 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
592 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
593 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
594 velec
= _mm_mul_pd(qq12
,_mm_sub_pd(_mm_sub_pd(rinv12
,sh_ewald
),velec
));
595 felec
= _mm_mul_pd(_mm_mul_pd(qq12
,rinv12
),_mm_sub_pd(rinvsq12
,felec
));
597 cutoff_mask
= _mm_cmplt_pd(rsq12
,rcutoff2
);
599 /* Update potential sum for this i atom from the interaction with this j atom. */
600 velec
= _mm_and_pd(velec
,cutoff_mask
);
601 velecsum
= _mm_add_pd(velecsum
,velec
);
605 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
607 /* Calculate temporary vectorial force */
608 tx
= _mm_mul_pd(fscal
,dx12
);
609 ty
= _mm_mul_pd(fscal
,dy12
);
610 tz
= _mm_mul_pd(fscal
,dz12
);
612 /* Update vectorial force */
613 fix1
= _mm_add_pd(fix1
,tx
);
614 fiy1
= _mm_add_pd(fiy1
,ty
);
615 fiz1
= _mm_add_pd(fiz1
,tz
);
617 fjx2
= _mm_add_pd(fjx2
,tx
);
618 fjy2
= _mm_add_pd(fjy2
,ty
);
619 fjz2
= _mm_add_pd(fjz2
,tz
);
623 /**************************
624 * CALCULATE INTERACTIONS *
625 **************************/
627 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
630 r20
= _mm_mul_pd(rsq20
,rinv20
);
632 /* EWALD ELECTROSTATICS */
634 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
635 ewrt
= _mm_mul_pd(r20
,ewtabscale
);
636 ewitab
= _mm_cvttpd_epi32(ewrt
);
637 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
638 ewitab
= _mm_slli_epi32(ewitab
,2);
639 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
640 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
641 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
642 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
643 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
644 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
645 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
646 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
647 velec
= _mm_mul_pd(qq20
,_mm_sub_pd(_mm_sub_pd(rinv20
,sh_ewald
),velec
));
648 felec
= _mm_mul_pd(_mm_mul_pd(qq20
,rinv20
),_mm_sub_pd(rinvsq20
,felec
));
650 cutoff_mask
= _mm_cmplt_pd(rsq20
,rcutoff2
);
652 /* Update potential sum for this i atom from the interaction with this j atom. */
653 velec
= _mm_and_pd(velec
,cutoff_mask
);
654 velecsum
= _mm_add_pd(velecsum
,velec
);
658 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
660 /* Calculate temporary vectorial force */
661 tx
= _mm_mul_pd(fscal
,dx20
);
662 ty
= _mm_mul_pd(fscal
,dy20
);
663 tz
= _mm_mul_pd(fscal
,dz20
);
665 /* Update vectorial force */
666 fix2
= _mm_add_pd(fix2
,tx
);
667 fiy2
= _mm_add_pd(fiy2
,ty
);
668 fiz2
= _mm_add_pd(fiz2
,tz
);
670 fjx0
= _mm_add_pd(fjx0
,tx
);
671 fjy0
= _mm_add_pd(fjy0
,ty
);
672 fjz0
= _mm_add_pd(fjz0
,tz
);
676 /**************************
677 * CALCULATE INTERACTIONS *
678 **************************/
680 if (gmx_mm_any_lt(rsq21
,rcutoff2
))
683 r21
= _mm_mul_pd(rsq21
,rinv21
);
685 /* EWALD ELECTROSTATICS */
687 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
688 ewrt
= _mm_mul_pd(r21
,ewtabscale
);
689 ewitab
= _mm_cvttpd_epi32(ewrt
);
690 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
691 ewitab
= _mm_slli_epi32(ewitab
,2);
692 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
693 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
694 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
695 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
696 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
697 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
698 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
699 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
700 velec
= _mm_mul_pd(qq21
,_mm_sub_pd(_mm_sub_pd(rinv21
,sh_ewald
),velec
));
701 felec
= _mm_mul_pd(_mm_mul_pd(qq21
,rinv21
),_mm_sub_pd(rinvsq21
,felec
));
703 cutoff_mask
= _mm_cmplt_pd(rsq21
,rcutoff2
);
705 /* Update potential sum for this i atom from the interaction with this j atom. */
706 velec
= _mm_and_pd(velec
,cutoff_mask
);
707 velecsum
= _mm_add_pd(velecsum
,velec
);
711 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
713 /* Calculate temporary vectorial force */
714 tx
= _mm_mul_pd(fscal
,dx21
);
715 ty
= _mm_mul_pd(fscal
,dy21
);
716 tz
= _mm_mul_pd(fscal
,dz21
);
718 /* Update vectorial force */
719 fix2
= _mm_add_pd(fix2
,tx
);
720 fiy2
= _mm_add_pd(fiy2
,ty
);
721 fiz2
= _mm_add_pd(fiz2
,tz
);
723 fjx1
= _mm_add_pd(fjx1
,tx
);
724 fjy1
= _mm_add_pd(fjy1
,ty
);
725 fjz1
= _mm_add_pd(fjz1
,tz
);
729 /**************************
730 * CALCULATE INTERACTIONS *
731 **************************/
733 if (gmx_mm_any_lt(rsq22
,rcutoff2
))
736 r22
= _mm_mul_pd(rsq22
,rinv22
);
738 /* EWALD ELECTROSTATICS */
740 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
741 ewrt
= _mm_mul_pd(r22
,ewtabscale
);
742 ewitab
= _mm_cvttpd_epi32(ewrt
);
743 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
744 ewitab
= _mm_slli_epi32(ewitab
,2);
745 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
746 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
747 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
748 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
749 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
750 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
751 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
752 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
753 velec
= _mm_mul_pd(qq22
,_mm_sub_pd(_mm_sub_pd(rinv22
,sh_ewald
),velec
));
754 felec
= _mm_mul_pd(_mm_mul_pd(qq22
,rinv22
),_mm_sub_pd(rinvsq22
,felec
));
756 cutoff_mask
= _mm_cmplt_pd(rsq22
,rcutoff2
);
758 /* Update potential sum for this i atom from the interaction with this j atom. */
759 velec
= _mm_and_pd(velec
,cutoff_mask
);
760 velecsum
= _mm_add_pd(velecsum
,velec
);
764 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
766 /* Calculate temporary vectorial force */
767 tx
= _mm_mul_pd(fscal
,dx22
);
768 ty
= _mm_mul_pd(fscal
,dy22
);
769 tz
= _mm_mul_pd(fscal
,dz22
);
771 /* Update vectorial force */
772 fix2
= _mm_add_pd(fix2
,tx
);
773 fiy2
= _mm_add_pd(fiy2
,ty
);
774 fiz2
= _mm_add_pd(fiz2
,tz
);
776 fjx2
= _mm_add_pd(fjx2
,tx
);
777 fjy2
= _mm_add_pd(fjy2
,ty
);
778 fjz2
= _mm_add_pd(fjz2
,tz
);
782 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
784 /* Inner loop uses 432 flops */
791 j_coord_offsetA
= DIM
*jnrA
;
793 /* load j atom coordinates */
794 gmx_mm_load_3rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
795 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
797 /* Calculate displacement vector */
798 dx00
= _mm_sub_pd(ix0
,jx0
);
799 dy00
= _mm_sub_pd(iy0
,jy0
);
800 dz00
= _mm_sub_pd(iz0
,jz0
);
801 dx01
= _mm_sub_pd(ix0
,jx1
);
802 dy01
= _mm_sub_pd(iy0
,jy1
);
803 dz01
= _mm_sub_pd(iz0
,jz1
);
804 dx02
= _mm_sub_pd(ix0
,jx2
);
805 dy02
= _mm_sub_pd(iy0
,jy2
);
806 dz02
= _mm_sub_pd(iz0
,jz2
);
807 dx10
= _mm_sub_pd(ix1
,jx0
);
808 dy10
= _mm_sub_pd(iy1
,jy0
);
809 dz10
= _mm_sub_pd(iz1
,jz0
);
810 dx11
= _mm_sub_pd(ix1
,jx1
);
811 dy11
= _mm_sub_pd(iy1
,jy1
);
812 dz11
= _mm_sub_pd(iz1
,jz1
);
813 dx12
= _mm_sub_pd(ix1
,jx2
);
814 dy12
= _mm_sub_pd(iy1
,jy2
);
815 dz12
= _mm_sub_pd(iz1
,jz2
);
816 dx20
= _mm_sub_pd(ix2
,jx0
);
817 dy20
= _mm_sub_pd(iy2
,jy0
);
818 dz20
= _mm_sub_pd(iz2
,jz0
);
819 dx21
= _mm_sub_pd(ix2
,jx1
);
820 dy21
= _mm_sub_pd(iy2
,jy1
);
821 dz21
= _mm_sub_pd(iz2
,jz1
);
822 dx22
= _mm_sub_pd(ix2
,jx2
);
823 dy22
= _mm_sub_pd(iy2
,jy2
);
824 dz22
= _mm_sub_pd(iz2
,jz2
);
826 /* Calculate squared distance and things based on it */
827 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
828 rsq01
= gmx_mm_calc_rsq_pd(dx01
,dy01
,dz01
);
829 rsq02
= gmx_mm_calc_rsq_pd(dx02
,dy02
,dz02
);
830 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
831 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
832 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
833 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
834 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
835 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
837 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
838 rinv01
= gmx_mm_invsqrt_pd(rsq01
);
839 rinv02
= gmx_mm_invsqrt_pd(rsq02
);
840 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
841 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
842 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
843 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
844 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
845 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
847 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
848 rinvsq01
= _mm_mul_pd(rinv01
,rinv01
);
849 rinvsq02
= _mm_mul_pd(rinv02
,rinv02
);
850 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
851 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
852 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
853 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
854 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
855 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
857 fjx0
= _mm_setzero_pd();
858 fjy0
= _mm_setzero_pd();
859 fjz0
= _mm_setzero_pd();
860 fjx1
= _mm_setzero_pd();
861 fjy1
= _mm_setzero_pd();
862 fjz1
= _mm_setzero_pd();
863 fjx2
= _mm_setzero_pd();
864 fjy2
= _mm_setzero_pd();
865 fjz2
= _mm_setzero_pd();
867 /**************************
868 * CALCULATE INTERACTIONS *
869 **************************/
871 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
874 r00
= _mm_mul_pd(rsq00
,rinv00
);
876 /* EWALD ELECTROSTATICS */
878 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
879 ewrt
= _mm_mul_pd(r00
,ewtabscale
);
880 ewitab
= _mm_cvttpd_epi32(ewrt
);
881 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
882 ewitab
= _mm_slli_epi32(ewitab
,2);
883 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
884 ewtabD
= _mm_setzero_pd();
885 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
886 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
887 ewtabFn
= _mm_setzero_pd();
888 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
889 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
890 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
891 velec
= _mm_mul_pd(qq00
,_mm_sub_pd(_mm_sub_pd(rinv00
,sh_ewald
),velec
));
892 felec
= _mm_mul_pd(_mm_mul_pd(qq00
,rinv00
),_mm_sub_pd(rinvsq00
,felec
));
894 /* LENNARD-JONES DISPERSION/REPULSION */
896 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
897 vvdw6
= _mm_mul_pd(c6_00
,rinvsix
);
898 vvdw12
= _mm_mul_pd(c12_00
,_mm_mul_pd(rinvsix
,rinvsix
));
899 vvdw
= _mm_sub_pd(_mm_mul_pd( _mm_sub_pd(vvdw12
, _mm_mul_pd(c12_00
,_mm_mul_pd(sh_vdw_invrcut6
,sh_vdw_invrcut6
))), one_twelfth
) ,
900 _mm_mul_pd( _mm_sub_pd(vvdw6
,_mm_mul_pd(c6_00
,sh_vdw_invrcut6
)),one_sixth
));
901 fvdw
= _mm_mul_pd(_mm_sub_pd(vvdw12
,vvdw6
),rinvsq00
);
903 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
905 /* Update potential sum for this i atom from the interaction with this j atom. */
906 velec
= _mm_and_pd(velec
,cutoff_mask
);
907 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
908 velecsum
= _mm_add_pd(velecsum
,velec
);
909 vvdw
= _mm_and_pd(vvdw
,cutoff_mask
);
910 vvdw
= _mm_unpacklo_pd(vvdw
,_mm_setzero_pd());
911 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
913 fscal
= _mm_add_pd(felec
,fvdw
);
915 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
917 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
919 /* Calculate temporary vectorial force */
920 tx
= _mm_mul_pd(fscal
,dx00
);
921 ty
= _mm_mul_pd(fscal
,dy00
);
922 tz
= _mm_mul_pd(fscal
,dz00
);
924 /* Update vectorial force */
925 fix0
= _mm_add_pd(fix0
,tx
);
926 fiy0
= _mm_add_pd(fiy0
,ty
);
927 fiz0
= _mm_add_pd(fiz0
,tz
);
929 fjx0
= _mm_add_pd(fjx0
,tx
);
930 fjy0
= _mm_add_pd(fjy0
,ty
);
931 fjz0
= _mm_add_pd(fjz0
,tz
);
935 /**************************
936 * CALCULATE INTERACTIONS *
937 **************************/
939 if (gmx_mm_any_lt(rsq01
,rcutoff2
))
942 r01
= _mm_mul_pd(rsq01
,rinv01
);
944 /* EWALD ELECTROSTATICS */
946 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
947 ewrt
= _mm_mul_pd(r01
,ewtabscale
);
948 ewitab
= _mm_cvttpd_epi32(ewrt
);
949 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
950 ewitab
= _mm_slli_epi32(ewitab
,2);
951 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
952 ewtabD
= _mm_setzero_pd();
953 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
954 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
955 ewtabFn
= _mm_setzero_pd();
956 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
957 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
958 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
959 velec
= _mm_mul_pd(qq01
,_mm_sub_pd(_mm_sub_pd(rinv01
,sh_ewald
),velec
));
960 felec
= _mm_mul_pd(_mm_mul_pd(qq01
,rinv01
),_mm_sub_pd(rinvsq01
,felec
));
962 cutoff_mask
= _mm_cmplt_pd(rsq01
,rcutoff2
);
964 /* Update potential sum for this i atom from the interaction with this j atom. */
965 velec
= _mm_and_pd(velec
,cutoff_mask
);
966 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
967 velecsum
= _mm_add_pd(velecsum
,velec
);
971 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
973 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
975 /* Calculate temporary vectorial force */
976 tx
= _mm_mul_pd(fscal
,dx01
);
977 ty
= _mm_mul_pd(fscal
,dy01
);
978 tz
= _mm_mul_pd(fscal
,dz01
);
980 /* Update vectorial force */
981 fix0
= _mm_add_pd(fix0
,tx
);
982 fiy0
= _mm_add_pd(fiy0
,ty
);
983 fiz0
= _mm_add_pd(fiz0
,tz
);
985 fjx1
= _mm_add_pd(fjx1
,tx
);
986 fjy1
= _mm_add_pd(fjy1
,ty
);
987 fjz1
= _mm_add_pd(fjz1
,tz
);
991 /**************************
992 * CALCULATE INTERACTIONS *
993 **************************/
995 if (gmx_mm_any_lt(rsq02
,rcutoff2
))
998 r02
= _mm_mul_pd(rsq02
,rinv02
);
1000 /* EWALD ELECTROSTATICS */
1002 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1003 ewrt
= _mm_mul_pd(r02
,ewtabscale
);
1004 ewitab
= _mm_cvttpd_epi32(ewrt
);
1005 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1006 ewitab
= _mm_slli_epi32(ewitab
,2);
1007 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1008 ewtabD
= _mm_setzero_pd();
1009 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1010 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1011 ewtabFn
= _mm_setzero_pd();
1012 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1013 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1014 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1015 velec
= _mm_mul_pd(qq02
,_mm_sub_pd(_mm_sub_pd(rinv02
,sh_ewald
),velec
));
1016 felec
= _mm_mul_pd(_mm_mul_pd(qq02
,rinv02
),_mm_sub_pd(rinvsq02
,felec
));
1018 cutoff_mask
= _mm_cmplt_pd(rsq02
,rcutoff2
);
1020 /* Update potential sum for this i atom from the interaction with this j atom. */
1021 velec
= _mm_and_pd(velec
,cutoff_mask
);
1022 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1023 velecsum
= _mm_add_pd(velecsum
,velec
);
1027 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1029 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1031 /* Calculate temporary vectorial force */
1032 tx
= _mm_mul_pd(fscal
,dx02
);
1033 ty
= _mm_mul_pd(fscal
,dy02
);
1034 tz
= _mm_mul_pd(fscal
,dz02
);
1036 /* Update vectorial force */
1037 fix0
= _mm_add_pd(fix0
,tx
);
1038 fiy0
= _mm_add_pd(fiy0
,ty
);
1039 fiz0
= _mm_add_pd(fiz0
,tz
);
1041 fjx2
= _mm_add_pd(fjx2
,tx
);
1042 fjy2
= _mm_add_pd(fjy2
,ty
);
1043 fjz2
= _mm_add_pd(fjz2
,tz
);
1047 /**************************
1048 * CALCULATE INTERACTIONS *
1049 **************************/
1051 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
1054 r10
= _mm_mul_pd(rsq10
,rinv10
);
1056 /* EWALD ELECTROSTATICS */
1058 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1059 ewrt
= _mm_mul_pd(r10
,ewtabscale
);
1060 ewitab
= _mm_cvttpd_epi32(ewrt
);
1061 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1062 ewitab
= _mm_slli_epi32(ewitab
,2);
1063 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1064 ewtabD
= _mm_setzero_pd();
1065 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1066 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1067 ewtabFn
= _mm_setzero_pd();
1068 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1069 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1070 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1071 velec
= _mm_mul_pd(qq10
,_mm_sub_pd(_mm_sub_pd(rinv10
,sh_ewald
),velec
));
1072 felec
= _mm_mul_pd(_mm_mul_pd(qq10
,rinv10
),_mm_sub_pd(rinvsq10
,felec
));
1074 cutoff_mask
= _mm_cmplt_pd(rsq10
,rcutoff2
);
1076 /* Update potential sum for this i atom from the interaction with this j atom. */
1077 velec
= _mm_and_pd(velec
,cutoff_mask
);
1078 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1079 velecsum
= _mm_add_pd(velecsum
,velec
);
1083 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1085 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1087 /* Calculate temporary vectorial force */
1088 tx
= _mm_mul_pd(fscal
,dx10
);
1089 ty
= _mm_mul_pd(fscal
,dy10
);
1090 tz
= _mm_mul_pd(fscal
,dz10
);
1092 /* Update vectorial force */
1093 fix1
= _mm_add_pd(fix1
,tx
);
1094 fiy1
= _mm_add_pd(fiy1
,ty
);
1095 fiz1
= _mm_add_pd(fiz1
,tz
);
1097 fjx0
= _mm_add_pd(fjx0
,tx
);
1098 fjy0
= _mm_add_pd(fjy0
,ty
);
1099 fjz0
= _mm_add_pd(fjz0
,tz
);
1103 /**************************
1104 * CALCULATE INTERACTIONS *
1105 **************************/
1107 if (gmx_mm_any_lt(rsq11
,rcutoff2
))
1110 r11
= _mm_mul_pd(rsq11
,rinv11
);
1112 /* EWALD ELECTROSTATICS */
1114 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1115 ewrt
= _mm_mul_pd(r11
,ewtabscale
);
1116 ewitab
= _mm_cvttpd_epi32(ewrt
);
1117 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1118 ewitab
= _mm_slli_epi32(ewitab
,2);
1119 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1120 ewtabD
= _mm_setzero_pd();
1121 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1122 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1123 ewtabFn
= _mm_setzero_pd();
1124 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1125 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1126 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1127 velec
= _mm_mul_pd(qq11
,_mm_sub_pd(_mm_sub_pd(rinv11
,sh_ewald
),velec
));
1128 felec
= _mm_mul_pd(_mm_mul_pd(qq11
,rinv11
),_mm_sub_pd(rinvsq11
,felec
));
1130 cutoff_mask
= _mm_cmplt_pd(rsq11
,rcutoff2
);
1132 /* Update potential sum for this i atom from the interaction with this j atom. */
1133 velec
= _mm_and_pd(velec
,cutoff_mask
);
1134 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1135 velecsum
= _mm_add_pd(velecsum
,velec
);
1139 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1141 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1143 /* Calculate temporary vectorial force */
1144 tx
= _mm_mul_pd(fscal
,dx11
);
1145 ty
= _mm_mul_pd(fscal
,dy11
);
1146 tz
= _mm_mul_pd(fscal
,dz11
);
1148 /* Update vectorial force */
1149 fix1
= _mm_add_pd(fix1
,tx
);
1150 fiy1
= _mm_add_pd(fiy1
,ty
);
1151 fiz1
= _mm_add_pd(fiz1
,tz
);
1153 fjx1
= _mm_add_pd(fjx1
,tx
);
1154 fjy1
= _mm_add_pd(fjy1
,ty
);
1155 fjz1
= _mm_add_pd(fjz1
,tz
);
1159 /**************************
1160 * CALCULATE INTERACTIONS *
1161 **************************/
1163 if (gmx_mm_any_lt(rsq12
,rcutoff2
))
1166 r12
= _mm_mul_pd(rsq12
,rinv12
);
1168 /* EWALD ELECTROSTATICS */
1170 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1171 ewrt
= _mm_mul_pd(r12
,ewtabscale
);
1172 ewitab
= _mm_cvttpd_epi32(ewrt
);
1173 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1174 ewitab
= _mm_slli_epi32(ewitab
,2);
1175 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1176 ewtabD
= _mm_setzero_pd();
1177 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1178 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1179 ewtabFn
= _mm_setzero_pd();
1180 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1181 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1182 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1183 velec
= _mm_mul_pd(qq12
,_mm_sub_pd(_mm_sub_pd(rinv12
,sh_ewald
),velec
));
1184 felec
= _mm_mul_pd(_mm_mul_pd(qq12
,rinv12
),_mm_sub_pd(rinvsq12
,felec
));
1186 cutoff_mask
= _mm_cmplt_pd(rsq12
,rcutoff2
);
1188 /* Update potential sum for this i atom from the interaction with this j atom. */
1189 velec
= _mm_and_pd(velec
,cutoff_mask
);
1190 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1191 velecsum
= _mm_add_pd(velecsum
,velec
);
1195 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1197 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1199 /* Calculate temporary vectorial force */
1200 tx
= _mm_mul_pd(fscal
,dx12
);
1201 ty
= _mm_mul_pd(fscal
,dy12
);
1202 tz
= _mm_mul_pd(fscal
,dz12
);
1204 /* Update vectorial force */
1205 fix1
= _mm_add_pd(fix1
,tx
);
1206 fiy1
= _mm_add_pd(fiy1
,ty
);
1207 fiz1
= _mm_add_pd(fiz1
,tz
);
1209 fjx2
= _mm_add_pd(fjx2
,tx
);
1210 fjy2
= _mm_add_pd(fjy2
,ty
);
1211 fjz2
= _mm_add_pd(fjz2
,tz
);
1215 /**************************
1216 * CALCULATE INTERACTIONS *
1217 **************************/
1219 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
1222 r20
= _mm_mul_pd(rsq20
,rinv20
);
1224 /* EWALD ELECTROSTATICS */
1226 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1227 ewrt
= _mm_mul_pd(r20
,ewtabscale
);
1228 ewitab
= _mm_cvttpd_epi32(ewrt
);
1229 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1230 ewitab
= _mm_slli_epi32(ewitab
,2);
1231 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1232 ewtabD
= _mm_setzero_pd();
1233 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1234 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1235 ewtabFn
= _mm_setzero_pd();
1236 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1237 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1238 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1239 velec
= _mm_mul_pd(qq20
,_mm_sub_pd(_mm_sub_pd(rinv20
,sh_ewald
),velec
));
1240 felec
= _mm_mul_pd(_mm_mul_pd(qq20
,rinv20
),_mm_sub_pd(rinvsq20
,felec
));
1242 cutoff_mask
= _mm_cmplt_pd(rsq20
,rcutoff2
);
1244 /* Update potential sum for this i atom from the interaction with this j atom. */
1245 velec
= _mm_and_pd(velec
,cutoff_mask
);
1246 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1247 velecsum
= _mm_add_pd(velecsum
,velec
);
1251 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1253 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1255 /* Calculate temporary vectorial force */
1256 tx
= _mm_mul_pd(fscal
,dx20
);
1257 ty
= _mm_mul_pd(fscal
,dy20
);
1258 tz
= _mm_mul_pd(fscal
,dz20
);
1260 /* Update vectorial force */
1261 fix2
= _mm_add_pd(fix2
,tx
);
1262 fiy2
= _mm_add_pd(fiy2
,ty
);
1263 fiz2
= _mm_add_pd(fiz2
,tz
);
1265 fjx0
= _mm_add_pd(fjx0
,tx
);
1266 fjy0
= _mm_add_pd(fjy0
,ty
);
1267 fjz0
= _mm_add_pd(fjz0
,tz
);
1271 /**************************
1272 * CALCULATE INTERACTIONS *
1273 **************************/
1275 if (gmx_mm_any_lt(rsq21
,rcutoff2
))
1278 r21
= _mm_mul_pd(rsq21
,rinv21
);
1280 /* EWALD ELECTROSTATICS */
1282 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1283 ewrt
= _mm_mul_pd(r21
,ewtabscale
);
1284 ewitab
= _mm_cvttpd_epi32(ewrt
);
1285 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1286 ewitab
= _mm_slli_epi32(ewitab
,2);
1287 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1288 ewtabD
= _mm_setzero_pd();
1289 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1290 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1291 ewtabFn
= _mm_setzero_pd();
1292 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1293 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1294 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1295 velec
= _mm_mul_pd(qq21
,_mm_sub_pd(_mm_sub_pd(rinv21
,sh_ewald
),velec
));
1296 felec
= _mm_mul_pd(_mm_mul_pd(qq21
,rinv21
),_mm_sub_pd(rinvsq21
,felec
));
1298 cutoff_mask
= _mm_cmplt_pd(rsq21
,rcutoff2
);
1300 /* Update potential sum for this i atom from the interaction with this j atom. */
1301 velec
= _mm_and_pd(velec
,cutoff_mask
);
1302 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1303 velecsum
= _mm_add_pd(velecsum
,velec
);
1307 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1309 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1311 /* Calculate temporary vectorial force */
1312 tx
= _mm_mul_pd(fscal
,dx21
);
1313 ty
= _mm_mul_pd(fscal
,dy21
);
1314 tz
= _mm_mul_pd(fscal
,dz21
);
1316 /* Update vectorial force */
1317 fix2
= _mm_add_pd(fix2
,tx
);
1318 fiy2
= _mm_add_pd(fiy2
,ty
);
1319 fiz2
= _mm_add_pd(fiz2
,tz
);
1321 fjx1
= _mm_add_pd(fjx1
,tx
);
1322 fjy1
= _mm_add_pd(fjy1
,ty
);
1323 fjz1
= _mm_add_pd(fjz1
,tz
);
1327 /**************************
1328 * CALCULATE INTERACTIONS *
1329 **************************/
1331 if (gmx_mm_any_lt(rsq22
,rcutoff2
))
1334 r22
= _mm_mul_pd(rsq22
,rinv22
);
1336 /* EWALD ELECTROSTATICS */
1338 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1339 ewrt
= _mm_mul_pd(r22
,ewtabscale
);
1340 ewitab
= _mm_cvttpd_epi32(ewrt
);
1341 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1342 ewitab
= _mm_slli_epi32(ewitab
,2);
1343 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1344 ewtabD
= _mm_setzero_pd();
1345 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1346 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1347 ewtabFn
= _mm_setzero_pd();
1348 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1349 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1350 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1351 velec
= _mm_mul_pd(qq22
,_mm_sub_pd(_mm_sub_pd(rinv22
,sh_ewald
),velec
));
1352 felec
= _mm_mul_pd(_mm_mul_pd(qq22
,rinv22
),_mm_sub_pd(rinvsq22
,felec
));
1354 cutoff_mask
= _mm_cmplt_pd(rsq22
,rcutoff2
);
1356 /* Update potential sum for this i atom from the interaction with this j atom. */
1357 velec
= _mm_and_pd(velec
,cutoff_mask
);
1358 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1359 velecsum
= _mm_add_pd(velecsum
,velec
);
1363 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1365 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1367 /* Calculate temporary vectorial force */
1368 tx
= _mm_mul_pd(fscal
,dx22
);
1369 ty
= _mm_mul_pd(fscal
,dy22
);
1370 tz
= _mm_mul_pd(fscal
,dz22
);
1372 /* Update vectorial force */
1373 fix2
= _mm_add_pd(fix2
,tx
);
1374 fiy2
= _mm_add_pd(fiy2
,ty
);
1375 fiz2
= _mm_add_pd(fiz2
,tz
);
1377 fjx2
= _mm_add_pd(fjx2
,tx
);
1378 fjy2
= _mm_add_pd(fjy2
,ty
);
1379 fjz2
= _mm_add_pd(fjz2
,tz
);
1383 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
1385 /* Inner loop uses 432 flops */
1388 /* End of innermost loop */
1390 gmx_mm_update_iforce_3atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
1391 f
+i_coord_offset
,fshift
+i_shift_offset
);
1394 /* Update potential energies */
1395 gmx_mm_update_1pot_pd(velecsum
,kernel_data
->energygrp_elec
+ggid
);
1396 gmx_mm_update_1pot_pd(vvdwsum
,kernel_data
->energygrp_vdw
+ggid
);
1398 /* Increment number of inner iterations */
1399 inneriter
+= j_index_end
- j_index_start
;
1401 /* Outer loop uses 20 flops */
1404 /* Increment number of outer iterations */
1407 /* Update outer/inner flops */
1409 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W3W3_VF
,outeriter
*20 + inneriter
*432);
1412 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sse4_1_double
1413 * Electrostatics interaction: Ewald
1414 * VdW interaction: LennardJones
1415 * Geometry: Water3-Water3
1416 * Calculate force/pot: Force
1419 nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sse4_1_double
1420 (t_nblist
* gmx_restrict nlist
,
1421 rvec
* gmx_restrict xx
,
1422 rvec
* gmx_restrict ff
,
1423 t_forcerec
* gmx_restrict fr
,
1424 t_mdatoms
* gmx_restrict mdatoms
,
1425 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
1426 t_nrnb
* gmx_restrict nrnb
)
1428 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1429 * just 0 for non-waters.
1430 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1431 * jnr indices corresponding to data put in the four positions in the SIMD register.
1433 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
1434 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
1436 int j_coord_offsetA
,j_coord_offsetB
;
1437 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
1438 real rcutoff_scalar
;
1439 real
*shiftvec
,*fshift
,*x
,*f
;
1440 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
1442 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
1444 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
1446 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
1447 int vdwjidx0A
,vdwjidx0B
;
1448 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
1449 int vdwjidx1A
,vdwjidx1B
;
1450 __m128d jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
1451 int vdwjidx2A
,vdwjidx2B
;
1452 __m128d jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
1453 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
1454 __m128d dx01
,dy01
,dz01
,rsq01
,rinv01
,rinvsq01
,r01
,qq01
,c6_01
,c12_01
;
1455 __m128d dx02
,dy02
,dz02
,rsq02
,rinv02
,rinvsq02
,r02
,qq02
,c6_02
,c12_02
;
1456 __m128d dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
1457 __m128d dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
1458 __m128d dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
1459 __m128d dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
1460 __m128d dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
1461 __m128d dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
1462 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
1465 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
1468 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
1469 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
1471 __m128d ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
1473 __m128d dummy_mask
,cutoff_mask
;
1474 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1475 __m128d one
= _mm_set1_pd(1.0);
1476 __m128d two
= _mm_set1_pd(2.0);
1482 jindex
= nlist
->jindex
;
1484 shiftidx
= nlist
->shift
;
1486 shiftvec
= fr
->shift_vec
[0];
1487 fshift
= fr
->fshift
[0];
1488 facel
= _mm_set1_pd(fr
->epsfac
);
1489 charge
= mdatoms
->chargeA
;
1490 nvdwtype
= fr
->ntype
;
1491 vdwparam
= fr
->nbfp
;
1492 vdwtype
= mdatoms
->typeA
;
1494 sh_ewald
= _mm_set1_pd(fr
->ic
->sh_ewald
);
1495 ewtab
= fr
->ic
->tabq_coul_F
;
1496 ewtabscale
= _mm_set1_pd(fr
->ic
->tabq_scale
);
1497 ewtabhalfspace
= _mm_set1_pd(0.5/fr
->ic
->tabq_scale
);
1499 /* Setup water-specific parameters */
1500 inr
= nlist
->iinr
[0];
1501 iq0
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+0]));
1502 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
1503 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
1504 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
1506 jq0
= _mm_set1_pd(charge
[inr
+0]);
1507 jq1
= _mm_set1_pd(charge
[inr
+1]);
1508 jq2
= _mm_set1_pd(charge
[inr
+2]);
1509 vdwjidx0A
= 2*vdwtype
[inr
+0];
1510 qq00
= _mm_mul_pd(iq0
,jq0
);
1511 c6_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
]);
1512 c12_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
1513 qq01
= _mm_mul_pd(iq0
,jq1
);
1514 qq02
= _mm_mul_pd(iq0
,jq2
);
1515 qq10
= _mm_mul_pd(iq1
,jq0
);
1516 qq11
= _mm_mul_pd(iq1
,jq1
);
1517 qq12
= _mm_mul_pd(iq1
,jq2
);
1518 qq20
= _mm_mul_pd(iq2
,jq0
);
1519 qq21
= _mm_mul_pd(iq2
,jq1
);
1520 qq22
= _mm_mul_pd(iq2
,jq2
);
1522 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1523 rcutoff_scalar
= fr
->rcoulomb
;
1524 rcutoff
= _mm_set1_pd(rcutoff_scalar
);
1525 rcutoff2
= _mm_mul_pd(rcutoff
,rcutoff
);
1527 sh_vdw_invrcut6
= _mm_set1_pd(fr
->ic
->sh_invrc6
);
1528 rvdw
= _mm_set1_pd(fr
->rvdw
);
1530 /* Avoid stupid compiler warnings */
1532 j_coord_offsetA
= 0;
1533 j_coord_offsetB
= 0;
1538 /* Start outer loop over neighborlists */
1539 for(iidx
=0; iidx
<nri
; iidx
++)
1541 /* Load shift vector for this list */
1542 i_shift_offset
= DIM
*shiftidx
[iidx
];
1544 /* Load limits for loop over neighbors */
1545 j_index_start
= jindex
[iidx
];
1546 j_index_end
= jindex
[iidx
+1];
1548 /* Get outer coordinate index */
1550 i_coord_offset
= DIM
*inr
;
1552 /* Load i particle coords and add shift vector */
1553 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
1554 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
1556 fix0
= _mm_setzero_pd();
1557 fiy0
= _mm_setzero_pd();
1558 fiz0
= _mm_setzero_pd();
1559 fix1
= _mm_setzero_pd();
1560 fiy1
= _mm_setzero_pd();
1561 fiz1
= _mm_setzero_pd();
1562 fix2
= _mm_setzero_pd();
1563 fiy2
= _mm_setzero_pd();
1564 fiz2
= _mm_setzero_pd();
1566 /* Start inner kernel loop */
1567 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
1570 /* Get j neighbor index, and coordinate index */
1572 jnrB
= jjnr
[jidx
+1];
1573 j_coord_offsetA
= DIM
*jnrA
;
1574 j_coord_offsetB
= DIM
*jnrB
;
1576 /* load j atom coordinates */
1577 gmx_mm_load_3rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1578 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
1580 /* Calculate displacement vector */
1581 dx00
= _mm_sub_pd(ix0
,jx0
);
1582 dy00
= _mm_sub_pd(iy0
,jy0
);
1583 dz00
= _mm_sub_pd(iz0
,jz0
);
1584 dx01
= _mm_sub_pd(ix0
,jx1
);
1585 dy01
= _mm_sub_pd(iy0
,jy1
);
1586 dz01
= _mm_sub_pd(iz0
,jz1
);
1587 dx02
= _mm_sub_pd(ix0
,jx2
);
1588 dy02
= _mm_sub_pd(iy0
,jy2
);
1589 dz02
= _mm_sub_pd(iz0
,jz2
);
1590 dx10
= _mm_sub_pd(ix1
,jx0
);
1591 dy10
= _mm_sub_pd(iy1
,jy0
);
1592 dz10
= _mm_sub_pd(iz1
,jz0
);
1593 dx11
= _mm_sub_pd(ix1
,jx1
);
1594 dy11
= _mm_sub_pd(iy1
,jy1
);
1595 dz11
= _mm_sub_pd(iz1
,jz1
);
1596 dx12
= _mm_sub_pd(ix1
,jx2
);
1597 dy12
= _mm_sub_pd(iy1
,jy2
);
1598 dz12
= _mm_sub_pd(iz1
,jz2
);
1599 dx20
= _mm_sub_pd(ix2
,jx0
);
1600 dy20
= _mm_sub_pd(iy2
,jy0
);
1601 dz20
= _mm_sub_pd(iz2
,jz0
);
1602 dx21
= _mm_sub_pd(ix2
,jx1
);
1603 dy21
= _mm_sub_pd(iy2
,jy1
);
1604 dz21
= _mm_sub_pd(iz2
,jz1
);
1605 dx22
= _mm_sub_pd(ix2
,jx2
);
1606 dy22
= _mm_sub_pd(iy2
,jy2
);
1607 dz22
= _mm_sub_pd(iz2
,jz2
);
1609 /* Calculate squared distance and things based on it */
1610 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
1611 rsq01
= gmx_mm_calc_rsq_pd(dx01
,dy01
,dz01
);
1612 rsq02
= gmx_mm_calc_rsq_pd(dx02
,dy02
,dz02
);
1613 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
1614 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
1615 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
1616 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
1617 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
1618 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
1620 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
1621 rinv01
= gmx_mm_invsqrt_pd(rsq01
);
1622 rinv02
= gmx_mm_invsqrt_pd(rsq02
);
1623 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
1624 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
1625 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
1626 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
1627 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
1628 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
1630 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
1631 rinvsq01
= _mm_mul_pd(rinv01
,rinv01
);
1632 rinvsq02
= _mm_mul_pd(rinv02
,rinv02
);
1633 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
1634 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
1635 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
1636 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
1637 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
1638 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
1640 fjx0
= _mm_setzero_pd();
1641 fjy0
= _mm_setzero_pd();
1642 fjz0
= _mm_setzero_pd();
1643 fjx1
= _mm_setzero_pd();
1644 fjy1
= _mm_setzero_pd();
1645 fjz1
= _mm_setzero_pd();
1646 fjx2
= _mm_setzero_pd();
1647 fjy2
= _mm_setzero_pd();
1648 fjz2
= _mm_setzero_pd();
1650 /**************************
1651 * CALCULATE INTERACTIONS *
1652 **************************/
1654 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
1657 r00
= _mm_mul_pd(rsq00
,rinv00
);
1659 /* EWALD ELECTROSTATICS */
1661 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1662 ewrt
= _mm_mul_pd(r00
,ewtabscale
);
1663 ewitab
= _mm_cvttpd_epi32(ewrt
);
1664 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1665 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1667 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
1668 felec
= _mm_mul_pd(_mm_mul_pd(qq00
,rinv00
),_mm_sub_pd(rinvsq00
,felec
));
1670 /* LENNARD-JONES DISPERSION/REPULSION */
1672 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
1673 fvdw
= _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00
,rinvsix
),c6_00
),_mm_mul_pd(rinvsix
,rinvsq00
));
1675 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
1677 fscal
= _mm_add_pd(felec
,fvdw
);
1679 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1681 /* Calculate temporary vectorial force */
1682 tx
= _mm_mul_pd(fscal
,dx00
);
1683 ty
= _mm_mul_pd(fscal
,dy00
);
1684 tz
= _mm_mul_pd(fscal
,dz00
);
1686 /* Update vectorial force */
1687 fix0
= _mm_add_pd(fix0
,tx
);
1688 fiy0
= _mm_add_pd(fiy0
,ty
);
1689 fiz0
= _mm_add_pd(fiz0
,tz
);
1691 fjx0
= _mm_add_pd(fjx0
,tx
);
1692 fjy0
= _mm_add_pd(fjy0
,ty
);
1693 fjz0
= _mm_add_pd(fjz0
,tz
);
1697 /**************************
1698 * CALCULATE INTERACTIONS *
1699 **************************/
1701 if (gmx_mm_any_lt(rsq01
,rcutoff2
))
1704 r01
= _mm_mul_pd(rsq01
,rinv01
);
1706 /* EWALD ELECTROSTATICS */
1708 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1709 ewrt
= _mm_mul_pd(r01
,ewtabscale
);
1710 ewitab
= _mm_cvttpd_epi32(ewrt
);
1711 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1712 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1714 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
1715 felec
= _mm_mul_pd(_mm_mul_pd(qq01
,rinv01
),_mm_sub_pd(rinvsq01
,felec
));
1717 cutoff_mask
= _mm_cmplt_pd(rsq01
,rcutoff2
);
1721 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1723 /* Calculate temporary vectorial force */
1724 tx
= _mm_mul_pd(fscal
,dx01
);
1725 ty
= _mm_mul_pd(fscal
,dy01
);
1726 tz
= _mm_mul_pd(fscal
,dz01
);
1728 /* Update vectorial force */
1729 fix0
= _mm_add_pd(fix0
,tx
);
1730 fiy0
= _mm_add_pd(fiy0
,ty
);
1731 fiz0
= _mm_add_pd(fiz0
,tz
);
1733 fjx1
= _mm_add_pd(fjx1
,tx
);
1734 fjy1
= _mm_add_pd(fjy1
,ty
);
1735 fjz1
= _mm_add_pd(fjz1
,tz
);
1739 /**************************
1740 * CALCULATE INTERACTIONS *
1741 **************************/
1743 if (gmx_mm_any_lt(rsq02
,rcutoff2
))
1746 r02
= _mm_mul_pd(rsq02
,rinv02
);
1748 /* EWALD ELECTROSTATICS */
1750 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1751 ewrt
= _mm_mul_pd(r02
,ewtabscale
);
1752 ewitab
= _mm_cvttpd_epi32(ewrt
);
1753 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1754 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1756 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
1757 felec
= _mm_mul_pd(_mm_mul_pd(qq02
,rinv02
),_mm_sub_pd(rinvsq02
,felec
));
1759 cutoff_mask
= _mm_cmplt_pd(rsq02
,rcutoff2
);
1763 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1765 /* Calculate temporary vectorial force */
1766 tx
= _mm_mul_pd(fscal
,dx02
);
1767 ty
= _mm_mul_pd(fscal
,dy02
);
1768 tz
= _mm_mul_pd(fscal
,dz02
);
1770 /* Update vectorial force */
1771 fix0
= _mm_add_pd(fix0
,tx
);
1772 fiy0
= _mm_add_pd(fiy0
,ty
);
1773 fiz0
= _mm_add_pd(fiz0
,tz
);
1775 fjx2
= _mm_add_pd(fjx2
,tx
);
1776 fjy2
= _mm_add_pd(fjy2
,ty
);
1777 fjz2
= _mm_add_pd(fjz2
,tz
);
1781 /**************************
1782 * CALCULATE INTERACTIONS *
1783 **************************/
1785 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
1788 r10
= _mm_mul_pd(rsq10
,rinv10
);
1790 /* EWALD ELECTROSTATICS */
1792 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1793 ewrt
= _mm_mul_pd(r10
,ewtabscale
);
1794 ewitab
= _mm_cvttpd_epi32(ewrt
);
1795 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1796 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1798 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
1799 felec
= _mm_mul_pd(_mm_mul_pd(qq10
,rinv10
),_mm_sub_pd(rinvsq10
,felec
));
1801 cutoff_mask
= _mm_cmplt_pd(rsq10
,rcutoff2
);
1805 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1807 /* Calculate temporary vectorial force */
1808 tx
= _mm_mul_pd(fscal
,dx10
);
1809 ty
= _mm_mul_pd(fscal
,dy10
);
1810 tz
= _mm_mul_pd(fscal
,dz10
);
1812 /* Update vectorial force */
1813 fix1
= _mm_add_pd(fix1
,tx
);
1814 fiy1
= _mm_add_pd(fiy1
,ty
);
1815 fiz1
= _mm_add_pd(fiz1
,tz
);
1817 fjx0
= _mm_add_pd(fjx0
,tx
);
1818 fjy0
= _mm_add_pd(fjy0
,ty
);
1819 fjz0
= _mm_add_pd(fjz0
,tz
);
1823 /**************************
1824 * CALCULATE INTERACTIONS *
1825 **************************/
1827 if (gmx_mm_any_lt(rsq11
,rcutoff2
))
1830 r11
= _mm_mul_pd(rsq11
,rinv11
);
1832 /* EWALD ELECTROSTATICS */
1834 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1835 ewrt
= _mm_mul_pd(r11
,ewtabscale
);
1836 ewitab
= _mm_cvttpd_epi32(ewrt
);
1837 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1838 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1840 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
1841 felec
= _mm_mul_pd(_mm_mul_pd(qq11
,rinv11
),_mm_sub_pd(rinvsq11
,felec
));
1843 cutoff_mask
= _mm_cmplt_pd(rsq11
,rcutoff2
);
1847 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1849 /* Calculate temporary vectorial force */
1850 tx
= _mm_mul_pd(fscal
,dx11
);
1851 ty
= _mm_mul_pd(fscal
,dy11
);
1852 tz
= _mm_mul_pd(fscal
,dz11
);
1854 /* Update vectorial force */
1855 fix1
= _mm_add_pd(fix1
,tx
);
1856 fiy1
= _mm_add_pd(fiy1
,ty
);
1857 fiz1
= _mm_add_pd(fiz1
,tz
);
1859 fjx1
= _mm_add_pd(fjx1
,tx
);
1860 fjy1
= _mm_add_pd(fjy1
,ty
);
1861 fjz1
= _mm_add_pd(fjz1
,tz
);
1865 /**************************
1866 * CALCULATE INTERACTIONS *
1867 **************************/
1869 if (gmx_mm_any_lt(rsq12
,rcutoff2
))
1872 r12
= _mm_mul_pd(rsq12
,rinv12
);
1874 /* EWALD ELECTROSTATICS */
1876 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1877 ewrt
= _mm_mul_pd(r12
,ewtabscale
);
1878 ewitab
= _mm_cvttpd_epi32(ewrt
);
1879 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1880 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1882 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
1883 felec
= _mm_mul_pd(_mm_mul_pd(qq12
,rinv12
),_mm_sub_pd(rinvsq12
,felec
));
1885 cutoff_mask
= _mm_cmplt_pd(rsq12
,rcutoff2
);
1889 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1891 /* Calculate temporary vectorial force */
1892 tx
= _mm_mul_pd(fscal
,dx12
);
1893 ty
= _mm_mul_pd(fscal
,dy12
);
1894 tz
= _mm_mul_pd(fscal
,dz12
);
1896 /* Update vectorial force */
1897 fix1
= _mm_add_pd(fix1
,tx
);
1898 fiy1
= _mm_add_pd(fiy1
,ty
);
1899 fiz1
= _mm_add_pd(fiz1
,tz
);
1901 fjx2
= _mm_add_pd(fjx2
,tx
);
1902 fjy2
= _mm_add_pd(fjy2
,ty
);
1903 fjz2
= _mm_add_pd(fjz2
,tz
);
1907 /**************************
1908 * CALCULATE INTERACTIONS *
1909 **************************/
1911 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
1914 r20
= _mm_mul_pd(rsq20
,rinv20
);
1916 /* EWALD ELECTROSTATICS */
1918 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1919 ewrt
= _mm_mul_pd(r20
,ewtabscale
);
1920 ewitab
= _mm_cvttpd_epi32(ewrt
);
1921 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1922 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1924 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
1925 felec
= _mm_mul_pd(_mm_mul_pd(qq20
,rinv20
),_mm_sub_pd(rinvsq20
,felec
));
1927 cutoff_mask
= _mm_cmplt_pd(rsq20
,rcutoff2
);
1931 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1933 /* Calculate temporary vectorial force */
1934 tx
= _mm_mul_pd(fscal
,dx20
);
1935 ty
= _mm_mul_pd(fscal
,dy20
);
1936 tz
= _mm_mul_pd(fscal
,dz20
);
1938 /* Update vectorial force */
1939 fix2
= _mm_add_pd(fix2
,tx
);
1940 fiy2
= _mm_add_pd(fiy2
,ty
);
1941 fiz2
= _mm_add_pd(fiz2
,tz
);
1943 fjx0
= _mm_add_pd(fjx0
,tx
);
1944 fjy0
= _mm_add_pd(fjy0
,ty
);
1945 fjz0
= _mm_add_pd(fjz0
,tz
);
1949 /**************************
1950 * CALCULATE INTERACTIONS *
1951 **************************/
1953 if (gmx_mm_any_lt(rsq21
,rcutoff2
))
1956 r21
= _mm_mul_pd(rsq21
,rinv21
);
1958 /* EWALD ELECTROSTATICS */
1960 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1961 ewrt
= _mm_mul_pd(r21
,ewtabscale
);
1962 ewitab
= _mm_cvttpd_epi32(ewrt
);
1963 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1964 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1966 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
1967 felec
= _mm_mul_pd(_mm_mul_pd(qq21
,rinv21
),_mm_sub_pd(rinvsq21
,felec
));
1969 cutoff_mask
= _mm_cmplt_pd(rsq21
,rcutoff2
);
1973 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1975 /* Calculate temporary vectorial force */
1976 tx
= _mm_mul_pd(fscal
,dx21
);
1977 ty
= _mm_mul_pd(fscal
,dy21
);
1978 tz
= _mm_mul_pd(fscal
,dz21
);
1980 /* Update vectorial force */
1981 fix2
= _mm_add_pd(fix2
,tx
);
1982 fiy2
= _mm_add_pd(fiy2
,ty
);
1983 fiz2
= _mm_add_pd(fiz2
,tz
);
1985 fjx1
= _mm_add_pd(fjx1
,tx
);
1986 fjy1
= _mm_add_pd(fjy1
,ty
);
1987 fjz1
= _mm_add_pd(fjz1
,tz
);
1991 /**************************
1992 * CALCULATE INTERACTIONS *
1993 **************************/
1995 if (gmx_mm_any_lt(rsq22
,rcutoff2
))
1998 r22
= _mm_mul_pd(rsq22
,rinv22
);
2000 /* EWALD ELECTROSTATICS */
2002 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2003 ewrt
= _mm_mul_pd(r22
,ewtabscale
);
2004 ewitab
= _mm_cvttpd_epi32(ewrt
);
2005 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2006 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
2008 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2009 felec
= _mm_mul_pd(_mm_mul_pd(qq22
,rinv22
),_mm_sub_pd(rinvsq22
,felec
));
2011 cutoff_mask
= _mm_cmplt_pd(rsq22
,rcutoff2
);
2015 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2017 /* Calculate temporary vectorial force */
2018 tx
= _mm_mul_pd(fscal
,dx22
);
2019 ty
= _mm_mul_pd(fscal
,dy22
);
2020 tz
= _mm_mul_pd(fscal
,dz22
);
2022 /* Update vectorial force */
2023 fix2
= _mm_add_pd(fix2
,tx
);
2024 fiy2
= _mm_add_pd(fiy2
,ty
);
2025 fiz2
= _mm_add_pd(fiz2
,tz
);
2027 fjx2
= _mm_add_pd(fjx2
,tx
);
2028 fjy2
= _mm_add_pd(fjy2
,ty
);
2029 fjz2
= _mm_add_pd(fjz2
,tz
);
2033 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
2035 /* Inner loop uses 358 flops */
2038 if(jidx
<j_index_end
)
2042 j_coord_offsetA
= DIM
*jnrA
;
2044 /* load j atom coordinates */
2045 gmx_mm_load_3rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
2046 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
2048 /* Calculate displacement vector */
2049 dx00
= _mm_sub_pd(ix0
,jx0
);
2050 dy00
= _mm_sub_pd(iy0
,jy0
);
2051 dz00
= _mm_sub_pd(iz0
,jz0
);
2052 dx01
= _mm_sub_pd(ix0
,jx1
);
2053 dy01
= _mm_sub_pd(iy0
,jy1
);
2054 dz01
= _mm_sub_pd(iz0
,jz1
);
2055 dx02
= _mm_sub_pd(ix0
,jx2
);
2056 dy02
= _mm_sub_pd(iy0
,jy2
);
2057 dz02
= _mm_sub_pd(iz0
,jz2
);
2058 dx10
= _mm_sub_pd(ix1
,jx0
);
2059 dy10
= _mm_sub_pd(iy1
,jy0
);
2060 dz10
= _mm_sub_pd(iz1
,jz0
);
2061 dx11
= _mm_sub_pd(ix1
,jx1
);
2062 dy11
= _mm_sub_pd(iy1
,jy1
);
2063 dz11
= _mm_sub_pd(iz1
,jz1
);
2064 dx12
= _mm_sub_pd(ix1
,jx2
);
2065 dy12
= _mm_sub_pd(iy1
,jy2
);
2066 dz12
= _mm_sub_pd(iz1
,jz2
);
2067 dx20
= _mm_sub_pd(ix2
,jx0
);
2068 dy20
= _mm_sub_pd(iy2
,jy0
);
2069 dz20
= _mm_sub_pd(iz2
,jz0
);
2070 dx21
= _mm_sub_pd(ix2
,jx1
);
2071 dy21
= _mm_sub_pd(iy2
,jy1
);
2072 dz21
= _mm_sub_pd(iz2
,jz1
);
2073 dx22
= _mm_sub_pd(ix2
,jx2
);
2074 dy22
= _mm_sub_pd(iy2
,jy2
);
2075 dz22
= _mm_sub_pd(iz2
,jz2
);
2077 /* Calculate squared distance and things based on it */
2078 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
2079 rsq01
= gmx_mm_calc_rsq_pd(dx01
,dy01
,dz01
);
2080 rsq02
= gmx_mm_calc_rsq_pd(dx02
,dy02
,dz02
);
2081 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
2082 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
2083 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
2084 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
2085 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
2086 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
2088 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
2089 rinv01
= gmx_mm_invsqrt_pd(rsq01
);
2090 rinv02
= gmx_mm_invsqrt_pd(rsq02
);
2091 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
2092 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
2093 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
2094 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
2095 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
2096 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
2098 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
2099 rinvsq01
= _mm_mul_pd(rinv01
,rinv01
);
2100 rinvsq02
= _mm_mul_pd(rinv02
,rinv02
);
2101 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
2102 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
2103 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
2104 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
2105 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
2106 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
2108 fjx0
= _mm_setzero_pd();
2109 fjy0
= _mm_setzero_pd();
2110 fjz0
= _mm_setzero_pd();
2111 fjx1
= _mm_setzero_pd();
2112 fjy1
= _mm_setzero_pd();
2113 fjz1
= _mm_setzero_pd();
2114 fjx2
= _mm_setzero_pd();
2115 fjy2
= _mm_setzero_pd();
2116 fjz2
= _mm_setzero_pd();
2118 /**************************
2119 * CALCULATE INTERACTIONS *
2120 **************************/
2122 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
2125 r00
= _mm_mul_pd(rsq00
,rinv00
);
2127 /* EWALD ELECTROSTATICS */
2129 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2130 ewrt
= _mm_mul_pd(r00
,ewtabscale
);
2131 ewitab
= _mm_cvttpd_epi32(ewrt
);
2132 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2133 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
2134 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2135 felec
= _mm_mul_pd(_mm_mul_pd(qq00
,rinv00
),_mm_sub_pd(rinvsq00
,felec
));
2137 /* LENNARD-JONES DISPERSION/REPULSION */
2139 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
2140 fvdw
= _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00
,rinvsix
),c6_00
),_mm_mul_pd(rinvsix
,rinvsq00
));
2142 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
2144 fscal
= _mm_add_pd(felec
,fvdw
);
2146 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2148 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2150 /* Calculate temporary vectorial force */
2151 tx
= _mm_mul_pd(fscal
,dx00
);
2152 ty
= _mm_mul_pd(fscal
,dy00
);
2153 tz
= _mm_mul_pd(fscal
,dz00
);
2155 /* Update vectorial force */
2156 fix0
= _mm_add_pd(fix0
,tx
);
2157 fiy0
= _mm_add_pd(fiy0
,ty
);
2158 fiz0
= _mm_add_pd(fiz0
,tz
);
2160 fjx0
= _mm_add_pd(fjx0
,tx
);
2161 fjy0
= _mm_add_pd(fjy0
,ty
);
2162 fjz0
= _mm_add_pd(fjz0
,tz
);
2166 /**************************
2167 * CALCULATE INTERACTIONS *
2168 **************************/
2170 if (gmx_mm_any_lt(rsq01
,rcutoff2
))
2173 r01
= _mm_mul_pd(rsq01
,rinv01
);
2175 /* EWALD ELECTROSTATICS */
2177 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2178 ewrt
= _mm_mul_pd(r01
,ewtabscale
);
2179 ewitab
= _mm_cvttpd_epi32(ewrt
);
2180 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2181 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
2182 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2183 felec
= _mm_mul_pd(_mm_mul_pd(qq01
,rinv01
),_mm_sub_pd(rinvsq01
,felec
));
2185 cutoff_mask
= _mm_cmplt_pd(rsq01
,rcutoff2
);
2189 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2191 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2193 /* Calculate temporary vectorial force */
2194 tx
= _mm_mul_pd(fscal
,dx01
);
2195 ty
= _mm_mul_pd(fscal
,dy01
);
2196 tz
= _mm_mul_pd(fscal
,dz01
);
2198 /* Update vectorial force */
2199 fix0
= _mm_add_pd(fix0
,tx
);
2200 fiy0
= _mm_add_pd(fiy0
,ty
);
2201 fiz0
= _mm_add_pd(fiz0
,tz
);
2203 fjx1
= _mm_add_pd(fjx1
,tx
);
2204 fjy1
= _mm_add_pd(fjy1
,ty
);
2205 fjz1
= _mm_add_pd(fjz1
,tz
);
2209 /**************************
2210 * CALCULATE INTERACTIONS *
2211 **************************/
2213 if (gmx_mm_any_lt(rsq02
,rcutoff2
))
2216 r02
= _mm_mul_pd(rsq02
,rinv02
);
2218 /* EWALD ELECTROSTATICS */
2220 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2221 ewrt
= _mm_mul_pd(r02
,ewtabscale
);
2222 ewitab
= _mm_cvttpd_epi32(ewrt
);
2223 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2224 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
2225 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2226 felec
= _mm_mul_pd(_mm_mul_pd(qq02
,rinv02
),_mm_sub_pd(rinvsq02
,felec
));
2228 cutoff_mask
= _mm_cmplt_pd(rsq02
,rcutoff2
);
2232 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2234 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2236 /* Calculate temporary vectorial force */
2237 tx
= _mm_mul_pd(fscal
,dx02
);
2238 ty
= _mm_mul_pd(fscal
,dy02
);
2239 tz
= _mm_mul_pd(fscal
,dz02
);
2241 /* Update vectorial force */
2242 fix0
= _mm_add_pd(fix0
,tx
);
2243 fiy0
= _mm_add_pd(fiy0
,ty
);
2244 fiz0
= _mm_add_pd(fiz0
,tz
);
2246 fjx2
= _mm_add_pd(fjx2
,tx
);
2247 fjy2
= _mm_add_pd(fjy2
,ty
);
2248 fjz2
= _mm_add_pd(fjz2
,tz
);
2252 /**************************
2253 * CALCULATE INTERACTIONS *
2254 **************************/
2256 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
2259 r10
= _mm_mul_pd(rsq10
,rinv10
);
2261 /* EWALD ELECTROSTATICS */
2263 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2264 ewrt
= _mm_mul_pd(r10
,ewtabscale
);
2265 ewitab
= _mm_cvttpd_epi32(ewrt
);
2266 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2267 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
2268 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2269 felec
= _mm_mul_pd(_mm_mul_pd(qq10
,rinv10
),_mm_sub_pd(rinvsq10
,felec
));
2271 cutoff_mask
= _mm_cmplt_pd(rsq10
,rcutoff2
);
2275 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2277 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2279 /* Calculate temporary vectorial force */
2280 tx
= _mm_mul_pd(fscal
,dx10
);
2281 ty
= _mm_mul_pd(fscal
,dy10
);
2282 tz
= _mm_mul_pd(fscal
,dz10
);
2284 /* Update vectorial force */
2285 fix1
= _mm_add_pd(fix1
,tx
);
2286 fiy1
= _mm_add_pd(fiy1
,ty
);
2287 fiz1
= _mm_add_pd(fiz1
,tz
);
2289 fjx0
= _mm_add_pd(fjx0
,tx
);
2290 fjy0
= _mm_add_pd(fjy0
,ty
);
2291 fjz0
= _mm_add_pd(fjz0
,tz
);
2295 /**************************
2296 * CALCULATE INTERACTIONS *
2297 **************************/
2299 if (gmx_mm_any_lt(rsq11
,rcutoff2
))
2302 r11
= _mm_mul_pd(rsq11
,rinv11
);
2304 /* EWALD ELECTROSTATICS */
2306 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2307 ewrt
= _mm_mul_pd(r11
,ewtabscale
);
2308 ewitab
= _mm_cvttpd_epi32(ewrt
);
2309 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2310 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
2311 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2312 felec
= _mm_mul_pd(_mm_mul_pd(qq11
,rinv11
),_mm_sub_pd(rinvsq11
,felec
));
2314 cutoff_mask
= _mm_cmplt_pd(rsq11
,rcutoff2
);
2318 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2320 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2322 /* Calculate temporary vectorial force */
2323 tx
= _mm_mul_pd(fscal
,dx11
);
2324 ty
= _mm_mul_pd(fscal
,dy11
);
2325 tz
= _mm_mul_pd(fscal
,dz11
);
2327 /* Update vectorial force */
2328 fix1
= _mm_add_pd(fix1
,tx
);
2329 fiy1
= _mm_add_pd(fiy1
,ty
);
2330 fiz1
= _mm_add_pd(fiz1
,tz
);
2332 fjx1
= _mm_add_pd(fjx1
,tx
);
2333 fjy1
= _mm_add_pd(fjy1
,ty
);
2334 fjz1
= _mm_add_pd(fjz1
,tz
);
2338 /**************************
2339 * CALCULATE INTERACTIONS *
2340 **************************/
2342 if (gmx_mm_any_lt(rsq12
,rcutoff2
))
2345 r12
= _mm_mul_pd(rsq12
,rinv12
);
2347 /* EWALD ELECTROSTATICS */
2349 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2350 ewrt
= _mm_mul_pd(r12
,ewtabscale
);
2351 ewitab
= _mm_cvttpd_epi32(ewrt
);
2352 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2353 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
2354 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2355 felec
= _mm_mul_pd(_mm_mul_pd(qq12
,rinv12
),_mm_sub_pd(rinvsq12
,felec
));
2357 cutoff_mask
= _mm_cmplt_pd(rsq12
,rcutoff2
);
2361 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2363 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2365 /* Calculate temporary vectorial force */
2366 tx
= _mm_mul_pd(fscal
,dx12
);
2367 ty
= _mm_mul_pd(fscal
,dy12
);
2368 tz
= _mm_mul_pd(fscal
,dz12
);
2370 /* Update vectorial force */
2371 fix1
= _mm_add_pd(fix1
,tx
);
2372 fiy1
= _mm_add_pd(fiy1
,ty
);
2373 fiz1
= _mm_add_pd(fiz1
,tz
);
2375 fjx2
= _mm_add_pd(fjx2
,tx
);
2376 fjy2
= _mm_add_pd(fjy2
,ty
);
2377 fjz2
= _mm_add_pd(fjz2
,tz
);
2381 /**************************
2382 * CALCULATE INTERACTIONS *
2383 **************************/
2385 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
2388 r20
= _mm_mul_pd(rsq20
,rinv20
);
2390 /* EWALD ELECTROSTATICS */
2392 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2393 ewrt
= _mm_mul_pd(r20
,ewtabscale
);
2394 ewitab
= _mm_cvttpd_epi32(ewrt
);
2395 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2396 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
2397 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2398 felec
= _mm_mul_pd(_mm_mul_pd(qq20
,rinv20
),_mm_sub_pd(rinvsq20
,felec
));
2400 cutoff_mask
= _mm_cmplt_pd(rsq20
,rcutoff2
);
2404 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2406 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2408 /* Calculate temporary vectorial force */
2409 tx
= _mm_mul_pd(fscal
,dx20
);
2410 ty
= _mm_mul_pd(fscal
,dy20
);
2411 tz
= _mm_mul_pd(fscal
,dz20
);
2413 /* Update vectorial force */
2414 fix2
= _mm_add_pd(fix2
,tx
);
2415 fiy2
= _mm_add_pd(fiy2
,ty
);
2416 fiz2
= _mm_add_pd(fiz2
,tz
);
2418 fjx0
= _mm_add_pd(fjx0
,tx
);
2419 fjy0
= _mm_add_pd(fjy0
,ty
);
2420 fjz0
= _mm_add_pd(fjz0
,tz
);
2424 /**************************
2425 * CALCULATE INTERACTIONS *
2426 **************************/
2428 if (gmx_mm_any_lt(rsq21
,rcutoff2
))
2431 r21
= _mm_mul_pd(rsq21
,rinv21
);
2433 /* EWALD ELECTROSTATICS */
2435 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2436 ewrt
= _mm_mul_pd(r21
,ewtabscale
);
2437 ewitab
= _mm_cvttpd_epi32(ewrt
);
2438 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2439 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
2440 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2441 felec
= _mm_mul_pd(_mm_mul_pd(qq21
,rinv21
),_mm_sub_pd(rinvsq21
,felec
));
2443 cutoff_mask
= _mm_cmplt_pd(rsq21
,rcutoff2
);
2447 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2449 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2451 /* Calculate temporary vectorial force */
2452 tx
= _mm_mul_pd(fscal
,dx21
);
2453 ty
= _mm_mul_pd(fscal
,dy21
);
2454 tz
= _mm_mul_pd(fscal
,dz21
);
2456 /* Update vectorial force */
2457 fix2
= _mm_add_pd(fix2
,tx
);
2458 fiy2
= _mm_add_pd(fiy2
,ty
);
2459 fiz2
= _mm_add_pd(fiz2
,tz
);
2461 fjx1
= _mm_add_pd(fjx1
,tx
);
2462 fjy1
= _mm_add_pd(fjy1
,ty
);
2463 fjz1
= _mm_add_pd(fjz1
,tz
);
2467 /**************************
2468 * CALCULATE INTERACTIONS *
2469 **************************/
2471 if (gmx_mm_any_lt(rsq22
,rcutoff2
))
2474 r22
= _mm_mul_pd(rsq22
,rinv22
);
2476 /* EWALD ELECTROSTATICS */
2478 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2479 ewrt
= _mm_mul_pd(r22
,ewtabscale
);
2480 ewitab
= _mm_cvttpd_epi32(ewrt
);
2481 eweps
= _mm_sub_pd(ewrt
,_mm_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2482 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
2483 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2484 felec
= _mm_mul_pd(_mm_mul_pd(qq22
,rinv22
),_mm_sub_pd(rinvsq22
,felec
));
2486 cutoff_mask
= _mm_cmplt_pd(rsq22
,rcutoff2
);
2490 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
2492 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2494 /* Calculate temporary vectorial force */
2495 tx
= _mm_mul_pd(fscal
,dx22
);
2496 ty
= _mm_mul_pd(fscal
,dy22
);
2497 tz
= _mm_mul_pd(fscal
,dz22
);
2499 /* Update vectorial force */
2500 fix2
= _mm_add_pd(fix2
,tx
);
2501 fiy2
= _mm_add_pd(fiy2
,ty
);
2502 fiz2
= _mm_add_pd(fiz2
,tz
);
2504 fjx2
= _mm_add_pd(fjx2
,tx
);
2505 fjy2
= _mm_add_pd(fjy2
,ty
);
2506 fjz2
= _mm_add_pd(fjz2
,tz
);
2510 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
2512 /* Inner loop uses 358 flops */
2515 /* End of innermost loop */
2517 gmx_mm_update_iforce_3atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
2518 f
+i_coord_offset
,fshift
+i_shift_offset
);
2520 /* Increment number of inner iterations */
2521 inneriter
+= j_index_end
- j_index_start
;
2523 /* Outer loop uses 18 flops */
2526 /* Increment number of outer iterations */
2529 /* Update outer/inner flops */
2531 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W3W3_F
,outeriter
*18 + inneriter
*358);