2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_128_fma_double kernel generator.
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
47 #include "gromacs/simd/math_x86_avx_128_fma_double.h"
48 #include "kernelutil_x86_avx_128_fma_double.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_avx_128_fma_double
52 * Electrostatics interaction: ReactionField
53 * VdW interaction: CubicSplineTable
54 * Geometry: Water3-Water3
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_avx_128_fma_double
59 (t_nblist
* gmx_restrict nlist
,
60 rvec
* gmx_restrict xx
,
61 rvec
* gmx_restrict ff
,
62 t_forcerec
* gmx_restrict fr
,
63 t_mdatoms
* gmx_restrict mdatoms
,
64 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
65 t_nrnb
* gmx_restrict nrnb
)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
73 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
75 int j_coord_offsetA
,j_coord_offsetB
;
76 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
78 real
*shiftvec
,*fshift
,*x
,*f
;
79 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
81 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
83 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
85 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
86 int vdwjidx0A
,vdwjidx0B
;
87 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
88 int vdwjidx1A
,vdwjidx1B
;
89 __m128d jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
90 int vdwjidx2A
,vdwjidx2B
;
91 __m128d jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
92 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
93 __m128d dx01
,dy01
,dz01
,rsq01
,rinv01
,rinvsq01
,r01
,qq01
,c6_01
,c12_01
;
94 __m128d dx02
,dy02
,dz02
,rsq02
,rinv02
,rinvsq02
,r02
,qq02
,c6_02
,c12_02
;
95 __m128d dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
96 __m128d dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
97 __m128d dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
98 __m128d dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
99 __m128d dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
100 __m128d dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
101 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
104 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
107 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
108 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
110 __m128i ifour
= _mm_set1_epi32(4);
111 __m128d rt
,vfeps
,vftabscale
,Y
,F
,G
,H
,Heps
,Fp
,VV
,FF
,twovfeps
;
113 __m128d dummy_mask
,cutoff_mask
;
114 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
115 __m128d one
= _mm_set1_pd(1.0);
116 __m128d two
= _mm_set1_pd(2.0);
122 jindex
= nlist
->jindex
;
124 shiftidx
= nlist
->shift
;
126 shiftvec
= fr
->shift_vec
[0];
127 fshift
= fr
->fshift
[0];
128 facel
= _mm_set1_pd(fr
->epsfac
);
129 charge
= mdatoms
->chargeA
;
130 krf
= _mm_set1_pd(fr
->ic
->k_rf
);
131 krf2
= _mm_set1_pd(fr
->ic
->k_rf
*2.0);
132 crf
= _mm_set1_pd(fr
->ic
->c_rf
);
133 nvdwtype
= fr
->ntype
;
135 vdwtype
= mdatoms
->typeA
;
137 vftab
= kernel_data
->table_vdw
->data
;
138 vftabscale
= _mm_set1_pd(kernel_data
->table_vdw
->scale
);
140 /* Setup water-specific parameters */
141 inr
= nlist
->iinr
[0];
142 iq0
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+0]));
143 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
144 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
145 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
147 jq0
= _mm_set1_pd(charge
[inr
+0]);
148 jq1
= _mm_set1_pd(charge
[inr
+1]);
149 jq2
= _mm_set1_pd(charge
[inr
+2]);
150 vdwjidx0A
= 2*vdwtype
[inr
+0];
151 qq00
= _mm_mul_pd(iq0
,jq0
);
152 c6_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
]);
153 c12_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
154 qq01
= _mm_mul_pd(iq0
,jq1
);
155 qq02
= _mm_mul_pd(iq0
,jq2
);
156 qq10
= _mm_mul_pd(iq1
,jq0
);
157 qq11
= _mm_mul_pd(iq1
,jq1
);
158 qq12
= _mm_mul_pd(iq1
,jq2
);
159 qq20
= _mm_mul_pd(iq2
,jq0
);
160 qq21
= _mm_mul_pd(iq2
,jq1
);
161 qq22
= _mm_mul_pd(iq2
,jq2
);
163 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
164 rcutoff_scalar
= fr
->rcoulomb
;
165 rcutoff
= _mm_set1_pd(rcutoff_scalar
);
166 rcutoff2
= _mm_mul_pd(rcutoff
,rcutoff
);
168 /* Avoid stupid compiler warnings */
176 /* Start outer loop over neighborlists */
177 for(iidx
=0; iidx
<nri
; iidx
++)
179 /* Load shift vector for this list */
180 i_shift_offset
= DIM
*shiftidx
[iidx
];
182 /* Load limits for loop over neighbors */
183 j_index_start
= jindex
[iidx
];
184 j_index_end
= jindex
[iidx
+1];
186 /* Get outer coordinate index */
188 i_coord_offset
= DIM
*inr
;
190 /* Load i particle coords and add shift vector */
191 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
192 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
194 fix0
= _mm_setzero_pd();
195 fiy0
= _mm_setzero_pd();
196 fiz0
= _mm_setzero_pd();
197 fix1
= _mm_setzero_pd();
198 fiy1
= _mm_setzero_pd();
199 fiz1
= _mm_setzero_pd();
200 fix2
= _mm_setzero_pd();
201 fiy2
= _mm_setzero_pd();
202 fiz2
= _mm_setzero_pd();
204 /* Reset potential sums */
205 velecsum
= _mm_setzero_pd();
206 vvdwsum
= _mm_setzero_pd();
208 /* Start inner kernel loop */
209 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
212 /* Get j neighbor index, and coordinate index */
215 j_coord_offsetA
= DIM
*jnrA
;
216 j_coord_offsetB
= DIM
*jnrB
;
218 /* load j atom coordinates */
219 gmx_mm_load_3rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
220 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
222 /* Calculate displacement vector */
223 dx00
= _mm_sub_pd(ix0
,jx0
);
224 dy00
= _mm_sub_pd(iy0
,jy0
);
225 dz00
= _mm_sub_pd(iz0
,jz0
);
226 dx01
= _mm_sub_pd(ix0
,jx1
);
227 dy01
= _mm_sub_pd(iy0
,jy1
);
228 dz01
= _mm_sub_pd(iz0
,jz1
);
229 dx02
= _mm_sub_pd(ix0
,jx2
);
230 dy02
= _mm_sub_pd(iy0
,jy2
);
231 dz02
= _mm_sub_pd(iz0
,jz2
);
232 dx10
= _mm_sub_pd(ix1
,jx0
);
233 dy10
= _mm_sub_pd(iy1
,jy0
);
234 dz10
= _mm_sub_pd(iz1
,jz0
);
235 dx11
= _mm_sub_pd(ix1
,jx1
);
236 dy11
= _mm_sub_pd(iy1
,jy1
);
237 dz11
= _mm_sub_pd(iz1
,jz1
);
238 dx12
= _mm_sub_pd(ix1
,jx2
);
239 dy12
= _mm_sub_pd(iy1
,jy2
);
240 dz12
= _mm_sub_pd(iz1
,jz2
);
241 dx20
= _mm_sub_pd(ix2
,jx0
);
242 dy20
= _mm_sub_pd(iy2
,jy0
);
243 dz20
= _mm_sub_pd(iz2
,jz0
);
244 dx21
= _mm_sub_pd(ix2
,jx1
);
245 dy21
= _mm_sub_pd(iy2
,jy1
);
246 dz21
= _mm_sub_pd(iz2
,jz1
);
247 dx22
= _mm_sub_pd(ix2
,jx2
);
248 dy22
= _mm_sub_pd(iy2
,jy2
);
249 dz22
= _mm_sub_pd(iz2
,jz2
);
251 /* Calculate squared distance and things based on it */
252 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
253 rsq01
= gmx_mm_calc_rsq_pd(dx01
,dy01
,dz01
);
254 rsq02
= gmx_mm_calc_rsq_pd(dx02
,dy02
,dz02
);
255 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
256 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
257 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
258 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
259 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
260 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
262 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
263 rinv01
= gmx_mm_invsqrt_pd(rsq01
);
264 rinv02
= gmx_mm_invsqrt_pd(rsq02
);
265 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
266 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
267 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
268 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
269 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
270 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
272 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
273 rinvsq01
= _mm_mul_pd(rinv01
,rinv01
);
274 rinvsq02
= _mm_mul_pd(rinv02
,rinv02
);
275 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
276 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
277 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
278 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
279 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
280 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
282 fjx0
= _mm_setzero_pd();
283 fjy0
= _mm_setzero_pd();
284 fjz0
= _mm_setzero_pd();
285 fjx1
= _mm_setzero_pd();
286 fjy1
= _mm_setzero_pd();
287 fjz1
= _mm_setzero_pd();
288 fjx2
= _mm_setzero_pd();
289 fjy2
= _mm_setzero_pd();
290 fjz2
= _mm_setzero_pd();
292 /**************************
293 * CALCULATE INTERACTIONS *
294 **************************/
296 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
299 r00
= _mm_mul_pd(rsq00
,rinv00
);
301 /* Calculate table index by multiplying r with table scale and truncate to integer */
302 rt
= _mm_mul_pd(r00
,vftabscale
);
303 vfitab
= _mm_cvttpd_epi32(rt
);
305 vfeps
= _mm_frcz_pd(rt
);
307 vfeps
= _mm_sub_pd(rt
,_mm_round_pd(rt
, _MM_FROUND_FLOOR
));
309 twovfeps
= _mm_add_pd(vfeps
,vfeps
);
310 vfitab
= _mm_slli_epi32(vfitab
,3);
312 /* REACTION-FIELD ELECTROSTATICS */
313 velec
= _mm_mul_pd(qq00
,_mm_sub_pd(_mm_macc_pd(krf
,rsq00
,rinv00
),crf
));
314 felec
= _mm_mul_pd(qq00
,_mm_msub_pd(rinv00
,rinvsq00
,krf2
));
316 /* CUBIC SPLINE TABLE DISPERSION */
317 Y
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,0) );
318 F
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,1) );
319 GMX_MM_TRANSPOSE2_PD(Y
,F
);
320 G
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,0) +2);
321 H
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,1) +2);
322 GMX_MM_TRANSPOSE2_PD(G
,H
);
323 Fp
= _mm_macc_pd(vfeps
,_mm_macc_pd(H
,vfeps
,G
),F
);
324 VV
= _mm_macc_pd(vfeps
,Fp
,Y
);
325 vvdw6
= _mm_mul_pd(c6_00
,VV
);
326 FF
= _mm_macc_pd(vfeps
,_mm_macc_pd(twovfeps
,H
,G
),Fp
);
327 fvdw6
= _mm_mul_pd(c6_00
,FF
);
329 /* CUBIC SPLINE TABLE REPULSION */
330 vfitab
= _mm_add_epi32(vfitab
,ifour
);
331 Y
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,0) );
332 F
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,1) );
333 GMX_MM_TRANSPOSE2_PD(Y
,F
);
334 G
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,0) +2);
335 H
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,1) +2);
336 GMX_MM_TRANSPOSE2_PD(G
,H
);
337 Fp
= _mm_macc_pd(vfeps
,_mm_macc_pd(H
,vfeps
,G
),F
);
338 VV
= _mm_macc_pd(vfeps
,Fp
,Y
);
339 vvdw12
= _mm_mul_pd(c12_00
,VV
);
340 FF
= _mm_macc_pd(vfeps
,_mm_macc_pd(twovfeps
,H
,G
),Fp
);
341 fvdw12
= _mm_mul_pd(c12_00
,FF
);
342 vvdw
= _mm_add_pd(vvdw12
,vvdw6
);
343 fvdw
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_add_pd(fvdw6
,fvdw12
),_mm_mul_pd(vftabscale
,rinv00
)));
345 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
347 /* Update potential sum for this i atom from the interaction with this j atom. */
348 velec
= _mm_and_pd(velec
,cutoff_mask
);
349 velecsum
= _mm_add_pd(velecsum
,velec
);
350 vvdw
= _mm_and_pd(vvdw
,cutoff_mask
);
351 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
353 fscal
= _mm_add_pd(felec
,fvdw
);
355 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
357 /* Update vectorial force */
358 fix0
= _mm_macc_pd(dx00
,fscal
,fix0
);
359 fiy0
= _mm_macc_pd(dy00
,fscal
,fiy0
);
360 fiz0
= _mm_macc_pd(dz00
,fscal
,fiz0
);
362 fjx0
= _mm_macc_pd(dx00
,fscal
,fjx0
);
363 fjy0
= _mm_macc_pd(dy00
,fscal
,fjy0
);
364 fjz0
= _mm_macc_pd(dz00
,fscal
,fjz0
);
368 /**************************
369 * CALCULATE INTERACTIONS *
370 **************************/
372 if (gmx_mm_any_lt(rsq01
,rcutoff2
))
375 /* REACTION-FIELD ELECTROSTATICS */
376 velec
= _mm_mul_pd(qq01
,_mm_sub_pd(_mm_macc_pd(krf
,rsq01
,rinv01
),crf
));
377 felec
= _mm_mul_pd(qq01
,_mm_msub_pd(rinv01
,rinvsq01
,krf2
));
379 cutoff_mask
= _mm_cmplt_pd(rsq01
,rcutoff2
);
381 /* Update potential sum for this i atom from the interaction with this j atom. */
382 velec
= _mm_and_pd(velec
,cutoff_mask
);
383 velecsum
= _mm_add_pd(velecsum
,velec
);
387 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
389 /* Update vectorial force */
390 fix0
= _mm_macc_pd(dx01
,fscal
,fix0
);
391 fiy0
= _mm_macc_pd(dy01
,fscal
,fiy0
);
392 fiz0
= _mm_macc_pd(dz01
,fscal
,fiz0
);
394 fjx1
= _mm_macc_pd(dx01
,fscal
,fjx1
);
395 fjy1
= _mm_macc_pd(dy01
,fscal
,fjy1
);
396 fjz1
= _mm_macc_pd(dz01
,fscal
,fjz1
);
400 /**************************
401 * CALCULATE INTERACTIONS *
402 **************************/
404 if (gmx_mm_any_lt(rsq02
,rcutoff2
))
407 /* REACTION-FIELD ELECTROSTATICS */
408 velec
= _mm_mul_pd(qq02
,_mm_sub_pd(_mm_macc_pd(krf
,rsq02
,rinv02
),crf
));
409 felec
= _mm_mul_pd(qq02
,_mm_msub_pd(rinv02
,rinvsq02
,krf2
));
411 cutoff_mask
= _mm_cmplt_pd(rsq02
,rcutoff2
);
413 /* Update potential sum for this i atom from the interaction with this j atom. */
414 velec
= _mm_and_pd(velec
,cutoff_mask
);
415 velecsum
= _mm_add_pd(velecsum
,velec
);
419 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
421 /* Update vectorial force */
422 fix0
= _mm_macc_pd(dx02
,fscal
,fix0
);
423 fiy0
= _mm_macc_pd(dy02
,fscal
,fiy0
);
424 fiz0
= _mm_macc_pd(dz02
,fscal
,fiz0
);
426 fjx2
= _mm_macc_pd(dx02
,fscal
,fjx2
);
427 fjy2
= _mm_macc_pd(dy02
,fscal
,fjy2
);
428 fjz2
= _mm_macc_pd(dz02
,fscal
,fjz2
);
432 /**************************
433 * CALCULATE INTERACTIONS *
434 **************************/
436 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
439 /* REACTION-FIELD ELECTROSTATICS */
440 velec
= _mm_mul_pd(qq10
,_mm_sub_pd(_mm_macc_pd(krf
,rsq10
,rinv10
),crf
));
441 felec
= _mm_mul_pd(qq10
,_mm_msub_pd(rinv10
,rinvsq10
,krf2
));
443 cutoff_mask
= _mm_cmplt_pd(rsq10
,rcutoff2
);
445 /* Update potential sum for this i atom from the interaction with this j atom. */
446 velec
= _mm_and_pd(velec
,cutoff_mask
);
447 velecsum
= _mm_add_pd(velecsum
,velec
);
451 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
453 /* Update vectorial force */
454 fix1
= _mm_macc_pd(dx10
,fscal
,fix1
);
455 fiy1
= _mm_macc_pd(dy10
,fscal
,fiy1
);
456 fiz1
= _mm_macc_pd(dz10
,fscal
,fiz1
);
458 fjx0
= _mm_macc_pd(dx10
,fscal
,fjx0
);
459 fjy0
= _mm_macc_pd(dy10
,fscal
,fjy0
);
460 fjz0
= _mm_macc_pd(dz10
,fscal
,fjz0
);
464 /**************************
465 * CALCULATE INTERACTIONS *
466 **************************/
468 if (gmx_mm_any_lt(rsq11
,rcutoff2
))
471 /* REACTION-FIELD ELECTROSTATICS */
472 velec
= _mm_mul_pd(qq11
,_mm_sub_pd(_mm_macc_pd(krf
,rsq11
,rinv11
),crf
));
473 felec
= _mm_mul_pd(qq11
,_mm_msub_pd(rinv11
,rinvsq11
,krf2
));
475 cutoff_mask
= _mm_cmplt_pd(rsq11
,rcutoff2
);
477 /* Update potential sum for this i atom from the interaction with this j atom. */
478 velec
= _mm_and_pd(velec
,cutoff_mask
);
479 velecsum
= _mm_add_pd(velecsum
,velec
);
483 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
485 /* Update vectorial force */
486 fix1
= _mm_macc_pd(dx11
,fscal
,fix1
);
487 fiy1
= _mm_macc_pd(dy11
,fscal
,fiy1
);
488 fiz1
= _mm_macc_pd(dz11
,fscal
,fiz1
);
490 fjx1
= _mm_macc_pd(dx11
,fscal
,fjx1
);
491 fjy1
= _mm_macc_pd(dy11
,fscal
,fjy1
);
492 fjz1
= _mm_macc_pd(dz11
,fscal
,fjz1
);
496 /**************************
497 * CALCULATE INTERACTIONS *
498 **************************/
500 if (gmx_mm_any_lt(rsq12
,rcutoff2
))
503 /* REACTION-FIELD ELECTROSTATICS */
504 velec
= _mm_mul_pd(qq12
,_mm_sub_pd(_mm_macc_pd(krf
,rsq12
,rinv12
),crf
));
505 felec
= _mm_mul_pd(qq12
,_mm_msub_pd(rinv12
,rinvsq12
,krf2
));
507 cutoff_mask
= _mm_cmplt_pd(rsq12
,rcutoff2
);
509 /* Update potential sum for this i atom from the interaction with this j atom. */
510 velec
= _mm_and_pd(velec
,cutoff_mask
);
511 velecsum
= _mm_add_pd(velecsum
,velec
);
515 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
517 /* Update vectorial force */
518 fix1
= _mm_macc_pd(dx12
,fscal
,fix1
);
519 fiy1
= _mm_macc_pd(dy12
,fscal
,fiy1
);
520 fiz1
= _mm_macc_pd(dz12
,fscal
,fiz1
);
522 fjx2
= _mm_macc_pd(dx12
,fscal
,fjx2
);
523 fjy2
= _mm_macc_pd(dy12
,fscal
,fjy2
);
524 fjz2
= _mm_macc_pd(dz12
,fscal
,fjz2
);
528 /**************************
529 * CALCULATE INTERACTIONS *
530 **************************/
532 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
535 /* REACTION-FIELD ELECTROSTATICS */
536 velec
= _mm_mul_pd(qq20
,_mm_sub_pd(_mm_macc_pd(krf
,rsq20
,rinv20
),crf
));
537 felec
= _mm_mul_pd(qq20
,_mm_msub_pd(rinv20
,rinvsq20
,krf2
));
539 cutoff_mask
= _mm_cmplt_pd(rsq20
,rcutoff2
);
541 /* Update potential sum for this i atom from the interaction with this j atom. */
542 velec
= _mm_and_pd(velec
,cutoff_mask
);
543 velecsum
= _mm_add_pd(velecsum
,velec
);
547 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
549 /* Update vectorial force */
550 fix2
= _mm_macc_pd(dx20
,fscal
,fix2
);
551 fiy2
= _mm_macc_pd(dy20
,fscal
,fiy2
);
552 fiz2
= _mm_macc_pd(dz20
,fscal
,fiz2
);
554 fjx0
= _mm_macc_pd(dx20
,fscal
,fjx0
);
555 fjy0
= _mm_macc_pd(dy20
,fscal
,fjy0
);
556 fjz0
= _mm_macc_pd(dz20
,fscal
,fjz0
);
560 /**************************
561 * CALCULATE INTERACTIONS *
562 **************************/
564 if (gmx_mm_any_lt(rsq21
,rcutoff2
))
567 /* REACTION-FIELD ELECTROSTATICS */
568 velec
= _mm_mul_pd(qq21
,_mm_sub_pd(_mm_macc_pd(krf
,rsq21
,rinv21
),crf
));
569 felec
= _mm_mul_pd(qq21
,_mm_msub_pd(rinv21
,rinvsq21
,krf2
));
571 cutoff_mask
= _mm_cmplt_pd(rsq21
,rcutoff2
);
573 /* Update potential sum for this i atom from the interaction with this j atom. */
574 velec
= _mm_and_pd(velec
,cutoff_mask
);
575 velecsum
= _mm_add_pd(velecsum
,velec
);
579 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
581 /* Update vectorial force */
582 fix2
= _mm_macc_pd(dx21
,fscal
,fix2
);
583 fiy2
= _mm_macc_pd(dy21
,fscal
,fiy2
);
584 fiz2
= _mm_macc_pd(dz21
,fscal
,fiz2
);
586 fjx1
= _mm_macc_pd(dx21
,fscal
,fjx1
);
587 fjy1
= _mm_macc_pd(dy21
,fscal
,fjy1
);
588 fjz1
= _mm_macc_pd(dz21
,fscal
,fjz1
);
592 /**************************
593 * CALCULATE INTERACTIONS *
594 **************************/
596 if (gmx_mm_any_lt(rsq22
,rcutoff2
))
599 /* REACTION-FIELD ELECTROSTATICS */
600 velec
= _mm_mul_pd(qq22
,_mm_sub_pd(_mm_macc_pd(krf
,rsq22
,rinv22
),crf
));
601 felec
= _mm_mul_pd(qq22
,_mm_msub_pd(rinv22
,rinvsq22
,krf2
));
603 cutoff_mask
= _mm_cmplt_pd(rsq22
,rcutoff2
);
605 /* Update potential sum for this i atom from the interaction with this j atom. */
606 velec
= _mm_and_pd(velec
,cutoff_mask
);
607 velecsum
= _mm_add_pd(velecsum
,velec
);
611 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
613 /* Update vectorial force */
614 fix2
= _mm_macc_pd(dx22
,fscal
,fix2
);
615 fiy2
= _mm_macc_pd(dy22
,fscal
,fiy2
);
616 fiz2
= _mm_macc_pd(dz22
,fscal
,fiz2
);
618 fjx2
= _mm_macc_pd(dx22
,fscal
,fjx2
);
619 fjy2
= _mm_macc_pd(dy22
,fscal
,fjy2
);
620 fjz2
= _mm_macc_pd(dz22
,fscal
,fjz2
);
624 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
626 /* Inner loop uses 387 flops */
633 j_coord_offsetA
= DIM
*jnrA
;
635 /* load j atom coordinates */
636 gmx_mm_load_3rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
637 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
639 /* Calculate displacement vector */
640 dx00
= _mm_sub_pd(ix0
,jx0
);
641 dy00
= _mm_sub_pd(iy0
,jy0
);
642 dz00
= _mm_sub_pd(iz0
,jz0
);
643 dx01
= _mm_sub_pd(ix0
,jx1
);
644 dy01
= _mm_sub_pd(iy0
,jy1
);
645 dz01
= _mm_sub_pd(iz0
,jz1
);
646 dx02
= _mm_sub_pd(ix0
,jx2
);
647 dy02
= _mm_sub_pd(iy0
,jy2
);
648 dz02
= _mm_sub_pd(iz0
,jz2
);
649 dx10
= _mm_sub_pd(ix1
,jx0
);
650 dy10
= _mm_sub_pd(iy1
,jy0
);
651 dz10
= _mm_sub_pd(iz1
,jz0
);
652 dx11
= _mm_sub_pd(ix1
,jx1
);
653 dy11
= _mm_sub_pd(iy1
,jy1
);
654 dz11
= _mm_sub_pd(iz1
,jz1
);
655 dx12
= _mm_sub_pd(ix1
,jx2
);
656 dy12
= _mm_sub_pd(iy1
,jy2
);
657 dz12
= _mm_sub_pd(iz1
,jz2
);
658 dx20
= _mm_sub_pd(ix2
,jx0
);
659 dy20
= _mm_sub_pd(iy2
,jy0
);
660 dz20
= _mm_sub_pd(iz2
,jz0
);
661 dx21
= _mm_sub_pd(ix2
,jx1
);
662 dy21
= _mm_sub_pd(iy2
,jy1
);
663 dz21
= _mm_sub_pd(iz2
,jz1
);
664 dx22
= _mm_sub_pd(ix2
,jx2
);
665 dy22
= _mm_sub_pd(iy2
,jy2
);
666 dz22
= _mm_sub_pd(iz2
,jz2
);
668 /* Calculate squared distance and things based on it */
669 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
670 rsq01
= gmx_mm_calc_rsq_pd(dx01
,dy01
,dz01
);
671 rsq02
= gmx_mm_calc_rsq_pd(dx02
,dy02
,dz02
);
672 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
673 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
674 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
675 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
676 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
677 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
679 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
680 rinv01
= gmx_mm_invsqrt_pd(rsq01
);
681 rinv02
= gmx_mm_invsqrt_pd(rsq02
);
682 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
683 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
684 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
685 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
686 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
687 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
689 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
690 rinvsq01
= _mm_mul_pd(rinv01
,rinv01
);
691 rinvsq02
= _mm_mul_pd(rinv02
,rinv02
);
692 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
693 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
694 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
695 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
696 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
697 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
699 fjx0
= _mm_setzero_pd();
700 fjy0
= _mm_setzero_pd();
701 fjz0
= _mm_setzero_pd();
702 fjx1
= _mm_setzero_pd();
703 fjy1
= _mm_setzero_pd();
704 fjz1
= _mm_setzero_pd();
705 fjx2
= _mm_setzero_pd();
706 fjy2
= _mm_setzero_pd();
707 fjz2
= _mm_setzero_pd();
709 /**************************
710 * CALCULATE INTERACTIONS *
711 **************************/
713 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
716 r00
= _mm_mul_pd(rsq00
,rinv00
);
718 /* Calculate table index by multiplying r with table scale and truncate to integer */
719 rt
= _mm_mul_pd(r00
,vftabscale
);
720 vfitab
= _mm_cvttpd_epi32(rt
);
722 vfeps
= _mm_frcz_pd(rt
);
724 vfeps
= _mm_sub_pd(rt
,_mm_round_pd(rt
, _MM_FROUND_FLOOR
));
726 twovfeps
= _mm_add_pd(vfeps
,vfeps
);
727 vfitab
= _mm_slli_epi32(vfitab
,3);
729 /* REACTION-FIELD ELECTROSTATICS */
730 velec
= _mm_mul_pd(qq00
,_mm_sub_pd(_mm_macc_pd(krf
,rsq00
,rinv00
),crf
));
731 felec
= _mm_mul_pd(qq00
,_mm_msub_pd(rinv00
,rinvsq00
,krf2
));
733 /* CUBIC SPLINE TABLE DISPERSION */
734 Y
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,0) );
735 F
= _mm_setzero_pd();
736 GMX_MM_TRANSPOSE2_PD(Y
,F
);
737 G
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,0) +2);
738 H
= _mm_setzero_pd();
739 GMX_MM_TRANSPOSE2_PD(G
,H
);
740 Fp
= _mm_macc_pd(vfeps
,_mm_macc_pd(H
,vfeps
,G
),F
);
741 VV
= _mm_macc_pd(vfeps
,Fp
,Y
);
742 vvdw6
= _mm_mul_pd(c6_00
,VV
);
743 FF
= _mm_macc_pd(vfeps
,_mm_macc_pd(twovfeps
,H
,G
),Fp
);
744 fvdw6
= _mm_mul_pd(c6_00
,FF
);
746 /* CUBIC SPLINE TABLE REPULSION */
747 vfitab
= _mm_add_epi32(vfitab
,ifour
);
748 Y
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,0) );
749 F
= _mm_setzero_pd();
750 GMX_MM_TRANSPOSE2_PD(Y
,F
);
751 G
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,0) +2);
752 H
= _mm_setzero_pd();
753 GMX_MM_TRANSPOSE2_PD(G
,H
);
754 Fp
= _mm_macc_pd(vfeps
,_mm_macc_pd(H
,vfeps
,G
),F
);
755 VV
= _mm_macc_pd(vfeps
,Fp
,Y
);
756 vvdw12
= _mm_mul_pd(c12_00
,VV
);
757 FF
= _mm_macc_pd(vfeps
,_mm_macc_pd(twovfeps
,H
,G
),Fp
);
758 fvdw12
= _mm_mul_pd(c12_00
,FF
);
759 vvdw
= _mm_add_pd(vvdw12
,vvdw6
);
760 fvdw
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_add_pd(fvdw6
,fvdw12
),_mm_mul_pd(vftabscale
,rinv00
)));
762 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
764 /* Update potential sum for this i atom from the interaction with this j atom. */
765 velec
= _mm_and_pd(velec
,cutoff_mask
);
766 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
767 velecsum
= _mm_add_pd(velecsum
,velec
);
768 vvdw
= _mm_and_pd(vvdw
,cutoff_mask
);
769 vvdw
= _mm_unpacklo_pd(vvdw
,_mm_setzero_pd());
770 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
772 fscal
= _mm_add_pd(felec
,fvdw
);
774 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
776 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
778 /* Update vectorial force */
779 fix0
= _mm_macc_pd(dx00
,fscal
,fix0
);
780 fiy0
= _mm_macc_pd(dy00
,fscal
,fiy0
);
781 fiz0
= _mm_macc_pd(dz00
,fscal
,fiz0
);
783 fjx0
= _mm_macc_pd(dx00
,fscal
,fjx0
);
784 fjy0
= _mm_macc_pd(dy00
,fscal
,fjy0
);
785 fjz0
= _mm_macc_pd(dz00
,fscal
,fjz0
);
789 /**************************
790 * CALCULATE INTERACTIONS *
791 **************************/
793 if (gmx_mm_any_lt(rsq01
,rcutoff2
))
796 /* REACTION-FIELD ELECTROSTATICS */
797 velec
= _mm_mul_pd(qq01
,_mm_sub_pd(_mm_macc_pd(krf
,rsq01
,rinv01
),crf
));
798 felec
= _mm_mul_pd(qq01
,_mm_msub_pd(rinv01
,rinvsq01
,krf2
));
800 cutoff_mask
= _mm_cmplt_pd(rsq01
,rcutoff2
);
802 /* Update potential sum for this i atom from the interaction with this j atom. */
803 velec
= _mm_and_pd(velec
,cutoff_mask
);
804 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
805 velecsum
= _mm_add_pd(velecsum
,velec
);
809 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
811 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
813 /* Update vectorial force */
814 fix0
= _mm_macc_pd(dx01
,fscal
,fix0
);
815 fiy0
= _mm_macc_pd(dy01
,fscal
,fiy0
);
816 fiz0
= _mm_macc_pd(dz01
,fscal
,fiz0
);
818 fjx1
= _mm_macc_pd(dx01
,fscal
,fjx1
);
819 fjy1
= _mm_macc_pd(dy01
,fscal
,fjy1
);
820 fjz1
= _mm_macc_pd(dz01
,fscal
,fjz1
);
824 /**************************
825 * CALCULATE INTERACTIONS *
826 **************************/
828 if (gmx_mm_any_lt(rsq02
,rcutoff2
))
831 /* REACTION-FIELD ELECTROSTATICS */
832 velec
= _mm_mul_pd(qq02
,_mm_sub_pd(_mm_macc_pd(krf
,rsq02
,rinv02
),crf
));
833 felec
= _mm_mul_pd(qq02
,_mm_msub_pd(rinv02
,rinvsq02
,krf2
));
835 cutoff_mask
= _mm_cmplt_pd(rsq02
,rcutoff2
);
837 /* Update potential sum for this i atom from the interaction with this j atom. */
838 velec
= _mm_and_pd(velec
,cutoff_mask
);
839 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
840 velecsum
= _mm_add_pd(velecsum
,velec
);
844 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
846 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
848 /* Update vectorial force */
849 fix0
= _mm_macc_pd(dx02
,fscal
,fix0
);
850 fiy0
= _mm_macc_pd(dy02
,fscal
,fiy0
);
851 fiz0
= _mm_macc_pd(dz02
,fscal
,fiz0
);
853 fjx2
= _mm_macc_pd(dx02
,fscal
,fjx2
);
854 fjy2
= _mm_macc_pd(dy02
,fscal
,fjy2
);
855 fjz2
= _mm_macc_pd(dz02
,fscal
,fjz2
);
859 /**************************
860 * CALCULATE INTERACTIONS *
861 **************************/
863 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
866 /* REACTION-FIELD ELECTROSTATICS */
867 velec
= _mm_mul_pd(qq10
,_mm_sub_pd(_mm_macc_pd(krf
,rsq10
,rinv10
),crf
));
868 felec
= _mm_mul_pd(qq10
,_mm_msub_pd(rinv10
,rinvsq10
,krf2
));
870 cutoff_mask
= _mm_cmplt_pd(rsq10
,rcutoff2
);
872 /* Update potential sum for this i atom from the interaction with this j atom. */
873 velec
= _mm_and_pd(velec
,cutoff_mask
);
874 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
875 velecsum
= _mm_add_pd(velecsum
,velec
);
879 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
881 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
883 /* Update vectorial force */
884 fix1
= _mm_macc_pd(dx10
,fscal
,fix1
);
885 fiy1
= _mm_macc_pd(dy10
,fscal
,fiy1
);
886 fiz1
= _mm_macc_pd(dz10
,fscal
,fiz1
);
888 fjx0
= _mm_macc_pd(dx10
,fscal
,fjx0
);
889 fjy0
= _mm_macc_pd(dy10
,fscal
,fjy0
);
890 fjz0
= _mm_macc_pd(dz10
,fscal
,fjz0
);
894 /**************************
895 * CALCULATE INTERACTIONS *
896 **************************/
898 if (gmx_mm_any_lt(rsq11
,rcutoff2
))
901 /* REACTION-FIELD ELECTROSTATICS */
902 velec
= _mm_mul_pd(qq11
,_mm_sub_pd(_mm_macc_pd(krf
,rsq11
,rinv11
),crf
));
903 felec
= _mm_mul_pd(qq11
,_mm_msub_pd(rinv11
,rinvsq11
,krf2
));
905 cutoff_mask
= _mm_cmplt_pd(rsq11
,rcutoff2
);
907 /* Update potential sum for this i atom from the interaction with this j atom. */
908 velec
= _mm_and_pd(velec
,cutoff_mask
);
909 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
910 velecsum
= _mm_add_pd(velecsum
,velec
);
914 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
916 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
918 /* Update vectorial force */
919 fix1
= _mm_macc_pd(dx11
,fscal
,fix1
);
920 fiy1
= _mm_macc_pd(dy11
,fscal
,fiy1
);
921 fiz1
= _mm_macc_pd(dz11
,fscal
,fiz1
);
923 fjx1
= _mm_macc_pd(dx11
,fscal
,fjx1
);
924 fjy1
= _mm_macc_pd(dy11
,fscal
,fjy1
);
925 fjz1
= _mm_macc_pd(dz11
,fscal
,fjz1
);
929 /**************************
930 * CALCULATE INTERACTIONS *
931 **************************/
933 if (gmx_mm_any_lt(rsq12
,rcutoff2
))
936 /* REACTION-FIELD ELECTROSTATICS */
937 velec
= _mm_mul_pd(qq12
,_mm_sub_pd(_mm_macc_pd(krf
,rsq12
,rinv12
),crf
));
938 felec
= _mm_mul_pd(qq12
,_mm_msub_pd(rinv12
,rinvsq12
,krf2
));
940 cutoff_mask
= _mm_cmplt_pd(rsq12
,rcutoff2
);
942 /* Update potential sum for this i atom from the interaction with this j atom. */
943 velec
= _mm_and_pd(velec
,cutoff_mask
);
944 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
945 velecsum
= _mm_add_pd(velecsum
,velec
);
949 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
951 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
953 /* Update vectorial force */
954 fix1
= _mm_macc_pd(dx12
,fscal
,fix1
);
955 fiy1
= _mm_macc_pd(dy12
,fscal
,fiy1
);
956 fiz1
= _mm_macc_pd(dz12
,fscal
,fiz1
);
958 fjx2
= _mm_macc_pd(dx12
,fscal
,fjx2
);
959 fjy2
= _mm_macc_pd(dy12
,fscal
,fjy2
);
960 fjz2
= _mm_macc_pd(dz12
,fscal
,fjz2
);
964 /**************************
965 * CALCULATE INTERACTIONS *
966 **************************/
968 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
971 /* REACTION-FIELD ELECTROSTATICS */
972 velec
= _mm_mul_pd(qq20
,_mm_sub_pd(_mm_macc_pd(krf
,rsq20
,rinv20
),crf
));
973 felec
= _mm_mul_pd(qq20
,_mm_msub_pd(rinv20
,rinvsq20
,krf2
));
975 cutoff_mask
= _mm_cmplt_pd(rsq20
,rcutoff2
);
977 /* Update potential sum for this i atom from the interaction with this j atom. */
978 velec
= _mm_and_pd(velec
,cutoff_mask
);
979 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
980 velecsum
= _mm_add_pd(velecsum
,velec
);
984 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
986 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
988 /* Update vectorial force */
989 fix2
= _mm_macc_pd(dx20
,fscal
,fix2
);
990 fiy2
= _mm_macc_pd(dy20
,fscal
,fiy2
);
991 fiz2
= _mm_macc_pd(dz20
,fscal
,fiz2
);
993 fjx0
= _mm_macc_pd(dx20
,fscal
,fjx0
);
994 fjy0
= _mm_macc_pd(dy20
,fscal
,fjy0
);
995 fjz0
= _mm_macc_pd(dz20
,fscal
,fjz0
);
999 /**************************
1000 * CALCULATE INTERACTIONS *
1001 **************************/
1003 if (gmx_mm_any_lt(rsq21
,rcutoff2
))
1006 /* REACTION-FIELD ELECTROSTATICS */
1007 velec
= _mm_mul_pd(qq21
,_mm_sub_pd(_mm_macc_pd(krf
,rsq21
,rinv21
),crf
));
1008 felec
= _mm_mul_pd(qq21
,_mm_msub_pd(rinv21
,rinvsq21
,krf2
));
1010 cutoff_mask
= _mm_cmplt_pd(rsq21
,rcutoff2
);
1012 /* Update potential sum for this i atom from the interaction with this j atom. */
1013 velec
= _mm_and_pd(velec
,cutoff_mask
);
1014 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1015 velecsum
= _mm_add_pd(velecsum
,velec
);
1019 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1021 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1023 /* Update vectorial force */
1024 fix2
= _mm_macc_pd(dx21
,fscal
,fix2
);
1025 fiy2
= _mm_macc_pd(dy21
,fscal
,fiy2
);
1026 fiz2
= _mm_macc_pd(dz21
,fscal
,fiz2
);
1028 fjx1
= _mm_macc_pd(dx21
,fscal
,fjx1
);
1029 fjy1
= _mm_macc_pd(dy21
,fscal
,fjy1
);
1030 fjz1
= _mm_macc_pd(dz21
,fscal
,fjz1
);
1034 /**************************
1035 * CALCULATE INTERACTIONS *
1036 **************************/
1038 if (gmx_mm_any_lt(rsq22
,rcutoff2
))
1041 /* REACTION-FIELD ELECTROSTATICS */
1042 velec
= _mm_mul_pd(qq22
,_mm_sub_pd(_mm_macc_pd(krf
,rsq22
,rinv22
),crf
));
1043 felec
= _mm_mul_pd(qq22
,_mm_msub_pd(rinv22
,rinvsq22
,krf2
));
1045 cutoff_mask
= _mm_cmplt_pd(rsq22
,rcutoff2
);
1047 /* Update potential sum for this i atom from the interaction with this j atom. */
1048 velec
= _mm_and_pd(velec
,cutoff_mask
);
1049 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1050 velecsum
= _mm_add_pd(velecsum
,velec
);
1054 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1056 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1058 /* Update vectorial force */
1059 fix2
= _mm_macc_pd(dx22
,fscal
,fix2
);
1060 fiy2
= _mm_macc_pd(dy22
,fscal
,fiy2
);
1061 fiz2
= _mm_macc_pd(dz22
,fscal
,fiz2
);
1063 fjx2
= _mm_macc_pd(dx22
,fscal
,fjx2
);
1064 fjy2
= _mm_macc_pd(dy22
,fscal
,fjy2
);
1065 fjz2
= _mm_macc_pd(dz22
,fscal
,fjz2
);
1069 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
1071 /* Inner loop uses 387 flops */
1074 /* End of innermost loop */
1076 gmx_mm_update_iforce_3atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
1077 f
+i_coord_offset
,fshift
+i_shift_offset
);
1080 /* Update potential energies */
1081 gmx_mm_update_1pot_pd(velecsum
,kernel_data
->energygrp_elec
+ggid
);
1082 gmx_mm_update_1pot_pd(vvdwsum
,kernel_data
->energygrp_vdw
+ggid
);
1084 /* Increment number of inner iterations */
1085 inneriter
+= j_index_end
- j_index_start
;
1087 /* Outer loop uses 20 flops */
1090 /* Increment number of outer iterations */
1093 /* Update outer/inner flops */
1095 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W3W3_VF
,outeriter
*20 + inneriter
*387);
1098 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_avx_128_fma_double
1099 * Electrostatics interaction: ReactionField
1100 * VdW interaction: CubicSplineTable
1101 * Geometry: Water3-Water3
1102 * Calculate force/pot: Force
1105 nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_avx_128_fma_double
1106 (t_nblist
* gmx_restrict nlist
,
1107 rvec
* gmx_restrict xx
,
1108 rvec
* gmx_restrict ff
,
1109 t_forcerec
* gmx_restrict fr
,
1110 t_mdatoms
* gmx_restrict mdatoms
,
1111 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
1112 t_nrnb
* gmx_restrict nrnb
)
1114 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1115 * just 0 for non-waters.
1116 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1117 * jnr indices corresponding to data put in the four positions in the SIMD register.
1119 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
1120 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
1122 int j_coord_offsetA
,j_coord_offsetB
;
1123 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
1124 real rcutoff_scalar
;
1125 real
*shiftvec
,*fshift
,*x
,*f
;
1126 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
1128 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
1130 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
1132 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
1133 int vdwjidx0A
,vdwjidx0B
;
1134 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
1135 int vdwjidx1A
,vdwjidx1B
;
1136 __m128d jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
1137 int vdwjidx2A
,vdwjidx2B
;
1138 __m128d jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
1139 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
1140 __m128d dx01
,dy01
,dz01
,rsq01
,rinv01
,rinvsq01
,r01
,qq01
,c6_01
,c12_01
;
1141 __m128d dx02
,dy02
,dz02
,rsq02
,rinv02
,rinvsq02
,r02
,qq02
,c6_02
,c12_02
;
1142 __m128d dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
1143 __m128d dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
1144 __m128d dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
1145 __m128d dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
1146 __m128d dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
1147 __m128d dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
1148 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
1151 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
1154 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
1155 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
1157 __m128i ifour
= _mm_set1_epi32(4);
1158 __m128d rt
,vfeps
,vftabscale
,Y
,F
,G
,H
,Heps
,Fp
,VV
,FF
,twovfeps
;
1160 __m128d dummy_mask
,cutoff_mask
;
1161 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1162 __m128d one
= _mm_set1_pd(1.0);
1163 __m128d two
= _mm_set1_pd(2.0);
1169 jindex
= nlist
->jindex
;
1171 shiftidx
= nlist
->shift
;
1173 shiftvec
= fr
->shift_vec
[0];
1174 fshift
= fr
->fshift
[0];
1175 facel
= _mm_set1_pd(fr
->epsfac
);
1176 charge
= mdatoms
->chargeA
;
1177 krf
= _mm_set1_pd(fr
->ic
->k_rf
);
1178 krf2
= _mm_set1_pd(fr
->ic
->k_rf
*2.0);
1179 crf
= _mm_set1_pd(fr
->ic
->c_rf
);
1180 nvdwtype
= fr
->ntype
;
1181 vdwparam
= fr
->nbfp
;
1182 vdwtype
= mdatoms
->typeA
;
1184 vftab
= kernel_data
->table_vdw
->data
;
1185 vftabscale
= _mm_set1_pd(kernel_data
->table_vdw
->scale
);
1187 /* Setup water-specific parameters */
1188 inr
= nlist
->iinr
[0];
1189 iq0
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+0]));
1190 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
1191 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
1192 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
1194 jq0
= _mm_set1_pd(charge
[inr
+0]);
1195 jq1
= _mm_set1_pd(charge
[inr
+1]);
1196 jq2
= _mm_set1_pd(charge
[inr
+2]);
1197 vdwjidx0A
= 2*vdwtype
[inr
+0];
1198 qq00
= _mm_mul_pd(iq0
,jq0
);
1199 c6_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
]);
1200 c12_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
1201 qq01
= _mm_mul_pd(iq0
,jq1
);
1202 qq02
= _mm_mul_pd(iq0
,jq2
);
1203 qq10
= _mm_mul_pd(iq1
,jq0
);
1204 qq11
= _mm_mul_pd(iq1
,jq1
);
1205 qq12
= _mm_mul_pd(iq1
,jq2
);
1206 qq20
= _mm_mul_pd(iq2
,jq0
);
1207 qq21
= _mm_mul_pd(iq2
,jq1
);
1208 qq22
= _mm_mul_pd(iq2
,jq2
);
1210 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1211 rcutoff_scalar
= fr
->rcoulomb
;
1212 rcutoff
= _mm_set1_pd(rcutoff_scalar
);
1213 rcutoff2
= _mm_mul_pd(rcutoff
,rcutoff
);
1215 /* Avoid stupid compiler warnings */
1217 j_coord_offsetA
= 0;
1218 j_coord_offsetB
= 0;
1223 /* Start outer loop over neighborlists */
1224 for(iidx
=0; iidx
<nri
; iidx
++)
1226 /* Load shift vector for this list */
1227 i_shift_offset
= DIM
*shiftidx
[iidx
];
1229 /* Load limits for loop over neighbors */
1230 j_index_start
= jindex
[iidx
];
1231 j_index_end
= jindex
[iidx
+1];
1233 /* Get outer coordinate index */
1235 i_coord_offset
= DIM
*inr
;
1237 /* Load i particle coords and add shift vector */
1238 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
1239 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
1241 fix0
= _mm_setzero_pd();
1242 fiy0
= _mm_setzero_pd();
1243 fiz0
= _mm_setzero_pd();
1244 fix1
= _mm_setzero_pd();
1245 fiy1
= _mm_setzero_pd();
1246 fiz1
= _mm_setzero_pd();
1247 fix2
= _mm_setzero_pd();
1248 fiy2
= _mm_setzero_pd();
1249 fiz2
= _mm_setzero_pd();
1251 /* Start inner kernel loop */
1252 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
1255 /* Get j neighbor index, and coordinate index */
1257 jnrB
= jjnr
[jidx
+1];
1258 j_coord_offsetA
= DIM
*jnrA
;
1259 j_coord_offsetB
= DIM
*jnrB
;
1261 /* load j atom coordinates */
1262 gmx_mm_load_3rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1263 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
1265 /* Calculate displacement vector */
1266 dx00
= _mm_sub_pd(ix0
,jx0
);
1267 dy00
= _mm_sub_pd(iy0
,jy0
);
1268 dz00
= _mm_sub_pd(iz0
,jz0
);
1269 dx01
= _mm_sub_pd(ix0
,jx1
);
1270 dy01
= _mm_sub_pd(iy0
,jy1
);
1271 dz01
= _mm_sub_pd(iz0
,jz1
);
1272 dx02
= _mm_sub_pd(ix0
,jx2
);
1273 dy02
= _mm_sub_pd(iy0
,jy2
);
1274 dz02
= _mm_sub_pd(iz0
,jz2
);
1275 dx10
= _mm_sub_pd(ix1
,jx0
);
1276 dy10
= _mm_sub_pd(iy1
,jy0
);
1277 dz10
= _mm_sub_pd(iz1
,jz0
);
1278 dx11
= _mm_sub_pd(ix1
,jx1
);
1279 dy11
= _mm_sub_pd(iy1
,jy1
);
1280 dz11
= _mm_sub_pd(iz1
,jz1
);
1281 dx12
= _mm_sub_pd(ix1
,jx2
);
1282 dy12
= _mm_sub_pd(iy1
,jy2
);
1283 dz12
= _mm_sub_pd(iz1
,jz2
);
1284 dx20
= _mm_sub_pd(ix2
,jx0
);
1285 dy20
= _mm_sub_pd(iy2
,jy0
);
1286 dz20
= _mm_sub_pd(iz2
,jz0
);
1287 dx21
= _mm_sub_pd(ix2
,jx1
);
1288 dy21
= _mm_sub_pd(iy2
,jy1
);
1289 dz21
= _mm_sub_pd(iz2
,jz1
);
1290 dx22
= _mm_sub_pd(ix2
,jx2
);
1291 dy22
= _mm_sub_pd(iy2
,jy2
);
1292 dz22
= _mm_sub_pd(iz2
,jz2
);
1294 /* Calculate squared distance and things based on it */
1295 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
1296 rsq01
= gmx_mm_calc_rsq_pd(dx01
,dy01
,dz01
);
1297 rsq02
= gmx_mm_calc_rsq_pd(dx02
,dy02
,dz02
);
1298 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
1299 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
1300 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
1301 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
1302 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
1303 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
1305 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
1306 rinv01
= gmx_mm_invsqrt_pd(rsq01
);
1307 rinv02
= gmx_mm_invsqrt_pd(rsq02
);
1308 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
1309 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
1310 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
1311 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
1312 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
1313 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
1315 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
1316 rinvsq01
= _mm_mul_pd(rinv01
,rinv01
);
1317 rinvsq02
= _mm_mul_pd(rinv02
,rinv02
);
1318 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
1319 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
1320 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
1321 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
1322 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
1323 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
1325 fjx0
= _mm_setzero_pd();
1326 fjy0
= _mm_setzero_pd();
1327 fjz0
= _mm_setzero_pd();
1328 fjx1
= _mm_setzero_pd();
1329 fjy1
= _mm_setzero_pd();
1330 fjz1
= _mm_setzero_pd();
1331 fjx2
= _mm_setzero_pd();
1332 fjy2
= _mm_setzero_pd();
1333 fjz2
= _mm_setzero_pd();
1335 /**************************
1336 * CALCULATE INTERACTIONS *
1337 **************************/
1339 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
1342 r00
= _mm_mul_pd(rsq00
,rinv00
);
1344 /* Calculate table index by multiplying r with table scale and truncate to integer */
1345 rt
= _mm_mul_pd(r00
,vftabscale
);
1346 vfitab
= _mm_cvttpd_epi32(rt
);
1348 vfeps
= _mm_frcz_pd(rt
);
1350 vfeps
= _mm_sub_pd(rt
,_mm_round_pd(rt
, _MM_FROUND_FLOOR
));
1352 twovfeps
= _mm_add_pd(vfeps
,vfeps
);
1353 vfitab
= _mm_slli_epi32(vfitab
,3);
1355 /* REACTION-FIELD ELECTROSTATICS */
1356 felec
= _mm_mul_pd(qq00
,_mm_msub_pd(rinv00
,rinvsq00
,krf2
));
1358 /* CUBIC SPLINE TABLE DISPERSION */
1359 Y
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,0) );
1360 F
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,1) );
1361 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1362 G
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,0) +2);
1363 H
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,1) +2);
1364 GMX_MM_TRANSPOSE2_PD(G
,H
);
1365 Fp
= _mm_macc_pd(vfeps
,_mm_macc_pd(H
,vfeps
,G
),F
);
1366 FF
= _mm_macc_pd(vfeps
,_mm_macc_pd(twovfeps
,H
,G
),Fp
);
1367 fvdw6
= _mm_mul_pd(c6_00
,FF
);
1369 /* CUBIC SPLINE TABLE REPULSION */
1370 vfitab
= _mm_add_epi32(vfitab
,ifour
);
1371 Y
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,0) );
1372 F
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,1) );
1373 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1374 G
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,0) +2);
1375 H
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,1) +2);
1376 GMX_MM_TRANSPOSE2_PD(G
,H
);
1377 Fp
= _mm_macc_pd(vfeps
,_mm_macc_pd(H
,vfeps
,G
),F
);
1378 FF
= _mm_macc_pd(vfeps
,_mm_macc_pd(twovfeps
,H
,G
),Fp
);
1379 fvdw12
= _mm_mul_pd(c12_00
,FF
);
1380 fvdw
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_add_pd(fvdw6
,fvdw12
),_mm_mul_pd(vftabscale
,rinv00
)));
1382 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
1384 fscal
= _mm_add_pd(felec
,fvdw
);
1386 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1388 /* Update vectorial force */
1389 fix0
= _mm_macc_pd(dx00
,fscal
,fix0
);
1390 fiy0
= _mm_macc_pd(dy00
,fscal
,fiy0
);
1391 fiz0
= _mm_macc_pd(dz00
,fscal
,fiz0
);
1393 fjx0
= _mm_macc_pd(dx00
,fscal
,fjx0
);
1394 fjy0
= _mm_macc_pd(dy00
,fscal
,fjy0
);
1395 fjz0
= _mm_macc_pd(dz00
,fscal
,fjz0
);
1399 /**************************
1400 * CALCULATE INTERACTIONS *
1401 **************************/
1403 if (gmx_mm_any_lt(rsq01
,rcutoff2
))
1406 /* REACTION-FIELD ELECTROSTATICS */
1407 felec
= _mm_mul_pd(qq01
,_mm_msub_pd(rinv01
,rinvsq01
,krf2
));
1409 cutoff_mask
= _mm_cmplt_pd(rsq01
,rcutoff2
);
1413 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1415 /* Update vectorial force */
1416 fix0
= _mm_macc_pd(dx01
,fscal
,fix0
);
1417 fiy0
= _mm_macc_pd(dy01
,fscal
,fiy0
);
1418 fiz0
= _mm_macc_pd(dz01
,fscal
,fiz0
);
1420 fjx1
= _mm_macc_pd(dx01
,fscal
,fjx1
);
1421 fjy1
= _mm_macc_pd(dy01
,fscal
,fjy1
);
1422 fjz1
= _mm_macc_pd(dz01
,fscal
,fjz1
);
1426 /**************************
1427 * CALCULATE INTERACTIONS *
1428 **************************/
1430 if (gmx_mm_any_lt(rsq02
,rcutoff2
))
1433 /* REACTION-FIELD ELECTROSTATICS */
1434 felec
= _mm_mul_pd(qq02
,_mm_msub_pd(rinv02
,rinvsq02
,krf2
));
1436 cutoff_mask
= _mm_cmplt_pd(rsq02
,rcutoff2
);
1440 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1442 /* Update vectorial force */
1443 fix0
= _mm_macc_pd(dx02
,fscal
,fix0
);
1444 fiy0
= _mm_macc_pd(dy02
,fscal
,fiy0
);
1445 fiz0
= _mm_macc_pd(dz02
,fscal
,fiz0
);
1447 fjx2
= _mm_macc_pd(dx02
,fscal
,fjx2
);
1448 fjy2
= _mm_macc_pd(dy02
,fscal
,fjy2
);
1449 fjz2
= _mm_macc_pd(dz02
,fscal
,fjz2
);
1453 /**************************
1454 * CALCULATE INTERACTIONS *
1455 **************************/
1457 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
1460 /* REACTION-FIELD ELECTROSTATICS */
1461 felec
= _mm_mul_pd(qq10
,_mm_msub_pd(rinv10
,rinvsq10
,krf2
));
1463 cutoff_mask
= _mm_cmplt_pd(rsq10
,rcutoff2
);
1467 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1469 /* Update vectorial force */
1470 fix1
= _mm_macc_pd(dx10
,fscal
,fix1
);
1471 fiy1
= _mm_macc_pd(dy10
,fscal
,fiy1
);
1472 fiz1
= _mm_macc_pd(dz10
,fscal
,fiz1
);
1474 fjx0
= _mm_macc_pd(dx10
,fscal
,fjx0
);
1475 fjy0
= _mm_macc_pd(dy10
,fscal
,fjy0
);
1476 fjz0
= _mm_macc_pd(dz10
,fscal
,fjz0
);
1480 /**************************
1481 * CALCULATE INTERACTIONS *
1482 **************************/
1484 if (gmx_mm_any_lt(rsq11
,rcutoff2
))
1487 /* REACTION-FIELD ELECTROSTATICS */
1488 felec
= _mm_mul_pd(qq11
,_mm_msub_pd(rinv11
,rinvsq11
,krf2
));
1490 cutoff_mask
= _mm_cmplt_pd(rsq11
,rcutoff2
);
1494 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1496 /* Update vectorial force */
1497 fix1
= _mm_macc_pd(dx11
,fscal
,fix1
);
1498 fiy1
= _mm_macc_pd(dy11
,fscal
,fiy1
);
1499 fiz1
= _mm_macc_pd(dz11
,fscal
,fiz1
);
1501 fjx1
= _mm_macc_pd(dx11
,fscal
,fjx1
);
1502 fjy1
= _mm_macc_pd(dy11
,fscal
,fjy1
);
1503 fjz1
= _mm_macc_pd(dz11
,fscal
,fjz1
);
1507 /**************************
1508 * CALCULATE INTERACTIONS *
1509 **************************/
1511 if (gmx_mm_any_lt(rsq12
,rcutoff2
))
1514 /* REACTION-FIELD ELECTROSTATICS */
1515 felec
= _mm_mul_pd(qq12
,_mm_msub_pd(rinv12
,rinvsq12
,krf2
));
1517 cutoff_mask
= _mm_cmplt_pd(rsq12
,rcutoff2
);
1521 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1523 /* Update vectorial force */
1524 fix1
= _mm_macc_pd(dx12
,fscal
,fix1
);
1525 fiy1
= _mm_macc_pd(dy12
,fscal
,fiy1
);
1526 fiz1
= _mm_macc_pd(dz12
,fscal
,fiz1
);
1528 fjx2
= _mm_macc_pd(dx12
,fscal
,fjx2
);
1529 fjy2
= _mm_macc_pd(dy12
,fscal
,fjy2
);
1530 fjz2
= _mm_macc_pd(dz12
,fscal
,fjz2
);
1534 /**************************
1535 * CALCULATE INTERACTIONS *
1536 **************************/
1538 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
1541 /* REACTION-FIELD ELECTROSTATICS */
1542 felec
= _mm_mul_pd(qq20
,_mm_msub_pd(rinv20
,rinvsq20
,krf2
));
1544 cutoff_mask
= _mm_cmplt_pd(rsq20
,rcutoff2
);
1548 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1550 /* Update vectorial force */
1551 fix2
= _mm_macc_pd(dx20
,fscal
,fix2
);
1552 fiy2
= _mm_macc_pd(dy20
,fscal
,fiy2
);
1553 fiz2
= _mm_macc_pd(dz20
,fscal
,fiz2
);
1555 fjx0
= _mm_macc_pd(dx20
,fscal
,fjx0
);
1556 fjy0
= _mm_macc_pd(dy20
,fscal
,fjy0
);
1557 fjz0
= _mm_macc_pd(dz20
,fscal
,fjz0
);
1561 /**************************
1562 * CALCULATE INTERACTIONS *
1563 **************************/
1565 if (gmx_mm_any_lt(rsq21
,rcutoff2
))
1568 /* REACTION-FIELD ELECTROSTATICS */
1569 felec
= _mm_mul_pd(qq21
,_mm_msub_pd(rinv21
,rinvsq21
,krf2
));
1571 cutoff_mask
= _mm_cmplt_pd(rsq21
,rcutoff2
);
1575 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1577 /* Update vectorial force */
1578 fix2
= _mm_macc_pd(dx21
,fscal
,fix2
);
1579 fiy2
= _mm_macc_pd(dy21
,fscal
,fiy2
);
1580 fiz2
= _mm_macc_pd(dz21
,fscal
,fiz2
);
1582 fjx1
= _mm_macc_pd(dx21
,fscal
,fjx1
);
1583 fjy1
= _mm_macc_pd(dy21
,fscal
,fjy1
);
1584 fjz1
= _mm_macc_pd(dz21
,fscal
,fjz1
);
1588 /**************************
1589 * CALCULATE INTERACTIONS *
1590 **************************/
1592 if (gmx_mm_any_lt(rsq22
,rcutoff2
))
1595 /* REACTION-FIELD ELECTROSTATICS */
1596 felec
= _mm_mul_pd(qq22
,_mm_msub_pd(rinv22
,rinvsq22
,krf2
));
1598 cutoff_mask
= _mm_cmplt_pd(rsq22
,rcutoff2
);
1602 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1604 /* Update vectorial force */
1605 fix2
= _mm_macc_pd(dx22
,fscal
,fix2
);
1606 fiy2
= _mm_macc_pd(dy22
,fscal
,fiy2
);
1607 fiz2
= _mm_macc_pd(dz22
,fscal
,fiz2
);
1609 fjx2
= _mm_macc_pd(dx22
,fscal
,fjx2
);
1610 fjy2
= _mm_macc_pd(dy22
,fscal
,fjy2
);
1611 fjz2
= _mm_macc_pd(dz22
,fscal
,fjz2
);
1615 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
1617 /* Inner loop uses 324 flops */
1620 if(jidx
<j_index_end
)
1624 j_coord_offsetA
= DIM
*jnrA
;
1626 /* load j atom coordinates */
1627 gmx_mm_load_3rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
1628 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
1630 /* Calculate displacement vector */
1631 dx00
= _mm_sub_pd(ix0
,jx0
);
1632 dy00
= _mm_sub_pd(iy0
,jy0
);
1633 dz00
= _mm_sub_pd(iz0
,jz0
);
1634 dx01
= _mm_sub_pd(ix0
,jx1
);
1635 dy01
= _mm_sub_pd(iy0
,jy1
);
1636 dz01
= _mm_sub_pd(iz0
,jz1
);
1637 dx02
= _mm_sub_pd(ix0
,jx2
);
1638 dy02
= _mm_sub_pd(iy0
,jy2
);
1639 dz02
= _mm_sub_pd(iz0
,jz2
);
1640 dx10
= _mm_sub_pd(ix1
,jx0
);
1641 dy10
= _mm_sub_pd(iy1
,jy0
);
1642 dz10
= _mm_sub_pd(iz1
,jz0
);
1643 dx11
= _mm_sub_pd(ix1
,jx1
);
1644 dy11
= _mm_sub_pd(iy1
,jy1
);
1645 dz11
= _mm_sub_pd(iz1
,jz1
);
1646 dx12
= _mm_sub_pd(ix1
,jx2
);
1647 dy12
= _mm_sub_pd(iy1
,jy2
);
1648 dz12
= _mm_sub_pd(iz1
,jz2
);
1649 dx20
= _mm_sub_pd(ix2
,jx0
);
1650 dy20
= _mm_sub_pd(iy2
,jy0
);
1651 dz20
= _mm_sub_pd(iz2
,jz0
);
1652 dx21
= _mm_sub_pd(ix2
,jx1
);
1653 dy21
= _mm_sub_pd(iy2
,jy1
);
1654 dz21
= _mm_sub_pd(iz2
,jz1
);
1655 dx22
= _mm_sub_pd(ix2
,jx2
);
1656 dy22
= _mm_sub_pd(iy2
,jy2
);
1657 dz22
= _mm_sub_pd(iz2
,jz2
);
1659 /* Calculate squared distance and things based on it */
1660 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
1661 rsq01
= gmx_mm_calc_rsq_pd(dx01
,dy01
,dz01
);
1662 rsq02
= gmx_mm_calc_rsq_pd(dx02
,dy02
,dz02
);
1663 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
1664 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
1665 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
1666 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
1667 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
1668 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
1670 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
1671 rinv01
= gmx_mm_invsqrt_pd(rsq01
);
1672 rinv02
= gmx_mm_invsqrt_pd(rsq02
);
1673 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
1674 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
1675 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
1676 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
1677 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
1678 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
1680 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
1681 rinvsq01
= _mm_mul_pd(rinv01
,rinv01
);
1682 rinvsq02
= _mm_mul_pd(rinv02
,rinv02
);
1683 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
1684 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
1685 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
1686 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
1687 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
1688 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
1690 fjx0
= _mm_setzero_pd();
1691 fjy0
= _mm_setzero_pd();
1692 fjz0
= _mm_setzero_pd();
1693 fjx1
= _mm_setzero_pd();
1694 fjy1
= _mm_setzero_pd();
1695 fjz1
= _mm_setzero_pd();
1696 fjx2
= _mm_setzero_pd();
1697 fjy2
= _mm_setzero_pd();
1698 fjz2
= _mm_setzero_pd();
1700 /**************************
1701 * CALCULATE INTERACTIONS *
1702 **************************/
1704 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
1707 r00
= _mm_mul_pd(rsq00
,rinv00
);
1709 /* Calculate table index by multiplying r with table scale and truncate to integer */
1710 rt
= _mm_mul_pd(r00
,vftabscale
);
1711 vfitab
= _mm_cvttpd_epi32(rt
);
1713 vfeps
= _mm_frcz_pd(rt
);
1715 vfeps
= _mm_sub_pd(rt
,_mm_round_pd(rt
, _MM_FROUND_FLOOR
));
1717 twovfeps
= _mm_add_pd(vfeps
,vfeps
);
1718 vfitab
= _mm_slli_epi32(vfitab
,3);
1720 /* REACTION-FIELD ELECTROSTATICS */
1721 felec
= _mm_mul_pd(qq00
,_mm_msub_pd(rinv00
,rinvsq00
,krf2
));
1723 /* CUBIC SPLINE TABLE DISPERSION */
1724 Y
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,0) );
1725 F
= _mm_setzero_pd();
1726 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1727 G
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,0) +2);
1728 H
= _mm_setzero_pd();
1729 GMX_MM_TRANSPOSE2_PD(G
,H
);
1730 Fp
= _mm_macc_pd(vfeps
,_mm_macc_pd(H
,vfeps
,G
),F
);
1731 FF
= _mm_macc_pd(vfeps
,_mm_macc_pd(twovfeps
,H
,G
),Fp
);
1732 fvdw6
= _mm_mul_pd(c6_00
,FF
);
1734 /* CUBIC SPLINE TABLE REPULSION */
1735 vfitab
= _mm_add_epi32(vfitab
,ifour
);
1736 Y
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,0) );
1737 F
= _mm_setzero_pd();
1738 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1739 G
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,0) +2);
1740 H
= _mm_setzero_pd();
1741 GMX_MM_TRANSPOSE2_PD(G
,H
);
1742 Fp
= _mm_macc_pd(vfeps
,_mm_macc_pd(H
,vfeps
,G
),F
);
1743 FF
= _mm_macc_pd(vfeps
,_mm_macc_pd(twovfeps
,H
,G
),Fp
);
1744 fvdw12
= _mm_mul_pd(c12_00
,FF
);
1745 fvdw
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_add_pd(fvdw6
,fvdw12
),_mm_mul_pd(vftabscale
,rinv00
)));
1747 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
1749 fscal
= _mm_add_pd(felec
,fvdw
);
1751 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1753 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1755 /* Update vectorial force */
1756 fix0
= _mm_macc_pd(dx00
,fscal
,fix0
);
1757 fiy0
= _mm_macc_pd(dy00
,fscal
,fiy0
);
1758 fiz0
= _mm_macc_pd(dz00
,fscal
,fiz0
);
1760 fjx0
= _mm_macc_pd(dx00
,fscal
,fjx0
);
1761 fjy0
= _mm_macc_pd(dy00
,fscal
,fjy0
);
1762 fjz0
= _mm_macc_pd(dz00
,fscal
,fjz0
);
1766 /**************************
1767 * CALCULATE INTERACTIONS *
1768 **************************/
1770 if (gmx_mm_any_lt(rsq01
,rcutoff2
))
1773 /* REACTION-FIELD ELECTROSTATICS */
1774 felec
= _mm_mul_pd(qq01
,_mm_msub_pd(rinv01
,rinvsq01
,krf2
));
1776 cutoff_mask
= _mm_cmplt_pd(rsq01
,rcutoff2
);
1780 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1782 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1784 /* Update vectorial force */
1785 fix0
= _mm_macc_pd(dx01
,fscal
,fix0
);
1786 fiy0
= _mm_macc_pd(dy01
,fscal
,fiy0
);
1787 fiz0
= _mm_macc_pd(dz01
,fscal
,fiz0
);
1789 fjx1
= _mm_macc_pd(dx01
,fscal
,fjx1
);
1790 fjy1
= _mm_macc_pd(dy01
,fscal
,fjy1
);
1791 fjz1
= _mm_macc_pd(dz01
,fscal
,fjz1
);
1795 /**************************
1796 * CALCULATE INTERACTIONS *
1797 **************************/
1799 if (gmx_mm_any_lt(rsq02
,rcutoff2
))
1802 /* REACTION-FIELD ELECTROSTATICS */
1803 felec
= _mm_mul_pd(qq02
,_mm_msub_pd(rinv02
,rinvsq02
,krf2
));
1805 cutoff_mask
= _mm_cmplt_pd(rsq02
,rcutoff2
);
1809 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1811 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1813 /* Update vectorial force */
1814 fix0
= _mm_macc_pd(dx02
,fscal
,fix0
);
1815 fiy0
= _mm_macc_pd(dy02
,fscal
,fiy0
);
1816 fiz0
= _mm_macc_pd(dz02
,fscal
,fiz0
);
1818 fjx2
= _mm_macc_pd(dx02
,fscal
,fjx2
);
1819 fjy2
= _mm_macc_pd(dy02
,fscal
,fjy2
);
1820 fjz2
= _mm_macc_pd(dz02
,fscal
,fjz2
);
1824 /**************************
1825 * CALCULATE INTERACTIONS *
1826 **************************/
1828 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
1831 /* REACTION-FIELD ELECTROSTATICS */
1832 felec
= _mm_mul_pd(qq10
,_mm_msub_pd(rinv10
,rinvsq10
,krf2
));
1834 cutoff_mask
= _mm_cmplt_pd(rsq10
,rcutoff2
);
1838 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1840 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1842 /* Update vectorial force */
1843 fix1
= _mm_macc_pd(dx10
,fscal
,fix1
);
1844 fiy1
= _mm_macc_pd(dy10
,fscal
,fiy1
);
1845 fiz1
= _mm_macc_pd(dz10
,fscal
,fiz1
);
1847 fjx0
= _mm_macc_pd(dx10
,fscal
,fjx0
);
1848 fjy0
= _mm_macc_pd(dy10
,fscal
,fjy0
);
1849 fjz0
= _mm_macc_pd(dz10
,fscal
,fjz0
);
1853 /**************************
1854 * CALCULATE INTERACTIONS *
1855 **************************/
1857 if (gmx_mm_any_lt(rsq11
,rcutoff2
))
1860 /* REACTION-FIELD ELECTROSTATICS */
1861 felec
= _mm_mul_pd(qq11
,_mm_msub_pd(rinv11
,rinvsq11
,krf2
));
1863 cutoff_mask
= _mm_cmplt_pd(rsq11
,rcutoff2
);
1867 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1869 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1871 /* Update vectorial force */
1872 fix1
= _mm_macc_pd(dx11
,fscal
,fix1
);
1873 fiy1
= _mm_macc_pd(dy11
,fscal
,fiy1
);
1874 fiz1
= _mm_macc_pd(dz11
,fscal
,fiz1
);
1876 fjx1
= _mm_macc_pd(dx11
,fscal
,fjx1
);
1877 fjy1
= _mm_macc_pd(dy11
,fscal
,fjy1
);
1878 fjz1
= _mm_macc_pd(dz11
,fscal
,fjz1
);
1882 /**************************
1883 * CALCULATE INTERACTIONS *
1884 **************************/
1886 if (gmx_mm_any_lt(rsq12
,rcutoff2
))
1889 /* REACTION-FIELD ELECTROSTATICS */
1890 felec
= _mm_mul_pd(qq12
,_mm_msub_pd(rinv12
,rinvsq12
,krf2
));
1892 cutoff_mask
= _mm_cmplt_pd(rsq12
,rcutoff2
);
1896 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1898 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1900 /* Update vectorial force */
1901 fix1
= _mm_macc_pd(dx12
,fscal
,fix1
);
1902 fiy1
= _mm_macc_pd(dy12
,fscal
,fiy1
);
1903 fiz1
= _mm_macc_pd(dz12
,fscal
,fiz1
);
1905 fjx2
= _mm_macc_pd(dx12
,fscal
,fjx2
);
1906 fjy2
= _mm_macc_pd(dy12
,fscal
,fjy2
);
1907 fjz2
= _mm_macc_pd(dz12
,fscal
,fjz2
);
1911 /**************************
1912 * CALCULATE INTERACTIONS *
1913 **************************/
1915 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
1918 /* REACTION-FIELD ELECTROSTATICS */
1919 felec
= _mm_mul_pd(qq20
,_mm_msub_pd(rinv20
,rinvsq20
,krf2
));
1921 cutoff_mask
= _mm_cmplt_pd(rsq20
,rcutoff2
);
1925 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1927 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1929 /* Update vectorial force */
1930 fix2
= _mm_macc_pd(dx20
,fscal
,fix2
);
1931 fiy2
= _mm_macc_pd(dy20
,fscal
,fiy2
);
1932 fiz2
= _mm_macc_pd(dz20
,fscal
,fiz2
);
1934 fjx0
= _mm_macc_pd(dx20
,fscal
,fjx0
);
1935 fjy0
= _mm_macc_pd(dy20
,fscal
,fjy0
);
1936 fjz0
= _mm_macc_pd(dz20
,fscal
,fjz0
);
1940 /**************************
1941 * CALCULATE INTERACTIONS *
1942 **************************/
1944 if (gmx_mm_any_lt(rsq21
,rcutoff2
))
1947 /* REACTION-FIELD ELECTROSTATICS */
1948 felec
= _mm_mul_pd(qq21
,_mm_msub_pd(rinv21
,rinvsq21
,krf2
));
1950 cutoff_mask
= _mm_cmplt_pd(rsq21
,rcutoff2
);
1954 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1956 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1958 /* Update vectorial force */
1959 fix2
= _mm_macc_pd(dx21
,fscal
,fix2
);
1960 fiy2
= _mm_macc_pd(dy21
,fscal
,fiy2
);
1961 fiz2
= _mm_macc_pd(dz21
,fscal
,fiz2
);
1963 fjx1
= _mm_macc_pd(dx21
,fscal
,fjx1
);
1964 fjy1
= _mm_macc_pd(dy21
,fscal
,fjy1
);
1965 fjz1
= _mm_macc_pd(dz21
,fscal
,fjz1
);
1969 /**************************
1970 * CALCULATE INTERACTIONS *
1971 **************************/
1973 if (gmx_mm_any_lt(rsq22
,rcutoff2
))
1976 /* REACTION-FIELD ELECTROSTATICS */
1977 felec
= _mm_mul_pd(qq22
,_mm_msub_pd(rinv22
,rinvsq22
,krf2
));
1979 cutoff_mask
= _mm_cmplt_pd(rsq22
,rcutoff2
);
1983 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1985 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1987 /* Update vectorial force */
1988 fix2
= _mm_macc_pd(dx22
,fscal
,fix2
);
1989 fiy2
= _mm_macc_pd(dy22
,fscal
,fiy2
);
1990 fiz2
= _mm_macc_pd(dz22
,fscal
,fiz2
);
1992 fjx2
= _mm_macc_pd(dx22
,fscal
,fjx2
);
1993 fjy2
= _mm_macc_pd(dy22
,fscal
,fjy2
);
1994 fjz2
= _mm_macc_pd(dz22
,fscal
,fjz2
);
1998 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
2000 /* Inner loop uses 324 flops */
2003 /* End of innermost loop */
2005 gmx_mm_update_iforce_3atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
2006 f
+i_coord_offset
,fshift
+i_shift_offset
);
2008 /* Increment number of inner iterations */
2009 inneriter
+= j_index_end
- j_index_start
;
2011 /* Outer loop uses 18 flops */
2014 /* Increment number of outer iterations */
2017 /* Update outer/inner flops */
2019 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W3W3_F
,outeriter
*18 + inneriter
*324);