2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_128_fma_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/math/vec.h"
46 #include "gromacs/legacyheaders/nrnb.h"
48 #include "gromacs/simd/math_x86_avx_128_fma_double.h"
49 #include "kernelutil_x86_avx_128_fma_double.h"
52 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_avx_128_fma_double
53 * Electrostatics interaction: ReactionField
54 * VdW interaction: CubicSplineTable
55 * Geometry: Water3-Particle
56 * Calculate force/pot: PotentialAndForce
59 nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_VF_avx_128_fma_double
60 (t_nblist
* gmx_restrict nlist
,
61 rvec
* gmx_restrict xx
,
62 rvec
* gmx_restrict ff
,
63 t_forcerec
* gmx_restrict fr
,
64 t_mdatoms
* gmx_restrict mdatoms
,
65 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
66 t_nrnb
* gmx_restrict nrnb
)
68 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
69 * just 0 for non-waters.
70 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
71 * jnr indices corresponding to data put in the four positions in the SIMD register.
73 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
74 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
76 int j_coord_offsetA
,j_coord_offsetB
;
77 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
79 real
*shiftvec
,*fshift
,*x
,*f
;
80 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
82 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
84 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
86 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
87 int vdwjidx0A
,vdwjidx0B
;
88 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
89 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
90 __m128d dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
91 __m128d dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
92 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
95 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
98 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
99 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
101 __m128i ifour
= _mm_set1_epi32(4);
102 __m128d rt
,vfeps
,vftabscale
,Y
,F
,G
,H
,Heps
,Fp
,VV
,FF
,twovfeps
;
104 __m128d dummy_mask
,cutoff_mask
;
105 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
106 __m128d one
= _mm_set1_pd(1.0);
107 __m128d two
= _mm_set1_pd(2.0);
113 jindex
= nlist
->jindex
;
115 shiftidx
= nlist
->shift
;
117 shiftvec
= fr
->shift_vec
[0];
118 fshift
= fr
->fshift
[0];
119 facel
= _mm_set1_pd(fr
->epsfac
);
120 charge
= mdatoms
->chargeA
;
121 krf
= _mm_set1_pd(fr
->ic
->k_rf
);
122 krf2
= _mm_set1_pd(fr
->ic
->k_rf
*2.0);
123 crf
= _mm_set1_pd(fr
->ic
->c_rf
);
124 nvdwtype
= fr
->ntype
;
126 vdwtype
= mdatoms
->typeA
;
128 vftab
= kernel_data
->table_vdw
->data
;
129 vftabscale
= _mm_set1_pd(kernel_data
->table_vdw
->scale
);
131 /* Setup water-specific parameters */
132 inr
= nlist
->iinr
[0];
133 iq0
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+0]));
134 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
135 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
136 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
138 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
139 rcutoff_scalar
= fr
->rcoulomb
;
140 rcutoff
= _mm_set1_pd(rcutoff_scalar
);
141 rcutoff2
= _mm_mul_pd(rcutoff
,rcutoff
);
143 /* Avoid stupid compiler warnings */
151 /* Start outer loop over neighborlists */
152 for(iidx
=0; iidx
<nri
; iidx
++)
154 /* Load shift vector for this list */
155 i_shift_offset
= DIM
*shiftidx
[iidx
];
157 /* Load limits for loop over neighbors */
158 j_index_start
= jindex
[iidx
];
159 j_index_end
= jindex
[iidx
+1];
161 /* Get outer coordinate index */
163 i_coord_offset
= DIM
*inr
;
165 /* Load i particle coords and add shift vector */
166 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
167 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
169 fix0
= _mm_setzero_pd();
170 fiy0
= _mm_setzero_pd();
171 fiz0
= _mm_setzero_pd();
172 fix1
= _mm_setzero_pd();
173 fiy1
= _mm_setzero_pd();
174 fiz1
= _mm_setzero_pd();
175 fix2
= _mm_setzero_pd();
176 fiy2
= _mm_setzero_pd();
177 fiz2
= _mm_setzero_pd();
179 /* Reset potential sums */
180 velecsum
= _mm_setzero_pd();
181 vvdwsum
= _mm_setzero_pd();
183 /* Start inner kernel loop */
184 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
187 /* Get j neighbor index, and coordinate index */
190 j_coord_offsetA
= DIM
*jnrA
;
191 j_coord_offsetB
= DIM
*jnrB
;
193 /* load j atom coordinates */
194 gmx_mm_load_1rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
197 /* Calculate displacement vector */
198 dx00
= _mm_sub_pd(ix0
,jx0
);
199 dy00
= _mm_sub_pd(iy0
,jy0
);
200 dz00
= _mm_sub_pd(iz0
,jz0
);
201 dx10
= _mm_sub_pd(ix1
,jx0
);
202 dy10
= _mm_sub_pd(iy1
,jy0
);
203 dz10
= _mm_sub_pd(iz1
,jz0
);
204 dx20
= _mm_sub_pd(ix2
,jx0
);
205 dy20
= _mm_sub_pd(iy2
,jy0
);
206 dz20
= _mm_sub_pd(iz2
,jz0
);
208 /* Calculate squared distance and things based on it */
209 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
210 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
211 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
213 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
214 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
215 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
217 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
218 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
219 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
221 /* Load parameters for j particles */
222 jq0
= gmx_mm_load_2real_swizzle_pd(charge
+jnrA
+0,charge
+jnrB
+0);
223 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
224 vdwjidx0B
= 2*vdwtype
[jnrB
+0];
226 fjx0
= _mm_setzero_pd();
227 fjy0
= _mm_setzero_pd();
228 fjz0
= _mm_setzero_pd();
230 /**************************
231 * CALCULATE INTERACTIONS *
232 **************************/
234 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
237 r00
= _mm_mul_pd(rsq00
,rinv00
);
239 /* Compute parameters for interactions between i and j atoms */
240 qq00
= _mm_mul_pd(iq0
,jq0
);
241 gmx_mm_load_2pair_swizzle_pd(vdwparam
+vdwioffset0
+vdwjidx0A
,
242 vdwparam
+vdwioffset0
+vdwjidx0B
,&c6_00
,&c12_00
);
244 /* Calculate table index by multiplying r with table scale and truncate to integer */
245 rt
= _mm_mul_pd(r00
,vftabscale
);
246 vfitab
= _mm_cvttpd_epi32(rt
);
248 vfeps
= _mm_frcz_pd(rt
);
250 vfeps
= _mm_sub_pd(rt
,_mm_round_pd(rt
, _MM_FROUND_FLOOR
));
252 twovfeps
= _mm_add_pd(vfeps
,vfeps
);
253 vfitab
= _mm_slli_epi32(vfitab
,3);
255 /* REACTION-FIELD ELECTROSTATICS */
256 velec
= _mm_mul_pd(qq00
,_mm_sub_pd(_mm_macc_pd(krf
,rsq00
,rinv00
),crf
));
257 felec
= _mm_mul_pd(qq00
,_mm_msub_pd(rinv00
,rinvsq00
,krf2
));
259 /* CUBIC SPLINE TABLE DISPERSION */
260 Y
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,0) );
261 F
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,1) );
262 GMX_MM_TRANSPOSE2_PD(Y
,F
);
263 G
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,0) +2);
264 H
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,1) +2);
265 GMX_MM_TRANSPOSE2_PD(G
,H
);
266 Fp
= _mm_macc_pd(vfeps
,_mm_macc_pd(H
,vfeps
,G
),F
);
267 VV
= _mm_macc_pd(vfeps
,Fp
,Y
);
268 vvdw6
= _mm_mul_pd(c6_00
,VV
);
269 FF
= _mm_macc_pd(vfeps
,_mm_macc_pd(twovfeps
,H
,G
),Fp
);
270 fvdw6
= _mm_mul_pd(c6_00
,FF
);
272 /* CUBIC SPLINE TABLE REPULSION */
273 vfitab
= _mm_add_epi32(vfitab
,ifour
);
274 Y
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,0) );
275 F
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,1) );
276 GMX_MM_TRANSPOSE2_PD(Y
,F
);
277 G
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,0) +2);
278 H
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,1) +2);
279 GMX_MM_TRANSPOSE2_PD(G
,H
);
280 Fp
= _mm_macc_pd(vfeps
,_mm_macc_pd(H
,vfeps
,G
),F
);
281 VV
= _mm_macc_pd(vfeps
,Fp
,Y
);
282 vvdw12
= _mm_mul_pd(c12_00
,VV
);
283 FF
= _mm_macc_pd(vfeps
,_mm_macc_pd(twovfeps
,H
,G
),Fp
);
284 fvdw12
= _mm_mul_pd(c12_00
,FF
);
285 vvdw
= _mm_add_pd(vvdw12
,vvdw6
);
286 fvdw
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_add_pd(fvdw6
,fvdw12
),_mm_mul_pd(vftabscale
,rinv00
)));
288 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
290 /* Update potential sum for this i atom from the interaction with this j atom. */
291 velec
= _mm_and_pd(velec
,cutoff_mask
);
292 velecsum
= _mm_add_pd(velecsum
,velec
);
293 vvdw
= _mm_and_pd(vvdw
,cutoff_mask
);
294 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
296 fscal
= _mm_add_pd(felec
,fvdw
);
298 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
300 /* Update vectorial force */
301 fix0
= _mm_macc_pd(dx00
,fscal
,fix0
);
302 fiy0
= _mm_macc_pd(dy00
,fscal
,fiy0
);
303 fiz0
= _mm_macc_pd(dz00
,fscal
,fiz0
);
305 fjx0
= _mm_macc_pd(dx00
,fscal
,fjx0
);
306 fjy0
= _mm_macc_pd(dy00
,fscal
,fjy0
);
307 fjz0
= _mm_macc_pd(dz00
,fscal
,fjz0
);
311 /**************************
312 * CALCULATE INTERACTIONS *
313 **************************/
315 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
318 /* Compute parameters for interactions between i and j atoms */
319 qq10
= _mm_mul_pd(iq1
,jq0
);
321 /* REACTION-FIELD ELECTROSTATICS */
322 velec
= _mm_mul_pd(qq10
,_mm_sub_pd(_mm_macc_pd(krf
,rsq10
,rinv10
),crf
));
323 felec
= _mm_mul_pd(qq10
,_mm_msub_pd(rinv10
,rinvsq10
,krf2
));
325 cutoff_mask
= _mm_cmplt_pd(rsq10
,rcutoff2
);
327 /* Update potential sum for this i atom from the interaction with this j atom. */
328 velec
= _mm_and_pd(velec
,cutoff_mask
);
329 velecsum
= _mm_add_pd(velecsum
,velec
);
333 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
335 /* Update vectorial force */
336 fix1
= _mm_macc_pd(dx10
,fscal
,fix1
);
337 fiy1
= _mm_macc_pd(dy10
,fscal
,fiy1
);
338 fiz1
= _mm_macc_pd(dz10
,fscal
,fiz1
);
340 fjx0
= _mm_macc_pd(dx10
,fscal
,fjx0
);
341 fjy0
= _mm_macc_pd(dy10
,fscal
,fjy0
);
342 fjz0
= _mm_macc_pd(dz10
,fscal
,fjz0
);
346 /**************************
347 * CALCULATE INTERACTIONS *
348 **************************/
350 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
353 /* Compute parameters for interactions between i and j atoms */
354 qq20
= _mm_mul_pd(iq2
,jq0
);
356 /* REACTION-FIELD ELECTROSTATICS */
357 velec
= _mm_mul_pd(qq20
,_mm_sub_pd(_mm_macc_pd(krf
,rsq20
,rinv20
),crf
));
358 felec
= _mm_mul_pd(qq20
,_mm_msub_pd(rinv20
,rinvsq20
,krf2
));
360 cutoff_mask
= _mm_cmplt_pd(rsq20
,rcutoff2
);
362 /* Update potential sum for this i atom from the interaction with this j atom. */
363 velec
= _mm_and_pd(velec
,cutoff_mask
);
364 velecsum
= _mm_add_pd(velecsum
,velec
);
368 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
370 /* Update vectorial force */
371 fix2
= _mm_macc_pd(dx20
,fscal
,fix2
);
372 fiy2
= _mm_macc_pd(dy20
,fscal
,fiy2
);
373 fiz2
= _mm_macc_pd(dz20
,fscal
,fiz2
);
375 fjx0
= _mm_macc_pd(dx20
,fscal
,fjx0
);
376 fjy0
= _mm_macc_pd(dy20
,fscal
,fjy0
);
377 fjz0
= _mm_macc_pd(dz20
,fscal
,fjz0
);
381 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
);
383 /* Inner loop uses 156 flops */
390 j_coord_offsetA
= DIM
*jnrA
;
392 /* load j atom coordinates */
393 gmx_mm_load_1rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
396 /* Calculate displacement vector */
397 dx00
= _mm_sub_pd(ix0
,jx0
);
398 dy00
= _mm_sub_pd(iy0
,jy0
);
399 dz00
= _mm_sub_pd(iz0
,jz0
);
400 dx10
= _mm_sub_pd(ix1
,jx0
);
401 dy10
= _mm_sub_pd(iy1
,jy0
);
402 dz10
= _mm_sub_pd(iz1
,jz0
);
403 dx20
= _mm_sub_pd(ix2
,jx0
);
404 dy20
= _mm_sub_pd(iy2
,jy0
);
405 dz20
= _mm_sub_pd(iz2
,jz0
);
407 /* Calculate squared distance and things based on it */
408 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
409 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
410 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
412 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
413 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
414 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
416 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
417 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
418 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
420 /* Load parameters for j particles */
421 jq0
= _mm_load_sd(charge
+jnrA
+0);
422 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
424 fjx0
= _mm_setzero_pd();
425 fjy0
= _mm_setzero_pd();
426 fjz0
= _mm_setzero_pd();
428 /**************************
429 * CALCULATE INTERACTIONS *
430 **************************/
432 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
435 r00
= _mm_mul_pd(rsq00
,rinv00
);
437 /* Compute parameters for interactions between i and j atoms */
438 qq00
= _mm_mul_pd(iq0
,jq0
);
439 gmx_mm_load_1pair_swizzle_pd(vdwparam
+vdwioffset0
+vdwjidx0A
,&c6_00
,&c12_00
);
441 /* Calculate table index by multiplying r with table scale and truncate to integer */
442 rt
= _mm_mul_pd(r00
,vftabscale
);
443 vfitab
= _mm_cvttpd_epi32(rt
);
445 vfeps
= _mm_frcz_pd(rt
);
447 vfeps
= _mm_sub_pd(rt
,_mm_round_pd(rt
, _MM_FROUND_FLOOR
));
449 twovfeps
= _mm_add_pd(vfeps
,vfeps
);
450 vfitab
= _mm_slli_epi32(vfitab
,3);
452 /* REACTION-FIELD ELECTROSTATICS */
453 velec
= _mm_mul_pd(qq00
,_mm_sub_pd(_mm_macc_pd(krf
,rsq00
,rinv00
),crf
));
454 felec
= _mm_mul_pd(qq00
,_mm_msub_pd(rinv00
,rinvsq00
,krf2
));
456 /* CUBIC SPLINE TABLE DISPERSION */
457 Y
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,0) );
458 F
= _mm_setzero_pd();
459 GMX_MM_TRANSPOSE2_PD(Y
,F
);
460 G
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,0) +2);
461 H
= _mm_setzero_pd();
462 GMX_MM_TRANSPOSE2_PD(G
,H
);
463 Fp
= _mm_macc_pd(vfeps
,_mm_macc_pd(H
,vfeps
,G
),F
);
464 VV
= _mm_macc_pd(vfeps
,Fp
,Y
);
465 vvdw6
= _mm_mul_pd(c6_00
,VV
);
466 FF
= _mm_macc_pd(vfeps
,_mm_macc_pd(twovfeps
,H
,G
),Fp
);
467 fvdw6
= _mm_mul_pd(c6_00
,FF
);
469 /* CUBIC SPLINE TABLE REPULSION */
470 vfitab
= _mm_add_epi32(vfitab
,ifour
);
471 Y
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,0) );
472 F
= _mm_setzero_pd();
473 GMX_MM_TRANSPOSE2_PD(Y
,F
);
474 G
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,0) +2);
475 H
= _mm_setzero_pd();
476 GMX_MM_TRANSPOSE2_PD(G
,H
);
477 Fp
= _mm_macc_pd(vfeps
,_mm_macc_pd(H
,vfeps
,G
),F
);
478 VV
= _mm_macc_pd(vfeps
,Fp
,Y
);
479 vvdw12
= _mm_mul_pd(c12_00
,VV
);
480 FF
= _mm_macc_pd(vfeps
,_mm_macc_pd(twovfeps
,H
,G
),Fp
);
481 fvdw12
= _mm_mul_pd(c12_00
,FF
);
482 vvdw
= _mm_add_pd(vvdw12
,vvdw6
);
483 fvdw
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_add_pd(fvdw6
,fvdw12
),_mm_mul_pd(vftabscale
,rinv00
)));
485 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
487 /* Update potential sum for this i atom from the interaction with this j atom. */
488 velec
= _mm_and_pd(velec
,cutoff_mask
);
489 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
490 velecsum
= _mm_add_pd(velecsum
,velec
);
491 vvdw
= _mm_and_pd(vvdw
,cutoff_mask
);
492 vvdw
= _mm_unpacklo_pd(vvdw
,_mm_setzero_pd());
493 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
495 fscal
= _mm_add_pd(felec
,fvdw
);
497 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
499 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
501 /* Update vectorial force */
502 fix0
= _mm_macc_pd(dx00
,fscal
,fix0
);
503 fiy0
= _mm_macc_pd(dy00
,fscal
,fiy0
);
504 fiz0
= _mm_macc_pd(dz00
,fscal
,fiz0
);
506 fjx0
= _mm_macc_pd(dx00
,fscal
,fjx0
);
507 fjy0
= _mm_macc_pd(dy00
,fscal
,fjy0
);
508 fjz0
= _mm_macc_pd(dz00
,fscal
,fjz0
);
512 /**************************
513 * CALCULATE INTERACTIONS *
514 **************************/
516 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
519 /* Compute parameters for interactions between i and j atoms */
520 qq10
= _mm_mul_pd(iq1
,jq0
);
522 /* REACTION-FIELD ELECTROSTATICS */
523 velec
= _mm_mul_pd(qq10
,_mm_sub_pd(_mm_macc_pd(krf
,rsq10
,rinv10
),crf
));
524 felec
= _mm_mul_pd(qq10
,_mm_msub_pd(rinv10
,rinvsq10
,krf2
));
526 cutoff_mask
= _mm_cmplt_pd(rsq10
,rcutoff2
);
528 /* Update potential sum for this i atom from the interaction with this j atom. */
529 velec
= _mm_and_pd(velec
,cutoff_mask
);
530 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
531 velecsum
= _mm_add_pd(velecsum
,velec
);
535 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
537 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
539 /* Update vectorial force */
540 fix1
= _mm_macc_pd(dx10
,fscal
,fix1
);
541 fiy1
= _mm_macc_pd(dy10
,fscal
,fiy1
);
542 fiz1
= _mm_macc_pd(dz10
,fscal
,fiz1
);
544 fjx0
= _mm_macc_pd(dx10
,fscal
,fjx0
);
545 fjy0
= _mm_macc_pd(dy10
,fscal
,fjy0
);
546 fjz0
= _mm_macc_pd(dz10
,fscal
,fjz0
);
550 /**************************
551 * CALCULATE INTERACTIONS *
552 **************************/
554 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
557 /* Compute parameters for interactions between i and j atoms */
558 qq20
= _mm_mul_pd(iq2
,jq0
);
560 /* REACTION-FIELD ELECTROSTATICS */
561 velec
= _mm_mul_pd(qq20
,_mm_sub_pd(_mm_macc_pd(krf
,rsq20
,rinv20
),crf
));
562 felec
= _mm_mul_pd(qq20
,_mm_msub_pd(rinv20
,rinvsq20
,krf2
));
564 cutoff_mask
= _mm_cmplt_pd(rsq20
,rcutoff2
);
566 /* Update potential sum for this i atom from the interaction with this j atom. */
567 velec
= _mm_and_pd(velec
,cutoff_mask
);
568 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
569 velecsum
= _mm_add_pd(velecsum
,velec
);
573 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
575 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
577 /* Update vectorial force */
578 fix2
= _mm_macc_pd(dx20
,fscal
,fix2
);
579 fiy2
= _mm_macc_pd(dy20
,fscal
,fiy2
);
580 fiz2
= _mm_macc_pd(dz20
,fscal
,fiz2
);
582 fjx0
= _mm_macc_pd(dx20
,fscal
,fjx0
);
583 fjy0
= _mm_macc_pd(dy20
,fscal
,fjy0
);
584 fjz0
= _mm_macc_pd(dz20
,fscal
,fjz0
);
588 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
);
590 /* Inner loop uses 156 flops */
593 /* End of innermost loop */
595 gmx_mm_update_iforce_3atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
596 f
+i_coord_offset
,fshift
+i_shift_offset
);
599 /* Update potential energies */
600 gmx_mm_update_1pot_pd(velecsum
,kernel_data
->energygrp_elec
+ggid
);
601 gmx_mm_update_1pot_pd(vvdwsum
,kernel_data
->energygrp_vdw
+ggid
);
603 /* Increment number of inner iterations */
604 inneriter
+= j_index_end
- j_index_start
;
606 /* Outer loop uses 20 flops */
609 /* Increment number of outer iterations */
612 /* Update outer/inner flops */
614 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W3_VF
,outeriter
*20 + inneriter
*156);
617 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_avx_128_fma_double
618 * Electrostatics interaction: ReactionField
619 * VdW interaction: CubicSplineTable
620 * Geometry: Water3-Particle
621 * Calculate force/pot: Force
624 nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_F_avx_128_fma_double
625 (t_nblist
* gmx_restrict nlist
,
626 rvec
* gmx_restrict xx
,
627 rvec
* gmx_restrict ff
,
628 t_forcerec
* gmx_restrict fr
,
629 t_mdatoms
* gmx_restrict mdatoms
,
630 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
631 t_nrnb
* gmx_restrict nrnb
)
633 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
634 * just 0 for non-waters.
635 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
636 * jnr indices corresponding to data put in the four positions in the SIMD register.
638 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
639 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
641 int j_coord_offsetA
,j_coord_offsetB
;
642 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
644 real
*shiftvec
,*fshift
,*x
,*f
;
645 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
647 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
649 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
651 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
652 int vdwjidx0A
,vdwjidx0B
;
653 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
654 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
655 __m128d dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
656 __m128d dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
657 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
660 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
663 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
664 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
666 __m128i ifour
= _mm_set1_epi32(4);
667 __m128d rt
,vfeps
,vftabscale
,Y
,F
,G
,H
,Heps
,Fp
,VV
,FF
,twovfeps
;
669 __m128d dummy_mask
,cutoff_mask
;
670 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
671 __m128d one
= _mm_set1_pd(1.0);
672 __m128d two
= _mm_set1_pd(2.0);
678 jindex
= nlist
->jindex
;
680 shiftidx
= nlist
->shift
;
682 shiftvec
= fr
->shift_vec
[0];
683 fshift
= fr
->fshift
[0];
684 facel
= _mm_set1_pd(fr
->epsfac
);
685 charge
= mdatoms
->chargeA
;
686 krf
= _mm_set1_pd(fr
->ic
->k_rf
);
687 krf2
= _mm_set1_pd(fr
->ic
->k_rf
*2.0);
688 crf
= _mm_set1_pd(fr
->ic
->c_rf
);
689 nvdwtype
= fr
->ntype
;
691 vdwtype
= mdatoms
->typeA
;
693 vftab
= kernel_data
->table_vdw
->data
;
694 vftabscale
= _mm_set1_pd(kernel_data
->table_vdw
->scale
);
696 /* Setup water-specific parameters */
697 inr
= nlist
->iinr
[0];
698 iq0
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+0]));
699 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
700 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
701 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
703 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
704 rcutoff_scalar
= fr
->rcoulomb
;
705 rcutoff
= _mm_set1_pd(rcutoff_scalar
);
706 rcutoff2
= _mm_mul_pd(rcutoff
,rcutoff
);
708 /* Avoid stupid compiler warnings */
716 /* Start outer loop over neighborlists */
717 for(iidx
=0; iidx
<nri
; iidx
++)
719 /* Load shift vector for this list */
720 i_shift_offset
= DIM
*shiftidx
[iidx
];
722 /* Load limits for loop over neighbors */
723 j_index_start
= jindex
[iidx
];
724 j_index_end
= jindex
[iidx
+1];
726 /* Get outer coordinate index */
728 i_coord_offset
= DIM
*inr
;
730 /* Load i particle coords and add shift vector */
731 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
732 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
734 fix0
= _mm_setzero_pd();
735 fiy0
= _mm_setzero_pd();
736 fiz0
= _mm_setzero_pd();
737 fix1
= _mm_setzero_pd();
738 fiy1
= _mm_setzero_pd();
739 fiz1
= _mm_setzero_pd();
740 fix2
= _mm_setzero_pd();
741 fiy2
= _mm_setzero_pd();
742 fiz2
= _mm_setzero_pd();
744 /* Start inner kernel loop */
745 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
748 /* Get j neighbor index, and coordinate index */
751 j_coord_offsetA
= DIM
*jnrA
;
752 j_coord_offsetB
= DIM
*jnrB
;
754 /* load j atom coordinates */
755 gmx_mm_load_1rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
758 /* Calculate displacement vector */
759 dx00
= _mm_sub_pd(ix0
,jx0
);
760 dy00
= _mm_sub_pd(iy0
,jy0
);
761 dz00
= _mm_sub_pd(iz0
,jz0
);
762 dx10
= _mm_sub_pd(ix1
,jx0
);
763 dy10
= _mm_sub_pd(iy1
,jy0
);
764 dz10
= _mm_sub_pd(iz1
,jz0
);
765 dx20
= _mm_sub_pd(ix2
,jx0
);
766 dy20
= _mm_sub_pd(iy2
,jy0
);
767 dz20
= _mm_sub_pd(iz2
,jz0
);
769 /* Calculate squared distance and things based on it */
770 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
771 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
772 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
774 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
775 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
776 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
778 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
779 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
780 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
782 /* Load parameters for j particles */
783 jq0
= gmx_mm_load_2real_swizzle_pd(charge
+jnrA
+0,charge
+jnrB
+0);
784 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
785 vdwjidx0B
= 2*vdwtype
[jnrB
+0];
787 fjx0
= _mm_setzero_pd();
788 fjy0
= _mm_setzero_pd();
789 fjz0
= _mm_setzero_pd();
791 /**************************
792 * CALCULATE INTERACTIONS *
793 **************************/
795 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
798 r00
= _mm_mul_pd(rsq00
,rinv00
);
800 /* Compute parameters for interactions between i and j atoms */
801 qq00
= _mm_mul_pd(iq0
,jq0
);
802 gmx_mm_load_2pair_swizzle_pd(vdwparam
+vdwioffset0
+vdwjidx0A
,
803 vdwparam
+vdwioffset0
+vdwjidx0B
,&c6_00
,&c12_00
);
805 /* Calculate table index by multiplying r with table scale and truncate to integer */
806 rt
= _mm_mul_pd(r00
,vftabscale
);
807 vfitab
= _mm_cvttpd_epi32(rt
);
809 vfeps
= _mm_frcz_pd(rt
);
811 vfeps
= _mm_sub_pd(rt
,_mm_round_pd(rt
, _MM_FROUND_FLOOR
));
813 twovfeps
= _mm_add_pd(vfeps
,vfeps
);
814 vfitab
= _mm_slli_epi32(vfitab
,3);
816 /* REACTION-FIELD ELECTROSTATICS */
817 felec
= _mm_mul_pd(qq00
,_mm_msub_pd(rinv00
,rinvsq00
,krf2
));
819 /* CUBIC SPLINE TABLE DISPERSION */
820 Y
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,0) );
821 F
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,1) );
822 GMX_MM_TRANSPOSE2_PD(Y
,F
);
823 G
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,0) +2);
824 H
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,1) +2);
825 GMX_MM_TRANSPOSE2_PD(G
,H
);
826 Fp
= _mm_macc_pd(vfeps
,_mm_macc_pd(H
,vfeps
,G
),F
);
827 FF
= _mm_macc_pd(vfeps
,_mm_macc_pd(twovfeps
,H
,G
),Fp
);
828 fvdw6
= _mm_mul_pd(c6_00
,FF
);
830 /* CUBIC SPLINE TABLE REPULSION */
831 vfitab
= _mm_add_epi32(vfitab
,ifour
);
832 Y
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,0) );
833 F
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,1) );
834 GMX_MM_TRANSPOSE2_PD(Y
,F
);
835 G
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,0) +2);
836 H
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,1) +2);
837 GMX_MM_TRANSPOSE2_PD(G
,H
);
838 Fp
= _mm_macc_pd(vfeps
,_mm_macc_pd(H
,vfeps
,G
),F
);
839 FF
= _mm_macc_pd(vfeps
,_mm_macc_pd(twovfeps
,H
,G
),Fp
);
840 fvdw12
= _mm_mul_pd(c12_00
,FF
);
841 fvdw
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_add_pd(fvdw6
,fvdw12
),_mm_mul_pd(vftabscale
,rinv00
)));
843 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
845 fscal
= _mm_add_pd(felec
,fvdw
);
847 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
849 /* Update vectorial force */
850 fix0
= _mm_macc_pd(dx00
,fscal
,fix0
);
851 fiy0
= _mm_macc_pd(dy00
,fscal
,fiy0
);
852 fiz0
= _mm_macc_pd(dz00
,fscal
,fiz0
);
854 fjx0
= _mm_macc_pd(dx00
,fscal
,fjx0
);
855 fjy0
= _mm_macc_pd(dy00
,fscal
,fjy0
);
856 fjz0
= _mm_macc_pd(dz00
,fscal
,fjz0
);
860 /**************************
861 * CALCULATE INTERACTIONS *
862 **************************/
864 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
867 /* Compute parameters for interactions between i and j atoms */
868 qq10
= _mm_mul_pd(iq1
,jq0
);
870 /* REACTION-FIELD ELECTROSTATICS */
871 felec
= _mm_mul_pd(qq10
,_mm_msub_pd(rinv10
,rinvsq10
,krf2
));
873 cutoff_mask
= _mm_cmplt_pd(rsq10
,rcutoff2
);
877 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
879 /* Update vectorial force */
880 fix1
= _mm_macc_pd(dx10
,fscal
,fix1
);
881 fiy1
= _mm_macc_pd(dy10
,fscal
,fiy1
);
882 fiz1
= _mm_macc_pd(dz10
,fscal
,fiz1
);
884 fjx0
= _mm_macc_pd(dx10
,fscal
,fjx0
);
885 fjy0
= _mm_macc_pd(dy10
,fscal
,fjy0
);
886 fjz0
= _mm_macc_pd(dz10
,fscal
,fjz0
);
890 /**************************
891 * CALCULATE INTERACTIONS *
892 **************************/
894 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
897 /* Compute parameters for interactions between i and j atoms */
898 qq20
= _mm_mul_pd(iq2
,jq0
);
900 /* REACTION-FIELD ELECTROSTATICS */
901 felec
= _mm_mul_pd(qq20
,_mm_msub_pd(rinv20
,rinvsq20
,krf2
));
903 cutoff_mask
= _mm_cmplt_pd(rsq20
,rcutoff2
);
907 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
909 /* Update vectorial force */
910 fix2
= _mm_macc_pd(dx20
,fscal
,fix2
);
911 fiy2
= _mm_macc_pd(dy20
,fscal
,fiy2
);
912 fiz2
= _mm_macc_pd(dz20
,fscal
,fiz2
);
914 fjx0
= _mm_macc_pd(dx20
,fscal
,fjx0
);
915 fjy0
= _mm_macc_pd(dy20
,fscal
,fjy0
);
916 fjz0
= _mm_macc_pd(dz20
,fscal
,fjz0
);
920 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
);
922 /* Inner loop uses 129 flops */
929 j_coord_offsetA
= DIM
*jnrA
;
931 /* load j atom coordinates */
932 gmx_mm_load_1rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
935 /* Calculate displacement vector */
936 dx00
= _mm_sub_pd(ix0
,jx0
);
937 dy00
= _mm_sub_pd(iy0
,jy0
);
938 dz00
= _mm_sub_pd(iz0
,jz0
);
939 dx10
= _mm_sub_pd(ix1
,jx0
);
940 dy10
= _mm_sub_pd(iy1
,jy0
);
941 dz10
= _mm_sub_pd(iz1
,jz0
);
942 dx20
= _mm_sub_pd(ix2
,jx0
);
943 dy20
= _mm_sub_pd(iy2
,jy0
);
944 dz20
= _mm_sub_pd(iz2
,jz0
);
946 /* Calculate squared distance and things based on it */
947 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
948 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
949 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
951 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
952 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
953 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
955 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
956 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
957 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
959 /* Load parameters for j particles */
960 jq0
= _mm_load_sd(charge
+jnrA
+0);
961 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
963 fjx0
= _mm_setzero_pd();
964 fjy0
= _mm_setzero_pd();
965 fjz0
= _mm_setzero_pd();
967 /**************************
968 * CALCULATE INTERACTIONS *
969 **************************/
971 if (gmx_mm_any_lt(rsq00
,rcutoff2
))
974 r00
= _mm_mul_pd(rsq00
,rinv00
);
976 /* Compute parameters for interactions between i and j atoms */
977 qq00
= _mm_mul_pd(iq0
,jq0
);
978 gmx_mm_load_1pair_swizzle_pd(vdwparam
+vdwioffset0
+vdwjidx0A
,&c6_00
,&c12_00
);
980 /* Calculate table index by multiplying r with table scale and truncate to integer */
981 rt
= _mm_mul_pd(r00
,vftabscale
);
982 vfitab
= _mm_cvttpd_epi32(rt
);
984 vfeps
= _mm_frcz_pd(rt
);
986 vfeps
= _mm_sub_pd(rt
,_mm_round_pd(rt
, _MM_FROUND_FLOOR
));
988 twovfeps
= _mm_add_pd(vfeps
,vfeps
);
989 vfitab
= _mm_slli_epi32(vfitab
,3);
991 /* REACTION-FIELD ELECTROSTATICS */
992 felec
= _mm_mul_pd(qq00
,_mm_msub_pd(rinv00
,rinvsq00
,krf2
));
994 /* CUBIC SPLINE TABLE DISPERSION */
995 Y
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,0) );
996 F
= _mm_setzero_pd();
997 GMX_MM_TRANSPOSE2_PD(Y
,F
);
998 G
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,0) +2);
999 H
= _mm_setzero_pd();
1000 GMX_MM_TRANSPOSE2_PD(G
,H
);
1001 Fp
= _mm_macc_pd(vfeps
,_mm_macc_pd(H
,vfeps
,G
),F
);
1002 FF
= _mm_macc_pd(vfeps
,_mm_macc_pd(twovfeps
,H
,G
),Fp
);
1003 fvdw6
= _mm_mul_pd(c6_00
,FF
);
1005 /* CUBIC SPLINE TABLE REPULSION */
1006 vfitab
= _mm_add_epi32(vfitab
,ifour
);
1007 Y
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,0) );
1008 F
= _mm_setzero_pd();
1009 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1010 G
= _mm_load_pd( vftab
+ _mm_extract_epi32(vfitab
,0) +2);
1011 H
= _mm_setzero_pd();
1012 GMX_MM_TRANSPOSE2_PD(G
,H
);
1013 Fp
= _mm_macc_pd(vfeps
,_mm_macc_pd(H
,vfeps
,G
),F
);
1014 FF
= _mm_macc_pd(vfeps
,_mm_macc_pd(twovfeps
,H
,G
),Fp
);
1015 fvdw12
= _mm_mul_pd(c12_00
,FF
);
1016 fvdw
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_add_pd(fvdw6
,fvdw12
),_mm_mul_pd(vftabscale
,rinv00
)));
1018 cutoff_mask
= _mm_cmplt_pd(rsq00
,rcutoff2
);
1020 fscal
= _mm_add_pd(felec
,fvdw
);
1022 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1024 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1026 /* Update vectorial force */
1027 fix0
= _mm_macc_pd(dx00
,fscal
,fix0
);
1028 fiy0
= _mm_macc_pd(dy00
,fscal
,fiy0
);
1029 fiz0
= _mm_macc_pd(dz00
,fscal
,fiz0
);
1031 fjx0
= _mm_macc_pd(dx00
,fscal
,fjx0
);
1032 fjy0
= _mm_macc_pd(dy00
,fscal
,fjy0
);
1033 fjz0
= _mm_macc_pd(dz00
,fscal
,fjz0
);
1037 /**************************
1038 * CALCULATE INTERACTIONS *
1039 **************************/
1041 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
1044 /* Compute parameters for interactions between i and j atoms */
1045 qq10
= _mm_mul_pd(iq1
,jq0
);
1047 /* REACTION-FIELD ELECTROSTATICS */
1048 felec
= _mm_mul_pd(qq10
,_mm_msub_pd(rinv10
,rinvsq10
,krf2
));
1050 cutoff_mask
= _mm_cmplt_pd(rsq10
,rcutoff2
);
1054 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1056 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1058 /* Update vectorial force */
1059 fix1
= _mm_macc_pd(dx10
,fscal
,fix1
);
1060 fiy1
= _mm_macc_pd(dy10
,fscal
,fiy1
);
1061 fiz1
= _mm_macc_pd(dz10
,fscal
,fiz1
);
1063 fjx0
= _mm_macc_pd(dx10
,fscal
,fjx0
);
1064 fjy0
= _mm_macc_pd(dy10
,fscal
,fjy0
);
1065 fjz0
= _mm_macc_pd(dz10
,fscal
,fjz0
);
1069 /**************************
1070 * CALCULATE INTERACTIONS *
1071 **************************/
1073 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
1076 /* Compute parameters for interactions between i and j atoms */
1077 qq20
= _mm_mul_pd(iq2
,jq0
);
1079 /* REACTION-FIELD ELECTROSTATICS */
1080 felec
= _mm_mul_pd(qq20
,_mm_msub_pd(rinv20
,rinvsq20
,krf2
));
1082 cutoff_mask
= _mm_cmplt_pd(rsq20
,rcutoff2
);
1086 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
1088 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1090 /* Update vectorial force */
1091 fix2
= _mm_macc_pd(dx20
,fscal
,fix2
);
1092 fiy2
= _mm_macc_pd(dy20
,fscal
,fiy2
);
1093 fiz2
= _mm_macc_pd(dz20
,fscal
,fiz2
);
1095 fjx0
= _mm_macc_pd(dx20
,fscal
,fjx0
);
1096 fjy0
= _mm_macc_pd(dy20
,fscal
,fjy0
);
1097 fjz0
= _mm_macc_pd(dz20
,fscal
,fjz0
);
1101 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
);
1103 /* Inner loop uses 129 flops */
1106 /* End of innermost loop */
1108 gmx_mm_update_iforce_3atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
1109 f
+i_coord_offset
,fshift
+i_shift_offset
);
1111 /* Increment number of inner iterations */
1112 inneriter
+= j_index_end
- j_index_start
;
1114 /* Outer loop uses 18 flops */
1117 /* Increment number of outer iterations */
1120 /* Update outer/inner flops */
1122 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W3_F
,outeriter
*18 + inneriter
*129);