2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_avx_128_fma_single.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_avx_128_fma_single
51 * Electrostatics interaction: ReactionField
52 * VdW interaction: LennardJones
53 * Geometry: Water3-Particle
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecRF_VdwLJ_GeomW3P1_VF_avx_128_fma_single
58 (t_nblist
* gmx_restrict nlist
,
59 rvec
* gmx_restrict xx
,
60 rvec
* gmx_restrict ff
,
61 struct t_forcerec
* gmx_restrict fr
,
62 t_mdatoms
* gmx_restrict mdatoms
,
63 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
64 t_nrnb
* gmx_restrict nrnb
)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
72 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
73 int jnrA
,jnrB
,jnrC
,jnrD
;
74 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
75 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
76 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
78 real
*shiftvec
,*fshift
,*x
,*f
;
79 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
;
81 __m128 fscal
,rcutoff
,rcutoff2
,jidxall
;
83 __m128 ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
85 __m128 ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
87 __m128 ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
88 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
;
89 __m128 jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
90 __m128 dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
91 __m128 dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
92 __m128 dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
93 __m128 velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
96 __m128 rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
99 __m128 one_sixth
= _mm_set1_ps(1.0/6.0);
100 __m128 one_twelfth
= _mm_set1_ps(1.0/12.0);
101 __m128 dummy_mask
,cutoff_mask
;
102 __m128 signbit
= _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
103 __m128 one
= _mm_set1_ps(1.0);
104 __m128 two
= _mm_set1_ps(2.0);
110 jindex
= nlist
->jindex
;
112 shiftidx
= nlist
->shift
;
114 shiftvec
= fr
->shift_vec
[0];
115 fshift
= fr
->fshift
[0];
116 facel
= _mm_set1_ps(fr
->ic
->epsfac
);
117 charge
= mdatoms
->chargeA
;
118 krf
= _mm_set1_ps(fr
->ic
->k_rf
);
119 krf2
= _mm_set1_ps(fr
->ic
->k_rf
*2.0);
120 crf
= _mm_set1_ps(fr
->ic
->c_rf
);
121 nvdwtype
= fr
->ntype
;
123 vdwtype
= mdatoms
->typeA
;
125 /* Setup water-specific parameters */
126 inr
= nlist
->iinr
[0];
127 iq0
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+0]));
128 iq1
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+1]));
129 iq2
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+2]));
130 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
132 /* Avoid stupid compiler warnings */
133 jnrA
= jnrB
= jnrC
= jnrD
= 0;
142 for(iidx
=0;iidx
<4*DIM
;iidx
++)
147 /* Start outer loop over neighborlists */
148 for(iidx
=0; iidx
<nri
; iidx
++)
150 /* Load shift vector for this list */
151 i_shift_offset
= DIM
*shiftidx
[iidx
];
153 /* Load limits for loop over neighbors */
154 j_index_start
= jindex
[iidx
];
155 j_index_end
= jindex
[iidx
+1];
157 /* Get outer coordinate index */
159 i_coord_offset
= DIM
*inr
;
161 /* Load i particle coords and add shift vector */
162 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
163 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
165 fix0
= _mm_setzero_ps();
166 fiy0
= _mm_setzero_ps();
167 fiz0
= _mm_setzero_ps();
168 fix1
= _mm_setzero_ps();
169 fiy1
= _mm_setzero_ps();
170 fiz1
= _mm_setzero_ps();
171 fix2
= _mm_setzero_ps();
172 fiy2
= _mm_setzero_ps();
173 fiz2
= _mm_setzero_ps();
175 /* Reset potential sums */
176 velecsum
= _mm_setzero_ps();
177 vvdwsum
= _mm_setzero_ps();
179 /* Start inner kernel loop */
180 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+3]>=0; jidx
+=4)
183 /* Get j neighbor index, and coordinate index */
188 j_coord_offsetA
= DIM
*jnrA
;
189 j_coord_offsetB
= DIM
*jnrB
;
190 j_coord_offsetC
= DIM
*jnrC
;
191 j_coord_offsetD
= DIM
*jnrD
;
193 /* load j atom coordinates */
194 gmx_mm_load_1rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
195 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
198 /* Calculate displacement vector */
199 dx00
= _mm_sub_ps(ix0
,jx0
);
200 dy00
= _mm_sub_ps(iy0
,jy0
);
201 dz00
= _mm_sub_ps(iz0
,jz0
);
202 dx10
= _mm_sub_ps(ix1
,jx0
);
203 dy10
= _mm_sub_ps(iy1
,jy0
);
204 dz10
= _mm_sub_ps(iz1
,jz0
);
205 dx20
= _mm_sub_ps(ix2
,jx0
);
206 dy20
= _mm_sub_ps(iy2
,jy0
);
207 dz20
= _mm_sub_ps(iz2
,jz0
);
209 /* Calculate squared distance and things based on it */
210 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
211 rsq10
= gmx_mm_calc_rsq_ps(dx10
,dy10
,dz10
);
212 rsq20
= gmx_mm_calc_rsq_ps(dx20
,dy20
,dz20
);
214 rinv00
= avx128fma_invsqrt_f(rsq00
);
215 rinv10
= avx128fma_invsqrt_f(rsq10
);
216 rinv20
= avx128fma_invsqrt_f(rsq20
);
218 rinvsq00
= _mm_mul_ps(rinv00
,rinv00
);
219 rinvsq10
= _mm_mul_ps(rinv10
,rinv10
);
220 rinvsq20
= _mm_mul_ps(rinv20
,rinv20
);
222 /* Load parameters for j particles */
223 jq0
= gmx_mm_load_4real_swizzle_ps(charge
+jnrA
+0,charge
+jnrB
+0,
224 charge
+jnrC
+0,charge
+jnrD
+0);
225 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
226 vdwjidx0B
= 2*vdwtype
[jnrB
+0];
227 vdwjidx0C
= 2*vdwtype
[jnrC
+0];
228 vdwjidx0D
= 2*vdwtype
[jnrD
+0];
230 fjx0
= _mm_setzero_ps();
231 fjy0
= _mm_setzero_ps();
232 fjz0
= _mm_setzero_ps();
234 /**************************
235 * CALCULATE INTERACTIONS *
236 **************************/
238 /* Compute parameters for interactions between i and j atoms */
239 qq00
= _mm_mul_ps(iq0
,jq0
);
240 gmx_mm_load_4pair_swizzle_ps(vdwparam
+vdwioffset0
+vdwjidx0A
,
241 vdwparam
+vdwioffset0
+vdwjidx0B
,
242 vdwparam
+vdwioffset0
+vdwjidx0C
,
243 vdwparam
+vdwioffset0
+vdwjidx0D
,
246 /* REACTION-FIELD ELECTROSTATICS */
247 velec
= _mm_mul_ps(qq00
,_mm_sub_ps(_mm_macc_ps(krf
,rsq00
,rinv00
),crf
));
248 felec
= _mm_mul_ps(qq00
,_mm_msub_ps(rinv00
,rinvsq00
,krf2
));
250 /* LENNARD-JONES DISPERSION/REPULSION */
252 rinvsix
= _mm_mul_ps(_mm_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
253 vvdw6
= _mm_mul_ps(c6_00
,rinvsix
);
254 vvdw12
= _mm_mul_ps(c12_00
,_mm_mul_ps(rinvsix
,rinvsix
));
255 vvdw
= _mm_msub_ps(vvdw12
,one_twelfth
,_mm_mul_ps(vvdw6
,one_sixth
));
256 fvdw
= _mm_mul_ps(_mm_sub_ps(vvdw12
,vvdw6
),rinvsq00
);
258 /* Update potential sum for this i atom from the interaction with this j atom. */
259 velecsum
= _mm_add_ps(velecsum
,velec
);
260 vvdwsum
= _mm_add_ps(vvdwsum
,vvdw
);
262 fscal
= _mm_add_ps(felec
,fvdw
);
264 /* Update vectorial force */
265 fix0
= _mm_macc_ps(dx00
,fscal
,fix0
);
266 fiy0
= _mm_macc_ps(dy00
,fscal
,fiy0
);
267 fiz0
= _mm_macc_ps(dz00
,fscal
,fiz0
);
269 fjx0
= _mm_macc_ps(dx00
,fscal
,fjx0
);
270 fjy0
= _mm_macc_ps(dy00
,fscal
,fjy0
);
271 fjz0
= _mm_macc_ps(dz00
,fscal
,fjz0
);
273 /**************************
274 * CALCULATE INTERACTIONS *
275 **************************/
277 /* Compute parameters for interactions between i and j atoms */
278 qq10
= _mm_mul_ps(iq1
,jq0
);
280 /* REACTION-FIELD ELECTROSTATICS */
281 velec
= _mm_mul_ps(qq10
,_mm_sub_ps(_mm_macc_ps(krf
,rsq10
,rinv10
),crf
));
282 felec
= _mm_mul_ps(qq10
,_mm_msub_ps(rinv10
,rinvsq10
,krf2
));
284 /* Update potential sum for this i atom from the interaction with this j atom. */
285 velecsum
= _mm_add_ps(velecsum
,velec
);
289 /* Update vectorial force */
290 fix1
= _mm_macc_ps(dx10
,fscal
,fix1
);
291 fiy1
= _mm_macc_ps(dy10
,fscal
,fiy1
);
292 fiz1
= _mm_macc_ps(dz10
,fscal
,fiz1
);
294 fjx0
= _mm_macc_ps(dx10
,fscal
,fjx0
);
295 fjy0
= _mm_macc_ps(dy10
,fscal
,fjy0
);
296 fjz0
= _mm_macc_ps(dz10
,fscal
,fjz0
);
298 /**************************
299 * CALCULATE INTERACTIONS *
300 **************************/
302 /* Compute parameters for interactions between i and j atoms */
303 qq20
= _mm_mul_ps(iq2
,jq0
);
305 /* REACTION-FIELD ELECTROSTATICS */
306 velec
= _mm_mul_ps(qq20
,_mm_sub_ps(_mm_macc_ps(krf
,rsq20
,rinv20
),crf
));
307 felec
= _mm_mul_ps(qq20
,_mm_msub_ps(rinv20
,rinvsq20
,krf2
));
309 /* Update potential sum for this i atom from the interaction with this j atom. */
310 velecsum
= _mm_add_ps(velecsum
,velec
);
314 /* Update vectorial force */
315 fix2
= _mm_macc_ps(dx20
,fscal
,fix2
);
316 fiy2
= _mm_macc_ps(dy20
,fscal
,fiy2
);
317 fiz2
= _mm_macc_ps(dz20
,fscal
,fiz2
);
319 fjx0
= _mm_macc_ps(dx20
,fscal
,fjx0
);
320 fjy0
= _mm_macc_ps(dy20
,fscal
,fjy0
);
321 fjz0
= _mm_macc_ps(dz20
,fscal
,fjz0
);
323 fjptrA
= f
+j_coord_offsetA
;
324 fjptrB
= f
+j_coord_offsetB
;
325 fjptrC
= f
+j_coord_offsetC
;
326 fjptrD
= f
+j_coord_offsetD
;
328 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,fjx0
,fjy0
,fjz0
);
330 /* Inner loop uses 117 flops */
336 /* Get j neighbor index, and coordinate index */
337 jnrlistA
= jjnr
[jidx
];
338 jnrlistB
= jjnr
[jidx
+1];
339 jnrlistC
= jjnr
[jidx
+2];
340 jnrlistD
= jjnr
[jidx
+3];
341 /* Sign of each element will be negative for non-real atoms.
342 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
343 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
345 dummy_mask
= gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128()));
346 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
347 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
348 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
349 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
350 j_coord_offsetA
= DIM
*jnrA
;
351 j_coord_offsetB
= DIM
*jnrB
;
352 j_coord_offsetC
= DIM
*jnrC
;
353 j_coord_offsetD
= DIM
*jnrD
;
355 /* load j atom coordinates */
356 gmx_mm_load_1rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
357 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
360 /* Calculate displacement vector */
361 dx00
= _mm_sub_ps(ix0
,jx0
);
362 dy00
= _mm_sub_ps(iy0
,jy0
);
363 dz00
= _mm_sub_ps(iz0
,jz0
);
364 dx10
= _mm_sub_ps(ix1
,jx0
);
365 dy10
= _mm_sub_ps(iy1
,jy0
);
366 dz10
= _mm_sub_ps(iz1
,jz0
);
367 dx20
= _mm_sub_ps(ix2
,jx0
);
368 dy20
= _mm_sub_ps(iy2
,jy0
);
369 dz20
= _mm_sub_ps(iz2
,jz0
);
371 /* Calculate squared distance and things based on it */
372 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
373 rsq10
= gmx_mm_calc_rsq_ps(dx10
,dy10
,dz10
);
374 rsq20
= gmx_mm_calc_rsq_ps(dx20
,dy20
,dz20
);
376 rinv00
= avx128fma_invsqrt_f(rsq00
);
377 rinv10
= avx128fma_invsqrt_f(rsq10
);
378 rinv20
= avx128fma_invsqrt_f(rsq20
);
380 rinvsq00
= _mm_mul_ps(rinv00
,rinv00
);
381 rinvsq10
= _mm_mul_ps(rinv10
,rinv10
);
382 rinvsq20
= _mm_mul_ps(rinv20
,rinv20
);
384 /* Load parameters for j particles */
385 jq0
= gmx_mm_load_4real_swizzle_ps(charge
+jnrA
+0,charge
+jnrB
+0,
386 charge
+jnrC
+0,charge
+jnrD
+0);
387 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
388 vdwjidx0B
= 2*vdwtype
[jnrB
+0];
389 vdwjidx0C
= 2*vdwtype
[jnrC
+0];
390 vdwjidx0D
= 2*vdwtype
[jnrD
+0];
392 fjx0
= _mm_setzero_ps();
393 fjy0
= _mm_setzero_ps();
394 fjz0
= _mm_setzero_ps();
396 /**************************
397 * CALCULATE INTERACTIONS *
398 **************************/
400 /* Compute parameters for interactions between i and j atoms */
401 qq00
= _mm_mul_ps(iq0
,jq0
);
402 gmx_mm_load_4pair_swizzle_ps(vdwparam
+vdwioffset0
+vdwjidx0A
,
403 vdwparam
+vdwioffset0
+vdwjidx0B
,
404 vdwparam
+vdwioffset0
+vdwjidx0C
,
405 vdwparam
+vdwioffset0
+vdwjidx0D
,
408 /* REACTION-FIELD ELECTROSTATICS */
409 velec
= _mm_mul_ps(qq00
,_mm_sub_ps(_mm_macc_ps(krf
,rsq00
,rinv00
),crf
));
410 felec
= _mm_mul_ps(qq00
,_mm_msub_ps(rinv00
,rinvsq00
,krf2
));
412 /* LENNARD-JONES DISPERSION/REPULSION */
414 rinvsix
= _mm_mul_ps(_mm_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
415 vvdw6
= _mm_mul_ps(c6_00
,rinvsix
);
416 vvdw12
= _mm_mul_ps(c12_00
,_mm_mul_ps(rinvsix
,rinvsix
));
417 vvdw
= _mm_msub_ps(vvdw12
,one_twelfth
,_mm_mul_ps(vvdw6
,one_sixth
));
418 fvdw
= _mm_mul_ps(_mm_sub_ps(vvdw12
,vvdw6
),rinvsq00
);
420 /* Update potential sum for this i atom from the interaction with this j atom. */
421 velec
= _mm_andnot_ps(dummy_mask
,velec
);
422 velecsum
= _mm_add_ps(velecsum
,velec
);
423 vvdw
= _mm_andnot_ps(dummy_mask
,vvdw
);
424 vvdwsum
= _mm_add_ps(vvdwsum
,vvdw
);
426 fscal
= _mm_add_ps(felec
,fvdw
);
428 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
430 /* Update vectorial force */
431 fix0
= _mm_macc_ps(dx00
,fscal
,fix0
);
432 fiy0
= _mm_macc_ps(dy00
,fscal
,fiy0
);
433 fiz0
= _mm_macc_ps(dz00
,fscal
,fiz0
);
435 fjx0
= _mm_macc_ps(dx00
,fscal
,fjx0
);
436 fjy0
= _mm_macc_ps(dy00
,fscal
,fjy0
);
437 fjz0
= _mm_macc_ps(dz00
,fscal
,fjz0
);
439 /**************************
440 * CALCULATE INTERACTIONS *
441 **************************/
443 /* Compute parameters for interactions between i and j atoms */
444 qq10
= _mm_mul_ps(iq1
,jq0
);
446 /* REACTION-FIELD ELECTROSTATICS */
447 velec
= _mm_mul_ps(qq10
,_mm_sub_ps(_mm_macc_ps(krf
,rsq10
,rinv10
),crf
));
448 felec
= _mm_mul_ps(qq10
,_mm_msub_ps(rinv10
,rinvsq10
,krf2
));
450 /* Update potential sum for this i atom from the interaction with this j atom. */
451 velec
= _mm_andnot_ps(dummy_mask
,velec
);
452 velecsum
= _mm_add_ps(velecsum
,velec
);
456 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
458 /* Update vectorial force */
459 fix1
= _mm_macc_ps(dx10
,fscal
,fix1
);
460 fiy1
= _mm_macc_ps(dy10
,fscal
,fiy1
);
461 fiz1
= _mm_macc_ps(dz10
,fscal
,fiz1
);
463 fjx0
= _mm_macc_ps(dx10
,fscal
,fjx0
);
464 fjy0
= _mm_macc_ps(dy10
,fscal
,fjy0
);
465 fjz0
= _mm_macc_ps(dz10
,fscal
,fjz0
);
467 /**************************
468 * CALCULATE INTERACTIONS *
469 **************************/
471 /* Compute parameters for interactions between i and j atoms */
472 qq20
= _mm_mul_ps(iq2
,jq0
);
474 /* REACTION-FIELD ELECTROSTATICS */
475 velec
= _mm_mul_ps(qq20
,_mm_sub_ps(_mm_macc_ps(krf
,rsq20
,rinv20
),crf
));
476 felec
= _mm_mul_ps(qq20
,_mm_msub_ps(rinv20
,rinvsq20
,krf2
));
478 /* Update potential sum for this i atom from the interaction with this j atom. */
479 velec
= _mm_andnot_ps(dummy_mask
,velec
);
480 velecsum
= _mm_add_ps(velecsum
,velec
);
484 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
486 /* Update vectorial force */
487 fix2
= _mm_macc_ps(dx20
,fscal
,fix2
);
488 fiy2
= _mm_macc_ps(dy20
,fscal
,fiy2
);
489 fiz2
= _mm_macc_ps(dz20
,fscal
,fiz2
);
491 fjx0
= _mm_macc_ps(dx20
,fscal
,fjx0
);
492 fjy0
= _mm_macc_ps(dy20
,fscal
,fjy0
);
493 fjz0
= _mm_macc_ps(dz20
,fscal
,fjz0
);
495 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
496 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
497 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
498 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
500 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,fjx0
,fjy0
,fjz0
);
502 /* Inner loop uses 117 flops */
505 /* End of innermost loop */
507 gmx_mm_update_iforce_3atom_swizzle_ps(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
508 f
+i_coord_offset
,fshift
+i_shift_offset
);
511 /* Update potential energies */
512 gmx_mm_update_1pot_ps(velecsum
,kernel_data
->energygrp_elec
+ggid
);
513 gmx_mm_update_1pot_ps(vvdwsum
,kernel_data
->energygrp_vdw
+ggid
);
515 /* Increment number of inner iterations */
516 inneriter
+= j_index_end
- j_index_start
;
518 /* Outer loop uses 20 flops */
521 /* Increment number of outer iterations */
524 /* Update outer/inner flops */
526 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W3_VF
,outeriter
*20 + inneriter
*117);
529 * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_avx_128_fma_single
530 * Electrostatics interaction: ReactionField
531 * VdW interaction: LennardJones
532 * Geometry: Water3-Particle
533 * Calculate force/pot: Force
536 nb_kernel_ElecRF_VdwLJ_GeomW3P1_F_avx_128_fma_single
537 (t_nblist
* gmx_restrict nlist
,
538 rvec
* gmx_restrict xx
,
539 rvec
* gmx_restrict ff
,
540 struct t_forcerec
* gmx_restrict fr
,
541 t_mdatoms
* gmx_restrict mdatoms
,
542 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
543 t_nrnb
* gmx_restrict nrnb
)
545 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
546 * just 0 for non-waters.
547 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
548 * jnr indices corresponding to data put in the four positions in the SIMD register.
550 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
551 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
552 int jnrA
,jnrB
,jnrC
,jnrD
;
553 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
554 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
555 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
557 real
*shiftvec
,*fshift
,*x
,*f
;
558 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
;
560 __m128 fscal
,rcutoff
,rcutoff2
,jidxall
;
562 __m128 ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
564 __m128 ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
566 __m128 ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
567 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
;
568 __m128 jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
569 __m128 dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
570 __m128 dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
571 __m128 dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
572 __m128 velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
575 __m128 rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
578 __m128 one_sixth
= _mm_set1_ps(1.0/6.0);
579 __m128 one_twelfth
= _mm_set1_ps(1.0/12.0);
580 __m128 dummy_mask
,cutoff_mask
;
581 __m128 signbit
= _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
582 __m128 one
= _mm_set1_ps(1.0);
583 __m128 two
= _mm_set1_ps(2.0);
589 jindex
= nlist
->jindex
;
591 shiftidx
= nlist
->shift
;
593 shiftvec
= fr
->shift_vec
[0];
594 fshift
= fr
->fshift
[0];
595 facel
= _mm_set1_ps(fr
->ic
->epsfac
);
596 charge
= mdatoms
->chargeA
;
597 krf
= _mm_set1_ps(fr
->ic
->k_rf
);
598 krf2
= _mm_set1_ps(fr
->ic
->k_rf
*2.0);
599 crf
= _mm_set1_ps(fr
->ic
->c_rf
);
600 nvdwtype
= fr
->ntype
;
602 vdwtype
= mdatoms
->typeA
;
604 /* Setup water-specific parameters */
605 inr
= nlist
->iinr
[0];
606 iq0
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+0]));
607 iq1
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+1]));
608 iq2
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+2]));
609 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
611 /* Avoid stupid compiler warnings */
612 jnrA
= jnrB
= jnrC
= jnrD
= 0;
621 for(iidx
=0;iidx
<4*DIM
;iidx
++)
626 /* Start outer loop over neighborlists */
627 for(iidx
=0; iidx
<nri
; iidx
++)
629 /* Load shift vector for this list */
630 i_shift_offset
= DIM
*shiftidx
[iidx
];
632 /* Load limits for loop over neighbors */
633 j_index_start
= jindex
[iidx
];
634 j_index_end
= jindex
[iidx
+1];
636 /* Get outer coordinate index */
638 i_coord_offset
= DIM
*inr
;
640 /* Load i particle coords and add shift vector */
641 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
642 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
644 fix0
= _mm_setzero_ps();
645 fiy0
= _mm_setzero_ps();
646 fiz0
= _mm_setzero_ps();
647 fix1
= _mm_setzero_ps();
648 fiy1
= _mm_setzero_ps();
649 fiz1
= _mm_setzero_ps();
650 fix2
= _mm_setzero_ps();
651 fiy2
= _mm_setzero_ps();
652 fiz2
= _mm_setzero_ps();
654 /* Start inner kernel loop */
655 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+3]>=0; jidx
+=4)
658 /* Get j neighbor index, and coordinate index */
663 j_coord_offsetA
= DIM
*jnrA
;
664 j_coord_offsetB
= DIM
*jnrB
;
665 j_coord_offsetC
= DIM
*jnrC
;
666 j_coord_offsetD
= DIM
*jnrD
;
668 /* load j atom coordinates */
669 gmx_mm_load_1rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
670 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
673 /* Calculate displacement vector */
674 dx00
= _mm_sub_ps(ix0
,jx0
);
675 dy00
= _mm_sub_ps(iy0
,jy0
);
676 dz00
= _mm_sub_ps(iz0
,jz0
);
677 dx10
= _mm_sub_ps(ix1
,jx0
);
678 dy10
= _mm_sub_ps(iy1
,jy0
);
679 dz10
= _mm_sub_ps(iz1
,jz0
);
680 dx20
= _mm_sub_ps(ix2
,jx0
);
681 dy20
= _mm_sub_ps(iy2
,jy0
);
682 dz20
= _mm_sub_ps(iz2
,jz0
);
684 /* Calculate squared distance and things based on it */
685 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
686 rsq10
= gmx_mm_calc_rsq_ps(dx10
,dy10
,dz10
);
687 rsq20
= gmx_mm_calc_rsq_ps(dx20
,dy20
,dz20
);
689 rinv00
= avx128fma_invsqrt_f(rsq00
);
690 rinv10
= avx128fma_invsqrt_f(rsq10
);
691 rinv20
= avx128fma_invsqrt_f(rsq20
);
693 rinvsq00
= _mm_mul_ps(rinv00
,rinv00
);
694 rinvsq10
= _mm_mul_ps(rinv10
,rinv10
);
695 rinvsq20
= _mm_mul_ps(rinv20
,rinv20
);
697 /* Load parameters for j particles */
698 jq0
= gmx_mm_load_4real_swizzle_ps(charge
+jnrA
+0,charge
+jnrB
+0,
699 charge
+jnrC
+0,charge
+jnrD
+0);
700 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
701 vdwjidx0B
= 2*vdwtype
[jnrB
+0];
702 vdwjidx0C
= 2*vdwtype
[jnrC
+0];
703 vdwjidx0D
= 2*vdwtype
[jnrD
+0];
705 fjx0
= _mm_setzero_ps();
706 fjy0
= _mm_setzero_ps();
707 fjz0
= _mm_setzero_ps();
709 /**************************
710 * CALCULATE INTERACTIONS *
711 **************************/
713 /* Compute parameters for interactions between i and j atoms */
714 qq00
= _mm_mul_ps(iq0
,jq0
);
715 gmx_mm_load_4pair_swizzle_ps(vdwparam
+vdwioffset0
+vdwjidx0A
,
716 vdwparam
+vdwioffset0
+vdwjidx0B
,
717 vdwparam
+vdwioffset0
+vdwjidx0C
,
718 vdwparam
+vdwioffset0
+vdwjidx0D
,
721 /* REACTION-FIELD ELECTROSTATICS */
722 felec
= _mm_mul_ps(qq00
,_mm_msub_ps(rinv00
,rinvsq00
,krf2
));
724 /* LENNARD-JONES DISPERSION/REPULSION */
726 rinvsix
= _mm_mul_ps(_mm_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
727 fvdw
= _mm_mul_ps(_mm_msub_ps(c12_00
,rinvsix
,c6_00
),_mm_mul_ps(rinvsix
,rinvsq00
));
729 fscal
= _mm_add_ps(felec
,fvdw
);
731 /* Update vectorial force */
732 fix0
= _mm_macc_ps(dx00
,fscal
,fix0
);
733 fiy0
= _mm_macc_ps(dy00
,fscal
,fiy0
);
734 fiz0
= _mm_macc_ps(dz00
,fscal
,fiz0
);
736 fjx0
= _mm_macc_ps(dx00
,fscal
,fjx0
);
737 fjy0
= _mm_macc_ps(dy00
,fscal
,fjy0
);
738 fjz0
= _mm_macc_ps(dz00
,fscal
,fjz0
);
740 /**************************
741 * CALCULATE INTERACTIONS *
742 **************************/
744 /* Compute parameters for interactions between i and j atoms */
745 qq10
= _mm_mul_ps(iq1
,jq0
);
747 /* REACTION-FIELD ELECTROSTATICS */
748 felec
= _mm_mul_ps(qq10
,_mm_msub_ps(rinv10
,rinvsq10
,krf2
));
752 /* Update vectorial force */
753 fix1
= _mm_macc_ps(dx10
,fscal
,fix1
);
754 fiy1
= _mm_macc_ps(dy10
,fscal
,fiy1
);
755 fiz1
= _mm_macc_ps(dz10
,fscal
,fiz1
);
757 fjx0
= _mm_macc_ps(dx10
,fscal
,fjx0
);
758 fjy0
= _mm_macc_ps(dy10
,fscal
,fjy0
);
759 fjz0
= _mm_macc_ps(dz10
,fscal
,fjz0
);
761 /**************************
762 * CALCULATE INTERACTIONS *
763 **************************/
765 /* Compute parameters for interactions between i and j atoms */
766 qq20
= _mm_mul_ps(iq2
,jq0
);
768 /* REACTION-FIELD ELECTROSTATICS */
769 felec
= _mm_mul_ps(qq20
,_mm_msub_ps(rinv20
,rinvsq20
,krf2
));
773 /* Update vectorial force */
774 fix2
= _mm_macc_ps(dx20
,fscal
,fix2
);
775 fiy2
= _mm_macc_ps(dy20
,fscal
,fiy2
);
776 fiz2
= _mm_macc_ps(dz20
,fscal
,fiz2
);
778 fjx0
= _mm_macc_ps(dx20
,fscal
,fjx0
);
779 fjy0
= _mm_macc_ps(dy20
,fscal
,fjy0
);
780 fjz0
= _mm_macc_ps(dz20
,fscal
,fjz0
);
782 fjptrA
= f
+j_coord_offsetA
;
783 fjptrB
= f
+j_coord_offsetB
;
784 fjptrC
= f
+j_coord_offsetC
;
785 fjptrD
= f
+j_coord_offsetD
;
787 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,fjx0
,fjy0
,fjz0
);
789 /* Inner loop uses 97 flops */
795 /* Get j neighbor index, and coordinate index */
796 jnrlistA
= jjnr
[jidx
];
797 jnrlistB
= jjnr
[jidx
+1];
798 jnrlistC
= jjnr
[jidx
+2];
799 jnrlistD
= jjnr
[jidx
+3];
800 /* Sign of each element will be negative for non-real atoms.
801 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
802 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
804 dummy_mask
= gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128()));
805 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
806 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
807 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
808 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
809 j_coord_offsetA
= DIM
*jnrA
;
810 j_coord_offsetB
= DIM
*jnrB
;
811 j_coord_offsetC
= DIM
*jnrC
;
812 j_coord_offsetD
= DIM
*jnrD
;
814 /* load j atom coordinates */
815 gmx_mm_load_1rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
816 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
819 /* Calculate displacement vector */
820 dx00
= _mm_sub_ps(ix0
,jx0
);
821 dy00
= _mm_sub_ps(iy0
,jy0
);
822 dz00
= _mm_sub_ps(iz0
,jz0
);
823 dx10
= _mm_sub_ps(ix1
,jx0
);
824 dy10
= _mm_sub_ps(iy1
,jy0
);
825 dz10
= _mm_sub_ps(iz1
,jz0
);
826 dx20
= _mm_sub_ps(ix2
,jx0
);
827 dy20
= _mm_sub_ps(iy2
,jy0
);
828 dz20
= _mm_sub_ps(iz2
,jz0
);
830 /* Calculate squared distance and things based on it */
831 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
832 rsq10
= gmx_mm_calc_rsq_ps(dx10
,dy10
,dz10
);
833 rsq20
= gmx_mm_calc_rsq_ps(dx20
,dy20
,dz20
);
835 rinv00
= avx128fma_invsqrt_f(rsq00
);
836 rinv10
= avx128fma_invsqrt_f(rsq10
);
837 rinv20
= avx128fma_invsqrt_f(rsq20
);
839 rinvsq00
= _mm_mul_ps(rinv00
,rinv00
);
840 rinvsq10
= _mm_mul_ps(rinv10
,rinv10
);
841 rinvsq20
= _mm_mul_ps(rinv20
,rinv20
);
843 /* Load parameters for j particles */
844 jq0
= gmx_mm_load_4real_swizzle_ps(charge
+jnrA
+0,charge
+jnrB
+0,
845 charge
+jnrC
+0,charge
+jnrD
+0);
846 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
847 vdwjidx0B
= 2*vdwtype
[jnrB
+0];
848 vdwjidx0C
= 2*vdwtype
[jnrC
+0];
849 vdwjidx0D
= 2*vdwtype
[jnrD
+0];
851 fjx0
= _mm_setzero_ps();
852 fjy0
= _mm_setzero_ps();
853 fjz0
= _mm_setzero_ps();
855 /**************************
856 * CALCULATE INTERACTIONS *
857 **************************/
859 /* Compute parameters for interactions between i and j atoms */
860 qq00
= _mm_mul_ps(iq0
,jq0
);
861 gmx_mm_load_4pair_swizzle_ps(vdwparam
+vdwioffset0
+vdwjidx0A
,
862 vdwparam
+vdwioffset0
+vdwjidx0B
,
863 vdwparam
+vdwioffset0
+vdwjidx0C
,
864 vdwparam
+vdwioffset0
+vdwjidx0D
,
867 /* REACTION-FIELD ELECTROSTATICS */
868 felec
= _mm_mul_ps(qq00
,_mm_msub_ps(rinv00
,rinvsq00
,krf2
));
870 /* LENNARD-JONES DISPERSION/REPULSION */
872 rinvsix
= _mm_mul_ps(_mm_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
873 fvdw
= _mm_mul_ps(_mm_msub_ps(c12_00
,rinvsix
,c6_00
),_mm_mul_ps(rinvsix
,rinvsq00
));
875 fscal
= _mm_add_ps(felec
,fvdw
);
877 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
879 /* Update vectorial force */
880 fix0
= _mm_macc_ps(dx00
,fscal
,fix0
);
881 fiy0
= _mm_macc_ps(dy00
,fscal
,fiy0
);
882 fiz0
= _mm_macc_ps(dz00
,fscal
,fiz0
);
884 fjx0
= _mm_macc_ps(dx00
,fscal
,fjx0
);
885 fjy0
= _mm_macc_ps(dy00
,fscal
,fjy0
);
886 fjz0
= _mm_macc_ps(dz00
,fscal
,fjz0
);
888 /**************************
889 * CALCULATE INTERACTIONS *
890 **************************/
892 /* Compute parameters for interactions between i and j atoms */
893 qq10
= _mm_mul_ps(iq1
,jq0
);
895 /* REACTION-FIELD ELECTROSTATICS */
896 felec
= _mm_mul_ps(qq10
,_mm_msub_ps(rinv10
,rinvsq10
,krf2
));
900 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
902 /* Update vectorial force */
903 fix1
= _mm_macc_ps(dx10
,fscal
,fix1
);
904 fiy1
= _mm_macc_ps(dy10
,fscal
,fiy1
);
905 fiz1
= _mm_macc_ps(dz10
,fscal
,fiz1
);
907 fjx0
= _mm_macc_ps(dx10
,fscal
,fjx0
);
908 fjy0
= _mm_macc_ps(dy10
,fscal
,fjy0
);
909 fjz0
= _mm_macc_ps(dz10
,fscal
,fjz0
);
911 /**************************
912 * CALCULATE INTERACTIONS *
913 **************************/
915 /* Compute parameters for interactions between i and j atoms */
916 qq20
= _mm_mul_ps(iq2
,jq0
);
918 /* REACTION-FIELD ELECTROSTATICS */
919 felec
= _mm_mul_ps(qq20
,_mm_msub_ps(rinv20
,rinvsq20
,krf2
));
923 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
925 /* Update vectorial force */
926 fix2
= _mm_macc_ps(dx20
,fscal
,fix2
);
927 fiy2
= _mm_macc_ps(dy20
,fscal
,fiy2
);
928 fiz2
= _mm_macc_ps(dz20
,fscal
,fiz2
);
930 fjx0
= _mm_macc_ps(dx20
,fscal
,fjx0
);
931 fjy0
= _mm_macc_ps(dy20
,fscal
,fjy0
);
932 fjz0
= _mm_macc_ps(dz20
,fscal
,fjz0
);
934 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
935 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
936 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
937 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
939 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,fjx0
,fjy0
,fjz0
);
941 /* Inner loop uses 97 flops */
944 /* End of innermost loop */
946 gmx_mm_update_iforce_3atom_swizzle_ps(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
947 f
+i_coord_offset
,fshift
+i_shift_offset
);
949 /* Increment number of inner iterations */
950 inneriter
+= j_index_end
- j_index_start
;
952 /* Outer loop uses 18 flops */
955 /* Increment number of outer iterations */
958 /* Update outer/inner flops */
960 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W3_F
,outeriter
*18 + inneriter
*97);