2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_128_fma_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_avx_128_fma_double.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_avx_128_fma_double
51 * Electrostatics interaction: ReactionField
52 * VdW interaction: None
53 * Geometry: Water4-Particle
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecRFCut_VdwNone_GeomW4P1_VF_avx_128_fma_double
58 (t_nblist
* gmx_restrict nlist
,
59 rvec
* gmx_restrict xx
,
60 rvec
* gmx_restrict ff
,
61 struct t_forcerec
* gmx_restrict fr
,
62 t_mdatoms
* gmx_restrict mdatoms
,
63 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
64 t_nrnb
* gmx_restrict nrnb
)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
72 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
74 int j_coord_offsetA
,j_coord_offsetB
;
75 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
77 real
*shiftvec
,*fshift
,*x
,*f
;
78 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
80 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
82 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
84 __m128d ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
85 int vdwjidx0A
,vdwjidx0B
;
86 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
87 __m128d dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
88 __m128d dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
89 __m128d dx30
,dy30
,dz30
,rsq30
,rinv30
,rinvsq30
,r30
,qq30
,c6_30
,c12_30
;
90 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
92 __m128d dummy_mask
,cutoff_mask
;
93 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
94 __m128d one
= _mm_set1_pd(1.0);
95 __m128d two
= _mm_set1_pd(2.0);
101 jindex
= nlist
->jindex
;
103 shiftidx
= nlist
->shift
;
105 shiftvec
= fr
->shift_vec
[0];
106 fshift
= fr
->fshift
[0];
107 facel
= _mm_set1_pd(fr
->ic
->epsfac
);
108 charge
= mdatoms
->chargeA
;
109 krf
= _mm_set1_pd(fr
->ic
->k_rf
);
110 krf2
= _mm_set1_pd(fr
->ic
->k_rf
*2.0);
111 crf
= _mm_set1_pd(fr
->ic
->c_rf
);
113 /* Setup water-specific parameters */
114 inr
= nlist
->iinr
[0];
115 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
116 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
117 iq3
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+3]));
119 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
120 rcutoff_scalar
= fr
->ic
->rcoulomb
;
121 rcutoff
= _mm_set1_pd(rcutoff_scalar
);
122 rcutoff2
= _mm_mul_pd(rcutoff
,rcutoff
);
124 /* Avoid stupid compiler warnings */
132 /* Start outer loop over neighborlists */
133 for(iidx
=0; iidx
<nri
; iidx
++)
135 /* Load shift vector for this list */
136 i_shift_offset
= DIM
*shiftidx
[iidx
];
138 /* Load limits for loop over neighbors */
139 j_index_start
= jindex
[iidx
];
140 j_index_end
= jindex
[iidx
+1];
142 /* Get outer coordinate index */
144 i_coord_offset
= DIM
*inr
;
146 /* Load i particle coords and add shift vector */
147 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
+DIM
,
148 &ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
150 fix1
= _mm_setzero_pd();
151 fiy1
= _mm_setzero_pd();
152 fiz1
= _mm_setzero_pd();
153 fix2
= _mm_setzero_pd();
154 fiy2
= _mm_setzero_pd();
155 fiz2
= _mm_setzero_pd();
156 fix3
= _mm_setzero_pd();
157 fiy3
= _mm_setzero_pd();
158 fiz3
= _mm_setzero_pd();
160 /* Reset potential sums */
161 velecsum
= _mm_setzero_pd();
163 /* Start inner kernel loop */
164 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
167 /* Get j neighbor index, and coordinate index */
170 j_coord_offsetA
= DIM
*jnrA
;
171 j_coord_offsetB
= DIM
*jnrB
;
173 /* load j atom coordinates */
174 gmx_mm_load_1rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
177 /* Calculate displacement vector */
178 dx10
= _mm_sub_pd(ix1
,jx0
);
179 dy10
= _mm_sub_pd(iy1
,jy0
);
180 dz10
= _mm_sub_pd(iz1
,jz0
);
181 dx20
= _mm_sub_pd(ix2
,jx0
);
182 dy20
= _mm_sub_pd(iy2
,jy0
);
183 dz20
= _mm_sub_pd(iz2
,jz0
);
184 dx30
= _mm_sub_pd(ix3
,jx0
);
185 dy30
= _mm_sub_pd(iy3
,jy0
);
186 dz30
= _mm_sub_pd(iz3
,jz0
);
188 /* Calculate squared distance and things based on it */
189 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
190 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
191 rsq30
= gmx_mm_calc_rsq_pd(dx30
,dy30
,dz30
);
193 rinv10
= avx128fma_invsqrt_d(rsq10
);
194 rinv20
= avx128fma_invsqrt_d(rsq20
);
195 rinv30
= avx128fma_invsqrt_d(rsq30
);
197 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
198 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
199 rinvsq30
= _mm_mul_pd(rinv30
,rinv30
);
201 /* Load parameters for j particles */
202 jq0
= gmx_mm_load_2real_swizzle_pd(charge
+jnrA
+0,charge
+jnrB
+0);
204 fjx0
= _mm_setzero_pd();
205 fjy0
= _mm_setzero_pd();
206 fjz0
= _mm_setzero_pd();
208 /**************************
209 * CALCULATE INTERACTIONS *
210 **************************/
212 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
215 /* Compute parameters for interactions between i and j atoms */
216 qq10
= _mm_mul_pd(iq1
,jq0
);
218 /* REACTION-FIELD ELECTROSTATICS */
219 velec
= _mm_mul_pd(qq10
,_mm_sub_pd(_mm_macc_pd(krf
,rsq10
,rinv10
),crf
));
220 felec
= _mm_mul_pd(qq10
,_mm_msub_pd(rinv10
,rinvsq10
,krf2
));
222 cutoff_mask
= _mm_cmplt_pd(rsq10
,rcutoff2
);
224 /* Update potential sum for this i atom from the interaction with this j atom. */
225 velec
= _mm_and_pd(velec
,cutoff_mask
);
226 velecsum
= _mm_add_pd(velecsum
,velec
);
230 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
232 /* Update vectorial force */
233 fix1
= _mm_macc_pd(dx10
,fscal
,fix1
);
234 fiy1
= _mm_macc_pd(dy10
,fscal
,fiy1
);
235 fiz1
= _mm_macc_pd(dz10
,fscal
,fiz1
);
237 fjx0
= _mm_macc_pd(dx10
,fscal
,fjx0
);
238 fjy0
= _mm_macc_pd(dy10
,fscal
,fjy0
);
239 fjz0
= _mm_macc_pd(dz10
,fscal
,fjz0
);
243 /**************************
244 * CALCULATE INTERACTIONS *
245 **************************/
247 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
250 /* Compute parameters for interactions between i and j atoms */
251 qq20
= _mm_mul_pd(iq2
,jq0
);
253 /* REACTION-FIELD ELECTROSTATICS */
254 velec
= _mm_mul_pd(qq20
,_mm_sub_pd(_mm_macc_pd(krf
,rsq20
,rinv20
),crf
));
255 felec
= _mm_mul_pd(qq20
,_mm_msub_pd(rinv20
,rinvsq20
,krf2
));
257 cutoff_mask
= _mm_cmplt_pd(rsq20
,rcutoff2
);
259 /* Update potential sum for this i atom from the interaction with this j atom. */
260 velec
= _mm_and_pd(velec
,cutoff_mask
);
261 velecsum
= _mm_add_pd(velecsum
,velec
);
265 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
267 /* Update vectorial force */
268 fix2
= _mm_macc_pd(dx20
,fscal
,fix2
);
269 fiy2
= _mm_macc_pd(dy20
,fscal
,fiy2
);
270 fiz2
= _mm_macc_pd(dz20
,fscal
,fiz2
);
272 fjx0
= _mm_macc_pd(dx20
,fscal
,fjx0
);
273 fjy0
= _mm_macc_pd(dy20
,fscal
,fjy0
);
274 fjz0
= _mm_macc_pd(dz20
,fscal
,fjz0
);
278 /**************************
279 * CALCULATE INTERACTIONS *
280 **************************/
282 if (gmx_mm_any_lt(rsq30
,rcutoff2
))
285 /* Compute parameters for interactions between i and j atoms */
286 qq30
= _mm_mul_pd(iq3
,jq0
);
288 /* REACTION-FIELD ELECTROSTATICS */
289 velec
= _mm_mul_pd(qq30
,_mm_sub_pd(_mm_macc_pd(krf
,rsq30
,rinv30
),crf
));
290 felec
= _mm_mul_pd(qq30
,_mm_msub_pd(rinv30
,rinvsq30
,krf2
));
292 cutoff_mask
= _mm_cmplt_pd(rsq30
,rcutoff2
);
294 /* Update potential sum for this i atom from the interaction with this j atom. */
295 velec
= _mm_and_pd(velec
,cutoff_mask
);
296 velecsum
= _mm_add_pd(velecsum
,velec
);
300 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
302 /* Update vectorial force */
303 fix3
= _mm_macc_pd(dx30
,fscal
,fix3
);
304 fiy3
= _mm_macc_pd(dy30
,fscal
,fiy3
);
305 fiz3
= _mm_macc_pd(dz30
,fscal
,fiz3
);
307 fjx0
= _mm_macc_pd(dx30
,fscal
,fjx0
);
308 fjy0
= _mm_macc_pd(dy30
,fscal
,fjy0
);
309 fjz0
= _mm_macc_pd(dz30
,fscal
,fjz0
);
313 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
);
315 /* Inner loop uses 120 flops */
322 j_coord_offsetA
= DIM
*jnrA
;
324 /* load j atom coordinates */
325 gmx_mm_load_1rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
328 /* Calculate displacement vector */
329 dx10
= _mm_sub_pd(ix1
,jx0
);
330 dy10
= _mm_sub_pd(iy1
,jy0
);
331 dz10
= _mm_sub_pd(iz1
,jz0
);
332 dx20
= _mm_sub_pd(ix2
,jx0
);
333 dy20
= _mm_sub_pd(iy2
,jy0
);
334 dz20
= _mm_sub_pd(iz2
,jz0
);
335 dx30
= _mm_sub_pd(ix3
,jx0
);
336 dy30
= _mm_sub_pd(iy3
,jy0
);
337 dz30
= _mm_sub_pd(iz3
,jz0
);
339 /* Calculate squared distance and things based on it */
340 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
341 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
342 rsq30
= gmx_mm_calc_rsq_pd(dx30
,dy30
,dz30
);
344 rinv10
= avx128fma_invsqrt_d(rsq10
);
345 rinv20
= avx128fma_invsqrt_d(rsq20
);
346 rinv30
= avx128fma_invsqrt_d(rsq30
);
348 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
349 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
350 rinvsq30
= _mm_mul_pd(rinv30
,rinv30
);
352 /* Load parameters for j particles */
353 jq0
= _mm_load_sd(charge
+jnrA
+0);
355 fjx0
= _mm_setzero_pd();
356 fjy0
= _mm_setzero_pd();
357 fjz0
= _mm_setzero_pd();
359 /**************************
360 * CALCULATE INTERACTIONS *
361 **************************/
363 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
366 /* Compute parameters for interactions between i and j atoms */
367 qq10
= _mm_mul_pd(iq1
,jq0
);
369 /* REACTION-FIELD ELECTROSTATICS */
370 velec
= _mm_mul_pd(qq10
,_mm_sub_pd(_mm_macc_pd(krf
,rsq10
,rinv10
),crf
));
371 felec
= _mm_mul_pd(qq10
,_mm_msub_pd(rinv10
,rinvsq10
,krf2
));
373 cutoff_mask
= _mm_cmplt_pd(rsq10
,rcutoff2
);
375 /* Update potential sum for this i atom from the interaction with this j atom. */
376 velec
= _mm_and_pd(velec
,cutoff_mask
);
377 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
378 velecsum
= _mm_add_pd(velecsum
,velec
);
382 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
384 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
386 /* Update vectorial force */
387 fix1
= _mm_macc_pd(dx10
,fscal
,fix1
);
388 fiy1
= _mm_macc_pd(dy10
,fscal
,fiy1
);
389 fiz1
= _mm_macc_pd(dz10
,fscal
,fiz1
);
391 fjx0
= _mm_macc_pd(dx10
,fscal
,fjx0
);
392 fjy0
= _mm_macc_pd(dy10
,fscal
,fjy0
);
393 fjz0
= _mm_macc_pd(dz10
,fscal
,fjz0
);
397 /**************************
398 * CALCULATE INTERACTIONS *
399 **************************/
401 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
404 /* Compute parameters for interactions between i and j atoms */
405 qq20
= _mm_mul_pd(iq2
,jq0
);
407 /* REACTION-FIELD ELECTROSTATICS */
408 velec
= _mm_mul_pd(qq20
,_mm_sub_pd(_mm_macc_pd(krf
,rsq20
,rinv20
),crf
));
409 felec
= _mm_mul_pd(qq20
,_mm_msub_pd(rinv20
,rinvsq20
,krf2
));
411 cutoff_mask
= _mm_cmplt_pd(rsq20
,rcutoff2
);
413 /* Update potential sum for this i atom from the interaction with this j atom. */
414 velec
= _mm_and_pd(velec
,cutoff_mask
);
415 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
416 velecsum
= _mm_add_pd(velecsum
,velec
);
420 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
422 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
424 /* Update vectorial force */
425 fix2
= _mm_macc_pd(dx20
,fscal
,fix2
);
426 fiy2
= _mm_macc_pd(dy20
,fscal
,fiy2
);
427 fiz2
= _mm_macc_pd(dz20
,fscal
,fiz2
);
429 fjx0
= _mm_macc_pd(dx20
,fscal
,fjx0
);
430 fjy0
= _mm_macc_pd(dy20
,fscal
,fjy0
);
431 fjz0
= _mm_macc_pd(dz20
,fscal
,fjz0
);
435 /**************************
436 * CALCULATE INTERACTIONS *
437 **************************/
439 if (gmx_mm_any_lt(rsq30
,rcutoff2
))
442 /* Compute parameters for interactions between i and j atoms */
443 qq30
= _mm_mul_pd(iq3
,jq0
);
445 /* REACTION-FIELD ELECTROSTATICS */
446 velec
= _mm_mul_pd(qq30
,_mm_sub_pd(_mm_macc_pd(krf
,rsq30
,rinv30
),crf
));
447 felec
= _mm_mul_pd(qq30
,_mm_msub_pd(rinv30
,rinvsq30
,krf2
));
449 cutoff_mask
= _mm_cmplt_pd(rsq30
,rcutoff2
);
451 /* Update potential sum for this i atom from the interaction with this j atom. */
452 velec
= _mm_and_pd(velec
,cutoff_mask
);
453 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
454 velecsum
= _mm_add_pd(velecsum
,velec
);
458 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
460 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
462 /* Update vectorial force */
463 fix3
= _mm_macc_pd(dx30
,fscal
,fix3
);
464 fiy3
= _mm_macc_pd(dy30
,fscal
,fiy3
);
465 fiz3
= _mm_macc_pd(dz30
,fscal
,fiz3
);
467 fjx0
= _mm_macc_pd(dx30
,fscal
,fjx0
);
468 fjy0
= _mm_macc_pd(dy30
,fscal
,fjy0
);
469 fjz0
= _mm_macc_pd(dz30
,fscal
,fjz0
);
473 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
);
475 /* Inner loop uses 120 flops */
478 /* End of innermost loop */
480 gmx_mm_update_iforce_3atom_swizzle_pd(fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
481 f
+i_coord_offset
+DIM
,fshift
+i_shift_offset
);
484 /* Update potential energies */
485 gmx_mm_update_1pot_pd(velecsum
,kernel_data
->energygrp_elec
+ggid
);
487 /* Increment number of inner iterations */
488 inneriter
+= j_index_end
- j_index_start
;
490 /* Outer loop uses 19 flops */
493 /* Increment number of outer iterations */
496 /* Update outer/inner flops */
498 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_W4_VF
,outeriter
*19 + inneriter
*120);
501 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_avx_128_fma_double
502 * Electrostatics interaction: ReactionField
503 * VdW interaction: None
504 * Geometry: Water4-Particle
505 * Calculate force/pot: Force
508 nb_kernel_ElecRFCut_VdwNone_GeomW4P1_F_avx_128_fma_double
509 (t_nblist
* gmx_restrict nlist
,
510 rvec
* gmx_restrict xx
,
511 rvec
* gmx_restrict ff
,
512 struct t_forcerec
* gmx_restrict fr
,
513 t_mdatoms
* gmx_restrict mdatoms
,
514 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
515 t_nrnb
* gmx_restrict nrnb
)
517 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
518 * just 0 for non-waters.
519 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
520 * jnr indices corresponding to data put in the four positions in the SIMD register.
522 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
523 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
525 int j_coord_offsetA
,j_coord_offsetB
;
526 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
528 real
*shiftvec
,*fshift
,*x
,*f
;
529 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
531 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
533 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
535 __m128d ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
536 int vdwjidx0A
,vdwjidx0B
;
537 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
538 __m128d dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
539 __m128d dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
540 __m128d dx30
,dy30
,dz30
,rsq30
,rinv30
,rinvsq30
,r30
,qq30
,c6_30
,c12_30
;
541 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
543 __m128d dummy_mask
,cutoff_mask
;
544 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
545 __m128d one
= _mm_set1_pd(1.0);
546 __m128d two
= _mm_set1_pd(2.0);
552 jindex
= nlist
->jindex
;
554 shiftidx
= nlist
->shift
;
556 shiftvec
= fr
->shift_vec
[0];
557 fshift
= fr
->fshift
[0];
558 facel
= _mm_set1_pd(fr
->ic
->epsfac
);
559 charge
= mdatoms
->chargeA
;
560 krf
= _mm_set1_pd(fr
->ic
->k_rf
);
561 krf2
= _mm_set1_pd(fr
->ic
->k_rf
*2.0);
562 crf
= _mm_set1_pd(fr
->ic
->c_rf
);
564 /* Setup water-specific parameters */
565 inr
= nlist
->iinr
[0];
566 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
567 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
568 iq3
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+3]));
570 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
571 rcutoff_scalar
= fr
->ic
->rcoulomb
;
572 rcutoff
= _mm_set1_pd(rcutoff_scalar
);
573 rcutoff2
= _mm_mul_pd(rcutoff
,rcutoff
);
575 /* Avoid stupid compiler warnings */
583 /* Start outer loop over neighborlists */
584 for(iidx
=0; iidx
<nri
; iidx
++)
586 /* Load shift vector for this list */
587 i_shift_offset
= DIM
*shiftidx
[iidx
];
589 /* Load limits for loop over neighbors */
590 j_index_start
= jindex
[iidx
];
591 j_index_end
= jindex
[iidx
+1];
593 /* Get outer coordinate index */
595 i_coord_offset
= DIM
*inr
;
597 /* Load i particle coords and add shift vector */
598 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
+DIM
,
599 &ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
601 fix1
= _mm_setzero_pd();
602 fiy1
= _mm_setzero_pd();
603 fiz1
= _mm_setzero_pd();
604 fix2
= _mm_setzero_pd();
605 fiy2
= _mm_setzero_pd();
606 fiz2
= _mm_setzero_pd();
607 fix3
= _mm_setzero_pd();
608 fiy3
= _mm_setzero_pd();
609 fiz3
= _mm_setzero_pd();
611 /* Start inner kernel loop */
612 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
615 /* Get j neighbor index, and coordinate index */
618 j_coord_offsetA
= DIM
*jnrA
;
619 j_coord_offsetB
= DIM
*jnrB
;
621 /* load j atom coordinates */
622 gmx_mm_load_1rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
625 /* Calculate displacement vector */
626 dx10
= _mm_sub_pd(ix1
,jx0
);
627 dy10
= _mm_sub_pd(iy1
,jy0
);
628 dz10
= _mm_sub_pd(iz1
,jz0
);
629 dx20
= _mm_sub_pd(ix2
,jx0
);
630 dy20
= _mm_sub_pd(iy2
,jy0
);
631 dz20
= _mm_sub_pd(iz2
,jz0
);
632 dx30
= _mm_sub_pd(ix3
,jx0
);
633 dy30
= _mm_sub_pd(iy3
,jy0
);
634 dz30
= _mm_sub_pd(iz3
,jz0
);
636 /* Calculate squared distance and things based on it */
637 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
638 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
639 rsq30
= gmx_mm_calc_rsq_pd(dx30
,dy30
,dz30
);
641 rinv10
= avx128fma_invsqrt_d(rsq10
);
642 rinv20
= avx128fma_invsqrt_d(rsq20
);
643 rinv30
= avx128fma_invsqrt_d(rsq30
);
645 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
646 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
647 rinvsq30
= _mm_mul_pd(rinv30
,rinv30
);
649 /* Load parameters for j particles */
650 jq0
= gmx_mm_load_2real_swizzle_pd(charge
+jnrA
+0,charge
+jnrB
+0);
652 fjx0
= _mm_setzero_pd();
653 fjy0
= _mm_setzero_pd();
654 fjz0
= _mm_setzero_pd();
656 /**************************
657 * CALCULATE INTERACTIONS *
658 **************************/
660 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
663 /* Compute parameters for interactions between i and j atoms */
664 qq10
= _mm_mul_pd(iq1
,jq0
);
666 /* REACTION-FIELD ELECTROSTATICS */
667 felec
= _mm_mul_pd(qq10
,_mm_msub_pd(rinv10
,rinvsq10
,krf2
));
669 cutoff_mask
= _mm_cmplt_pd(rsq10
,rcutoff2
);
673 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
675 /* Update vectorial force */
676 fix1
= _mm_macc_pd(dx10
,fscal
,fix1
);
677 fiy1
= _mm_macc_pd(dy10
,fscal
,fiy1
);
678 fiz1
= _mm_macc_pd(dz10
,fscal
,fiz1
);
680 fjx0
= _mm_macc_pd(dx10
,fscal
,fjx0
);
681 fjy0
= _mm_macc_pd(dy10
,fscal
,fjy0
);
682 fjz0
= _mm_macc_pd(dz10
,fscal
,fjz0
);
686 /**************************
687 * CALCULATE INTERACTIONS *
688 **************************/
690 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
693 /* Compute parameters for interactions between i and j atoms */
694 qq20
= _mm_mul_pd(iq2
,jq0
);
696 /* REACTION-FIELD ELECTROSTATICS */
697 felec
= _mm_mul_pd(qq20
,_mm_msub_pd(rinv20
,rinvsq20
,krf2
));
699 cutoff_mask
= _mm_cmplt_pd(rsq20
,rcutoff2
);
703 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
705 /* Update vectorial force */
706 fix2
= _mm_macc_pd(dx20
,fscal
,fix2
);
707 fiy2
= _mm_macc_pd(dy20
,fscal
,fiy2
);
708 fiz2
= _mm_macc_pd(dz20
,fscal
,fiz2
);
710 fjx0
= _mm_macc_pd(dx20
,fscal
,fjx0
);
711 fjy0
= _mm_macc_pd(dy20
,fscal
,fjy0
);
712 fjz0
= _mm_macc_pd(dz20
,fscal
,fjz0
);
716 /**************************
717 * CALCULATE INTERACTIONS *
718 **************************/
720 if (gmx_mm_any_lt(rsq30
,rcutoff2
))
723 /* Compute parameters for interactions between i and j atoms */
724 qq30
= _mm_mul_pd(iq3
,jq0
);
726 /* REACTION-FIELD ELECTROSTATICS */
727 felec
= _mm_mul_pd(qq30
,_mm_msub_pd(rinv30
,rinvsq30
,krf2
));
729 cutoff_mask
= _mm_cmplt_pd(rsq30
,rcutoff2
);
733 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
735 /* Update vectorial force */
736 fix3
= _mm_macc_pd(dx30
,fscal
,fix3
);
737 fiy3
= _mm_macc_pd(dy30
,fscal
,fiy3
);
738 fiz3
= _mm_macc_pd(dz30
,fscal
,fiz3
);
740 fjx0
= _mm_macc_pd(dx30
,fscal
,fjx0
);
741 fjy0
= _mm_macc_pd(dy30
,fscal
,fjy0
);
742 fjz0
= _mm_macc_pd(dz30
,fscal
,fjz0
);
746 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
);
748 /* Inner loop uses 102 flops */
755 j_coord_offsetA
= DIM
*jnrA
;
757 /* load j atom coordinates */
758 gmx_mm_load_1rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
761 /* Calculate displacement vector */
762 dx10
= _mm_sub_pd(ix1
,jx0
);
763 dy10
= _mm_sub_pd(iy1
,jy0
);
764 dz10
= _mm_sub_pd(iz1
,jz0
);
765 dx20
= _mm_sub_pd(ix2
,jx0
);
766 dy20
= _mm_sub_pd(iy2
,jy0
);
767 dz20
= _mm_sub_pd(iz2
,jz0
);
768 dx30
= _mm_sub_pd(ix3
,jx0
);
769 dy30
= _mm_sub_pd(iy3
,jy0
);
770 dz30
= _mm_sub_pd(iz3
,jz0
);
772 /* Calculate squared distance and things based on it */
773 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
774 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
775 rsq30
= gmx_mm_calc_rsq_pd(dx30
,dy30
,dz30
);
777 rinv10
= avx128fma_invsqrt_d(rsq10
);
778 rinv20
= avx128fma_invsqrt_d(rsq20
);
779 rinv30
= avx128fma_invsqrt_d(rsq30
);
781 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
782 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
783 rinvsq30
= _mm_mul_pd(rinv30
,rinv30
);
785 /* Load parameters for j particles */
786 jq0
= _mm_load_sd(charge
+jnrA
+0);
788 fjx0
= _mm_setzero_pd();
789 fjy0
= _mm_setzero_pd();
790 fjz0
= _mm_setzero_pd();
792 /**************************
793 * CALCULATE INTERACTIONS *
794 **************************/
796 if (gmx_mm_any_lt(rsq10
,rcutoff2
))
799 /* Compute parameters for interactions between i and j atoms */
800 qq10
= _mm_mul_pd(iq1
,jq0
);
802 /* REACTION-FIELD ELECTROSTATICS */
803 felec
= _mm_mul_pd(qq10
,_mm_msub_pd(rinv10
,rinvsq10
,krf2
));
805 cutoff_mask
= _mm_cmplt_pd(rsq10
,rcutoff2
);
809 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
811 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
813 /* Update vectorial force */
814 fix1
= _mm_macc_pd(dx10
,fscal
,fix1
);
815 fiy1
= _mm_macc_pd(dy10
,fscal
,fiy1
);
816 fiz1
= _mm_macc_pd(dz10
,fscal
,fiz1
);
818 fjx0
= _mm_macc_pd(dx10
,fscal
,fjx0
);
819 fjy0
= _mm_macc_pd(dy10
,fscal
,fjy0
);
820 fjz0
= _mm_macc_pd(dz10
,fscal
,fjz0
);
824 /**************************
825 * CALCULATE INTERACTIONS *
826 **************************/
828 if (gmx_mm_any_lt(rsq20
,rcutoff2
))
831 /* Compute parameters for interactions between i and j atoms */
832 qq20
= _mm_mul_pd(iq2
,jq0
);
834 /* REACTION-FIELD ELECTROSTATICS */
835 felec
= _mm_mul_pd(qq20
,_mm_msub_pd(rinv20
,rinvsq20
,krf2
));
837 cutoff_mask
= _mm_cmplt_pd(rsq20
,rcutoff2
);
841 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
843 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
845 /* Update vectorial force */
846 fix2
= _mm_macc_pd(dx20
,fscal
,fix2
);
847 fiy2
= _mm_macc_pd(dy20
,fscal
,fiy2
);
848 fiz2
= _mm_macc_pd(dz20
,fscal
,fiz2
);
850 fjx0
= _mm_macc_pd(dx20
,fscal
,fjx0
);
851 fjy0
= _mm_macc_pd(dy20
,fscal
,fjy0
);
852 fjz0
= _mm_macc_pd(dz20
,fscal
,fjz0
);
856 /**************************
857 * CALCULATE INTERACTIONS *
858 **************************/
860 if (gmx_mm_any_lt(rsq30
,rcutoff2
))
863 /* Compute parameters for interactions between i and j atoms */
864 qq30
= _mm_mul_pd(iq3
,jq0
);
866 /* REACTION-FIELD ELECTROSTATICS */
867 felec
= _mm_mul_pd(qq30
,_mm_msub_pd(rinv30
,rinvsq30
,krf2
));
869 cutoff_mask
= _mm_cmplt_pd(rsq30
,rcutoff2
);
873 fscal
= _mm_and_pd(fscal
,cutoff_mask
);
875 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
877 /* Update vectorial force */
878 fix3
= _mm_macc_pd(dx30
,fscal
,fix3
);
879 fiy3
= _mm_macc_pd(dy30
,fscal
,fiy3
);
880 fiz3
= _mm_macc_pd(dz30
,fscal
,fiz3
);
882 fjx0
= _mm_macc_pd(dx30
,fscal
,fjx0
);
883 fjy0
= _mm_macc_pd(dy30
,fscal
,fjy0
);
884 fjz0
= _mm_macc_pd(dz30
,fscal
,fjz0
);
888 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
);
890 /* Inner loop uses 102 flops */
893 /* End of innermost loop */
895 gmx_mm_update_iforce_3atom_swizzle_pd(fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
896 f
+i_coord_offset
+DIM
,fshift
+i_shift_offset
);
898 /* Increment number of inner iterations */
899 inneriter
+= j_index_end
- j_index_start
;
901 /* Outer loop uses 18 flops */
904 /* Increment number of outer iterations */
907 /* Update outer/inner flops */
909 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_W4_F
,outeriter
*18 + inneriter
*102);