2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_avx_256_double.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_VF_avx_256_double
51 * Electrostatics interaction: None
52 * VdW interaction: LJEwald
53 * Geometry: Particle-Particle
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_VF_avx_256_double
58 (t_nblist
* gmx_restrict nlist
,
59 rvec
* gmx_restrict xx
,
60 rvec
* gmx_restrict ff
,
61 struct t_forcerec
* gmx_restrict fr
,
62 t_mdatoms
* gmx_restrict mdatoms
,
63 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
64 t_nrnb
* gmx_restrict nrnb
)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
72 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
73 int jnrA
,jnrB
,jnrC
,jnrD
;
74 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
75 int jnrlistE
,jnrlistF
,jnrlistG
,jnrlistH
;
76 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
77 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
79 real
*shiftvec
,*fshift
,*x
,*f
;
80 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
;
82 __m256d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
83 real
* vdwioffsetptr0
;
84 real
* vdwgridioffsetptr0
;
85 __m256d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
86 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
;
87 __m256d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
88 __m256d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
90 __m256d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
93 __m256d one_sixth
= _mm256_set1_pd(1.0/6.0);
94 __m256d one_twelfth
= _mm256_set1_pd(1.0/12.0);
97 __m256d ewclj
,ewclj2
,ewclj6
,ewcljrsq
,poly
,exponent
,f6A
,f6B
,sh_lj_ewald
;
98 __m256d one_half
= _mm256_set1_pd(0.5);
99 __m256d minus_one
= _mm256_set1_pd(-1.0);
100 __m256d dummy_mask
,cutoff_mask
;
101 __m128 tmpmask0
,tmpmask1
;
102 __m256d signbit
= _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
103 __m256d one
= _mm256_set1_pd(1.0);
104 __m256d two
= _mm256_set1_pd(2.0);
110 jindex
= nlist
->jindex
;
112 shiftidx
= nlist
->shift
;
114 shiftvec
= fr
->shift_vec
[0];
115 fshift
= fr
->fshift
[0];
116 nvdwtype
= fr
->ntype
;
118 vdwtype
= mdatoms
->typeA
;
119 vdwgridparam
= fr
->ljpme_c6grid
;
120 sh_lj_ewald
= _mm256_set1_pd(fr
->ic
->sh_lj_ewald
);
121 ewclj
= _mm256_set1_pd(fr
->ic
->ewaldcoeff_lj
);
122 ewclj2
= _mm256_mul_pd(minus_one
,_mm256_mul_pd(ewclj
,ewclj
));
124 rcutoff_scalar
= fr
->ic
->rvdw
;
125 rcutoff
= _mm256_set1_pd(rcutoff_scalar
);
126 rcutoff2
= _mm256_mul_pd(rcutoff
,rcutoff
);
128 sh_vdw_invrcut6
= _mm256_set1_pd(fr
->ic
->sh_invrc6
);
129 rvdw
= _mm256_set1_pd(fr
->ic
->rvdw
);
131 /* Avoid stupid compiler warnings */
132 jnrA
= jnrB
= jnrC
= jnrD
= 0;
141 for(iidx
=0;iidx
<4*DIM
;iidx
++)
146 /* Start outer loop over neighborlists */
147 for(iidx
=0; iidx
<nri
; iidx
++)
149 /* Load shift vector for this list */
150 i_shift_offset
= DIM
*shiftidx
[iidx
];
152 /* Load limits for loop over neighbors */
153 j_index_start
= jindex
[iidx
];
154 j_index_end
= jindex
[iidx
+1];
156 /* Get outer coordinate index */
158 i_coord_offset
= DIM
*inr
;
160 /* Load i particle coords and add shift vector */
161 gmx_mm256_load_shift_and_1rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,&ix0
,&iy0
,&iz0
);
163 fix0
= _mm256_setzero_pd();
164 fiy0
= _mm256_setzero_pd();
165 fiz0
= _mm256_setzero_pd();
167 /* Load parameters for i particles */
168 vdwioffsetptr0
= vdwparam
+2*nvdwtype
*vdwtype
[inr
+0];
169 vdwgridioffsetptr0
= vdwgridparam
+2*nvdwtype
*vdwtype
[inr
+0];
171 /* Reset potential sums */
172 vvdwsum
= _mm256_setzero_pd();
174 /* Start inner kernel loop */
175 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+3]>=0; jidx
+=4)
178 /* Get j neighbor index, and coordinate index */
183 j_coord_offsetA
= DIM
*jnrA
;
184 j_coord_offsetB
= DIM
*jnrB
;
185 j_coord_offsetC
= DIM
*jnrC
;
186 j_coord_offsetD
= DIM
*jnrD
;
188 /* load j atom coordinates */
189 gmx_mm256_load_1rvec_4ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
190 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
193 /* Calculate displacement vector */
194 dx00
= _mm256_sub_pd(ix0
,jx0
);
195 dy00
= _mm256_sub_pd(iy0
,jy0
);
196 dz00
= _mm256_sub_pd(iz0
,jz0
);
198 /* Calculate squared distance and things based on it */
199 rsq00
= gmx_mm256_calc_rsq_pd(dx00
,dy00
,dz00
);
201 rinv00
= avx256_invsqrt_d(rsq00
);
203 rinvsq00
= _mm256_mul_pd(rinv00
,rinv00
);
205 /* Load parameters for j particles */
206 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
207 vdwjidx0B
= 2*vdwtype
[jnrB
+0];
208 vdwjidx0C
= 2*vdwtype
[jnrC
+0];
209 vdwjidx0D
= 2*vdwtype
[jnrD
+0];
211 /**************************
212 * CALCULATE INTERACTIONS *
213 **************************/
215 if (gmx_mm256_any_lt(rsq00
,rcutoff2
))
218 r00
= _mm256_mul_pd(rsq00
,rinv00
);
220 /* Compute parameters for interactions between i and j atoms */
221 gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0
+vdwjidx0A
,
222 vdwioffsetptr0
+vdwjidx0B
,
223 vdwioffsetptr0
+vdwjidx0C
,
224 vdwioffsetptr0
+vdwjidx0D
,
227 c6grid_00
= gmx_mm256_load_4real_swizzle_pd(vdwgridioffsetptr0
+vdwjidx0A
,
228 vdwgridioffsetptr0
+vdwjidx0B
,
229 vdwgridioffsetptr0
+vdwjidx0C
,
230 vdwgridioffsetptr0
+vdwjidx0D
);
232 /* Analytical LJ-PME */
233 rinvsix
= _mm256_mul_pd(_mm256_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
234 ewcljrsq
= _mm256_mul_pd(ewclj2
,rsq00
);
235 ewclj6
= _mm256_mul_pd(ewclj2
,_mm256_mul_pd(ewclj2
,ewclj2
));
236 exponent
= avx256_exp_d(ewcljrsq
);
237 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
238 poly
= _mm256_mul_pd(exponent
,_mm256_add_pd(_mm256_sub_pd(one
,ewcljrsq
),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq
,ewcljrsq
),one_half
)));
239 /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
240 vvdw6
= _mm256_mul_pd(_mm256_sub_pd(c6_00
,_mm256_mul_pd(c6grid_00
,_mm256_sub_pd(one
,poly
))),rinvsix
);
241 vvdw12
= _mm256_mul_pd(c12_00
,_mm256_mul_pd(rinvsix
,rinvsix
));
242 vvdw
= _mm256_sub_pd(_mm256_mul_pd( _mm256_sub_pd(vvdw12
, _mm256_mul_pd(c12_00
,_mm256_mul_pd(sh_vdw_invrcut6
,sh_vdw_invrcut6
))), one_twelfth
) ,
243 _mm256_mul_pd( _mm256_sub_pd(vvdw6
,_mm256_add_pd(_mm256_mul_pd(c6_00
,sh_vdw_invrcut6
),_mm256_mul_pd(c6grid_00
,sh_lj_ewald
))),one_sixth
));
244 /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
245 fvdw
= _mm256_mul_pd(_mm256_sub_pd(vvdw12
,_mm256_sub_pd(vvdw6
,_mm256_mul_pd(_mm256_mul_pd(c6grid_00
,one_sixth
),_mm256_mul_pd(exponent
,ewclj6
)))),rinvsq00
);
247 cutoff_mask
= _mm256_cmp_pd(rsq00
,rcutoff2
,_CMP_LT_OQ
);
249 /* Update potential sum for this i atom from the interaction with this j atom. */
250 vvdw
= _mm256_and_pd(vvdw
,cutoff_mask
);
251 vvdwsum
= _mm256_add_pd(vvdwsum
,vvdw
);
255 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
257 /* Calculate temporary vectorial force */
258 tx
= _mm256_mul_pd(fscal
,dx00
);
259 ty
= _mm256_mul_pd(fscal
,dy00
);
260 tz
= _mm256_mul_pd(fscal
,dz00
);
262 /* Update vectorial force */
263 fix0
= _mm256_add_pd(fix0
,tx
);
264 fiy0
= _mm256_add_pd(fiy0
,ty
);
265 fiz0
= _mm256_add_pd(fiz0
,tz
);
267 fjptrA
= f
+j_coord_offsetA
;
268 fjptrB
= f
+j_coord_offsetB
;
269 fjptrC
= f
+j_coord_offsetC
;
270 fjptrD
= f
+j_coord_offsetD
;
271 gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA
,fjptrB
,fjptrC
,fjptrD
,tx
,ty
,tz
);
275 /* Inner loop uses 62 flops */
281 /* Get j neighbor index, and coordinate index */
282 jnrlistA
= jjnr
[jidx
];
283 jnrlistB
= jjnr
[jidx
+1];
284 jnrlistC
= jjnr
[jidx
+2];
285 jnrlistD
= jjnr
[jidx
+3];
286 /* Sign of each element will be negative for non-real atoms.
287 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
288 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
290 tmpmask0
= gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128()));
292 tmpmask1
= _mm_permute_ps(tmpmask0
,_GMX_MM_PERMUTE(3,3,2,2));
293 tmpmask0
= _mm_permute_ps(tmpmask0
,_GMX_MM_PERMUTE(1,1,0,0));
294 dummy_mask
= _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1
,tmpmask0
));
296 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
297 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
298 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
299 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
300 j_coord_offsetA
= DIM
*jnrA
;
301 j_coord_offsetB
= DIM
*jnrB
;
302 j_coord_offsetC
= DIM
*jnrC
;
303 j_coord_offsetD
= DIM
*jnrD
;
305 /* load j atom coordinates */
306 gmx_mm256_load_1rvec_4ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
307 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
310 /* Calculate displacement vector */
311 dx00
= _mm256_sub_pd(ix0
,jx0
);
312 dy00
= _mm256_sub_pd(iy0
,jy0
);
313 dz00
= _mm256_sub_pd(iz0
,jz0
);
315 /* Calculate squared distance and things based on it */
316 rsq00
= gmx_mm256_calc_rsq_pd(dx00
,dy00
,dz00
);
318 rinv00
= avx256_invsqrt_d(rsq00
);
320 rinvsq00
= _mm256_mul_pd(rinv00
,rinv00
);
322 /* Load parameters for j particles */
323 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
324 vdwjidx0B
= 2*vdwtype
[jnrB
+0];
325 vdwjidx0C
= 2*vdwtype
[jnrC
+0];
326 vdwjidx0D
= 2*vdwtype
[jnrD
+0];
328 /**************************
329 * CALCULATE INTERACTIONS *
330 **************************/
332 if (gmx_mm256_any_lt(rsq00
,rcutoff2
))
335 r00
= _mm256_mul_pd(rsq00
,rinv00
);
336 r00
= _mm256_andnot_pd(dummy_mask
,r00
);
338 /* Compute parameters for interactions between i and j atoms */
339 gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0
+vdwjidx0A
,
340 vdwioffsetptr0
+vdwjidx0B
,
341 vdwioffsetptr0
+vdwjidx0C
,
342 vdwioffsetptr0
+vdwjidx0D
,
345 c6grid_00
= gmx_mm256_load_4real_swizzle_pd(vdwgridioffsetptr0
+vdwjidx0A
,
346 vdwgridioffsetptr0
+vdwjidx0B
,
347 vdwgridioffsetptr0
+vdwjidx0C
,
348 vdwgridioffsetptr0
+vdwjidx0D
);
350 /* Analytical LJ-PME */
351 rinvsix
= _mm256_mul_pd(_mm256_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
352 ewcljrsq
= _mm256_mul_pd(ewclj2
,rsq00
);
353 ewclj6
= _mm256_mul_pd(ewclj2
,_mm256_mul_pd(ewclj2
,ewclj2
));
354 exponent
= avx256_exp_d(ewcljrsq
);
355 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
356 poly
= _mm256_mul_pd(exponent
,_mm256_add_pd(_mm256_sub_pd(one
,ewcljrsq
),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq
,ewcljrsq
),one_half
)));
357 /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
358 vvdw6
= _mm256_mul_pd(_mm256_sub_pd(c6_00
,_mm256_mul_pd(c6grid_00
,_mm256_sub_pd(one
,poly
))),rinvsix
);
359 vvdw12
= _mm256_mul_pd(c12_00
,_mm256_mul_pd(rinvsix
,rinvsix
));
360 vvdw
= _mm256_sub_pd(_mm256_mul_pd( _mm256_sub_pd(vvdw12
, _mm256_mul_pd(c12_00
,_mm256_mul_pd(sh_vdw_invrcut6
,sh_vdw_invrcut6
))), one_twelfth
) ,
361 _mm256_mul_pd( _mm256_sub_pd(vvdw6
,_mm256_add_pd(_mm256_mul_pd(c6_00
,sh_vdw_invrcut6
),_mm256_mul_pd(c6grid_00
,sh_lj_ewald
))),one_sixth
));
362 /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
363 fvdw
= _mm256_mul_pd(_mm256_sub_pd(vvdw12
,_mm256_sub_pd(vvdw6
,_mm256_mul_pd(_mm256_mul_pd(c6grid_00
,one_sixth
),_mm256_mul_pd(exponent
,ewclj6
)))),rinvsq00
);
365 cutoff_mask
= _mm256_cmp_pd(rsq00
,rcutoff2
,_CMP_LT_OQ
);
367 /* Update potential sum for this i atom from the interaction with this j atom. */
368 vvdw
= _mm256_and_pd(vvdw
,cutoff_mask
);
369 vvdw
= _mm256_andnot_pd(dummy_mask
,vvdw
);
370 vvdwsum
= _mm256_add_pd(vvdwsum
,vvdw
);
374 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
376 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
378 /* Calculate temporary vectorial force */
379 tx
= _mm256_mul_pd(fscal
,dx00
);
380 ty
= _mm256_mul_pd(fscal
,dy00
);
381 tz
= _mm256_mul_pd(fscal
,dz00
);
383 /* Update vectorial force */
384 fix0
= _mm256_add_pd(fix0
,tx
);
385 fiy0
= _mm256_add_pd(fiy0
,ty
);
386 fiz0
= _mm256_add_pd(fiz0
,tz
);
388 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
389 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
390 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
391 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
392 gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA
,fjptrB
,fjptrC
,fjptrD
,tx
,ty
,tz
);
396 /* Inner loop uses 63 flops */
399 /* End of innermost loop */
401 gmx_mm256_update_iforce_1atom_swizzle_pd(fix0
,fiy0
,fiz0
,
402 f
+i_coord_offset
,fshift
+i_shift_offset
);
405 /* Update potential energies */
406 gmx_mm256_update_1pot_pd(vvdwsum
,kernel_data
->energygrp_vdw
+ggid
);
408 /* Increment number of inner iterations */
409 inneriter
+= j_index_end
- j_index_start
;
411 /* Outer loop uses 7 flops */
414 /* Increment number of outer iterations */
417 /* Update outer/inner flops */
419 inc_nrnb(nrnb
,eNR_NBKERNEL_VDW_VF
,outeriter
*7 + inneriter
*63);
422 * Gromacs nonbonded kernel: nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_F_avx_256_double
423 * Electrostatics interaction: None
424 * VdW interaction: LJEwald
425 * Geometry: Particle-Particle
426 * Calculate force/pot: Force
429 nb_kernel_ElecNone_VdwLJEwSh_GeomP1P1_F_avx_256_double
430 (t_nblist
* gmx_restrict nlist
,
431 rvec
* gmx_restrict xx
,
432 rvec
* gmx_restrict ff
,
433 struct t_forcerec
* gmx_restrict fr
,
434 t_mdatoms
* gmx_restrict mdatoms
,
435 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
436 t_nrnb
* gmx_restrict nrnb
)
438 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
439 * just 0 for non-waters.
440 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
441 * jnr indices corresponding to data put in the four positions in the SIMD register.
443 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
444 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
445 int jnrA
,jnrB
,jnrC
,jnrD
;
446 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
447 int jnrlistE
,jnrlistF
,jnrlistG
,jnrlistH
;
448 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
449 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
451 real
*shiftvec
,*fshift
,*x
,*f
;
452 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
;
454 __m256d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
455 real
* vdwioffsetptr0
;
456 real
* vdwgridioffsetptr0
;
457 __m256d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
458 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
;
459 __m256d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
460 __m256d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
462 __m256d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
465 __m256d one_sixth
= _mm256_set1_pd(1.0/6.0);
466 __m256d one_twelfth
= _mm256_set1_pd(1.0/12.0);
469 __m256d ewclj
,ewclj2
,ewclj6
,ewcljrsq
,poly
,exponent
,f6A
,f6B
,sh_lj_ewald
;
470 __m256d one_half
= _mm256_set1_pd(0.5);
471 __m256d minus_one
= _mm256_set1_pd(-1.0);
472 __m256d dummy_mask
,cutoff_mask
;
473 __m128 tmpmask0
,tmpmask1
;
474 __m256d signbit
= _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
475 __m256d one
= _mm256_set1_pd(1.0);
476 __m256d two
= _mm256_set1_pd(2.0);
482 jindex
= nlist
->jindex
;
484 shiftidx
= nlist
->shift
;
486 shiftvec
= fr
->shift_vec
[0];
487 fshift
= fr
->fshift
[0];
488 nvdwtype
= fr
->ntype
;
490 vdwtype
= mdatoms
->typeA
;
491 vdwgridparam
= fr
->ljpme_c6grid
;
492 sh_lj_ewald
= _mm256_set1_pd(fr
->ic
->sh_lj_ewald
);
493 ewclj
= _mm256_set1_pd(fr
->ic
->ewaldcoeff_lj
);
494 ewclj2
= _mm256_mul_pd(minus_one
,_mm256_mul_pd(ewclj
,ewclj
));
496 rcutoff_scalar
= fr
->ic
->rvdw
;
497 rcutoff
= _mm256_set1_pd(rcutoff_scalar
);
498 rcutoff2
= _mm256_mul_pd(rcutoff
,rcutoff
);
500 sh_vdw_invrcut6
= _mm256_set1_pd(fr
->ic
->sh_invrc6
);
501 rvdw
= _mm256_set1_pd(fr
->ic
->rvdw
);
503 /* Avoid stupid compiler warnings */
504 jnrA
= jnrB
= jnrC
= jnrD
= 0;
513 for(iidx
=0;iidx
<4*DIM
;iidx
++)
518 /* Start outer loop over neighborlists */
519 for(iidx
=0; iidx
<nri
; iidx
++)
521 /* Load shift vector for this list */
522 i_shift_offset
= DIM
*shiftidx
[iidx
];
524 /* Load limits for loop over neighbors */
525 j_index_start
= jindex
[iidx
];
526 j_index_end
= jindex
[iidx
+1];
528 /* Get outer coordinate index */
530 i_coord_offset
= DIM
*inr
;
532 /* Load i particle coords and add shift vector */
533 gmx_mm256_load_shift_and_1rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,&ix0
,&iy0
,&iz0
);
535 fix0
= _mm256_setzero_pd();
536 fiy0
= _mm256_setzero_pd();
537 fiz0
= _mm256_setzero_pd();
539 /* Load parameters for i particles */
540 vdwioffsetptr0
= vdwparam
+2*nvdwtype
*vdwtype
[inr
+0];
541 vdwgridioffsetptr0
= vdwgridparam
+2*nvdwtype
*vdwtype
[inr
+0];
543 /* Start inner kernel loop */
544 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+3]>=0; jidx
+=4)
547 /* Get j neighbor index, and coordinate index */
552 j_coord_offsetA
= DIM
*jnrA
;
553 j_coord_offsetB
= DIM
*jnrB
;
554 j_coord_offsetC
= DIM
*jnrC
;
555 j_coord_offsetD
= DIM
*jnrD
;
557 /* load j atom coordinates */
558 gmx_mm256_load_1rvec_4ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
559 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
562 /* Calculate displacement vector */
563 dx00
= _mm256_sub_pd(ix0
,jx0
);
564 dy00
= _mm256_sub_pd(iy0
,jy0
);
565 dz00
= _mm256_sub_pd(iz0
,jz0
);
567 /* Calculate squared distance and things based on it */
568 rsq00
= gmx_mm256_calc_rsq_pd(dx00
,dy00
,dz00
);
570 rinv00
= avx256_invsqrt_d(rsq00
);
572 rinvsq00
= _mm256_mul_pd(rinv00
,rinv00
);
574 /* Load parameters for j particles */
575 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
576 vdwjidx0B
= 2*vdwtype
[jnrB
+0];
577 vdwjidx0C
= 2*vdwtype
[jnrC
+0];
578 vdwjidx0D
= 2*vdwtype
[jnrD
+0];
580 /**************************
581 * CALCULATE INTERACTIONS *
582 **************************/
584 if (gmx_mm256_any_lt(rsq00
,rcutoff2
))
587 r00
= _mm256_mul_pd(rsq00
,rinv00
);
589 /* Compute parameters for interactions between i and j atoms */
590 gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0
+vdwjidx0A
,
591 vdwioffsetptr0
+vdwjidx0B
,
592 vdwioffsetptr0
+vdwjidx0C
,
593 vdwioffsetptr0
+vdwjidx0D
,
596 c6grid_00
= gmx_mm256_load_4real_swizzle_pd(vdwgridioffsetptr0
+vdwjidx0A
,
597 vdwgridioffsetptr0
+vdwjidx0B
,
598 vdwgridioffsetptr0
+vdwjidx0C
,
599 vdwgridioffsetptr0
+vdwjidx0D
);
601 /* Analytical LJ-PME */
602 rinvsix
= _mm256_mul_pd(_mm256_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
603 ewcljrsq
= _mm256_mul_pd(ewclj2
,rsq00
);
604 ewclj6
= _mm256_mul_pd(ewclj2
,_mm256_mul_pd(ewclj2
,ewclj2
));
605 exponent
= avx256_exp_d(ewcljrsq
);
606 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
607 poly
= _mm256_mul_pd(exponent
,_mm256_add_pd(_mm256_sub_pd(one
,ewcljrsq
),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq
,ewcljrsq
),one_half
)));
608 /* f6A = 6 * C6grid * (1 - poly) */
609 f6A
= _mm256_mul_pd(c6grid_00
,_mm256_sub_pd(one
,poly
));
610 /* f6B = C6grid * exponent * beta^6 */
611 f6B
= _mm256_mul_pd(_mm256_mul_pd(c6grid_00
,one_sixth
),_mm256_mul_pd(exponent
,ewclj6
));
612 /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
613 fvdw
= _mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00
,rinvsix
),_mm256_sub_pd(c6_00
,f6A
)),rinvsix
),f6B
),rinvsq00
);
615 cutoff_mask
= _mm256_cmp_pd(rsq00
,rcutoff2
,_CMP_LT_OQ
);
619 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
621 /* Calculate temporary vectorial force */
622 tx
= _mm256_mul_pd(fscal
,dx00
);
623 ty
= _mm256_mul_pd(fscal
,dy00
);
624 tz
= _mm256_mul_pd(fscal
,dz00
);
626 /* Update vectorial force */
627 fix0
= _mm256_add_pd(fix0
,tx
);
628 fiy0
= _mm256_add_pd(fiy0
,ty
);
629 fiz0
= _mm256_add_pd(fiz0
,tz
);
631 fjptrA
= f
+j_coord_offsetA
;
632 fjptrB
= f
+j_coord_offsetB
;
633 fjptrC
= f
+j_coord_offsetC
;
634 fjptrD
= f
+j_coord_offsetD
;
635 gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA
,fjptrB
,fjptrC
,fjptrD
,tx
,ty
,tz
);
639 /* Inner loop uses 49 flops */
645 /* Get j neighbor index, and coordinate index */
646 jnrlistA
= jjnr
[jidx
];
647 jnrlistB
= jjnr
[jidx
+1];
648 jnrlistC
= jjnr
[jidx
+2];
649 jnrlistD
= jjnr
[jidx
+3];
650 /* Sign of each element will be negative for non-real atoms.
651 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
652 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
654 tmpmask0
= gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128()));
656 tmpmask1
= _mm_permute_ps(tmpmask0
,_GMX_MM_PERMUTE(3,3,2,2));
657 tmpmask0
= _mm_permute_ps(tmpmask0
,_GMX_MM_PERMUTE(1,1,0,0));
658 dummy_mask
= _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1
,tmpmask0
));
660 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
661 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
662 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
663 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
664 j_coord_offsetA
= DIM
*jnrA
;
665 j_coord_offsetB
= DIM
*jnrB
;
666 j_coord_offsetC
= DIM
*jnrC
;
667 j_coord_offsetD
= DIM
*jnrD
;
669 /* load j atom coordinates */
670 gmx_mm256_load_1rvec_4ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
671 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
674 /* Calculate displacement vector */
675 dx00
= _mm256_sub_pd(ix0
,jx0
);
676 dy00
= _mm256_sub_pd(iy0
,jy0
);
677 dz00
= _mm256_sub_pd(iz0
,jz0
);
679 /* Calculate squared distance and things based on it */
680 rsq00
= gmx_mm256_calc_rsq_pd(dx00
,dy00
,dz00
);
682 rinv00
= avx256_invsqrt_d(rsq00
);
684 rinvsq00
= _mm256_mul_pd(rinv00
,rinv00
);
686 /* Load parameters for j particles */
687 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
688 vdwjidx0B
= 2*vdwtype
[jnrB
+0];
689 vdwjidx0C
= 2*vdwtype
[jnrC
+0];
690 vdwjidx0D
= 2*vdwtype
[jnrD
+0];
692 /**************************
693 * CALCULATE INTERACTIONS *
694 **************************/
696 if (gmx_mm256_any_lt(rsq00
,rcutoff2
))
699 r00
= _mm256_mul_pd(rsq00
,rinv00
);
700 r00
= _mm256_andnot_pd(dummy_mask
,r00
);
702 /* Compute parameters for interactions between i and j atoms */
703 gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0
+vdwjidx0A
,
704 vdwioffsetptr0
+vdwjidx0B
,
705 vdwioffsetptr0
+vdwjidx0C
,
706 vdwioffsetptr0
+vdwjidx0D
,
709 c6grid_00
= gmx_mm256_load_4real_swizzle_pd(vdwgridioffsetptr0
+vdwjidx0A
,
710 vdwgridioffsetptr0
+vdwjidx0B
,
711 vdwgridioffsetptr0
+vdwjidx0C
,
712 vdwgridioffsetptr0
+vdwjidx0D
);
714 /* Analytical LJ-PME */
715 rinvsix
= _mm256_mul_pd(_mm256_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
716 ewcljrsq
= _mm256_mul_pd(ewclj2
,rsq00
);
717 ewclj6
= _mm256_mul_pd(ewclj2
,_mm256_mul_pd(ewclj2
,ewclj2
));
718 exponent
= avx256_exp_d(ewcljrsq
);
719 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
720 poly
= _mm256_mul_pd(exponent
,_mm256_add_pd(_mm256_sub_pd(one
,ewcljrsq
),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq
,ewcljrsq
),one_half
)));
721 /* f6A = 6 * C6grid * (1 - poly) */
722 f6A
= _mm256_mul_pd(c6grid_00
,_mm256_sub_pd(one
,poly
));
723 /* f6B = C6grid * exponent * beta^6 */
724 f6B
= _mm256_mul_pd(_mm256_mul_pd(c6grid_00
,one_sixth
),_mm256_mul_pd(exponent
,ewclj6
));
725 /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
726 fvdw
= _mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00
,rinvsix
),_mm256_sub_pd(c6_00
,f6A
)),rinvsix
),f6B
),rinvsq00
);
728 cutoff_mask
= _mm256_cmp_pd(rsq00
,rcutoff2
,_CMP_LT_OQ
);
732 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
734 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
736 /* Calculate temporary vectorial force */
737 tx
= _mm256_mul_pd(fscal
,dx00
);
738 ty
= _mm256_mul_pd(fscal
,dy00
);
739 tz
= _mm256_mul_pd(fscal
,dz00
);
741 /* Update vectorial force */
742 fix0
= _mm256_add_pd(fix0
,tx
);
743 fiy0
= _mm256_add_pd(fiy0
,ty
);
744 fiz0
= _mm256_add_pd(fiz0
,tz
);
746 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
747 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
748 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
749 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
750 gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA
,fjptrB
,fjptrC
,fjptrD
,tx
,ty
,tz
);
754 /* Inner loop uses 50 flops */
757 /* End of innermost loop */
759 gmx_mm256_update_iforce_1atom_swizzle_pd(fix0
,fiy0
,fiz0
,
760 f
+i_coord_offset
,fshift
+i_shift_offset
);
762 /* Increment number of inner iterations */
763 inneriter
+= j_index_end
- j_index_start
;
765 /* Outer loop uses 6 flops */
768 /* Increment number of outer iterations */
771 /* Update outer/inner flops */
773 inc_nrnb(nrnb
,eNR_NBKERNEL_VDW_F
,outeriter
*6 + inneriter
*50);