2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_single kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_avx_256_single.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_avx_256_single
51 * Electrostatics interaction: ReactionField
52 * VdW interaction: LennardJones
53 * Geometry: Water4-Particle
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecRF_VdwLJ_GeomW4P1_VF_avx_256_single
58 (t_nblist
* gmx_restrict nlist
,
59 rvec
* gmx_restrict xx
,
60 rvec
* gmx_restrict ff
,
61 struct t_forcerec
* gmx_restrict fr
,
62 t_mdatoms
* gmx_restrict mdatoms
,
63 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
64 t_nrnb
* gmx_restrict nrnb
)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
72 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
73 int jnrA
,jnrB
,jnrC
,jnrD
;
74 int jnrE
,jnrF
,jnrG
,jnrH
;
75 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
76 int jnrlistE
,jnrlistF
,jnrlistG
,jnrlistH
;
77 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
78 int j_coord_offsetE
,j_coord_offsetF
,j_coord_offsetG
,j_coord_offsetH
;
79 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
81 real
*shiftvec
,*fshift
,*x
,*f
;
82 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
,*fjptrE
,*fjptrF
,*fjptrG
,*fjptrH
;
84 __m256 tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
85 real
* vdwioffsetptr0
;
86 __m256 ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
87 real
* vdwioffsetptr1
;
88 __m256 ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
89 real
* vdwioffsetptr2
;
90 __m256 ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
91 real
* vdwioffsetptr3
;
92 __m256 ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
93 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
,vdwjidx0E
,vdwjidx0F
,vdwjidx0G
,vdwjidx0H
;
94 __m256 jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
95 __m256 dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
96 __m256 dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
97 __m256 dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
98 __m256 dx30
,dy30
,dz30
,rsq30
,rinv30
,rinvsq30
,r30
,qq30
,c6_30
,c12_30
;
99 __m256 velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
102 __m256 rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
105 __m256 one_sixth
= _mm256_set1_ps(1.0/6.0);
106 __m256 one_twelfth
= _mm256_set1_ps(1.0/12.0);
107 __m256 dummy_mask
,cutoff_mask
;
108 __m256 signbit
= _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
109 __m256 one
= _mm256_set1_ps(1.0);
110 __m256 two
= _mm256_set1_ps(2.0);
116 jindex
= nlist
->jindex
;
118 shiftidx
= nlist
->shift
;
120 shiftvec
= fr
->shift_vec
[0];
121 fshift
= fr
->fshift
[0];
122 facel
= _mm256_set1_ps(fr
->ic
->epsfac
);
123 charge
= mdatoms
->chargeA
;
124 krf
= _mm256_set1_ps(fr
->ic
->k_rf
);
125 krf2
= _mm256_set1_ps(fr
->ic
->k_rf
*2.0);
126 crf
= _mm256_set1_ps(fr
->ic
->c_rf
);
127 nvdwtype
= fr
->ntype
;
129 vdwtype
= mdatoms
->typeA
;
131 /* Setup water-specific parameters */
132 inr
= nlist
->iinr
[0];
133 iq1
= _mm256_mul_ps(facel
,_mm256_set1_ps(charge
[inr
+1]));
134 iq2
= _mm256_mul_ps(facel
,_mm256_set1_ps(charge
[inr
+2]));
135 iq3
= _mm256_mul_ps(facel
,_mm256_set1_ps(charge
[inr
+3]));
136 vdwioffsetptr0
= vdwparam
+2*nvdwtype
*vdwtype
[inr
+0];
138 /* Avoid stupid compiler warnings */
139 jnrA
= jnrB
= jnrC
= jnrD
= jnrE
= jnrF
= jnrG
= jnrH
= 0;
152 for(iidx
=0;iidx
<4*DIM
;iidx
++)
157 /* Start outer loop over neighborlists */
158 for(iidx
=0; iidx
<nri
; iidx
++)
160 /* Load shift vector for this list */
161 i_shift_offset
= DIM
*shiftidx
[iidx
];
163 /* Load limits for loop over neighbors */
164 j_index_start
= jindex
[iidx
];
165 j_index_end
= jindex
[iidx
+1];
167 /* Get outer coordinate index */
169 i_coord_offset
= DIM
*inr
;
171 /* Load i particle coords and add shift vector */
172 gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
173 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
175 fix0
= _mm256_setzero_ps();
176 fiy0
= _mm256_setzero_ps();
177 fiz0
= _mm256_setzero_ps();
178 fix1
= _mm256_setzero_ps();
179 fiy1
= _mm256_setzero_ps();
180 fiz1
= _mm256_setzero_ps();
181 fix2
= _mm256_setzero_ps();
182 fiy2
= _mm256_setzero_ps();
183 fiz2
= _mm256_setzero_ps();
184 fix3
= _mm256_setzero_ps();
185 fiy3
= _mm256_setzero_ps();
186 fiz3
= _mm256_setzero_ps();
188 /* Reset potential sums */
189 velecsum
= _mm256_setzero_ps();
190 vvdwsum
= _mm256_setzero_ps();
192 /* Start inner kernel loop */
193 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+7]>=0; jidx
+=8)
196 /* Get j neighbor index, and coordinate index */
205 j_coord_offsetA
= DIM
*jnrA
;
206 j_coord_offsetB
= DIM
*jnrB
;
207 j_coord_offsetC
= DIM
*jnrC
;
208 j_coord_offsetD
= DIM
*jnrD
;
209 j_coord_offsetE
= DIM
*jnrE
;
210 j_coord_offsetF
= DIM
*jnrF
;
211 j_coord_offsetG
= DIM
*jnrG
;
212 j_coord_offsetH
= DIM
*jnrH
;
214 /* load j atom coordinates */
215 gmx_mm256_load_1rvec_8ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
216 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
217 x
+j_coord_offsetE
,x
+j_coord_offsetF
,
218 x
+j_coord_offsetG
,x
+j_coord_offsetH
,
221 /* Calculate displacement vector */
222 dx00
= _mm256_sub_ps(ix0
,jx0
);
223 dy00
= _mm256_sub_ps(iy0
,jy0
);
224 dz00
= _mm256_sub_ps(iz0
,jz0
);
225 dx10
= _mm256_sub_ps(ix1
,jx0
);
226 dy10
= _mm256_sub_ps(iy1
,jy0
);
227 dz10
= _mm256_sub_ps(iz1
,jz0
);
228 dx20
= _mm256_sub_ps(ix2
,jx0
);
229 dy20
= _mm256_sub_ps(iy2
,jy0
);
230 dz20
= _mm256_sub_ps(iz2
,jz0
);
231 dx30
= _mm256_sub_ps(ix3
,jx0
);
232 dy30
= _mm256_sub_ps(iy3
,jy0
);
233 dz30
= _mm256_sub_ps(iz3
,jz0
);
235 /* Calculate squared distance and things based on it */
236 rsq00
= gmx_mm256_calc_rsq_ps(dx00
,dy00
,dz00
);
237 rsq10
= gmx_mm256_calc_rsq_ps(dx10
,dy10
,dz10
);
238 rsq20
= gmx_mm256_calc_rsq_ps(dx20
,dy20
,dz20
);
239 rsq30
= gmx_mm256_calc_rsq_ps(dx30
,dy30
,dz30
);
241 rinv10
= avx256_invsqrt_f(rsq10
);
242 rinv20
= avx256_invsqrt_f(rsq20
);
243 rinv30
= avx256_invsqrt_f(rsq30
);
245 rinvsq00
= avx256_inv_f(rsq00
);
246 rinvsq10
= _mm256_mul_ps(rinv10
,rinv10
);
247 rinvsq20
= _mm256_mul_ps(rinv20
,rinv20
);
248 rinvsq30
= _mm256_mul_ps(rinv30
,rinv30
);
250 /* Load parameters for j particles */
251 jq0
= gmx_mm256_load_8real_swizzle_ps(charge
+jnrA
+0,charge
+jnrB
+0,
252 charge
+jnrC
+0,charge
+jnrD
+0,
253 charge
+jnrE
+0,charge
+jnrF
+0,
254 charge
+jnrG
+0,charge
+jnrH
+0);
255 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
256 vdwjidx0B
= 2*vdwtype
[jnrB
+0];
257 vdwjidx0C
= 2*vdwtype
[jnrC
+0];
258 vdwjidx0D
= 2*vdwtype
[jnrD
+0];
259 vdwjidx0E
= 2*vdwtype
[jnrE
+0];
260 vdwjidx0F
= 2*vdwtype
[jnrF
+0];
261 vdwjidx0G
= 2*vdwtype
[jnrG
+0];
262 vdwjidx0H
= 2*vdwtype
[jnrH
+0];
264 fjx0
= _mm256_setzero_ps();
265 fjy0
= _mm256_setzero_ps();
266 fjz0
= _mm256_setzero_ps();
268 /**************************
269 * CALCULATE INTERACTIONS *
270 **************************/
272 /* Compute parameters for interactions between i and j atoms */
273 gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0
+vdwjidx0A
,
274 vdwioffsetptr0
+vdwjidx0B
,
275 vdwioffsetptr0
+vdwjidx0C
,
276 vdwioffsetptr0
+vdwjidx0D
,
277 vdwioffsetptr0
+vdwjidx0E
,
278 vdwioffsetptr0
+vdwjidx0F
,
279 vdwioffsetptr0
+vdwjidx0G
,
280 vdwioffsetptr0
+vdwjidx0H
,
283 /* LENNARD-JONES DISPERSION/REPULSION */
285 rinvsix
= _mm256_mul_ps(_mm256_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
286 vvdw6
= _mm256_mul_ps(c6_00
,rinvsix
);
287 vvdw12
= _mm256_mul_ps(c12_00
,_mm256_mul_ps(rinvsix
,rinvsix
));
288 vvdw
= _mm256_sub_ps( _mm256_mul_ps(vvdw12
,one_twelfth
) , _mm256_mul_ps(vvdw6
,one_sixth
) );
289 fvdw
= _mm256_mul_ps(_mm256_sub_ps(vvdw12
,vvdw6
),rinvsq00
);
291 /* Update potential sum for this i atom from the interaction with this j atom. */
292 vvdwsum
= _mm256_add_ps(vvdwsum
,vvdw
);
296 /* Calculate temporary vectorial force */
297 tx
= _mm256_mul_ps(fscal
,dx00
);
298 ty
= _mm256_mul_ps(fscal
,dy00
);
299 tz
= _mm256_mul_ps(fscal
,dz00
);
301 /* Update vectorial force */
302 fix0
= _mm256_add_ps(fix0
,tx
);
303 fiy0
= _mm256_add_ps(fiy0
,ty
);
304 fiz0
= _mm256_add_ps(fiz0
,tz
);
306 fjx0
= _mm256_add_ps(fjx0
,tx
);
307 fjy0
= _mm256_add_ps(fjy0
,ty
);
308 fjz0
= _mm256_add_ps(fjz0
,tz
);
310 /**************************
311 * CALCULATE INTERACTIONS *
312 **************************/
314 /* Compute parameters for interactions between i and j atoms */
315 qq10
= _mm256_mul_ps(iq1
,jq0
);
317 /* REACTION-FIELD ELECTROSTATICS */
318 velec
= _mm256_mul_ps(qq10
,_mm256_sub_ps(_mm256_add_ps(rinv10
,_mm256_mul_ps(krf
,rsq10
)),crf
));
319 felec
= _mm256_mul_ps(qq10
,_mm256_sub_ps(_mm256_mul_ps(rinv10
,rinvsq10
),krf2
));
321 /* Update potential sum for this i atom from the interaction with this j atom. */
322 velecsum
= _mm256_add_ps(velecsum
,velec
);
326 /* Calculate temporary vectorial force */
327 tx
= _mm256_mul_ps(fscal
,dx10
);
328 ty
= _mm256_mul_ps(fscal
,dy10
);
329 tz
= _mm256_mul_ps(fscal
,dz10
);
331 /* Update vectorial force */
332 fix1
= _mm256_add_ps(fix1
,tx
);
333 fiy1
= _mm256_add_ps(fiy1
,ty
);
334 fiz1
= _mm256_add_ps(fiz1
,tz
);
336 fjx0
= _mm256_add_ps(fjx0
,tx
);
337 fjy0
= _mm256_add_ps(fjy0
,ty
);
338 fjz0
= _mm256_add_ps(fjz0
,tz
);
340 /**************************
341 * CALCULATE INTERACTIONS *
342 **************************/
344 /* Compute parameters for interactions between i and j atoms */
345 qq20
= _mm256_mul_ps(iq2
,jq0
);
347 /* REACTION-FIELD ELECTROSTATICS */
348 velec
= _mm256_mul_ps(qq20
,_mm256_sub_ps(_mm256_add_ps(rinv20
,_mm256_mul_ps(krf
,rsq20
)),crf
));
349 felec
= _mm256_mul_ps(qq20
,_mm256_sub_ps(_mm256_mul_ps(rinv20
,rinvsq20
),krf2
));
351 /* Update potential sum for this i atom from the interaction with this j atom. */
352 velecsum
= _mm256_add_ps(velecsum
,velec
);
356 /* Calculate temporary vectorial force */
357 tx
= _mm256_mul_ps(fscal
,dx20
);
358 ty
= _mm256_mul_ps(fscal
,dy20
);
359 tz
= _mm256_mul_ps(fscal
,dz20
);
361 /* Update vectorial force */
362 fix2
= _mm256_add_ps(fix2
,tx
);
363 fiy2
= _mm256_add_ps(fiy2
,ty
);
364 fiz2
= _mm256_add_ps(fiz2
,tz
);
366 fjx0
= _mm256_add_ps(fjx0
,tx
);
367 fjy0
= _mm256_add_ps(fjy0
,ty
);
368 fjz0
= _mm256_add_ps(fjz0
,tz
);
370 /**************************
371 * CALCULATE INTERACTIONS *
372 **************************/
374 /* Compute parameters for interactions between i and j atoms */
375 qq30
= _mm256_mul_ps(iq3
,jq0
);
377 /* REACTION-FIELD ELECTROSTATICS */
378 velec
= _mm256_mul_ps(qq30
,_mm256_sub_ps(_mm256_add_ps(rinv30
,_mm256_mul_ps(krf
,rsq30
)),crf
));
379 felec
= _mm256_mul_ps(qq30
,_mm256_sub_ps(_mm256_mul_ps(rinv30
,rinvsq30
),krf2
));
381 /* Update potential sum for this i atom from the interaction with this j atom. */
382 velecsum
= _mm256_add_ps(velecsum
,velec
);
386 /* Calculate temporary vectorial force */
387 tx
= _mm256_mul_ps(fscal
,dx30
);
388 ty
= _mm256_mul_ps(fscal
,dy30
);
389 tz
= _mm256_mul_ps(fscal
,dz30
);
391 /* Update vectorial force */
392 fix3
= _mm256_add_ps(fix3
,tx
);
393 fiy3
= _mm256_add_ps(fiy3
,ty
);
394 fiz3
= _mm256_add_ps(fiz3
,tz
);
396 fjx0
= _mm256_add_ps(fjx0
,tx
);
397 fjy0
= _mm256_add_ps(fjy0
,ty
);
398 fjz0
= _mm256_add_ps(fjz0
,tz
);
400 fjptrA
= f
+j_coord_offsetA
;
401 fjptrB
= f
+j_coord_offsetB
;
402 fjptrC
= f
+j_coord_offsetC
;
403 fjptrD
= f
+j_coord_offsetD
;
404 fjptrE
= f
+j_coord_offsetE
;
405 fjptrF
= f
+j_coord_offsetF
;
406 fjptrG
= f
+j_coord_offsetG
;
407 fjptrH
= f
+j_coord_offsetH
;
409 gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,fjptrE
,fjptrF
,fjptrG
,fjptrH
,fjx0
,fjy0
,fjz0
);
411 /* Inner loop uses 131 flops */
417 /* Get j neighbor index, and coordinate index */
418 jnrlistA
= jjnr
[jidx
];
419 jnrlistB
= jjnr
[jidx
+1];
420 jnrlistC
= jjnr
[jidx
+2];
421 jnrlistD
= jjnr
[jidx
+3];
422 jnrlistE
= jjnr
[jidx
+4];
423 jnrlistF
= jjnr
[jidx
+5];
424 jnrlistG
= jjnr
[jidx
+6];
425 jnrlistH
= jjnr
[jidx
+7];
426 /* Sign of each element will be negative for non-real atoms.
427 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
428 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
430 dummy_mask
= gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
+4)),_mm_setzero_si128())),
431 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128())));
433 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
434 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
435 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
436 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
437 jnrE
= (jnrlistE
>=0) ? jnrlistE
: 0;
438 jnrF
= (jnrlistF
>=0) ? jnrlistF
: 0;
439 jnrG
= (jnrlistG
>=0) ? jnrlistG
: 0;
440 jnrH
= (jnrlistH
>=0) ? jnrlistH
: 0;
441 j_coord_offsetA
= DIM
*jnrA
;
442 j_coord_offsetB
= DIM
*jnrB
;
443 j_coord_offsetC
= DIM
*jnrC
;
444 j_coord_offsetD
= DIM
*jnrD
;
445 j_coord_offsetE
= DIM
*jnrE
;
446 j_coord_offsetF
= DIM
*jnrF
;
447 j_coord_offsetG
= DIM
*jnrG
;
448 j_coord_offsetH
= DIM
*jnrH
;
450 /* load j atom coordinates */
451 gmx_mm256_load_1rvec_8ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
452 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
453 x
+j_coord_offsetE
,x
+j_coord_offsetF
,
454 x
+j_coord_offsetG
,x
+j_coord_offsetH
,
457 /* Calculate displacement vector */
458 dx00
= _mm256_sub_ps(ix0
,jx0
);
459 dy00
= _mm256_sub_ps(iy0
,jy0
);
460 dz00
= _mm256_sub_ps(iz0
,jz0
);
461 dx10
= _mm256_sub_ps(ix1
,jx0
);
462 dy10
= _mm256_sub_ps(iy1
,jy0
);
463 dz10
= _mm256_sub_ps(iz1
,jz0
);
464 dx20
= _mm256_sub_ps(ix2
,jx0
);
465 dy20
= _mm256_sub_ps(iy2
,jy0
);
466 dz20
= _mm256_sub_ps(iz2
,jz0
);
467 dx30
= _mm256_sub_ps(ix3
,jx0
);
468 dy30
= _mm256_sub_ps(iy3
,jy0
);
469 dz30
= _mm256_sub_ps(iz3
,jz0
);
471 /* Calculate squared distance and things based on it */
472 rsq00
= gmx_mm256_calc_rsq_ps(dx00
,dy00
,dz00
);
473 rsq10
= gmx_mm256_calc_rsq_ps(dx10
,dy10
,dz10
);
474 rsq20
= gmx_mm256_calc_rsq_ps(dx20
,dy20
,dz20
);
475 rsq30
= gmx_mm256_calc_rsq_ps(dx30
,dy30
,dz30
);
477 rinv10
= avx256_invsqrt_f(rsq10
);
478 rinv20
= avx256_invsqrt_f(rsq20
);
479 rinv30
= avx256_invsqrt_f(rsq30
);
481 rinvsq00
= avx256_inv_f(rsq00
);
482 rinvsq10
= _mm256_mul_ps(rinv10
,rinv10
);
483 rinvsq20
= _mm256_mul_ps(rinv20
,rinv20
);
484 rinvsq30
= _mm256_mul_ps(rinv30
,rinv30
);
486 /* Load parameters for j particles */
487 jq0
= gmx_mm256_load_8real_swizzle_ps(charge
+jnrA
+0,charge
+jnrB
+0,
488 charge
+jnrC
+0,charge
+jnrD
+0,
489 charge
+jnrE
+0,charge
+jnrF
+0,
490 charge
+jnrG
+0,charge
+jnrH
+0);
491 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
492 vdwjidx0B
= 2*vdwtype
[jnrB
+0];
493 vdwjidx0C
= 2*vdwtype
[jnrC
+0];
494 vdwjidx0D
= 2*vdwtype
[jnrD
+0];
495 vdwjidx0E
= 2*vdwtype
[jnrE
+0];
496 vdwjidx0F
= 2*vdwtype
[jnrF
+0];
497 vdwjidx0G
= 2*vdwtype
[jnrG
+0];
498 vdwjidx0H
= 2*vdwtype
[jnrH
+0];
500 fjx0
= _mm256_setzero_ps();
501 fjy0
= _mm256_setzero_ps();
502 fjz0
= _mm256_setzero_ps();
504 /**************************
505 * CALCULATE INTERACTIONS *
506 **************************/
508 /* Compute parameters for interactions between i and j atoms */
509 gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0
+vdwjidx0A
,
510 vdwioffsetptr0
+vdwjidx0B
,
511 vdwioffsetptr0
+vdwjidx0C
,
512 vdwioffsetptr0
+vdwjidx0D
,
513 vdwioffsetptr0
+vdwjidx0E
,
514 vdwioffsetptr0
+vdwjidx0F
,
515 vdwioffsetptr0
+vdwjidx0G
,
516 vdwioffsetptr0
+vdwjidx0H
,
519 /* LENNARD-JONES DISPERSION/REPULSION */
521 rinvsix
= _mm256_mul_ps(_mm256_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
522 vvdw6
= _mm256_mul_ps(c6_00
,rinvsix
);
523 vvdw12
= _mm256_mul_ps(c12_00
,_mm256_mul_ps(rinvsix
,rinvsix
));
524 vvdw
= _mm256_sub_ps( _mm256_mul_ps(vvdw12
,one_twelfth
) , _mm256_mul_ps(vvdw6
,one_sixth
) );
525 fvdw
= _mm256_mul_ps(_mm256_sub_ps(vvdw12
,vvdw6
),rinvsq00
);
527 /* Update potential sum for this i atom from the interaction with this j atom. */
528 vvdw
= _mm256_andnot_ps(dummy_mask
,vvdw
);
529 vvdwsum
= _mm256_add_ps(vvdwsum
,vvdw
);
533 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
535 /* Calculate temporary vectorial force */
536 tx
= _mm256_mul_ps(fscal
,dx00
);
537 ty
= _mm256_mul_ps(fscal
,dy00
);
538 tz
= _mm256_mul_ps(fscal
,dz00
);
540 /* Update vectorial force */
541 fix0
= _mm256_add_ps(fix0
,tx
);
542 fiy0
= _mm256_add_ps(fiy0
,ty
);
543 fiz0
= _mm256_add_ps(fiz0
,tz
);
545 fjx0
= _mm256_add_ps(fjx0
,tx
);
546 fjy0
= _mm256_add_ps(fjy0
,ty
);
547 fjz0
= _mm256_add_ps(fjz0
,tz
);
549 /**************************
550 * CALCULATE INTERACTIONS *
551 **************************/
553 /* Compute parameters for interactions between i and j atoms */
554 qq10
= _mm256_mul_ps(iq1
,jq0
);
556 /* REACTION-FIELD ELECTROSTATICS */
557 velec
= _mm256_mul_ps(qq10
,_mm256_sub_ps(_mm256_add_ps(rinv10
,_mm256_mul_ps(krf
,rsq10
)),crf
));
558 felec
= _mm256_mul_ps(qq10
,_mm256_sub_ps(_mm256_mul_ps(rinv10
,rinvsq10
),krf2
));
560 /* Update potential sum for this i atom from the interaction with this j atom. */
561 velec
= _mm256_andnot_ps(dummy_mask
,velec
);
562 velecsum
= _mm256_add_ps(velecsum
,velec
);
566 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
568 /* Calculate temporary vectorial force */
569 tx
= _mm256_mul_ps(fscal
,dx10
);
570 ty
= _mm256_mul_ps(fscal
,dy10
);
571 tz
= _mm256_mul_ps(fscal
,dz10
);
573 /* Update vectorial force */
574 fix1
= _mm256_add_ps(fix1
,tx
);
575 fiy1
= _mm256_add_ps(fiy1
,ty
);
576 fiz1
= _mm256_add_ps(fiz1
,tz
);
578 fjx0
= _mm256_add_ps(fjx0
,tx
);
579 fjy0
= _mm256_add_ps(fjy0
,ty
);
580 fjz0
= _mm256_add_ps(fjz0
,tz
);
582 /**************************
583 * CALCULATE INTERACTIONS *
584 **************************/
586 /* Compute parameters for interactions between i and j atoms */
587 qq20
= _mm256_mul_ps(iq2
,jq0
);
589 /* REACTION-FIELD ELECTROSTATICS */
590 velec
= _mm256_mul_ps(qq20
,_mm256_sub_ps(_mm256_add_ps(rinv20
,_mm256_mul_ps(krf
,rsq20
)),crf
));
591 felec
= _mm256_mul_ps(qq20
,_mm256_sub_ps(_mm256_mul_ps(rinv20
,rinvsq20
),krf2
));
593 /* Update potential sum for this i atom from the interaction with this j atom. */
594 velec
= _mm256_andnot_ps(dummy_mask
,velec
);
595 velecsum
= _mm256_add_ps(velecsum
,velec
);
599 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
601 /* Calculate temporary vectorial force */
602 tx
= _mm256_mul_ps(fscal
,dx20
);
603 ty
= _mm256_mul_ps(fscal
,dy20
);
604 tz
= _mm256_mul_ps(fscal
,dz20
);
606 /* Update vectorial force */
607 fix2
= _mm256_add_ps(fix2
,tx
);
608 fiy2
= _mm256_add_ps(fiy2
,ty
);
609 fiz2
= _mm256_add_ps(fiz2
,tz
);
611 fjx0
= _mm256_add_ps(fjx0
,tx
);
612 fjy0
= _mm256_add_ps(fjy0
,ty
);
613 fjz0
= _mm256_add_ps(fjz0
,tz
);
615 /**************************
616 * CALCULATE INTERACTIONS *
617 **************************/
619 /* Compute parameters for interactions between i and j atoms */
620 qq30
= _mm256_mul_ps(iq3
,jq0
);
622 /* REACTION-FIELD ELECTROSTATICS */
623 velec
= _mm256_mul_ps(qq30
,_mm256_sub_ps(_mm256_add_ps(rinv30
,_mm256_mul_ps(krf
,rsq30
)),crf
));
624 felec
= _mm256_mul_ps(qq30
,_mm256_sub_ps(_mm256_mul_ps(rinv30
,rinvsq30
),krf2
));
626 /* Update potential sum for this i atom from the interaction with this j atom. */
627 velec
= _mm256_andnot_ps(dummy_mask
,velec
);
628 velecsum
= _mm256_add_ps(velecsum
,velec
);
632 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
634 /* Calculate temporary vectorial force */
635 tx
= _mm256_mul_ps(fscal
,dx30
);
636 ty
= _mm256_mul_ps(fscal
,dy30
);
637 tz
= _mm256_mul_ps(fscal
,dz30
);
639 /* Update vectorial force */
640 fix3
= _mm256_add_ps(fix3
,tx
);
641 fiy3
= _mm256_add_ps(fiy3
,ty
);
642 fiz3
= _mm256_add_ps(fiz3
,tz
);
644 fjx0
= _mm256_add_ps(fjx0
,tx
);
645 fjy0
= _mm256_add_ps(fjy0
,ty
);
646 fjz0
= _mm256_add_ps(fjz0
,tz
);
648 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
649 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
650 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
651 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
652 fjptrE
= (jnrlistE
>=0) ? f
+j_coord_offsetE
: scratch
;
653 fjptrF
= (jnrlistF
>=0) ? f
+j_coord_offsetF
: scratch
;
654 fjptrG
= (jnrlistG
>=0) ? f
+j_coord_offsetG
: scratch
;
655 fjptrH
= (jnrlistH
>=0) ? f
+j_coord_offsetH
: scratch
;
657 gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,fjptrE
,fjptrF
,fjptrG
,fjptrH
,fjx0
,fjy0
,fjz0
);
659 /* Inner loop uses 131 flops */
662 /* End of innermost loop */
664 gmx_mm256_update_iforce_4atom_swizzle_ps(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
665 f
+i_coord_offset
,fshift
+i_shift_offset
);
668 /* Update potential energies */
669 gmx_mm256_update_1pot_ps(velecsum
,kernel_data
->energygrp_elec
+ggid
);
670 gmx_mm256_update_1pot_ps(vvdwsum
,kernel_data
->energygrp_vdw
+ggid
);
672 /* Increment number of inner iterations */
673 inneriter
+= j_index_end
- j_index_start
;
675 /* Outer loop uses 26 flops */
678 /* Increment number of outer iterations */
681 /* Update outer/inner flops */
683 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W4_VF
,outeriter
*26 + inneriter
*131);
686 * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_avx_256_single
687 * Electrostatics interaction: ReactionField
688 * VdW interaction: LennardJones
689 * Geometry: Water4-Particle
690 * Calculate force/pot: Force
693 nb_kernel_ElecRF_VdwLJ_GeomW4P1_F_avx_256_single
694 (t_nblist
* gmx_restrict nlist
,
695 rvec
* gmx_restrict xx
,
696 rvec
* gmx_restrict ff
,
697 struct t_forcerec
* gmx_restrict fr
,
698 t_mdatoms
* gmx_restrict mdatoms
,
699 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
700 t_nrnb
* gmx_restrict nrnb
)
702 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
703 * just 0 for non-waters.
704 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
705 * jnr indices corresponding to data put in the four positions in the SIMD register.
707 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
708 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
709 int jnrA
,jnrB
,jnrC
,jnrD
;
710 int jnrE
,jnrF
,jnrG
,jnrH
;
711 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
712 int jnrlistE
,jnrlistF
,jnrlistG
,jnrlistH
;
713 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
714 int j_coord_offsetE
,j_coord_offsetF
,j_coord_offsetG
,j_coord_offsetH
;
715 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
717 real
*shiftvec
,*fshift
,*x
,*f
;
718 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
,*fjptrE
,*fjptrF
,*fjptrG
,*fjptrH
;
720 __m256 tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
721 real
* vdwioffsetptr0
;
722 __m256 ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
723 real
* vdwioffsetptr1
;
724 __m256 ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
725 real
* vdwioffsetptr2
;
726 __m256 ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
727 real
* vdwioffsetptr3
;
728 __m256 ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
729 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
,vdwjidx0E
,vdwjidx0F
,vdwjidx0G
,vdwjidx0H
;
730 __m256 jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
731 __m256 dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
732 __m256 dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
733 __m256 dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
734 __m256 dx30
,dy30
,dz30
,rsq30
,rinv30
,rinvsq30
,r30
,qq30
,c6_30
,c12_30
;
735 __m256 velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
738 __m256 rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
741 __m256 one_sixth
= _mm256_set1_ps(1.0/6.0);
742 __m256 one_twelfth
= _mm256_set1_ps(1.0/12.0);
743 __m256 dummy_mask
,cutoff_mask
;
744 __m256 signbit
= _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
745 __m256 one
= _mm256_set1_ps(1.0);
746 __m256 two
= _mm256_set1_ps(2.0);
752 jindex
= nlist
->jindex
;
754 shiftidx
= nlist
->shift
;
756 shiftvec
= fr
->shift_vec
[0];
757 fshift
= fr
->fshift
[0];
758 facel
= _mm256_set1_ps(fr
->ic
->epsfac
);
759 charge
= mdatoms
->chargeA
;
760 krf
= _mm256_set1_ps(fr
->ic
->k_rf
);
761 krf2
= _mm256_set1_ps(fr
->ic
->k_rf
*2.0);
762 crf
= _mm256_set1_ps(fr
->ic
->c_rf
);
763 nvdwtype
= fr
->ntype
;
765 vdwtype
= mdatoms
->typeA
;
767 /* Setup water-specific parameters */
768 inr
= nlist
->iinr
[0];
769 iq1
= _mm256_mul_ps(facel
,_mm256_set1_ps(charge
[inr
+1]));
770 iq2
= _mm256_mul_ps(facel
,_mm256_set1_ps(charge
[inr
+2]));
771 iq3
= _mm256_mul_ps(facel
,_mm256_set1_ps(charge
[inr
+3]));
772 vdwioffsetptr0
= vdwparam
+2*nvdwtype
*vdwtype
[inr
+0];
774 /* Avoid stupid compiler warnings */
775 jnrA
= jnrB
= jnrC
= jnrD
= jnrE
= jnrF
= jnrG
= jnrH
= 0;
788 for(iidx
=0;iidx
<4*DIM
;iidx
++)
793 /* Start outer loop over neighborlists */
794 for(iidx
=0; iidx
<nri
; iidx
++)
796 /* Load shift vector for this list */
797 i_shift_offset
= DIM
*shiftidx
[iidx
];
799 /* Load limits for loop over neighbors */
800 j_index_start
= jindex
[iidx
];
801 j_index_end
= jindex
[iidx
+1];
803 /* Get outer coordinate index */
805 i_coord_offset
= DIM
*inr
;
807 /* Load i particle coords and add shift vector */
808 gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
809 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
811 fix0
= _mm256_setzero_ps();
812 fiy0
= _mm256_setzero_ps();
813 fiz0
= _mm256_setzero_ps();
814 fix1
= _mm256_setzero_ps();
815 fiy1
= _mm256_setzero_ps();
816 fiz1
= _mm256_setzero_ps();
817 fix2
= _mm256_setzero_ps();
818 fiy2
= _mm256_setzero_ps();
819 fiz2
= _mm256_setzero_ps();
820 fix3
= _mm256_setzero_ps();
821 fiy3
= _mm256_setzero_ps();
822 fiz3
= _mm256_setzero_ps();
824 /* Start inner kernel loop */
825 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+7]>=0; jidx
+=8)
828 /* Get j neighbor index, and coordinate index */
837 j_coord_offsetA
= DIM
*jnrA
;
838 j_coord_offsetB
= DIM
*jnrB
;
839 j_coord_offsetC
= DIM
*jnrC
;
840 j_coord_offsetD
= DIM
*jnrD
;
841 j_coord_offsetE
= DIM
*jnrE
;
842 j_coord_offsetF
= DIM
*jnrF
;
843 j_coord_offsetG
= DIM
*jnrG
;
844 j_coord_offsetH
= DIM
*jnrH
;
846 /* load j atom coordinates */
847 gmx_mm256_load_1rvec_8ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
848 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
849 x
+j_coord_offsetE
,x
+j_coord_offsetF
,
850 x
+j_coord_offsetG
,x
+j_coord_offsetH
,
853 /* Calculate displacement vector */
854 dx00
= _mm256_sub_ps(ix0
,jx0
);
855 dy00
= _mm256_sub_ps(iy0
,jy0
);
856 dz00
= _mm256_sub_ps(iz0
,jz0
);
857 dx10
= _mm256_sub_ps(ix1
,jx0
);
858 dy10
= _mm256_sub_ps(iy1
,jy0
);
859 dz10
= _mm256_sub_ps(iz1
,jz0
);
860 dx20
= _mm256_sub_ps(ix2
,jx0
);
861 dy20
= _mm256_sub_ps(iy2
,jy0
);
862 dz20
= _mm256_sub_ps(iz2
,jz0
);
863 dx30
= _mm256_sub_ps(ix3
,jx0
);
864 dy30
= _mm256_sub_ps(iy3
,jy0
);
865 dz30
= _mm256_sub_ps(iz3
,jz0
);
867 /* Calculate squared distance and things based on it */
868 rsq00
= gmx_mm256_calc_rsq_ps(dx00
,dy00
,dz00
);
869 rsq10
= gmx_mm256_calc_rsq_ps(dx10
,dy10
,dz10
);
870 rsq20
= gmx_mm256_calc_rsq_ps(dx20
,dy20
,dz20
);
871 rsq30
= gmx_mm256_calc_rsq_ps(dx30
,dy30
,dz30
);
873 rinv10
= avx256_invsqrt_f(rsq10
);
874 rinv20
= avx256_invsqrt_f(rsq20
);
875 rinv30
= avx256_invsqrt_f(rsq30
);
877 rinvsq00
= avx256_inv_f(rsq00
);
878 rinvsq10
= _mm256_mul_ps(rinv10
,rinv10
);
879 rinvsq20
= _mm256_mul_ps(rinv20
,rinv20
);
880 rinvsq30
= _mm256_mul_ps(rinv30
,rinv30
);
882 /* Load parameters for j particles */
883 jq0
= gmx_mm256_load_8real_swizzle_ps(charge
+jnrA
+0,charge
+jnrB
+0,
884 charge
+jnrC
+0,charge
+jnrD
+0,
885 charge
+jnrE
+0,charge
+jnrF
+0,
886 charge
+jnrG
+0,charge
+jnrH
+0);
887 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
888 vdwjidx0B
= 2*vdwtype
[jnrB
+0];
889 vdwjidx0C
= 2*vdwtype
[jnrC
+0];
890 vdwjidx0D
= 2*vdwtype
[jnrD
+0];
891 vdwjidx0E
= 2*vdwtype
[jnrE
+0];
892 vdwjidx0F
= 2*vdwtype
[jnrF
+0];
893 vdwjidx0G
= 2*vdwtype
[jnrG
+0];
894 vdwjidx0H
= 2*vdwtype
[jnrH
+0];
896 fjx0
= _mm256_setzero_ps();
897 fjy0
= _mm256_setzero_ps();
898 fjz0
= _mm256_setzero_ps();
900 /**************************
901 * CALCULATE INTERACTIONS *
902 **************************/
904 /* Compute parameters for interactions between i and j atoms */
905 gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0
+vdwjidx0A
,
906 vdwioffsetptr0
+vdwjidx0B
,
907 vdwioffsetptr0
+vdwjidx0C
,
908 vdwioffsetptr0
+vdwjidx0D
,
909 vdwioffsetptr0
+vdwjidx0E
,
910 vdwioffsetptr0
+vdwjidx0F
,
911 vdwioffsetptr0
+vdwjidx0G
,
912 vdwioffsetptr0
+vdwjidx0H
,
915 /* LENNARD-JONES DISPERSION/REPULSION */
917 rinvsix
= _mm256_mul_ps(_mm256_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
918 fvdw
= _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00
,rinvsix
),c6_00
),_mm256_mul_ps(rinvsix
,rinvsq00
));
922 /* Calculate temporary vectorial force */
923 tx
= _mm256_mul_ps(fscal
,dx00
);
924 ty
= _mm256_mul_ps(fscal
,dy00
);
925 tz
= _mm256_mul_ps(fscal
,dz00
);
927 /* Update vectorial force */
928 fix0
= _mm256_add_ps(fix0
,tx
);
929 fiy0
= _mm256_add_ps(fiy0
,ty
);
930 fiz0
= _mm256_add_ps(fiz0
,tz
);
932 fjx0
= _mm256_add_ps(fjx0
,tx
);
933 fjy0
= _mm256_add_ps(fjy0
,ty
);
934 fjz0
= _mm256_add_ps(fjz0
,tz
);
936 /**************************
937 * CALCULATE INTERACTIONS *
938 **************************/
940 /* Compute parameters for interactions between i and j atoms */
941 qq10
= _mm256_mul_ps(iq1
,jq0
);
943 /* REACTION-FIELD ELECTROSTATICS */
944 felec
= _mm256_mul_ps(qq10
,_mm256_sub_ps(_mm256_mul_ps(rinv10
,rinvsq10
),krf2
));
948 /* Calculate temporary vectorial force */
949 tx
= _mm256_mul_ps(fscal
,dx10
);
950 ty
= _mm256_mul_ps(fscal
,dy10
);
951 tz
= _mm256_mul_ps(fscal
,dz10
);
953 /* Update vectorial force */
954 fix1
= _mm256_add_ps(fix1
,tx
);
955 fiy1
= _mm256_add_ps(fiy1
,ty
);
956 fiz1
= _mm256_add_ps(fiz1
,tz
);
958 fjx0
= _mm256_add_ps(fjx0
,tx
);
959 fjy0
= _mm256_add_ps(fjy0
,ty
);
960 fjz0
= _mm256_add_ps(fjz0
,tz
);
962 /**************************
963 * CALCULATE INTERACTIONS *
964 **************************/
966 /* Compute parameters for interactions between i and j atoms */
967 qq20
= _mm256_mul_ps(iq2
,jq0
);
969 /* REACTION-FIELD ELECTROSTATICS */
970 felec
= _mm256_mul_ps(qq20
,_mm256_sub_ps(_mm256_mul_ps(rinv20
,rinvsq20
),krf2
));
974 /* Calculate temporary vectorial force */
975 tx
= _mm256_mul_ps(fscal
,dx20
);
976 ty
= _mm256_mul_ps(fscal
,dy20
);
977 tz
= _mm256_mul_ps(fscal
,dz20
);
979 /* Update vectorial force */
980 fix2
= _mm256_add_ps(fix2
,tx
);
981 fiy2
= _mm256_add_ps(fiy2
,ty
);
982 fiz2
= _mm256_add_ps(fiz2
,tz
);
984 fjx0
= _mm256_add_ps(fjx0
,tx
);
985 fjy0
= _mm256_add_ps(fjy0
,ty
);
986 fjz0
= _mm256_add_ps(fjz0
,tz
);
988 /**************************
989 * CALCULATE INTERACTIONS *
990 **************************/
992 /* Compute parameters for interactions between i and j atoms */
993 qq30
= _mm256_mul_ps(iq3
,jq0
);
995 /* REACTION-FIELD ELECTROSTATICS */
996 felec
= _mm256_mul_ps(qq30
,_mm256_sub_ps(_mm256_mul_ps(rinv30
,rinvsq30
),krf2
));
1000 /* Calculate temporary vectorial force */
1001 tx
= _mm256_mul_ps(fscal
,dx30
);
1002 ty
= _mm256_mul_ps(fscal
,dy30
);
1003 tz
= _mm256_mul_ps(fscal
,dz30
);
1005 /* Update vectorial force */
1006 fix3
= _mm256_add_ps(fix3
,tx
);
1007 fiy3
= _mm256_add_ps(fiy3
,ty
);
1008 fiz3
= _mm256_add_ps(fiz3
,tz
);
1010 fjx0
= _mm256_add_ps(fjx0
,tx
);
1011 fjy0
= _mm256_add_ps(fjy0
,ty
);
1012 fjz0
= _mm256_add_ps(fjz0
,tz
);
1014 fjptrA
= f
+j_coord_offsetA
;
1015 fjptrB
= f
+j_coord_offsetB
;
1016 fjptrC
= f
+j_coord_offsetC
;
1017 fjptrD
= f
+j_coord_offsetD
;
1018 fjptrE
= f
+j_coord_offsetE
;
1019 fjptrF
= f
+j_coord_offsetF
;
1020 fjptrG
= f
+j_coord_offsetG
;
1021 fjptrH
= f
+j_coord_offsetH
;
1023 gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,fjptrE
,fjptrF
,fjptrG
,fjptrH
,fjx0
,fjy0
,fjz0
);
1025 /* Inner loop uses 111 flops */
1028 if(jidx
<j_index_end
)
1031 /* Get j neighbor index, and coordinate index */
1032 jnrlistA
= jjnr
[jidx
];
1033 jnrlistB
= jjnr
[jidx
+1];
1034 jnrlistC
= jjnr
[jidx
+2];
1035 jnrlistD
= jjnr
[jidx
+3];
1036 jnrlistE
= jjnr
[jidx
+4];
1037 jnrlistF
= jjnr
[jidx
+5];
1038 jnrlistG
= jjnr
[jidx
+6];
1039 jnrlistH
= jjnr
[jidx
+7];
1040 /* Sign of each element will be negative for non-real atoms.
1041 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1042 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1044 dummy_mask
= gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
+4)),_mm_setzero_si128())),
1045 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128())));
1047 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
1048 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
1049 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
1050 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
1051 jnrE
= (jnrlistE
>=0) ? jnrlistE
: 0;
1052 jnrF
= (jnrlistF
>=0) ? jnrlistF
: 0;
1053 jnrG
= (jnrlistG
>=0) ? jnrlistG
: 0;
1054 jnrH
= (jnrlistH
>=0) ? jnrlistH
: 0;
1055 j_coord_offsetA
= DIM
*jnrA
;
1056 j_coord_offsetB
= DIM
*jnrB
;
1057 j_coord_offsetC
= DIM
*jnrC
;
1058 j_coord_offsetD
= DIM
*jnrD
;
1059 j_coord_offsetE
= DIM
*jnrE
;
1060 j_coord_offsetF
= DIM
*jnrF
;
1061 j_coord_offsetG
= DIM
*jnrG
;
1062 j_coord_offsetH
= DIM
*jnrH
;
1064 /* load j atom coordinates */
1065 gmx_mm256_load_1rvec_8ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1066 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
1067 x
+j_coord_offsetE
,x
+j_coord_offsetF
,
1068 x
+j_coord_offsetG
,x
+j_coord_offsetH
,
1071 /* Calculate displacement vector */
1072 dx00
= _mm256_sub_ps(ix0
,jx0
);
1073 dy00
= _mm256_sub_ps(iy0
,jy0
);
1074 dz00
= _mm256_sub_ps(iz0
,jz0
);
1075 dx10
= _mm256_sub_ps(ix1
,jx0
);
1076 dy10
= _mm256_sub_ps(iy1
,jy0
);
1077 dz10
= _mm256_sub_ps(iz1
,jz0
);
1078 dx20
= _mm256_sub_ps(ix2
,jx0
);
1079 dy20
= _mm256_sub_ps(iy2
,jy0
);
1080 dz20
= _mm256_sub_ps(iz2
,jz0
);
1081 dx30
= _mm256_sub_ps(ix3
,jx0
);
1082 dy30
= _mm256_sub_ps(iy3
,jy0
);
1083 dz30
= _mm256_sub_ps(iz3
,jz0
);
1085 /* Calculate squared distance and things based on it */
1086 rsq00
= gmx_mm256_calc_rsq_ps(dx00
,dy00
,dz00
);
1087 rsq10
= gmx_mm256_calc_rsq_ps(dx10
,dy10
,dz10
);
1088 rsq20
= gmx_mm256_calc_rsq_ps(dx20
,dy20
,dz20
);
1089 rsq30
= gmx_mm256_calc_rsq_ps(dx30
,dy30
,dz30
);
1091 rinv10
= avx256_invsqrt_f(rsq10
);
1092 rinv20
= avx256_invsqrt_f(rsq20
);
1093 rinv30
= avx256_invsqrt_f(rsq30
);
1095 rinvsq00
= avx256_inv_f(rsq00
);
1096 rinvsq10
= _mm256_mul_ps(rinv10
,rinv10
);
1097 rinvsq20
= _mm256_mul_ps(rinv20
,rinv20
);
1098 rinvsq30
= _mm256_mul_ps(rinv30
,rinv30
);
1100 /* Load parameters for j particles */
1101 jq0
= gmx_mm256_load_8real_swizzle_ps(charge
+jnrA
+0,charge
+jnrB
+0,
1102 charge
+jnrC
+0,charge
+jnrD
+0,
1103 charge
+jnrE
+0,charge
+jnrF
+0,
1104 charge
+jnrG
+0,charge
+jnrH
+0);
1105 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
1106 vdwjidx0B
= 2*vdwtype
[jnrB
+0];
1107 vdwjidx0C
= 2*vdwtype
[jnrC
+0];
1108 vdwjidx0D
= 2*vdwtype
[jnrD
+0];
1109 vdwjidx0E
= 2*vdwtype
[jnrE
+0];
1110 vdwjidx0F
= 2*vdwtype
[jnrF
+0];
1111 vdwjidx0G
= 2*vdwtype
[jnrG
+0];
1112 vdwjidx0H
= 2*vdwtype
[jnrH
+0];
1114 fjx0
= _mm256_setzero_ps();
1115 fjy0
= _mm256_setzero_ps();
1116 fjz0
= _mm256_setzero_ps();
1118 /**************************
1119 * CALCULATE INTERACTIONS *
1120 **************************/
1122 /* Compute parameters for interactions between i and j atoms */
1123 gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0
+vdwjidx0A
,
1124 vdwioffsetptr0
+vdwjidx0B
,
1125 vdwioffsetptr0
+vdwjidx0C
,
1126 vdwioffsetptr0
+vdwjidx0D
,
1127 vdwioffsetptr0
+vdwjidx0E
,
1128 vdwioffsetptr0
+vdwjidx0F
,
1129 vdwioffsetptr0
+vdwjidx0G
,
1130 vdwioffsetptr0
+vdwjidx0H
,
1133 /* LENNARD-JONES DISPERSION/REPULSION */
1135 rinvsix
= _mm256_mul_ps(_mm256_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
1136 fvdw
= _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00
,rinvsix
),c6_00
),_mm256_mul_ps(rinvsix
,rinvsq00
));
1140 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
1142 /* Calculate temporary vectorial force */
1143 tx
= _mm256_mul_ps(fscal
,dx00
);
1144 ty
= _mm256_mul_ps(fscal
,dy00
);
1145 tz
= _mm256_mul_ps(fscal
,dz00
);
1147 /* Update vectorial force */
1148 fix0
= _mm256_add_ps(fix0
,tx
);
1149 fiy0
= _mm256_add_ps(fiy0
,ty
);
1150 fiz0
= _mm256_add_ps(fiz0
,tz
);
1152 fjx0
= _mm256_add_ps(fjx0
,tx
);
1153 fjy0
= _mm256_add_ps(fjy0
,ty
);
1154 fjz0
= _mm256_add_ps(fjz0
,tz
);
1156 /**************************
1157 * CALCULATE INTERACTIONS *
1158 **************************/
1160 /* Compute parameters for interactions between i and j atoms */
1161 qq10
= _mm256_mul_ps(iq1
,jq0
);
1163 /* REACTION-FIELD ELECTROSTATICS */
1164 felec
= _mm256_mul_ps(qq10
,_mm256_sub_ps(_mm256_mul_ps(rinv10
,rinvsq10
),krf2
));
1168 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
1170 /* Calculate temporary vectorial force */
1171 tx
= _mm256_mul_ps(fscal
,dx10
);
1172 ty
= _mm256_mul_ps(fscal
,dy10
);
1173 tz
= _mm256_mul_ps(fscal
,dz10
);
1175 /* Update vectorial force */
1176 fix1
= _mm256_add_ps(fix1
,tx
);
1177 fiy1
= _mm256_add_ps(fiy1
,ty
);
1178 fiz1
= _mm256_add_ps(fiz1
,tz
);
1180 fjx0
= _mm256_add_ps(fjx0
,tx
);
1181 fjy0
= _mm256_add_ps(fjy0
,ty
);
1182 fjz0
= _mm256_add_ps(fjz0
,tz
);
1184 /**************************
1185 * CALCULATE INTERACTIONS *
1186 **************************/
1188 /* Compute parameters for interactions between i and j atoms */
1189 qq20
= _mm256_mul_ps(iq2
,jq0
);
1191 /* REACTION-FIELD ELECTROSTATICS */
1192 felec
= _mm256_mul_ps(qq20
,_mm256_sub_ps(_mm256_mul_ps(rinv20
,rinvsq20
),krf2
));
1196 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
1198 /* Calculate temporary vectorial force */
1199 tx
= _mm256_mul_ps(fscal
,dx20
);
1200 ty
= _mm256_mul_ps(fscal
,dy20
);
1201 tz
= _mm256_mul_ps(fscal
,dz20
);
1203 /* Update vectorial force */
1204 fix2
= _mm256_add_ps(fix2
,tx
);
1205 fiy2
= _mm256_add_ps(fiy2
,ty
);
1206 fiz2
= _mm256_add_ps(fiz2
,tz
);
1208 fjx0
= _mm256_add_ps(fjx0
,tx
);
1209 fjy0
= _mm256_add_ps(fjy0
,ty
);
1210 fjz0
= _mm256_add_ps(fjz0
,tz
);
1212 /**************************
1213 * CALCULATE INTERACTIONS *
1214 **************************/
1216 /* Compute parameters for interactions between i and j atoms */
1217 qq30
= _mm256_mul_ps(iq3
,jq0
);
1219 /* REACTION-FIELD ELECTROSTATICS */
1220 felec
= _mm256_mul_ps(qq30
,_mm256_sub_ps(_mm256_mul_ps(rinv30
,rinvsq30
),krf2
));
1224 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
1226 /* Calculate temporary vectorial force */
1227 tx
= _mm256_mul_ps(fscal
,dx30
);
1228 ty
= _mm256_mul_ps(fscal
,dy30
);
1229 tz
= _mm256_mul_ps(fscal
,dz30
);
1231 /* Update vectorial force */
1232 fix3
= _mm256_add_ps(fix3
,tx
);
1233 fiy3
= _mm256_add_ps(fiy3
,ty
);
1234 fiz3
= _mm256_add_ps(fiz3
,tz
);
1236 fjx0
= _mm256_add_ps(fjx0
,tx
);
1237 fjy0
= _mm256_add_ps(fjy0
,ty
);
1238 fjz0
= _mm256_add_ps(fjz0
,tz
);
1240 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
1241 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
1242 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
1243 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
1244 fjptrE
= (jnrlistE
>=0) ? f
+j_coord_offsetE
: scratch
;
1245 fjptrF
= (jnrlistF
>=0) ? f
+j_coord_offsetF
: scratch
;
1246 fjptrG
= (jnrlistG
>=0) ? f
+j_coord_offsetG
: scratch
;
1247 fjptrH
= (jnrlistH
>=0) ? f
+j_coord_offsetH
: scratch
;
1249 gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,fjptrE
,fjptrF
,fjptrG
,fjptrH
,fjx0
,fjy0
,fjz0
);
1251 /* Inner loop uses 111 flops */
1254 /* End of innermost loop */
1256 gmx_mm256_update_iforce_4atom_swizzle_ps(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
1257 f
+i_coord_offset
,fshift
+i_shift_offset
);
1259 /* Increment number of inner iterations */
1260 inneriter
+= j_index_end
- j_index_start
;
1262 /* Outer loop uses 24 flops */
1265 /* Increment number of outer iterations */
1268 /* Update outer/inner flops */
1270 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W4_F
,outeriter
*24 + inneriter
*111);