2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_avx_256_double.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_avx_256_double
51 * Electrostatics interaction: ReactionField
52 * VdW interaction: LennardJones
53 * Geometry: Water4-Water4
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_VF_avx_256_double
58 (t_nblist
* gmx_restrict nlist
,
59 rvec
* gmx_restrict xx
,
60 rvec
* gmx_restrict ff
,
61 struct t_forcerec
* gmx_restrict fr
,
62 t_mdatoms
* gmx_restrict mdatoms
,
63 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
64 t_nrnb
* gmx_restrict nrnb
)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
72 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
73 int jnrA
,jnrB
,jnrC
,jnrD
;
74 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
75 int jnrlistE
,jnrlistF
,jnrlistG
,jnrlistH
;
76 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
77 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
79 real
*shiftvec
,*fshift
,*x
,*f
;
80 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
;
82 __m256d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
83 real
* vdwioffsetptr0
;
84 __m256d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
85 real
* vdwioffsetptr1
;
86 __m256d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
87 real
* vdwioffsetptr2
;
88 __m256d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
89 real
* vdwioffsetptr3
;
90 __m256d ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
91 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
;
92 __m256d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
93 int vdwjidx1A
,vdwjidx1B
,vdwjidx1C
,vdwjidx1D
;
94 __m256d jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
95 int vdwjidx2A
,vdwjidx2B
,vdwjidx2C
,vdwjidx2D
;
96 __m256d jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
97 int vdwjidx3A
,vdwjidx3B
,vdwjidx3C
,vdwjidx3D
;
98 __m256d jx3
,jy3
,jz3
,fjx3
,fjy3
,fjz3
,jq3
,isaj3
;
99 __m256d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
100 __m256d dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
101 __m256d dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
102 __m256d dx13
,dy13
,dz13
,rsq13
,rinv13
,rinvsq13
,r13
,qq13
,c6_13
,c12_13
;
103 __m256d dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
104 __m256d dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
105 __m256d dx23
,dy23
,dz23
,rsq23
,rinv23
,rinvsq23
,r23
,qq23
,c6_23
,c12_23
;
106 __m256d dx31
,dy31
,dz31
,rsq31
,rinv31
,rinvsq31
,r31
,qq31
,c6_31
,c12_31
;
107 __m256d dx32
,dy32
,dz32
,rsq32
,rinv32
,rinvsq32
,r32
,qq32
,c6_32
,c12_32
;
108 __m256d dx33
,dy33
,dz33
,rsq33
,rinv33
,rinvsq33
,r33
,qq33
,c6_33
,c12_33
;
109 __m256d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
112 __m256d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
115 __m256d one_sixth
= _mm256_set1_pd(1.0/6.0);
116 __m256d one_twelfth
= _mm256_set1_pd(1.0/12.0);
117 __m256d dummy_mask
,cutoff_mask
;
118 __m128 tmpmask0
,tmpmask1
;
119 __m256d signbit
= _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
120 __m256d one
= _mm256_set1_pd(1.0);
121 __m256d two
= _mm256_set1_pd(2.0);
127 jindex
= nlist
->jindex
;
129 shiftidx
= nlist
->shift
;
131 shiftvec
= fr
->shift_vec
[0];
132 fshift
= fr
->fshift
[0];
133 facel
= _mm256_set1_pd(fr
->ic
->epsfac
);
134 charge
= mdatoms
->chargeA
;
135 krf
= _mm256_set1_pd(fr
->ic
->k_rf
);
136 krf2
= _mm256_set1_pd(fr
->ic
->k_rf
*2.0);
137 crf
= _mm256_set1_pd(fr
->ic
->c_rf
);
138 nvdwtype
= fr
->ntype
;
140 vdwtype
= mdatoms
->typeA
;
142 /* Setup water-specific parameters */
143 inr
= nlist
->iinr
[0];
144 iq1
= _mm256_mul_pd(facel
,_mm256_set1_pd(charge
[inr
+1]));
145 iq2
= _mm256_mul_pd(facel
,_mm256_set1_pd(charge
[inr
+2]));
146 iq3
= _mm256_mul_pd(facel
,_mm256_set1_pd(charge
[inr
+3]));
147 vdwioffsetptr0
= vdwparam
+2*nvdwtype
*vdwtype
[inr
+0];
149 jq1
= _mm256_set1_pd(charge
[inr
+1]);
150 jq2
= _mm256_set1_pd(charge
[inr
+2]);
151 jq3
= _mm256_set1_pd(charge
[inr
+3]);
152 vdwjidx0A
= 2*vdwtype
[inr
+0];
153 c6_00
= _mm256_set1_pd(vdwioffsetptr0
[vdwjidx0A
]);
154 c12_00
= _mm256_set1_pd(vdwioffsetptr0
[vdwjidx0A
+1]);
155 qq11
= _mm256_mul_pd(iq1
,jq1
);
156 qq12
= _mm256_mul_pd(iq1
,jq2
);
157 qq13
= _mm256_mul_pd(iq1
,jq3
);
158 qq21
= _mm256_mul_pd(iq2
,jq1
);
159 qq22
= _mm256_mul_pd(iq2
,jq2
);
160 qq23
= _mm256_mul_pd(iq2
,jq3
);
161 qq31
= _mm256_mul_pd(iq3
,jq1
);
162 qq32
= _mm256_mul_pd(iq3
,jq2
);
163 qq33
= _mm256_mul_pd(iq3
,jq3
);
165 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
166 rcutoff_scalar
= fr
->ic
->rcoulomb
;
167 rcutoff
= _mm256_set1_pd(rcutoff_scalar
);
168 rcutoff2
= _mm256_mul_pd(rcutoff
,rcutoff
);
170 sh_vdw_invrcut6
= _mm256_set1_pd(fr
->ic
->sh_invrc6
);
171 rvdw
= _mm256_set1_pd(fr
->ic
->rvdw
);
173 /* Avoid stupid compiler warnings */
174 jnrA
= jnrB
= jnrC
= jnrD
= 0;
183 for(iidx
=0;iidx
<4*DIM
;iidx
++)
188 /* Start outer loop over neighborlists */
189 for(iidx
=0; iidx
<nri
; iidx
++)
191 /* Load shift vector for this list */
192 i_shift_offset
= DIM
*shiftidx
[iidx
];
194 /* Load limits for loop over neighbors */
195 j_index_start
= jindex
[iidx
];
196 j_index_end
= jindex
[iidx
+1];
198 /* Get outer coordinate index */
200 i_coord_offset
= DIM
*inr
;
202 /* Load i particle coords and add shift vector */
203 gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
204 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
206 fix0
= _mm256_setzero_pd();
207 fiy0
= _mm256_setzero_pd();
208 fiz0
= _mm256_setzero_pd();
209 fix1
= _mm256_setzero_pd();
210 fiy1
= _mm256_setzero_pd();
211 fiz1
= _mm256_setzero_pd();
212 fix2
= _mm256_setzero_pd();
213 fiy2
= _mm256_setzero_pd();
214 fiz2
= _mm256_setzero_pd();
215 fix3
= _mm256_setzero_pd();
216 fiy3
= _mm256_setzero_pd();
217 fiz3
= _mm256_setzero_pd();
219 /* Reset potential sums */
220 velecsum
= _mm256_setzero_pd();
221 vvdwsum
= _mm256_setzero_pd();
223 /* Start inner kernel loop */
224 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+3]>=0; jidx
+=4)
227 /* Get j neighbor index, and coordinate index */
232 j_coord_offsetA
= DIM
*jnrA
;
233 j_coord_offsetB
= DIM
*jnrB
;
234 j_coord_offsetC
= DIM
*jnrC
;
235 j_coord_offsetD
= DIM
*jnrD
;
237 /* load j atom coordinates */
238 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
239 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
240 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
241 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
243 /* Calculate displacement vector */
244 dx00
= _mm256_sub_pd(ix0
,jx0
);
245 dy00
= _mm256_sub_pd(iy0
,jy0
);
246 dz00
= _mm256_sub_pd(iz0
,jz0
);
247 dx11
= _mm256_sub_pd(ix1
,jx1
);
248 dy11
= _mm256_sub_pd(iy1
,jy1
);
249 dz11
= _mm256_sub_pd(iz1
,jz1
);
250 dx12
= _mm256_sub_pd(ix1
,jx2
);
251 dy12
= _mm256_sub_pd(iy1
,jy2
);
252 dz12
= _mm256_sub_pd(iz1
,jz2
);
253 dx13
= _mm256_sub_pd(ix1
,jx3
);
254 dy13
= _mm256_sub_pd(iy1
,jy3
);
255 dz13
= _mm256_sub_pd(iz1
,jz3
);
256 dx21
= _mm256_sub_pd(ix2
,jx1
);
257 dy21
= _mm256_sub_pd(iy2
,jy1
);
258 dz21
= _mm256_sub_pd(iz2
,jz1
);
259 dx22
= _mm256_sub_pd(ix2
,jx2
);
260 dy22
= _mm256_sub_pd(iy2
,jy2
);
261 dz22
= _mm256_sub_pd(iz2
,jz2
);
262 dx23
= _mm256_sub_pd(ix2
,jx3
);
263 dy23
= _mm256_sub_pd(iy2
,jy3
);
264 dz23
= _mm256_sub_pd(iz2
,jz3
);
265 dx31
= _mm256_sub_pd(ix3
,jx1
);
266 dy31
= _mm256_sub_pd(iy3
,jy1
);
267 dz31
= _mm256_sub_pd(iz3
,jz1
);
268 dx32
= _mm256_sub_pd(ix3
,jx2
);
269 dy32
= _mm256_sub_pd(iy3
,jy2
);
270 dz32
= _mm256_sub_pd(iz3
,jz2
);
271 dx33
= _mm256_sub_pd(ix3
,jx3
);
272 dy33
= _mm256_sub_pd(iy3
,jy3
);
273 dz33
= _mm256_sub_pd(iz3
,jz3
);
275 /* Calculate squared distance and things based on it */
276 rsq00
= gmx_mm256_calc_rsq_pd(dx00
,dy00
,dz00
);
277 rsq11
= gmx_mm256_calc_rsq_pd(dx11
,dy11
,dz11
);
278 rsq12
= gmx_mm256_calc_rsq_pd(dx12
,dy12
,dz12
);
279 rsq13
= gmx_mm256_calc_rsq_pd(dx13
,dy13
,dz13
);
280 rsq21
= gmx_mm256_calc_rsq_pd(dx21
,dy21
,dz21
);
281 rsq22
= gmx_mm256_calc_rsq_pd(dx22
,dy22
,dz22
);
282 rsq23
= gmx_mm256_calc_rsq_pd(dx23
,dy23
,dz23
);
283 rsq31
= gmx_mm256_calc_rsq_pd(dx31
,dy31
,dz31
);
284 rsq32
= gmx_mm256_calc_rsq_pd(dx32
,dy32
,dz32
);
285 rsq33
= gmx_mm256_calc_rsq_pd(dx33
,dy33
,dz33
);
287 rinv11
= avx256_invsqrt_d(rsq11
);
288 rinv12
= avx256_invsqrt_d(rsq12
);
289 rinv13
= avx256_invsqrt_d(rsq13
);
290 rinv21
= avx256_invsqrt_d(rsq21
);
291 rinv22
= avx256_invsqrt_d(rsq22
);
292 rinv23
= avx256_invsqrt_d(rsq23
);
293 rinv31
= avx256_invsqrt_d(rsq31
);
294 rinv32
= avx256_invsqrt_d(rsq32
);
295 rinv33
= avx256_invsqrt_d(rsq33
);
297 rinvsq00
= avx256_inv_d(rsq00
);
298 rinvsq11
= _mm256_mul_pd(rinv11
,rinv11
);
299 rinvsq12
= _mm256_mul_pd(rinv12
,rinv12
);
300 rinvsq13
= _mm256_mul_pd(rinv13
,rinv13
);
301 rinvsq21
= _mm256_mul_pd(rinv21
,rinv21
);
302 rinvsq22
= _mm256_mul_pd(rinv22
,rinv22
);
303 rinvsq23
= _mm256_mul_pd(rinv23
,rinv23
);
304 rinvsq31
= _mm256_mul_pd(rinv31
,rinv31
);
305 rinvsq32
= _mm256_mul_pd(rinv32
,rinv32
);
306 rinvsq33
= _mm256_mul_pd(rinv33
,rinv33
);
308 fjx0
= _mm256_setzero_pd();
309 fjy0
= _mm256_setzero_pd();
310 fjz0
= _mm256_setzero_pd();
311 fjx1
= _mm256_setzero_pd();
312 fjy1
= _mm256_setzero_pd();
313 fjz1
= _mm256_setzero_pd();
314 fjx2
= _mm256_setzero_pd();
315 fjy2
= _mm256_setzero_pd();
316 fjz2
= _mm256_setzero_pd();
317 fjx3
= _mm256_setzero_pd();
318 fjy3
= _mm256_setzero_pd();
319 fjz3
= _mm256_setzero_pd();
321 /**************************
322 * CALCULATE INTERACTIONS *
323 **************************/
325 if (gmx_mm256_any_lt(rsq00
,rcutoff2
))
328 /* LENNARD-JONES DISPERSION/REPULSION */
330 rinvsix
= _mm256_mul_pd(_mm256_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
331 vvdw6
= _mm256_mul_pd(c6_00
,rinvsix
);
332 vvdw12
= _mm256_mul_pd(c12_00
,_mm256_mul_pd(rinvsix
,rinvsix
));
333 vvdw
= _mm256_sub_pd(_mm256_mul_pd( _mm256_sub_pd(vvdw12
, _mm256_mul_pd(c12_00
,_mm256_mul_pd(sh_vdw_invrcut6
,sh_vdw_invrcut6
))), one_twelfth
) ,
334 _mm256_mul_pd( _mm256_sub_pd(vvdw6
,_mm256_mul_pd(c6_00
,sh_vdw_invrcut6
)),one_sixth
));
335 fvdw
= _mm256_mul_pd(_mm256_sub_pd(vvdw12
,vvdw6
),rinvsq00
);
337 cutoff_mask
= _mm256_cmp_pd(rsq00
,rcutoff2
,_CMP_LT_OQ
);
339 /* Update potential sum for this i atom from the interaction with this j atom. */
340 vvdw
= _mm256_and_pd(vvdw
,cutoff_mask
);
341 vvdwsum
= _mm256_add_pd(vvdwsum
,vvdw
);
345 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
347 /* Calculate temporary vectorial force */
348 tx
= _mm256_mul_pd(fscal
,dx00
);
349 ty
= _mm256_mul_pd(fscal
,dy00
);
350 tz
= _mm256_mul_pd(fscal
,dz00
);
352 /* Update vectorial force */
353 fix0
= _mm256_add_pd(fix0
,tx
);
354 fiy0
= _mm256_add_pd(fiy0
,ty
);
355 fiz0
= _mm256_add_pd(fiz0
,tz
);
357 fjx0
= _mm256_add_pd(fjx0
,tx
);
358 fjy0
= _mm256_add_pd(fjy0
,ty
);
359 fjz0
= _mm256_add_pd(fjz0
,tz
);
363 /**************************
364 * CALCULATE INTERACTIONS *
365 **************************/
367 if (gmx_mm256_any_lt(rsq11
,rcutoff2
))
370 /* REACTION-FIELD ELECTROSTATICS */
371 velec
= _mm256_mul_pd(qq11
,_mm256_sub_pd(_mm256_add_pd(rinv11
,_mm256_mul_pd(krf
,rsq11
)),crf
));
372 felec
= _mm256_mul_pd(qq11
,_mm256_sub_pd(_mm256_mul_pd(rinv11
,rinvsq11
),krf2
));
374 cutoff_mask
= _mm256_cmp_pd(rsq11
,rcutoff2
,_CMP_LT_OQ
);
376 /* Update potential sum for this i atom from the interaction with this j atom. */
377 velec
= _mm256_and_pd(velec
,cutoff_mask
);
378 velecsum
= _mm256_add_pd(velecsum
,velec
);
382 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
384 /* Calculate temporary vectorial force */
385 tx
= _mm256_mul_pd(fscal
,dx11
);
386 ty
= _mm256_mul_pd(fscal
,dy11
);
387 tz
= _mm256_mul_pd(fscal
,dz11
);
389 /* Update vectorial force */
390 fix1
= _mm256_add_pd(fix1
,tx
);
391 fiy1
= _mm256_add_pd(fiy1
,ty
);
392 fiz1
= _mm256_add_pd(fiz1
,tz
);
394 fjx1
= _mm256_add_pd(fjx1
,tx
);
395 fjy1
= _mm256_add_pd(fjy1
,ty
);
396 fjz1
= _mm256_add_pd(fjz1
,tz
);
400 /**************************
401 * CALCULATE INTERACTIONS *
402 **************************/
404 if (gmx_mm256_any_lt(rsq12
,rcutoff2
))
407 /* REACTION-FIELD ELECTROSTATICS */
408 velec
= _mm256_mul_pd(qq12
,_mm256_sub_pd(_mm256_add_pd(rinv12
,_mm256_mul_pd(krf
,rsq12
)),crf
));
409 felec
= _mm256_mul_pd(qq12
,_mm256_sub_pd(_mm256_mul_pd(rinv12
,rinvsq12
),krf2
));
411 cutoff_mask
= _mm256_cmp_pd(rsq12
,rcutoff2
,_CMP_LT_OQ
);
413 /* Update potential sum for this i atom from the interaction with this j atom. */
414 velec
= _mm256_and_pd(velec
,cutoff_mask
);
415 velecsum
= _mm256_add_pd(velecsum
,velec
);
419 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
421 /* Calculate temporary vectorial force */
422 tx
= _mm256_mul_pd(fscal
,dx12
);
423 ty
= _mm256_mul_pd(fscal
,dy12
);
424 tz
= _mm256_mul_pd(fscal
,dz12
);
426 /* Update vectorial force */
427 fix1
= _mm256_add_pd(fix1
,tx
);
428 fiy1
= _mm256_add_pd(fiy1
,ty
);
429 fiz1
= _mm256_add_pd(fiz1
,tz
);
431 fjx2
= _mm256_add_pd(fjx2
,tx
);
432 fjy2
= _mm256_add_pd(fjy2
,ty
);
433 fjz2
= _mm256_add_pd(fjz2
,tz
);
437 /**************************
438 * CALCULATE INTERACTIONS *
439 **************************/
441 if (gmx_mm256_any_lt(rsq13
,rcutoff2
))
444 /* REACTION-FIELD ELECTROSTATICS */
445 velec
= _mm256_mul_pd(qq13
,_mm256_sub_pd(_mm256_add_pd(rinv13
,_mm256_mul_pd(krf
,rsq13
)),crf
));
446 felec
= _mm256_mul_pd(qq13
,_mm256_sub_pd(_mm256_mul_pd(rinv13
,rinvsq13
),krf2
));
448 cutoff_mask
= _mm256_cmp_pd(rsq13
,rcutoff2
,_CMP_LT_OQ
);
450 /* Update potential sum for this i atom from the interaction with this j atom. */
451 velec
= _mm256_and_pd(velec
,cutoff_mask
);
452 velecsum
= _mm256_add_pd(velecsum
,velec
);
456 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
458 /* Calculate temporary vectorial force */
459 tx
= _mm256_mul_pd(fscal
,dx13
);
460 ty
= _mm256_mul_pd(fscal
,dy13
);
461 tz
= _mm256_mul_pd(fscal
,dz13
);
463 /* Update vectorial force */
464 fix1
= _mm256_add_pd(fix1
,tx
);
465 fiy1
= _mm256_add_pd(fiy1
,ty
);
466 fiz1
= _mm256_add_pd(fiz1
,tz
);
468 fjx3
= _mm256_add_pd(fjx3
,tx
);
469 fjy3
= _mm256_add_pd(fjy3
,ty
);
470 fjz3
= _mm256_add_pd(fjz3
,tz
);
474 /**************************
475 * CALCULATE INTERACTIONS *
476 **************************/
478 if (gmx_mm256_any_lt(rsq21
,rcutoff2
))
481 /* REACTION-FIELD ELECTROSTATICS */
482 velec
= _mm256_mul_pd(qq21
,_mm256_sub_pd(_mm256_add_pd(rinv21
,_mm256_mul_pd(krf
,rsq21
)),crf
));
483 felec
= _mm256_mul_pd(qq21
,_mm256_sub_pd(_mm256_mul_pd(rinv21
,rinvsq21
),krf2
));
485 cutoff_mask
= _mm256_cmp_pd(rsq21
,rcutoff2
,_CMP_LT_OQ
);
487 /* Update potential sum for this i atom from the interaction with this j atom. */
488 velec
= _mm256_and_pd(velec
,cutoff_mask
);
489 velecsum
= _mm256_add_pd(velecsum
,velec
);
493 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
495 /* Calculate temporary vectorial force */
496 tx
= _mm256_mul_pd(fscal
,dx21
);
497 ty
= _mm256_mul_pd(fscal
,dy21
);
498 tz
= _mm256_mul_pd(fscal
,dz21
);
500 /* Update vectorial force */
501 fix2
= _mm256_add_pd(fix2
,tx
);
502 fiy2
= _mm256_add_pd(fiy2
,ty
);
503 fiz2
= _mm256_add_pd(fiz2
,tz
);
505 fjx1
= _mm256_add_pd(fjx1
,tx
);
506 fjy1
= _mm256_add_pd(fjy1
,ty
);
507 fjz1
= _mm256_add_pd(fjz1
,tz
);
511 /**************************
512 * CALCULATE INTERACTIONS *
513 **************************/
515 if (gmx_mm256_any_lt(rsq22
,rcutoff2
))
518 /* REACTION-FIELD ELECTROSTATICS */
519 velec
= _mm256_mul_pd(qq22
,_mm256_sub_pd(_mm256_add_pd(rinv22
,_mm256_mul_pd(krf
,rsq22
)),crf
));
520 felec
= _mm256_mul_pd(qq22
,_mm256_sub_pd(_mm256_mul_pd(rinv22
,rinvsq22
),krf2
));
522 cutoff_mask
= _mm256_cmp_pd(rsq22
,rcutoff2
,_CMP_LT_OQ
);
524 /* Update potential sum for this i atom from the interaction with this j atom. */
525 velec
= _mm256_and_pd(velec
,cutoff_mask
);
526 velecsum
= _mm256_add_pd(velecsum
,velec
);
530 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
532 /* Calculate temporary vectorial force */
533 tx
= _mm256_mul_pd(fscal
,dx22
);
534 ty
= _mm256_mul_pd(fscal
,dy22
);
535 tz
= _mm256_mul_pd(fscal
,dz22
);
537 /* Update vectorial force */
538 fix2
= _mm256_add_pd(fix2
,tx
);
539 fiy2
= _mm256_add_pd(fiy2
,ty
);
540 fiz2
= _mm256_add_pd(fiz2
,tz
);
542 fjx2
= _mm256_add_pd(fjx2
,tx
);
543 fjy2
= _mm256_add_pd(fjy2
,ty
);
544 fjz2
= _mm256_add_pd(fjz2
,tz
);
548 /**************************
549 * CALCULATE INTERACTIONS *
550 **************************/
552 if (gmx_mm256_any_lt(rsq23
,rcutoff2
))
555 /* REACTION-FIELD ELECTROSTATICS */
556 velec
= _mm256_mul_pd(qq23
,_mm256_sub_pd(_mm256_add_pd(rinv23
,_mm256_mul_pd(krf
,rsq23
)),crf
));
557 felec
= _mm256_mul_pd(qq23
,_mm256_sub_pd(_mm256_mul_pd(rinv23
,rinvsq23
),krf2
));
559 cutoff_mask
= _mm256_cmp_pd(rsq23
,rcutoff2
,_CMP_LT_OQ
);
561 /* Update potential sum for this i atom from the interaction with this j atom. */
562 velec
= _mm256_and_pd(velec
,cutoff_mask
);
563 velecsum
= _mm256_add_pd(velecsum
,velec
);
567 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
569 /* Calculate temporary vectorial force */
570 tx
= _mm256_mul_pd(fscal
,dx23
);
571 ty
= _mm256_mul_pd(fscal
,dy23
);
572 tz
= _mm256_mul_pd(fscal
,dz23
);
574 /* Update vectorial force */
575 fix2
= _mm256_add_pd(fix2
,tx
);
576 fiy2
= _mm256_add_pd(fiy2
,ty
);
577 fiz2
= _mm256_add_pd(fiz2
,tz
);
579 fjx3
= _mm256_add_pd(fjx3
,tx
);
580 fjy3
= _mm256_add_pd(fjy3
,ty
);
581 fjz3
= _mm256_add_pd(fjz3
,tz
);
585 /**************************
586 * CALCULATE INTERACTIONS *
587 **************************/
589 if (gmx_mm256_any_lt(rsq31
,rcutoff2
))
592 /* REACTION-FIELD ELECTROSTATICS */
593 velec
= _mm256_mul_pd(qq31
,_mm256_sub_pd(_mm256_add_pd(rinv31
,_mm256_mul_pd(krf
,rsq31
)),crf
));
594 felec
= _mm256_mul_pd(qq31
,_mm256_sub_pd(_mm256_mul_pd(rinv31
,rinvsq31
),krf2
));
596 cutoff_mask
= _mm256_cmp_pd(rsq31
,rcutoff2
,_CMP_LT_OQ
);
598 /* Update potential sum for this i atom from the interaction with this j atom. */
599 velec
= _mm256_and_pd(velec
,cutoff_mask
);
600 velecsum
= _mm256_add_pd(velecsum
,velec
);
604 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
606 /* Calculate temporary vectorial force */
607 tx
= _mm256_mul_pd(fscal
,dx31
);
608 ty
= _mm256_mul_pd(fscal
,dy31
);
609 tz
= _mm256_mul_pd(fscal
,dz31
);
611 /* Update vectorial force */
612 fix3
= _mm256_add_pd(fix3
,tx
);
613 fiy3
= _mm256_add_pd(fiy3
,ty
);
614 fiz3
= _mm256_add_pd(fiz3
,tz
);
616 fjx1
= _mm256_add_pd(fjx1
,tx
);
617 fjy1
= _mm256_add_pd(fjy1
,ty
);
618 fjz1
= _mm256_add_pd(fjz1
,tz
);
622 /**************************
623 * CALCULATE INTERACTIONS *
624 **************************/
626 if (gmx_mm256_any_lt(rsq32
,rcutoff2
))
629 /* REACTION-FIELD ELECTROSTATICS */
630 velec
= _mm256_mul_pd(qq32
,_mm256_sub_pd(_mm256_add_pd(rinv32
,_mm256_mul_pd(krf
,rsq32
)),crf
));
631 felec
= _mm256_mul_pd(qq32
,_mm256_sub_pd(_mm256_mul_pd(rinv32
,rinvsq32
),krf2
));
633 cutoff_mask
= _mm256_cmp_pd(rsq32
,rcutoff2
,_CMP_LT_OQ
);
635 /* Update potential sum for this i atom from the interaction with this j atom. */
636 velec
= _mm256_and_pd(velec
,cutoff_mask
);
637 velecsum
= _mm256_add_pd(velecsum
,velec
);
641 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
643 /* Calculate temporary vectorial force */
644 tx
= _mm256_mul_pd(fscal
,dx32
);
645 ty
= _mm256_mul_pd(fscal
,dy32
);
646 tz
= _mm256_mul_pd(fscal
,dz32
);
648 /* Update vectorial force */
649 fix3
= _mm256_add_pd(fix3
,tx
);
650 fiy3
= _mm256_add_pd(fiy3
,ty
);
651 fiz3
= _mm256_add_pd(fiz3
,tz
);
653 fjx2
= _mm256_add_pd(fjx2
,tx
);
654 fjy2
= _mm256_add_pd(fjy2
,ty
);
655 fjz2
= _mm256_add_pd(fjz2
,tz
);
659 /**************************
660 * CALCULATE INTERACTIONS *
661 **************************/
663 if (gmx_mm256_any_lt(rsq33
,rcutoff2
))
666 /* REACTION-FIELD ELECTROSTATICS */
667 velec
= _mm256_mul_pd(qq33
,_mm256_sub_pd(_mm256_add_pd(rinv33
,_mm256_mul_pd(krf
,rsq33
)),crf
));
668 felec
= _mm256_mul_pd(qq33
,_mm256_sub_pd(_mm256_mul_pd(rinv33
,rinvsq33
),krf2
));
670 cutoff_mask
= _mm256_cmp_pd(rsq33
,rcutoff2
,_CMP_LT_OQ
);
672 /* Update potential sum for this i atom from the interaction with this j atom. */
673 velec
= _mm256_and_pd(velec
,cutoff_mask
);
674 velecsum
= _mm256_add_pd(velecsum
,velec
);
678 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
680 /* Calculate temporary vectorial force */
681 tx
= _mm256_mul_pd(fscal
,dx33
);
682 ty
= _mm256_mul_pd(fscal
,dy33
);
683 tz
= _mm256_mul_pd(fscal
,dz33
);
685 /* Update vectorial force */
686 fix3
= _mm256_add_pd(fix3
,tx
);
687 fiy3
= _mm256_add_pd(fiy3
,ty
);
688 fiz3
= _mm256_add_pd(fiz3
,tz
);
690 fjx3
= _mm256_add_pd(fjx3
,tx
);
691 fjy3
= _mm256_add_pd(fjy3
,ty
);
692 fjz3
= _mm256_add_pd(fjz3
,tz
);
696 fjptrA
= f
+j_coord_offsetA
;
697 fjptrB
= f
+j_coord_offsetB
;
698 fjptrC
= f
+j_coord_offsetC
;
699 fjptrD
= f
+j_coord_offsetD
;
701 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
702 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,
703 fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
705 /* Inner loop uses 368 flops */
711 /* Get j neighbor index, and coordinate index */
712 jnrlistA
= jjnr
[jidx
];
713 jnrlistB
= jjnr
[jidx
+1];
714 jnrlistC
= jjnr
[jidx
+2];
715 jnrlistD
= jjnr
[jidx
+3];
716 /* Sign of each element will be negative for non-real atoms.
717 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
718 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
720 tmpmask0
= gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128()));
722 tmpmask1
= _mm_permute_ps(tmpmask0
,_GMX_MM_PERMUTE(3,3,2,2));
723 tmpmask0
= _mm_permute_ps(tmpmask0
,_GMX_MM_PERMUTE(1,1,0,0));
724 dummy_mask
= _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1
,tmpmask0
));
726 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
727 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
728 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
729 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
730 j_coord_offsetA
= DIM
*jnrA
;
731 j_coord_offsetB
= DIM
*jnrB
;
732 j_coord_offsetC
= DIM
*jnrC
;
733 j_coord_offsetD
= DIM
*jnrD
;
735 /* load j atom coordinates */
736 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
737 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
738 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
739 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
741 /* Calculate displacement vector */
742 dx00
= _mm256_sub_pd(ix0
,jx0
);
743 dy00
= _mm256_sub_pd(iy0
,jy0
);
744 dz00
= _mm256_sub_pd(iz0
,jz0
);
745 dx11
= _mm256_sub_pd(ix1
,jx1
);
746 dy11
= _mm256_sub_pd(iy1
,jy1
);
747 dz11
= _mm256_sub_pd(iz1
,jz1
);
748 dx12
= _mm256_sub_pd(ix1
,jx2
);
749 dy12
= _mm256_sub_pd(iy1
,jy2
);
750 dz12
= _mm256_sub_pd(iz1
,jz2
);
751 dx13
= _mm256_sub_pd(ix1
,jx3
);
752 dy13
= _mm256_sub_pd(iy1
,jy3
);
753 dz13
= _mm256_sub_pd(iz1
,jz3
);
754 dx21
= _mm256_sub_pd(ix2
,jx1
);
755 dy21
= _mm256_sub_pd(iy2
,jy1
);
756 dz21
= _mm256_sub_pd(iz2
,jz1
);
757 dx22
= _mm256_sub_pd(ix2
,jx2
);
758 dy22
= _mm256_sub_pd(iy2
,jy2
);
759 dz22
= _mm256_sub_pd(iz2
,jz2
);
760 dx23
= _mm256_sub_pd(ix2
,jx3
);
761 dy23
= _mm256_sub_pd(iy2
,jy3
);
762 dz23
= _mm256_sub_pd(iz2
,jz3
);
763 dx31
= _mm256_sub_pd(ix3
,jx1
);
764 dy31
= _mm256_sub_pd(iy3
,jy1
);
765 dz31
= _mm256_sub_pd(iz3
,jz1
);
766 dx32
= _mm256_sub_pd(ix3
,jx2
);
767 dy32
= _mm256_sub_pd(iy3
,jy2
);
768 dz32
= _mm256_sub_pd(iz3
,jz2
);
769 dx33
= _mm256_sub_pd(ix3
,jx3
);
770 dy33
= _mm256_sub_pd(iy3
,jy3
);
771 dz33
= _mm256_sub_pd(iz3
,jz3
);
773 /* Calculate squared distance and things based on it */
774 rsq00
= gmx_mm256_calc_rsq_pd(dx00
,dy00
,dz00
);
775 rsq11
= gmx_mm256_calc_rsq_pd(dx11
,dy11
,dz11
);
776 rsq12
= gmx_mm256_calc_rsq_pd(dx12
,dy12
,dz12
);
777 rsq13
= gmx_mm256_calc_rsq_pd(dx13
,dy13
,dz13
);
778 rsq21
= gmx_mm256_calc_rsq_pd(dx21
,dy21
,dz21
);
779 rsq22
= gmx_mm256_calc_rsq_pd(dx22
,dy22
,dz22
);
780 rsq23
= gmx_mm256_calc_rsq_pd(dx23
,dy23
,dz23
);
781 rsq31
= gmx_mm256_calc_rsq_pd(dx31
,dy31
,dz31
);
782 rsq32
= gmx_mm256_calc_rsq_pd(dx32
,dy32
,dz32
);
783 rsq33
= gmx_mm256_calc_rsq_pd(dx33
,dy33
,dz33
);
785 rinv11
= avx256_invsqrt_d(rsq11
);
786 rinv12
= avx256_invsqrt_d(rsq12
);
787 rinv13
= avx256_invsqrt_d(rsq13
);
788 rinv21
= avx256_invsqrt_d(rsq21
);
789 rinv22
= avx256_invsqrt_d(rsq22
);
790 rinv23
= avx256_invsqrt_d(rsq23
);
791 rinv31
= avx256_invsqrt_d(rsq31
);
792 rinv32
= avx256_invsqrt_d(rsq32
);
793 rinv33
= avx256_invsqrt_d(rsq33
);
795 rinvsq00
= avx256_inv_d(rsq00
);
796 rinvsq11
= _mm256_mul_pd(rinv11
,rinv11
);
797 rinvsq12
= _mm256_mul_pd(rinv12
,rinv12
);
798 rinvsq13
= _mm256_mul_pd(rinv13
,rinv13
);
799 rinvsq21
= _mm256_mul_pd(rinv21
,rinv21
);
800 rinvsq22
= _mm256_mul_pd(rinv22
,rinv22
);
801 rinvsq23
= _mm256_mul_pd(rinv23
,rinv23
);
802 rinvsq31
= _mm256_mul_pd(rinv31
,rinv31
);
803 rinvsq32
= _mm256_mul_pd(rinv32
,rinv32
);
804 rinvsq33
= _mm256_mul_pd(rinv33
,rinv33
);
806 fjx0
= _mm256_setzero_pd();
807 fjy0
= _mm256_setzero_pd();
808 fjz0
= _mm256_setzero_pd();
809 fjx1
= _mm256_setzero_pd();
810 fjy1
= _mm256_setzero_pd();
811 fjz1
= _mm256_setzero_pd();
812 fjx2
= _mm256_setzero_pd();
813 fjy2
= _mm256_setzero_pd();
814 fjz2
= _mm256_setzero_pd();
815 fjx3
= _mm256_setzero_pd();
816 fjy3
= _mm256_setzero_pd();
817 fjz3
= _mm256_setzero_pd();
819 /**************************
820 * CALCULATE INTERACTIONS *
821 **************************/
823 if (gmx_mm256_any_lt(rsq00
,rcutoff2
))
826 /* LENNARD-JONES DISPERSION/REPULSION */
828 rinvsix
= _mm256_mul_pd(_mm256_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
829 vvdw6
= _mm256_mul_pd(c6_00
,rinvsix
);
830 vvdw12
= _mm256_mul_pd(c12_00
,_mm256_mul_pd(rinvsix
,rinvsix
));
831 vvdw
= _mm256_sub_pd(_mm256_mul_pd( _mm256_sub_pd(vvdw12
, _mm256_mul_pd(c12_00
,_mm256_mul_pd(sh_vdw_invrcut6
,sh_vdw_invrcut6
))), one_twelfth
) ,
832 _mm256_mul_pd( _mm256_sub_pd(vvdw6
,_mm256_mul_pd(c6_00
,sh_vdw_invrcut6
)),one_sixth
));
833 fvdw
= _mm256_mul_pd(_mm256_sub_pd(vvdw12
,vvdw6
),rinvsq00
);
835 cutoff_mask
= _mm256_cmp_pd(rsq00
,rcutoff2
,_CMP_LT_OQ
);
837 /* Update potential sum for this i atom from the interaction with this j atom. */
838 vvdw
= _mm256_and_pd(vvdw
,cutoff_mask
);
839 vvdw
= _mm256_andnot_pd(dummy_mask
,vvdw
);
840 vvdwsum
= _mm256_add_pd(vvdwsum
,vvdw
);
844 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
846 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
848 /* Calculate temporary vectorial force */
849 tx
= _mm256_mul_pd(fscal
,dx00
);
850 ty
= _mm256_mul_pd(fscal
,dy00
);
851 tz
= _mm256_mul_pd(fscal
,dz00
);
853 /* Update vectorial force */
854 fix0
= _mm256_add_pd(fix0
,tx
);
855 fiy0
= _mm256_add_pd(fiy0
,ty
);
856 fiz0
= _mm256_add_pd(fiz0
,tz
);
858 fjx0
= _mm256_add_pd(fjx0
,tx
);
859 fjy0
= _mm256_add_pd(fjy0
,ty
);
860 fjz0
= _mm256_add_pd(fjz0
,tz
);
864 /**************************
865 * CALCULATE INTERACTIONS *
866 **************************/
868 if (gmx_mm256_any_lt(rsq11
,rcutoff2
))
871 /* REACTION-FIELD ELECTROSTATICS */
872 velec
= _mm256_mul_pd(qq11
,_mm256_sub_pd(_mm256_add_pd(rinv11
,_mm256_mul_pd(krf
,rsq11
)),crf
));
873 felec
= _mm256_mul_pd(qq11
,_mm256_sub_pd(_mm256_mul_pd(rinv11
,rinvsq11
),krf2
));
875 cutoff_mask
= _mm256_cmp_pd(rsq11
,rcutoff2
,_CMP_LT_OQ
);
877 /* Update potential sum for this i atom from the interaction with this j atom. */
878 velec
= _mm256_and_pd(velec
,cutoff_mask
);
879 velec
= _mm256_andnot_pd(dummy_mask
,velec
);
880 velecsum
= _mm256_add_pd(velecsum
,velec
);
884 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
886 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
888 /* Calculate temporary vectorial force */
889 tx
= _mm256_mul_pd(fscal
,dx11
);
890 ty
= _mm256_mul_pd(fscal
,dy11
);
891 tz
= _mm256_mul_pd(fscal
,dz11
);
893 /* Update vectorial force */
894 fix1
= _mm256_add_pd(fix1
,tx
);
895 fiy1
= _mm256_add_pd(fiy1
,ty
);
896 fiz1
= _mm256_add_pd(fiz1
,tz
);
898 fjx1
= _mm256_add_pd(fjx1
,tx
);
899 fjy1
= _mm256_add_pd(fjy1
,ty
);
900 fjz1
= _mm256_add_pd(fjz1
,tz
);
904 /**************************
905 * CALCULATE INTERACTIONS *
906 **************************/
908 if (gmx_mm256_any_lt(rsq12
,rcutoff2
))
911 /* REACTION-FIELD ELECTROSTATICS */
912 velec
= _mm256_mul_pd(qq12
,_mm256_sub_pd(_mm256_add_pd(rinv12
,_mm256_mul_pd(krf
,rsq12
)),crf
));
913 felec
= _mm256_mul_pd(qq12
,_mm256_sub_pd(_mm256_mul_pd(rinv12
,rinvsq12
),krf2
));
915 cutoff_mask
= _mm256_cmp_pd(rsq12
,rcutoff2
,_CMP_LT_OQ
);
917 /* Update potential sum for this i atom from the interaction with this j atom. */
918 velec
= _mm256_and_pd(velec
,cutoff_mask
);
919 velec
= _mm256_andnot_pd(dummy_mask
,velec
);
920 velecsum
= _mm256_add_pd(velecsum
,velec
);
924 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
926 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
928 /* Calculate temporary vectorial force */
929 tx
= _mm256_mul_pd(fscal
,dx12
);
930 ty
= _mm256_mul_pd(fscal
,dy12
);
931 tz
= _mm256_mul_pd(fscal
,dz12
);
933 /* Update vectorial force */
934 fix1
= _mm256_add_pd(fix1
,tx
);
935 fiy1
= _mm256_add_pd(fiy1
,ty
);
936 fiz1
= _mm256_add_pd(fiz1
,tz
);
938 fjx2
= _mm256_add_pd(fjx2
,tx
);
939 fjy2
= _mm256_add_pd(fjy2
,ty
);
940 fjz2
= _mm256_add_pd(fjz2
,tz
);
944 /**************************
945 * CALCULATE INTERACTIONS *
946 **************************/
948 if (gmx_mm256_any_lt(rsq13
,rcutoff2
))
951 /* REACTION-FIELD ELECTROSTATICS */
952 velec
= _mm256_mul_pd(qq13
,_mm256_sub_pd(_mm256_add_pd(rinv13
,_mm256_mul_pd(krf
,rsq13
)),crf
));
953 felec
= _mm256_mul_pd(qq13
,_mm256_sub_pd(_mm256_mul_pd(rinv13
,rinvsq13
),krf2
));
955 cutoff_mask
= _mm256_cmp_pd(rsq13
,rcutoff2
,_CMP_LT_OQ
);
957 /* Update potential sum for this i atom from the interaction with this j atom. */
958 velec
= _mm256_and_pd(velec
,cutoff_mask
);
959 velec
= _mm256_andnot_pd(dummy_mask
,velec
);
960 velecsum
= _mm256_add_pd(velecsum
,velec
);
964 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
966 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
968 /* Calculate temporary vectorial force */
969 tx
= _mm256_mul_pd(fscal
,dx13
);
970 ty
= _mm256_mul_pd(fscal
,dy13
);
971 tz
= _mm256_mul_pd(fscal
,dz13
);
973 /* Update vectorial force */
974 fix1
= _mm256_add_pd(fix1
,tx
);
975 fiy1
= _mm256_add_pd(fiy1
,ty
);
976 fiz1
= _mm256_add_pd(fiz1
,tz
);
978 fjx3
= _mm256_add_pd(fjx3
,tx
);
979 fjy3
= _mm256_add_pd(fjy3
,ty
);
980 fjz3
= _mm256_add_pd(fjz3
,tz
);
984 /**************************
985 * CALCULATE INTERACTIONS *
986 **************************/
988 if (gmx_mm256_any_lt(rsq21
,rcutoff2
))
991 /* REACTION-FIELD ELECTROSTATICS */
992 velec
= _mm256_mul_pd(qq21
,_mm256_sub_pd(_mm256_add_pd(rinv21
,_mm256_mul_pd(krf
,rsq21
)),crf
));
993 felec
= _mm256_mul_pd(qq21
,_mm256_sub_pd(_mm256_mul_pd(rinv21
,rinvsq21
),krf2
));
995 cutoff_mask
= _mm256_cmp_pd(rsq21
,rcutoff2
,_CMP_LT_OQ
);
997 /* Update potential sum for this i atom from the interaction with this j atom. */
998 velec
= _mm256_and_pd(velec
,cutoff_mask
);
999 velec
= _mm256_andnot_pd(dummy_mask
,velec
);
1000 velecsum
= _mm256_add_pd(velecsum
,velec
);
1004 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
1006 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
1008 /* Calculate temporary vectorial force */
1009 tx
= _mm256_mul_pd(fscal
,dx21
);
1010 ty
= _mm256_mul_pd(fscal
,dy21
);
1011 tz
= _mm256_mul_pd(fscal
,dz21
);
1013 /* Update vectorial force */
1014 fix2
= _mm256_add_pd(fix2
,tx
);
1015 fiy2
= _mm256_add_pd(fiy2
,ty
);
1016 fiz2
= _mm256_add_pd(fiz2
,tz
);
1018 fjx1
= _mm256_add_pd(fjx1
,tx
);
1019 fjy1
= _mm256_add_pd(fjy1
,ty
);
1020 fjz1
= _mm256_add_pd(fjz1
,tz
);
1024 /**************************
1025 * CALCULATE INTERACTIONS *
1026 **************************/
1028 if (gmx_mm256_any_lt(rsq22
,rcutoff2
))
1031 /* REACTION-FIELD ELECTROSTATICS */
1032 velec
= _mm256_mul_pd(qq22
,_mm256_sub_pd(_mm256_add_pd(rinv22
,_mm256_mul_pd(krf
,rsq22
)),crf
));
1033 felec
= _mm256_mul_pd(qq22
,_mm256_sub_pd(_mm256_mul_pd(rinv22
,rinvsq22
),krf2
));
1035 cutoff_mask
= _mm256_cmp_pd(rsq22
,rcutoff2
,_CMP_LT_OQ
);
1037 /* Update potential sum for this i atom from the interaction with this j atom. */
1038 velec
= _mm256_and_pd(velec
,cutoff_mask
);
1039 velec
= _mm256_andnot_pd(dummy_mask
,velec
);
1040 velecsum
= _mm256_add_pd(velecsum
,velec
);
1044 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
1046 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
1048 /* Calculate temporary vectorial force */
1049 tx
= _mm256_mul_pd(fscal
,dx22
);
1050 ty
= _mm256_mul_pd(fscal
,dy22
);
1051 tz
= _mm256_mul_pd(fscal
,dz22
);
1053 /* Update vectorial force */
1054 fix2
= _mm256_add_pd(fix2
,tx
);
1055 fiy2
= _mm256_add_pd(fiy2
,ty
);
1056 fiz2
= _mm256_add_pd(fiz2
,tz
);
1058 fjx2
= _mm256_add_pd(fjx2
,tx
);
1059 fjy2
= _mm256_add_pd(fjy2
,ty
);
1060 fjz2
= _mm256_add_pd(fjz2
,tz
);
1064 /**************************
1065 * CALCULATE INTERACTIONS *
1066 **************************/
1068 if (gmx_mm256_any_lt(rsq23
,rcutoff2
))
1071 /* REACTION-FIELD ELECTROSTATICS */
1072 velec
= _mm256_mul_pd(qq23
,_mm256_sub_pd(_mm256_add_pd(rinv23
,_mm256_mul_pd(krf
,rsq23
)),crf
));
1073 felec
= _mm256_mul_pd(qq23
,_mm256_sub_pd(_mm256_mul_pd(rinv23
,rinvsq23
),krf2
));
1075 cutoff_mask
= _mm256_cmp_pd(rsq23
,rcutoff2
,_CMP_LT_OQ
);
1077 /* Update potential sum for this i atom from the interaction with this j atom. */
1078 velec
= _mm256_and_pd(velec
,cutoff_mask
);
1079 velec
= _mm256_andnot_pd(dummy_mask
,velec
);
1080 velecsum
= _mm256_add_pd(velecsum
,velec
);
1084 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
1086 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
1088 /* Calculate temporary vectorial force */
1089 tx
= _mm256_mul_pd(fscal
,dx23
);
1090 ty
= _mm256_mul_pd(fscal
,dy23
);
1091 tz
= _mm256_mul_pd(fscal
,dz23
);
1093 /* Update vectorial force */
1094 fix2
= _mm256_add_pd(fix2
,tx
);
1095 fiy2
= _mm256_add_pd(fiy2
,ty
);
1096 fiz2
= _mm256_add_pd(fiz2
,tz
);
1098 fjx3
= _mm256_add_pd(fjx3
,tx
);
1099 fjy3
= _mm256_add_pd(fjy3
,ty
);
1100 fjz3
= _mm256_add_pd(fjz3
,tz
);
1104 /**************************
1105 * CALCULATE INTERACTIONS *
1106 **************************/
1108 if (gmx_mm256_any_lt(rsq31
,rcutoff2
))
1111 /* REACTION-FIELD ELECTROSTATICS */
1112 velec
= _mm256_mul_pd(qq31
,_mm256_sub_pd(_mm256_add_pd(rinv31
,_mm256_mul_pd(krf
,rsq31
)),crf
));
1113 felec
= _mm256_mul_pd(qq31
,_mm256_sub_pd(_mm256_mul_pd(rinv31
,rinvsq31
),krf2
));
1115 cutoff_mask
= _mm256_cmp_pd(rsq31
,rcutoff2
,_CMP_LT_OQ
);
1117 /* Update potential sum for this i atom from the interaction with this j atom. */
1118 velec
= _mm256_and_pd(velec
,cutoff_mask
);
1119 velec
= _mm256_andnot_pd(dummy_mask
,velec
);
1120 velecsum
= _mm256_add_pd(velecsum
,velec
);
1124 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
1126 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
1128 /* Calculate temporary vectorial force */
1129 tx
= _mm256_mul_pd(fscal
,dx31
);
1130 ty
= _mm256_mul_pd(fscal
,dy31
);
1131 tz
= _mm256_mul_pd(fscal
,dz31
);
1133 /* Update vectorial force */
1134 fix3
= _mm256_add_pd(fix3
,tx
);
1135 fiy3
= _mm256_add_pd(fiy3
,ty
);
1136 fiz3
= _mm256_add_pd(fiz3
,tz
);
1138 fjx1
= _mm256_add_pd(fjx1
,tx
);
1139 fjy1
= _mm256_add_pd(fjy1
,ty
);
1140 fjz1
= _mm256_add_pd(fjz1
,tz
);
1144 /**************************
1145 * CALCULATE INTERACTIONS *
1146 **************************/
1148 if (gmx_mm256_any_lt(rsq32
,rcutoff2
))
1151 /* REACTION-FIELD ELECTROSTATICS */
1152 velec
= _mm256_mul_pd(qq32
,_mm256_sub_pd(_mm256_add_pd(rinv32
,_mm256_mul_pd(krf
,rsq32
)),crf
));
1153 felec
= _mm256_mul_pd(qq32
,_mm256_sub_pd(_mm256_mul_pd(rinv32
,rinvsq32
),krf2
));
1155 cutoff_mask
= _mm256_cmp_pd(rsq32
,rcutoff2
,_CMP_LT_OQ
);
1157 /* Update potential sum for this i atom from the interaction with this j atom. */
1158 velec
= _mm256_and_pd(velec
,cutoff_mask
);
1159 velec
= _mm256_andnot_pd(dummy_mask
,velec
);
1160 velecsum
= _mm256_add_pd(velecsum
,velec
);
1164 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
1166 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
1168 /* Calculate temporary vectorial force */
1169 tx
= _mm256_mul_pd(fscal
,dx32
);
1170 ty
= _mm256_mul_pd(fscal
,dy32
);
1171 tz
= _mm256_mul_pd(fscal
,dz32
);
1173 /* Update vectorial force */
1174 fix3
= _mm256_add_pd(fix3
,tx
);
1175 fiy3
= _mm256_add_pd(fiy3
,ty
);
1176 fiz3
= _mm256_add_pd(fiz3
,tz
);
1178 fjx2
= _mm256_add_pd(fjx2
,tx
);
1179 fjy2
= _mm256_add_pd(fjy2
,ty
);
1180 fjz2
= _mm256_add_pd(fjz2
,tz
);
1184 /**************************
1185 * CALCULATE INTERACTIONS *
1186 **************************/
1188 if (gmx_mm256_any_lt(rsq33
,rcutoff2
))
1191 /* REACTION-FIELD ELECTROSTATICS */
1192 velec
= _mm256_mul_pd(qq33
,_mm256_sub_pd(_mm256_add_pd(rinv33
,_mm256_mul_pd(krf
,rsq33
)),crf
));
1193 felec
= _mm256_mul_pd(qq33
,_mm256_sub_pd(_mm256_mul_pd(rinv33
,rinvsq33
),krf2
));
1195 cutoff_mask
= _mm256_cmp_pd(rsq33
,rcutoff2
,_CMP_LT_OQ
);
1197 /* Update potential sum for this i atom from the interaction with this j atom. */
1198 velec
= _mm256_and_pd(velec
,cutoff_mask
);
1199 velec
= _mm256_andnot_pd(dummy_mask
,velec
);
1200 velecsum
= _mm256_add_pd(velecsum
,velec
);
1204 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
1206 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
1208 /* Calculate temporary vectorial force */
1209 tx
= _mm256_mul_pd(fscal
,dx33
);
1210 ty
= _mm256_mul_pd(fscal
,dy33
);
1211 tz
= _mm256_mul_pd(fscal
,dz33
);
1213 /* Update vectorial force */
1214 fix3
= _mm256_add_pd(fix3
,tx
);
1215 fiy3
= _mm256_add_pd(fiy3
,ty
);
1216 fiz3
= _mm256_add_pd(fiz3
,tz
);
1218 fjx3
= _mm256_add_pd(fjx3
,tx
);
1219 fjy3
= _mm256_add_pd(fjy3
,ty
);
1220 fjz3
= _mm256_add_pd(fjz3
,tz
);
1224 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
1225 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
1226 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
1227 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
1229 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
1230 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,
1231 fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
1233 /* Inner loop uses 368 flops */
1236 /* End of innermost loop */
1238 gmx_mm256_update_iforce_4atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
1239 f
+i_coord_offset
,fshift
+i_shift_offset
);
1242 /* Update potential energies */
1243 gmx_mm256_update_1pot_pd(velecsum
,kernel_data
->energygrp_elec
+ggid
);
1244 gmx_mm256_update_1pot_pd(vvdwsum
,kernel_data
->energygrp_vdw
+ggid
);
1246 /* Increment number of inner iterations */
1247 inneriter
+= j_index_end
- j_index_start
;
1249 /* Outer loop uses 26 flops */
1252 /* Increment number of outer iterations */
1255 /* Update outer/inner flops */
1257 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W4W4_VF
,outeriter
*26 + inneriter
*368);
1260 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_avx_256_double
1261 * Electrostatics interaction: ReactionField
1262 * VdW interaction: LennardJones
1263 * Geometry: Water4-Water4
1264 * Calculate force/pot: Force
1267 nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_avx_256_double
1268 (t_nblist
* gmx_restrict nlist
,
1269 rvec
* gmx_restrict xx
,
1270 rvec
* gmx_restrict ff
,
1271 struct t_forcerec
* gmx_restrict fr
,
1272 t_mdatoms
* gmx_restrict mdatoms
,
1273 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
1274 t_nrnb
* gmx_restrict nrnb
)
1276 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1277 * just 0 for non-waters.
1278 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1279 * jnr indices corresponding to data put in the four positions in the SIMD register.
1281 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
1282 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
1283 int jnrA
,jnrB
,jnrC
,jnrD
;
1284 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
1285 int jnrlistE
,jnrlistF
,jnrlistG
,jnrlistH
;
1286 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
1287 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
1288 real rcutoff_scalar
;
1289 real
*shiftvec
,*fshift
,*x
,*f
;
1290 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
;
1291 real scratch
[4*DIM
];
1292 __m256d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
1293 real
* vdwioffsetptr0
;
1294 __m256d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
1295 real
* vdwioffsetptr1
;
1296 __m256d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
1297 real
* vdwioffsetptr2
;
1298 __m256d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
1299 real
* vdwioffsetptr3
;
1300 __m256d ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
1301 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
;
1302 __m256d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
1303 int vdwjidx1A
,vdwjidx1B
,vdwjidx1C
,vdwjidx1D
;
1304 __m256d jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
1305 int vdwjidx2A
,vdwjidx2B
,vdwjidx2C
,vdwjidx2D
;
1306 __m256d jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
1307 int vdwjidx3A
,vdwjidx3B
,vdwjidx3C
,vdwjidx3D
;
1308 __m256d jx3
,jy3
,jz3
,fjx3
,fjy3
,fjz3
,jq3
,isaj3
;
1309 __m256d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
1310 __m256d dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
1311 __m256d dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
1312 __m256d dx13
,dy13
,dz13
,rsq13
,rinv13
,rinvsq13
,r13
,qq13
,c6_13
,c12_13
;
1313 __m256d dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
1314 __m256d dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
1315 __m256d dx23
,dy23
,dz23
,rsq23
,rinv23
,rinvsq23
,r23
,qq23
,c6_23
,c12_23
;
1316 __m256d dx31
,dy31
,dz31
,rsq31
,rinv31
,rinvsq31
,r31
,qq31
,c6_31
,c12_31
;
1317 __m256d dx32
,dy32
,dz32
,rsq32
,rinv32
,rinvsq32
,r32
,qq32
,c6_32
,c12_32
;
1318 __m256d dx33
,dy33
,dz33
,rsq33
,rinv33
,rinvsq33
,r33
,qq33
,c6_33
,c12_33
;
1319 __m256d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
1322 __m256d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
1325 __m256d one_sixth
= _mm256_set1_pd(1.0/6.0);
1326 __m256d one_twelfth
= _mm256_set1_pd(1.0/12.0);
1327 __m256d dummy_mask
,cutoff_mask
;
1328 __m128 tmpmask0
,tmpmask1
;
1329 __m256d signbit
= _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1330 __m256d one
= _mm256_set1_pd(1.0);
1331 __m256d two
= _mm256_set1_pd(2.0);
1337 jindex
= nlist
->jindex
;
1339 shiftidx
= nlist
->shift
;
1341 shiftvec
= fr
->shift_vec
[0];
1342 fshift
= fr
->fshift
[0];
1343 facel
= _mm256_set1_pd(fr
->ic
->epsfac
);
1344 charge
= mdatoms
->chargeA
;
1345 krf
= _mm256_set1_pd(fr
->ic
->k_rf
);
1346 krf2
= _mm256_set1_pd(fr
->ic
->k_rf
*2.0);
1347 crf
= _mm256_set1_pd(fr
->ic
->c_rf
);
1348 nvdwtype
= fr
->ntype
;
1349 vdwparam
= fr
->nbfp
;
1350 vdwtype
= mdatoms
->typeA
;
1352 /* Setup water-specific parameters */
1353 inr
= nlist
->iinr
[0];
1354 iq1
= _mm256_mul_pd(facel
,_mm256_set1_pd(charge
[inr
+1]));
1355 iq2
= _mm256_mul_pd(facel
,_mm256_set1_pd(charge
[inr
+2]));
1356 iq3
= _mm256_mul_pd(facel
,_mm256_set1_pd(charge
[inr
+3]));
1357 vdwioffsetptr0
= vdwparam
+2*nvdwtype
*vdwtype
[inr
+0];
1359 jq1
= _mm256_set1_pd(charge
[inr
+1]);
1360 jq2
= _mm256_set1_pd(charge
[inr
+2]);
1361 jq3
= _mm256_set1_pd(charge
[inr
+3]);
1362 vdwjidx0A
= 2*vdwtype
[inr
+0];
1363 c6_00
= _mm256_set1_pd(vdwioffsetptr0
[vdwjidx0A
]);
1364 c12_00
= _mm256_set1_pd(vdwioffsetptr0
[vdwjidx0A
+1]);
1365 qq11
= _mm256_mul_pd(iq1
,jq1
);
1366 qq12
= _mm256_mul_pd(iq1
,jq2
);
1367 qq13
= _mm256_mul_pd(iq1
,jq3
);
1368 qq21
= _mm256_mul_pd(iq2
,jq1
);
1369 qq22
= _mm256_mul_pd(iq2
,jq2
);
1370 qq23
= _mm256_mul_pd(iq2
,jq3
);
1371 qq31
= _mm256_mul_pd(iq3
,jq1
);
1372 qq32
= _mm256_mul_pd(iq3
,jq2
);
1373 qq33
= _mm256_mul_pd(iq3
,jq3
);
1375 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1376 rcutoff_scalar
= fr
->ic
->rcoulomb
;
1377 rcutoff
= _mm256_set1_pd(rcutoff_scalar
);
1378 rcutoff2
= _mm256_mul_pd(rcutoff
,rcutoff
);
1380 sh_vdw_invrcut6
= _mm256_set1_pd(fr
->ic
->sh_invrc6
);
1381 rvdw
= _mm256_set1_pd(fr
->ic
->rvdw
);
1383 /* Avoid stupid compiler warnings */
1384 jnrA
= jnrB
= jnrC
= jnrD
= 0;
1385 j_coord_offsetA
= 0;
1386 j_coord_offsetB
= 0;
1387 j_coord_offsetC
= 0;
1388 j_coord_offsetD
= 0;
1393 for(iidx
=0;iidx
<4*DIM
;iidx
++)
1395 scratch
[iidx
] = 0.0;
1398 /* Start outer loop over neighborlists */
1399 for(iidx
=0; iidx
<nri
; iidx
++)
1401 /* Load shift vector for this list */
1402 i_shift_offset
= DIM
*shiftidx
[iidx
];
1404 /* Load limits for loop over neighbors */
1405 j_index_start
= jindex
[iidx
];
1406 j_index_end
= jindex
[iidx
+1];
1408 /* Get outer coordinate index */
1410 i_coord_offset
= DIM
*inr
;
1412 /* Load i particle coords and add shift vector */
1413 gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
1414 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
1416 fix0
= _mm256_setzero_pd();
1417 fiy0
= _mm256_setzero_pd();
1418 fiz0
= _mm256_setzero_pd();
1419 fix1
= _mm256_setzero_pd();
1420 fiy1
= _mm256_setzero_pd();
1421 fiz1
= _mm256_setzero_pd();
1422 fix2
= _mm256_setzero_pd();
1423 fiy2
= _mm256_setzero_pd();
1424 fiz2
= _mm256_setzero_pd();
1425 fix3
= _mm256_setzero_pd();
1426 fiy3
= _mm256_setzero_pd();
1427 fiz3
= _mm256_setzero_pd();
1429 /* Start inner kernel loop */
1430 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+3]>=0; jidx
+=4)
1433 /* Get j neighbor index, and coordinate index */
1435 jnrB
= jjnr
[jidx
+1];
1436 jnrC
= jjnr
[jidx
+2];
1437 jnrD
= jjnr
[jidx
+3];
1438 j_coord_offsetA
= DIM
*jnrA
;
1439 j_coord_offsetB
= DIM
*jnrB
;
1440 j_coord_offsetC
= DIM
*jnrC
;
1441 j_coord_offsetD
= DIM
*jnrD
;
1443 /* load j atom coordinates */
1444 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1445 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
1446 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
1447 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
1449 /* Calculate displacement vector */
1450 dx00
= _mm256_sub_pd(ix0
,jx0
);
1451 dy00
= _mm256_sub_pd(iy0
,jy0
);
1452 dz00
= _mm256_sub_pd(iz0
,jz0
);
1453 dx11
= _mm256_sub_pd(ix1
,jx1
);
1454 dy11
= _mm256_sub_pd(iy1
,jy1
);
1455 dz11
= _mm256_sub_pd(iz1
,jz1
);
1456 dx12
= _mm256_sub_pd(ix1
,jx2
);
1457 dy12
= _mm256_sub_pd(iy1
,jy2
);
1458 dz12
= _mm256_sub_pd(iz1
,jz2
);
1459 dx13
= _mm256_sub_pd(ix1
,jx3
);
1460 dy13
= _mm256_sub_pd(iy1
,jy3
);
1461 dz13
= _mm256_sub_pd(iz1
,jz3
);
1462 dx21
= _mm256_sub_pd(ix2
,jx1
);
1463 dy21
= _mm256_sub_pd(iy2
,jy1
);
1464 dz21
= _mm256_sub_pd(iz2
,jz1
);
1465 dx22
= _mm256_sub_pd(ix2
,jx2
);
1466 dy22
= _mm256_sub_pd(iy2
,jy2
);
1467 dz22
= _mm256_sub_pd(iz2
,jz2
);
1468 dx23
= _mm256_sub_pd(ix2
,jx3
);
1469 dy23
= _mm256_sub_pd(iy2
,jy3
);
1470 dz23
= _mm256_sub_pd(iz2
,jz3
);
1471 dx31
= _mm256_sub_pd(ix3
,jx1
);
1472 dy31
= _mm256_sub_pd(iy3
,jy1
);
1473 dz31
= _mm256_sub_pd(iz3
,jz1
);
1474 dx32
= _mm256_sub_pd(ix3
,jx2
);
1475 dy32
= _mm256_sub_pd(iy3
,jy2
);
1476 dz32
= _mm256_sub_pd(iz3
,jz2
);
1477 dx33
= _mm256_sub_pd(ix3
,jx3
);
1478 dy33
= _mm256_sub_pd(iy3
,jy3
);
1479 dz33
= _mm256_sub_pd(iz3
,jz3
);
1481 /* Calculate squared distance and things based on it */
1482 rsq00
= gmx_mm256_calc_rsq_pd(dx00
,dy00
,dz00
);
1483 rsq11
= gmx_mm256_calc_rsq_pd(dx11
,dy11
,dz11
);
1484 rsq12
= gmx_mm256_calc_rsq_pd(dx12
,dy12
,dz12
);
1485 rsq13
= gmx_mm256_calc_rsq_pd(dx13
,dy13
,dz13
);
1486 rsq21
= gmx_mm256_calc_rsq_pd(dx21
,dy21
,dz21
);
1487 rsq22
= gmx_mm256_calc_rsq_pd(dx22
,dy22
,dz22
);
1488 rsq23
= gmx_mm256_calc_rsq_pd(dx23
,dy23
,dz23
);
1489 rsq31
= gmx_mm256_calc_rsq_pd(dx31
,dy31
,dz31
);
1490 rsq32
= gmx_mm256_calc_rsq_pd(dx32
,dy32
,dz32
);
1491 rsq33
= gmx_mm256_calc_rsq_pd(dx33
,dy33
,dz33
);
1493 rinv11
= avx256_invsqrt_d(rsq11
);
1494 rinv12
= avx256_invsqrt_d(rsq12
);
1495 rinv13
= avx256_invsqrt_d(rsq13
);
1496 rinv21
= avx256_invsqrt_d(rsq21
);
1497 rinv22
= avx256_invsqrt_d(rsq22
);
1498 rinv23
= avx256_invsqrt_d(rsq23
);
1499 rinv31
= avx256_invsqrt_d(rsq31
);
1500 rinv32
= avx256_invsqrt_d(rsq32
);
1501 rinv33
= avx256_invsqrt_d(rsq33
);
1503 rinvsq00
= avx256_inv_d(rsq00
);
1504 rinvsq11
= _mm256_mul_pd(rinv11
,rinv11
);
1505 rinvsq12
= _mm256_mul_pd(rinv12
,rinv12
);
1506 rinvsq13
= _mm256_mul_pd(rinv13
,rinv13
);
1507 rinvsq21
= _mm256_mul_pd(rinv21
,rinv21
);
1508 rinvsq22
= _mm256_mul_pd(rinv22
,rinv22
);
1509 rinvsq23
= _mm256_mul_pd(rinv23
,rinv23
);
1510 rinvsq31
= _mm256_mul_pd(rinv31
,rinv31
);
1511 rinvsq32
= _mm256_mul_pd(rinv32
,rinv32
);
1512 rinvsq33
= _mm256_mul_pd(rinv33
,rinv33
);
1514 fjx0
= _mm256_setzero_pd();
1515 fjy0
= _mm256_setzero_pd();
1516 fjz0
= _mm256_setzero_pd();
1517 fjx1
= _mm256_setzero_pd();
1518 fjy1
= _mm256_setzero_pd();
1519 fjz1
= _mm256_setzero_pd();
1520 fjx2
= _mm256_setzero_pd();
1521 fjy2
= _mm256_setzero_pd();
1522 fjz2
= _mm256_setzero_pd();
1523 fjx3
= _mm256_setzero_pd();
1524 fjy3
= _mm256_setzero_pd();
1525 fjz3
= _mm256_setzero_pd();
1527 /**************************
1528 * CALCULATE INTERACTIONS *
1529 **************************/
1531 if (gmx_mm256_any_lt(rsq00
,rcutoff2
))
1534 /* LENNARD-JONES DISPERSION/REPULSION */
1536 rinvsix
= _mm256_mul_pd(_mm256_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
1537 fvdw
= _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00
,rinvsix
),c6_00
),_mm256_mul_pd(rinvsix
,rinvsq00
));
1539 cutoff_mask
= _mm256_cmp_pd(rsq00
,rcutoff2
,_CMP_LT_OQ
);
1543 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
1545 /* Calculate temporary vectorial force */
1546 tx
= _mm256_mul_pd(fscal
,dx00
);
1547 ty
= _mm256_mul_pd(fscal
,dy00
);
1548 tz
= _mm256_mul_pd(fscal
,dz00
);
1550 /* Update vectorial force */
1551 fix0
= _mm256_add_pd(fix0
,tx
);
1552 fiy0
= _mm256_add_pd(fiy0
,ty
);
1553 fiz0
= _mm256_add_pd(fiz0
,tz
);
1555 fjx0
= _mm256_add_pd(fjx0
,tx
);
1556 fjy0
= _mm256_add_pd(fjy0
,ty
);
1557 fjz0
= _mm256_add_pd(fjz0
,tz
);
1561 /**************************
1562 * CALCULATE INTERACTIONS *
1563 **************************/
1565 if (gmx_mm256_any_lt(rsq11
,rcutoff2
))
1568 /* REACTION-FIELD ELECTROSTATICS */
1569 felec
= _mm256_mul_pd(qq11
,_mm256_sub_pd(_mm256_mul_pd(rinv11
,rinvsq11
),krf2
));
1571 cutoff_mask
= _mm256_cmp_pd(rsq11
,rcutoff2
,_CMP_LT_OQ
);
1575 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
1577 /* Calculate temporary vectorial force */
1578 tx
= _mm256_mul_pd(fscal
,dx11
);
1579 ty
= _mm256_mul_pd(fscal
,dy11
);
1580 tz
= _mm256_mul_pd(fscal
,dz11
);
1582 /* Update vectorial force */
1583 fix1
= _mm256_add_pd(fix1
,tx
);
1584 fiy1
= _mm256_add_pd(fiy1
,ty
);
1585 fiz1
= _mm256_add_pd(fiz1
,tz
);
1587 fjx1
= _mm256_add_pd(fjx1
,tx
);
1588 fjy1
= _mm256_add_pd(fjy1
,ty
);
1589 fjz1
= _mm256_add_pd(fjz1
,tz
);
1593 /**************************
1594 * CALCULATE INTERACTIONS *
1595 **************************/
1597 if (gmx_mm256_any_lt(rsq12
,rcutoff2
))
1600 /* REACTION-FIELD ELECTROSTATICS */
1601 felec
= _mm256_mul_pd(qq12
,_mm256_sub_pd(_mm256_mul_pd(rinv12
,rinvsq12
),krf2
));
1603 cutoff_mask
= _mm256_cmp_pd(rsq12
,rcutoff2
,_CMP_LT_OQ
);
1607 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
1609 /* Calculate temporary vectorial force */
1610 tx
= _mm256_mul_pd(fscal
,dx12
);
1611 ty
= _mm256_mul_pd(fscal
,dy12
);
1612 tz
= _mm256_mul_pd(fscal
,dz12
);
1614 /* Update vectorial force */
1615 fix1
= _mm256_add_pd(fix1
,tx
);
1616 fiy1
= _mm256_add_pd(fiy1
,ty
);
1617 fiz1
= _mm256_add_pd(fiz1
,tz
);
1619 fjx2
= _mm256_add_pd(fjx2
,tx
);
1620 fjy2
= _mm256_add_pd(fjy2
,ty
);
1621 fjz2
= _mm256_add_pd(fjz2
,tz
);
1625 /**************************
1626 * CALCULATE INTERACTIONS *
1627 **************************/
1629 if (gmx_mm256_any_lt(rsq13
,rcutoff2
))
1632 /* REACTION-FIELD ELECTROSTATICS */
1633 felec
= _mm256_mul_pd(qq13
,_mm256_sub_pd(_mm256_mul_pd(rinv13
,rinvsq13
),krf2
));
1635 cutoff_mask
= _mm256_cmp_pd(rsq13
,rcutoff2
,_CMP_LT_OQ
);
1639 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
1641 /* Calculate temporary vectorial force */
1642 tx
= _mm256_mul_pd(fscal
,dx13
);
1643 ty
= _mm256_mul_pd(fscal
,dy13
);
1644 tz
= _mm256_mul_pd(fscal
,dz13
);
1646 /* Update vectorial force */
1647 fix1
= _mm256_add_pd(fix1
,tx
);
1648 fiy1
= _mm256_add_pd(fiy1
,ty
);
1649 fiz1
= _mm256_add_pd(fiz1
,tz
);
1651 fjx3
= _mm256_add_pd(fjx3
,tx
);
1652 fjy3
= _mm256_add_pd(fjy3
,ty
);
1653 fjz3
= _mm256_add_pd(fjz3
,tz
);
1657 /**************************
1658 * CALCULATE INTERACTIONS *
1659 **************************/
1661 if (gmx_mm256_any_lt(rsq21
,rcutoff2
))
1664 /* REACTION-FIELD ELECTROSTATICS */
1665 felec
= _mm256_mul_pd(qq21
,_mm256_sub_pd(_mm256_mul_pd(rinv21
,rinvsq21
),krf2
));
1667 cutoff_mask
= _mm256_cmp_pd(rsq21
,rcutoff2
,_CMP_LT_OQ
);
1671 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
1673 /* Calculate temporary vectorial force */
1674 tx
= _mm256_mul_pd(fscal
,dx21
);
1675 ty
= _mm256_mul_pd(fscal
,dy21
);
1676 tz
= _mm256_mul_pd(fscal
,dz21
);
1678 /* Update vectorial force */
1679 fix2
= _mm256_add_pd(fix2
,tx
);
1680 fiy2
= _mm256_add_pd(fiy2
,ty
);
1681 fiz2
= _mm256_add_pd(fiz2
,tz
);
1683 fjx1
= _mm256_add_pd(fjx1
,tx
);
1684 fjy1
= _mm256_add_pd(fjy1
,ty
);
1685 fjz1
= _mm256_add_pd(fjz1
,tz
);
1689 /**************************
1690 * CALCULATE INTERACTIONS *
1691 **************************/
1693 if (gmx_mm256_any_lt(rsq22
,rcutoff2
))
1696 /* REACTION-FIELD ELECTROSTATICS */
1697 felec
= _mm256_mul_pd(qq22
,_mm256_sub_pd(_mm256_mul_pd(rinv22
,rinvsq22
),krf2
));
1699 cutoff_mask
= _mm256_cmp_pd(rsq22
,rcutoff2
,_CMP_LT_OQ
);
1703 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
1705 /* Calculate temporary vectorial force */
1706 tx
= _mm256_mul_pd(fscal
,dx22
);
1707 ty
= _mm256_mul_pd(fscal
,dy22
);
1708 tz
= _mm256_mul_pd(fscal
,dz22
);
1710 /* Update vectorial force */
1711 fix2
= _mm256_add_pd(fix2
,tx
);
1712 fiy2
= _mm256_add_pd(fiy2
,ty
);
1713 fiz2
= _mm256_add_pd(fiz2
,tz
);
1715 fjx2
= _mm256_add_pd(fjx2
,tx
);
1716 fjy2
= _mm256_add_pd(fjy2
,ty
);
1717 fjz2
= _mm256_add_pd(fjz2
,tz
);
1721 /**************************
1722 * CALCULATE INTERACTIONS *
1723 **************************/
1725 if (gmx_mm256_any_lt(rsq23
,rcutoff2
))
1728 /* REACTION-FIELD ELECTROSTATICS */
1729 felec
= _mm256_mul_pd(qq23
,_mm256_sub_pd(_mm256_mul_pd(rinv23
,rinvsq23
),krf2
));
1731 cutoff_mask
= _mm256_cmp_pd(rsq23
,rcutoff2
,_CMP_LT_OQ
);
1735 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
1737 /* Calculate temporary vectorial force */
1738 tx
= _mm256_mul_pd(fscal
,dx23
);
1739 ty
= _mm256_mul_pd(fscal
,dy23
);
1740 tz
= _mm256_mul_pd(fscal
,dz23
);
1742 /* Update vectorial force */
1743 fix2
= _mm256_add_pd(fix2
,tx
);
1744 fiy2
= _mm256_add_pd(fiy2
,ty
);
1745 fiz2
= _mm256_add_pd(fiz2
,tz
);
1747 fjx3
= _mm256_add_pd(fjx3
,tx
);
1748 fjy3
= _mm256_add_pd(fjy3
,ty
);
1749 fjz3
= _mm256_add_pd(fjz3
,tz
);
1753 /**************************
1754 * CALCULATE INTERACTIONS *
1755 **************************/
1757 if (gmx_mm256_any_lt(rsq31
,rcutoff2
))
1760 /* REACTION-FIELD ELECTROSTATICS */
1761 felec
= _mm256_mul_pd(qq31
,_mm256_sub_pd(_mm256_mul_pd(rinv31
,rinvsq31
),krf2
));
1763 cutoff_mask
= _mm256_cmp_pd(rsq31
,rcutoff2
,_CMP_LT_OQ
);
1767 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
1769 /* Calculate temporary vectorial force */
1770 tx
= _mm256_mul_pd(fscal
,dx31
);
1771 ty
= _mm256_mul_pd(fscal
,dy31
);
1772 tz
= _mm256_mul_pd(fscal
,dz31
);
1774 /* Update vectorial force */
1775 fix3
= _mm256_add_pd(fix3
,tx
);
1776 fiy3
= _mm256_add_pd(fiy3
,ty
);
1777 fiz3
= _mm256_add_pd(fiz3
,tz
);
1779 fjx1
= _mm256_add_pd(fjx1
,tx
);
1780 fjy1
= _mm256_add_pd(fjy1
,ty
);
1781 fjz1
= _mm256_add_pd(fjz1
,tz
);
1785 /**************************
1786 * CALCULATE INTERACTIONS *
1787 **************************/
1789 if (gmx_mm256_any_lt(rsq32
,rcutoff2
))
1792 /* REACTION-FIELD ELECTROSTATICS */
1793 felec
= _mm256_mul_pd(qq32
,_mm256_sub_pd(_mm256_mul_pd(rinv32
,rinvsq32
),krf2
));
1795 cutoff_mask
= _mm256_cmp_pd(rsq32
,rcutoff2
,_CMP_LT_OQ
);
1799 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
1801 /* Calculate temporary vectorial force */
1802 tx
= _mm256_mul_pd(fscal
,dx32
);
1803 ty
= _mm256_mul_pd(fscal
,dy32
);
1804 tz
= _mm256_mul_pd(fscal
,dz32
);
1806 /* Update vectorial force */
1807 fix3
= _mm256_add_pd(fix3
,tx
);
1808 fiy3
= _mm256_add_pd(fiy3
,ty
);
1809 fiz3
= _mm256_add_pd(fiz3
,tz
);
1811 fjx2
= _mm256_add_pd(fjx2
,tx
);
1812 fjy2
= _mm256_add_pd(fjy2
,ty
);
1813 fjz2
= _mm256_add_pd(fjz2
,tz
);
1817 /**************************
1818 * CALCULATE INTERACTIONS *
1819 **************************/
1821 if (gmx_mm256_any_lt(rsq33
,rcutoff2
))
1824 /* REACTION-FIELD ELECTROSTATICS */
1825 felec
= _mm256_mul_pd(qq33
,_mm256_sub_pd(_mm256_mul_pd(rinv33
,rinvsq33
),krf2
));
1827 cutoff_mask
= _mm256_cmp_pd(rsq33
,rcutoff2
,_CMP_LT_OQ
);
1831 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
1833 /* Calculate temporary vectorial force */
1834 tx
= _mm256_mul_pd(fscal
,dx33
);
1835 ty
= _mm256_mul_pd(fscal
,dy33
);
1836 tz
= _mm256_mul_pd(fscal
,dz33
);
1838 /* Update vectorial force */
1839 fix3
= _mm256_add_pd(fix3
,tx
);
1840 fiy3
= _mm256_add_pd(fiy3
,ty
);
1841 fiz3
= _mm256_add_pd(fiz3
,tz
);
1843 fjx3
= _mm256_add_pd(fjx3
,tx
);
1844 fjy3
= _mm256_add_pd(fjy3
,ty
);
1845 fjz3
= _mm256_add_pd(fjz3
,tz
);
1849 fjptrA
= f
+j_coord_offsetA
;
1850 fjptrB
= f
+j_coord_offsetB
;
1851 fjptrC
= f
+j_coord_offsetC
;
1852 fjptrD
= f
+j_coord_offsetD
;
1854 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
1855 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,
1856 fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
1858 /* Inner loop uses 303 flops */
1861 if(jidx
<j_index_end
)
1864 /* Get j neighbor index, and coordinate index */
1865 jnrlistA
= jjnr
[jidx
];
1866 jnrlistB
= jjnr
[jidx
+1];
1867 jnrlistC
= jjnr
[jidx
+2];
1868 jnrlistD
= jjnr
[jidx
+3];
1869 /* Sign of each element will be negative for non-real atoms.
1870 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1871 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
1873 tmpmask0
= gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128()));
1875 tmpmask1
= _mm_permute_ps(tmpmask0
,_GMX_MM_PERMUTE(3,3,2,2));
1876 tmpmask0
= _mm_permute_ps(tmpmask0
,_GMX_MM_PERMUTE(1,1,0,0));
1877 dummy_mask
= _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1
,tmpmask0
));
1879 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
1880 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
1881 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
1882 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
1883 j_coord_offsetA
= DIM
*jnrA
;
1884 j_coord_offsetB
= DIM
*jnrB
;
1885 j_coord_offsetC
= DIM
*jnrC
;
1886 j_coord_offsetD
= DIM
*jnrD
;
1888 /* load j atom coordinates */
1889 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1890 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
1891 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
1892 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
1894 /* Calculate displacement vector */
1895 dx00
= _mm256_sub_pd(ix0
,jx0
);
1896 dy00
= _mm256_sub_pd(iy0
,jy0
);
1897 dz00
= _mm256_sub_pd(iz0
,jz0
);
1898 dx11
= _mm256_sub_pd(ix1
,jx1
);
1899 dy11
= _mm256_sub_pd(iy1
,jy1
);
1900 dz11
= _mm256_sub_pd(iz1
,jz1
);
1901 dx12
= _mm256_sub_pd(ix1
,jx2
);
1902 dy12
= _mm256_sub_pd(iy1
,jy2
);
1903 dz12
= _mm256_sub_pd(iz1
,jz2
);
1904 dx13
= _mm256_sub_pd(ix1
,jx3
);
1905 dy13
= _mm256_sub_pd(iy1
,jy3
);
1906 dz13
= _mm256_sub_pd(iz1
,jz3
);
1907 dx21
= _mm256_sub_pd(ix2
,jx1
);
1908 dy21
= _mm256_sub_pd(iy2
,jy1
);
1909 dz21
= _mm256_sub_pd(iz2
,jz1
);
1910 dx22
= _mm256_sub_pd(ix2
,jx2
);
1911 dy22
= _mm256_sub_pd(iy2
,jy2
);
1912 dz22
= _mm256_sub_pd(iz2
,jz2
);
1913 dx23
= _mm256_sub_pd(ix2
,jx3
);
1914 dy23
= _mm256_sub_pd(iy2
,jy3
);
1915 dz23
= _mm256_sub_pd(iz2
,jz3
);
1916 dx31
= _mm256_sub_pd(ix3
,jx1
);
1917 dy31
= _mm256_sub_pd(iy3
,jy1
);
1918 dz31
= _mm256_sub_pd(iz3
,jz1
);
1919 dx32
= _mm256_sub_pd(ix3
,jx2
);
1920 dy32
= _mm256_sub_pd(iy3
,jy2
);
1921 dz32
= _mm256_sub_pd(iz3
,jz2
);
1922 dx33
= _mm256_sub_pd(ix3
,jx3
);
1923 dy33
= _mm256_sub_pd(iy3
,jy3
);
1924 dz33
= _mm256_sub_pd(iz3
,jz3
);
1926 /* Calculate squared distance and things based on it */
1927 rsq00
= gmx_mm256_calc_rsq_pd(dx00
,dy00
,dz00
);
1928 rsq11
= gmx_mm256_calc_rsq_pd(dx11
,dy11
,dz11
);
1929 rsq12
= gmx_mm256_calc_rsq_pd(dx12
,dy12
,dz12
);
1930 rsq13
= gmx_mm256_calc_rsq_pd(dx13
,dy13
,dz13
);
1931 rsq21
= gmx_mm256_calc_rsq_pd(dx21
,dy21
,dz21
);
1932 rsq22
= gmx_mm256_calc_rsq_pd(dx22
,dy22
,dz22
);
1933 rsq23
= gmx_mm256_calc_rsq_pd(dx23
,dy23
,dz23
);
1934 rsq31
= gmx_mm256_calc_rsq_pd(dx31
,dy31
,dz31
);
1935 rsq32
= gmx_mm256_calc_rsq_pd(dx32
,dy32
,dz32
);
1936 rsq33
= gmx_mm256_calc_rsq_pd(dx33
,dy33
,dz33
);
1938 rinv11
= avx256_invsqrt_d(rsq11
);
1939 rinv12
= avx256_invsqrt_d(rsq12
);
1940 rinv13
= avx256_invsqrt_d(rsq13
);
1941 rinv21
= avx256_invsqrt_d(rsq21
);
1942 rinv22
= avx256_invsqrt_d(rsq22
);
1943 rinv23
= avx256_invsqrt_d(rsq23
);
1944 rinv31
= avx256_invsqrt_d(rsq31
);
1945 rinv32
= avx256_invsqrt_d(rsq32
);
1946 rinv33
= avx256_invsqrt_d(rsq33
);
1948 rinvsq00
= avx256_inv_d(rsq00
);
1949 rinvsq11
= _mm256_mul_pd(rinv11
,rinv11
);
1950 rinvsq12
= _mm256_mul_pd(rinv12
,rinv12
);
1951 rinvsq13
= _mm256_mul_pd(rinv13
,rinv13
);
1952 rinvsq21
= _mm256_mul_pd(rinv21
,rinv21
);
1953 rinvsq22
= _mm256_mul_pd(rinv22
,rinv22
);
1954 rinvsq23
= _mm256_mul_pd(rinv23
,rinv23
);
1955 rinvsq31
= _mm256_mul_pd(rinv31
,rinv31
);
1956 rinvsq32
= _mm256_mul_pd(rinv32
,rinv32
);
1957 rinvsq33
= _mm256_mul_pd(rinv33
,rinv33
);
1959 fjx0
= _mm256_setzero_pd();
1960 fjy0
= _mm256_setzero_pd();
1961 fjz0
= _mm256_setzero_pd();
1962 fjx1
= _mm256_setzero_pd();
1963 fjy1
= _mm256_setzero_pd();
1964 fjz1
= _mm256_setzero_pd();
1965 fjx2
= _mm256_setzero_pd();
1966 fjy2
= _mm256_setzero_pd();
1967 fjz2
= _mm256_setzero_pd();
1968 fjx3
= _mm256_setzero_pd();
1969 fjy3
= _mm256_setzero_pd();
1970 fjz3
= _mm256_setzero_pd();
1972 /**************************
1973 * CALCULATE INTERACTIONS *
1974 **************************/
1976 if (gmx_mm256_any_lt(rsq00
,rcutoff2
))
1979 /* LENNARD-JONES DISPERSION/REPULSION */
1981 rinvsix
= _mm256_mul_pd(_mm256_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
1982 fvdw
= _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00
,rinvsix
),c6_00
),_mm256_mul_pd(rinvsix
,rinvsq00
));
1984 cutoff_mask
= _mm256_cmp_pd(rsq00
,rcutoff2
,_CMP_LT_OQ
);
1988 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
1990 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
1992 /* Calculate temporary vectorial force */
1993 tx
= _mm256_mul_pd(fscal
,dx00
);
1994 ty
= _mm256_mul_pd(fscal
,dy00
);
1995 tz
= _mm256_mul_pd(fscal
,dz00
);
1997 /* Update vectorial force */
1998 fix0
= _mm256_add_pd(fix0
,tx
);
1999 fiy0
= _mm256_add_pd(fiy0
,ty
);
2000 fiz0
= _mm256_add_pd(fiz0
,tz
);
2002 fjx0
= _mm256_add_pd(fjx0
,tx
);
2003 fjy0
= _mm256_add_pd(fjy0
,ty
);
2004 fjz0
= _mm256_add_pd(fjz0
,tz
);
2008 /**************************
2009 * CALCULATE INTERACTIONS *
2010 **************************/
2012 if (gmx_mm256_any_lt(rsq11
,rcutoff2
))
2015 /* REACTION-FIELD ELECTROSTATICS */
2016 felec
= _mm256_mul_pd(qq11
,_mm256_sub_pd(_mm256_mul_pd(rinv11
,rinvsq11
),krf2
));
2018 cutoff_mask
= _mm256_cmp_pd(rsq11
,rcutoff2
,_CMP_LT_OQ
);
2022 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
2024 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
2026 /* Calculate temporary vectorial force */
2027 tx
= _mm256_mul_pd(fscal
,dx11
);
2028 ty
= _mm256_mul_pd(fscal
,dy11
);
2029 tz
= _mm256_mul_pd(fscal
,dz11
);
2031 /* Update vectorial force */
2032 fix1
= _mm256_add_pd(fix1
,tx
);
2033 fiy1
= _mm256_add_pd(fiy1
,ty
);
2034 fiz1
= _mm256_add_pd(fiz1
,tz
);
2036 fjx1
= _mm256_add_pd(fjx1
,tx
);
2037 fjy1
= _mm256_add_pd(fjy1
,ty
);
2038 fjz1
= _mm256_add_pd(fjz1
,tz
);
2042 /**************************
2043 * CALCULATE INTERACTIONS *
2044 **************************/
2046 if (gmx_mm256_any_lt(rsq12
,rcutoff2
))
2049 /* REACTION-FIELD ELECTROSTATICS */
2050 felec
= _mm256_mul_pd(qq12
,_mm256_sub_pd(_mm256_mul_pd(rinv12
,rinvsq12
),krf2
));
2052 cutoff_mask
= _mm256_cmp_pd(rsq12
,rcutoff2
,_CMP_LT_OQ
);
2056 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
2058 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
2060 /* Calculate temporary vectorial force */
2061 tx
= _mm256_mul_pd(fscal
,dx12
);
2062 ty
= _mm256_mul_pd(fscal
,dy12
);
2063 tz
= _mm256_mul_pd(fscal
,dz12
);
2065 /* Update vectorial force */
2066 fix1
= _mm256_add_pd(fix1
,tx
);
2067 fiy1
= _mm256_add_pd(fiy1
,ty
);
2068 fiz1
= _mm256_add_pd(fiz1
,tz
);
2070 fjx2
= _mm256_add_pd(fjx2
,tx
);
2071 fjy2
= _mm256_add_pd(fjy2
,ty
);
2072 fjz2
= _mm256_add_pd(fjz2
,tz
);
2076 /**************************
2077 * CALCULATE INTERACTIONS *
2078 **************************/
2080 if (gmx_mm256_any_lt(rsq13
,rcutoff2
))
2083 /* REACTION-FIELD ELECTROSTATICS */
2084 felec
= _mm256_mul_pd(qq13
,_mm256_sub_pd(_mm256_mul_pd(rinv13
,rinvsq13
),krf2
));
2086 cutoff_mask
= _mm256_cmp_pd(rsq13
,rcutoff2
,_CMP_LT_OQ
);
2090 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
2092 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
2094 /* Calculate temporary vectorial force */
2095 tx
= _mm256_mul_pd(fscal
,dx13
);
2096 ty
= _mm256_mul_pd(fscal
,dy13
);
2097 tz
= _mm256_mul_pd(fscal
,dz13
);
2099 /* Update vectorial force */
2100 fix1
= _mm256_add_pd(fix1
,tx
);
2101 fiy1
= _mm256_add_pd(fiy1
,ty
);
2102 fiz1
= _mm256_add_pd(fiz1
,tz
);
2104 fjx3
= _mm256_add_pd(fjx3
,tx
);
2105 fjy3
= _mm256_add_pd(fjy3
,ty
);
2106 fjz3
= _mm256_add_pd(fjz3
,tz
);
2110 /**************************
2111 * CALCULATE INTERACTIONS *
2112 **************************/
2114 if (gmx_mm256_any_lt(rsq21
,rcutoff2
))
2117 /* REACTION-FIELD ELECTROSTATICS */
2118 felec
= _mm256_mul_pd(qq21
,_mm256_sub_pd(_mm256_mul_pd(rinv21
,rinvsq21
),krf2
));
2120 cutoff_mask
= _mm256_cmp_pd(rsq21
,rcutoff2
,_CMP_LT_OQ
);
2124 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
2126 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
2128 /* Calculate temporary vectorial force */
2129 tx
= _mm256_mul_pd(fscal
,dx21
);
2130 ty
= _mm256_mul_pd(fscal
,dy21
);
2131 tz
= _mm256_mul_pd(fscal
,dz21
);
2133 /* Update vectorial force */
2134 fix2
= _mm256_add_pd(fix2
,tx
);
2135 fiy2
= _mm256_add_pd(fiy2
,ty
);
2136 fiz2
= _mm256_add_pd(fiz2
,tz
);
2138 fjx1
= _mm256_add_pd(fjx1
,tx
);
2139 fjy1
= _mm256_add_pd(fjy1
,ty
);
2140 fjz1
= _mm256_add_pd(fjz1
,tz
);
2144 /**************************
2145 * CALCULATE INTERACTIONS *
2146 **************************/
2148 if (gmx_mm256_any_lt(rsq22
,rcutoff2
))
2151 /* REACTION-FIELD ELECTROSTATICS */
2152 felec
= _mm256_mul_pd(qq22
,_mm256_sub_pd(_mm256_mul_pd(rinv22
,rinvsq22
),krf2
));
2154 cutoff_mask
= _mm256_cmp_pd(rsq22
,rcutoff2
,_CMP_LT_OQ
);
2158 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
2160 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
2162 /* Calculate temporary vectorial force */
2163 tx
= _mm256_mul_pd(fscal
,dx22
);
2164 ty
= _mm256_mul_pd(fscal
,dy22
);
2165 tz
= _mm256_mul_pd(fscal
,dz22
);
2167 /* Update vectorial force */
2168 fix2
= _mm256_add_pd(fix2
,tx
);
2169 fiy2
= _mm256_add_pd(fiy2
,ty
);
2170 fiz2
= _mm256_add_pd(fiz2
,tz
);
2172 fjx2
= _mm256_add_pd(fjx2
,tx
);
2173 fjy2
= _mm256_add_pd(fjy2
,ty
);
2174 fjz2
= _mm256_add_pd(fjz2
,tz
);
2178 /**************************
2179 * CALCULATE INTERACTIONS *
2180 **************************/
2182 if (gmx_mm256_any_lt(rsq23
,rcutoff2
))
2185 /* REACTION-FIELD ELECTROSTATICS */
2186 felec
= _mm256_mul_pd(qq23
,_mm256_sub_pd(_mm256_mul_pd(rinv23
,rinvsq23
),krf2
));
2188 cutoff_mask
= _mm256_cmp_pd(rsq23
,rcutoff2
,_CMP_LT_OQ
);
2192 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
2194 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
2196 /* Calculate temporary vectorial force */
2197 tx
= _mm256_mul_pd(fscal
,dx23
);
2198 ty
= _mm256_mul_pd(fscal
,dy23
);
2199 tz
= _mm256_mul_pd(fscal
,dz23
);
2201 /* Update vectorial force */
2202 fix2
= _mm256_add_pd(fix2
,tx
);
2203 fiy2
= _mm256_add_pd(fiy2
,ty
);
2204 fiz2
= _mm256_add_pd(fiz2
,tz
);
2206 fjx3
= _mm256_add_pd(fjx3
,tx
);
2207 fjy3
= _mm256_add_pd(fjy3
,ty
);
2208 fjz3
= _mm256_add_pd(fjz3
,tz
);
2212 /**************************
2213 * CALCULATE INTERACTIONS *
2214 **************************/
2216 if (gmx_mm256_any_lt(rsq31
,rcutoff2
))
2219 /* REACTION-FIELD ELECTROSTATICS */
2220 felec
= _mm256_mul_pd(qq31
,_mm256_sub_pd(_mm256_mul_pd(rinv31
,rinvsq31
),krf2
));
2222 cutoff_mask
= _mm256_cmp_pd(rsq31
,rcutoff2
,_CMP_LT_OQ
);
2226 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
2228 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
2230 /* Calculate temporary vectorial force */
2231 tx
= _mm256_mul_pd(fscal
,dx31
);
2232 ty
= _mm256_mul_pd(fscal
,dy31
);
2233 tz
= _mm256_mul_pd(fscal
,dz31
);
2235 /* Update vectorial force */
2236 fix3
= _mm256_add_pd(fix3
,tx
);
2237 fiy3
= _mm256_add_pd(fiy3
,ty
);
2238 fiz3
= _mm256_add_pd(fiz3
,tz
);
2240 fjx1
= _mm256_add_pd(fjx1
,tx
);
2241 fjy1
= _mm256_add_pd(fjy1
,ty
);
2242 fjz1
= _mm256_add_pd(fjz1
,tz
);
2246 /**************************
2247 * CALCULATE INTERACTIONS *
2248 **************************/
2250 if (gmx_mm256_any_lt(rsq32
,rcutoff2
))
2253 /* REACTION-FIELD ELECTROSTATICS */
2254 felec
= _mm256_mul_pd(qq32
,_mm256_sub_pd(_mm256_mul_pd(rinv32
,rinvsq32
),krf2
));
2256 cutoff_mask
= _mm256_cmp_pd(rsq32
,rcutoff2
,_CMP_LT_OQ
);
2260 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
2262 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
2264 /* Calculate temporary vectorial force */
2265 tx
= _mm256_mul_pd(fscal
,dx32
);
2266 ty
= _mm256_mul_pd(fscal
,dy32
);
2267 tz
= _mm256_mul_pd(fscal
,dz32
);
2269 /* Update vectorial force */
2270 fix3
= _mm256_add_pd(fix3
,tx
);
2271 fiy3
= _mm256_add_pd(fiy3
,ty
);
2272 fiz3
= _mm256_add_pd(fiz3
,tz
);
2274 fjx2
= _mm256_add_pd(fjx2
,tx
);
2275 fjy2
= _mm256_add_pd(fjy2
,ty
);
2276 fjz2
= _mm256_add_pd(fjz2
,tz
);
2280 /**************************
2281 * CALCULATE INTERACTIONS *
2282 **************************/
2284 if (gmx_mm256_any_lt(rsq33
,rcutoff2
))
2287 /* REACTION-FIELD ELECTROSTATICS */
2288 felec
= _mm256_mul_pd(qq33
,_mm256_sub_pd(_mm256_mul_pd(rinv33
,rinvsq33
),krf2
));
2290 cutoff_mask
= _mm256_cmp_pd(rsq33
,rcutoff2
,_CMP_LT_OQ
);
2294 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
2296 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
2298 /* Calculate temporary vectorial force */
2299 tx
= _mm256_mul_pd(fscal
,dx33
);
2300 ty
= _mm256_mul_pd(fscal
,dy33
);
2301 tz
= _mm256_mul_pd(fscal
,dz33
);
2303 /* Update vectorial force */
2304 fix3
= _mm256_add_pd(fix3
,tx
);
2305 fiy3
= _mm256_add_pd(fiy3
,ty
);
2306 fiz3
= _mm256_add_pd(fiz3
,tz
);
2308 fjx3
= _mm256_add_pd(fjx3
,tx
);
2309 fjy3
= _mm256_add_pd(fjy3
,ty
);
2310 fjz3
= _mm256_add_pd(fjz3
,tz
);
2314 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
2315 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
2316 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
2317 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
2319 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
2320 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,
2321 fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
2323 /* Inner loop uses 303 flops */
2326 /* End of innermost loop */
2328 gmx_mm256_update_iforce_4atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
2329 f
+i_coord_offset
,fshift
+i_shift_offset
);
2331 /* Increment number of inner iterations */
2332 inneriter
+= j_index_end
- j_index_start
;
2334 /* Outer loop uses 24 flops */
2337 /* Increment number of outer iterations */
2340 /* Update outer/inner flops */
2342 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W4W4_F
,outeriter
*24 + inneriter
*303);