2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_single kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_avx_256_single.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_avx_256_single
51 * Electrostatics interaction: Ewald
52 * VdW interaction: LJEwald
53 * Geometry: Water4-Water4
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_avx_256_single
58 (t_nblist
* gmx_restrict nlist
,
59 rvec
* gmx_restrict xx
,
60 rvec
* gmx_restrict ff
,
61 struct t_forcerec
* gmx_restrict fr
,
62 t_mdatoms
* gmx_restrict mdatoms
,
63 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
64 t_nrnb
* gmx_restrict nrnb
)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
72 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
73 int jnrA
,jnrB
,jnrC
,jnrD
;
74 int jnrE
,jnrF
,jnrG
,jnrH
;
75 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
76 int jnrlistE
,jnrlistF
,jnrlistG
,jnrlistH
;
77 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
78 int j_coord_offsetE
,j_coord_offsetF
,j_coord_offsetG
,j_coord_offsetH
;
79 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
81 real
*shiftvec
,*fshift
,*x
,*f
;
82 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
,*fjptrE
,*fjptrF
,*fjptrG
,*fjptrH
;
84 __m256 tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
85 real
* vdwioffsetptr0
;
86 real
* vdwgridioffsetptr0
;
87 __m256 ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
88 real
* vdwioffsetptr1
;
89 real
* vdwgridioffsetptr1
;
90 __m256 ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
91 real
* vdwioffsetptr2
;
92 real
* vdwgridioffsetptr2
;
93 __m256 ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
94 real
* vdwioffsetptr3
;
95 real
* vdwgridioffsetptr3
;
96 __m256 ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
97 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
,vdwjidx0E
,vdwjidx0F
,vdwjidx0G
,vdwjidx0H
;
98 __m256 jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
99 int vdwjidx1A
,vdwjidx1B
,vdwjidx1C
,vdwjidx1D
,vdwjidx1E
,vdwjidx1F
,vdwjidx1G
,vdwjidx1H
;
100 __m256 jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
101 int vdwjidx2A
,vdwjidx2B
,vdwjidx2C
,vdwjidx2D
,vdwjidx2E
,vdwjidx2F
,vdwjidx2G
,vdwjidx2H
;
102 __m256 jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
103 int vdwjidx3A
,vdwjidx3B
,vdwjidx3C
,vdwjidx3D
,vdwjidx3E
,vdwjidx3F
,vdwjidx3G
,vdwjidx3H
;
104 __m256 jx3
,jy3
,jz3
,fjx3
,fjy3
,fjz3
,jq3
,isaj3
;
105 __m256 dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
106 __m256 dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
107 __m256 dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
108 __m256 dx13
,dy13
,dz13
,rsq13
,rinv13
,rinvsq13
,r13
,qq13
,c6_13
,c12_13
;
109 __m256 dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
110 __m256 dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
111 __m256 dx23
,dy23
,dz23
,rsq23
,rinv23
,rinvsq23
,r23
,qq23
,c6_23
,c12_23
;
112 __m256 dx31
,dy31
,dz31
,rsq31
,rinv31
,rinvsq31
,r31
,qq31
,c6_31
,c12_31
;
113 __m256 dx32
,dy32
,dz32
,rsq32
,rinv32
,rinvsq32
,r32
,qq32
,c6_32
,c12_32
;
114 __m256 dx33
,dy33
,dz33
,rsq33
,rinv33
,rinvsq33
,r33
,qq33
,c6_33
,c12_33
;
115 __m256 velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
118 __m256 rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
121 __m256 one_sixth
= _mm256_set1_ps(1.0/6.0);
122 __m256 one_twelfth
= _mm256_set1_ps(1.0/12.0);
134 __m256 ewclj
,ewclj2
,ewclj6
,ewcljrsq
,poly
,exponent
,f6A
,f6B
,sh_lj_ewald
;
135 __m256 one_half
= _mm256_set1_ps(0.5);
136 __m256 minus_one
= _mm256_set1_ps(-1.0);
138 __m128i ewitab_lo
,ewitab_hi
;
139 __m256 ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
140 __m256 beta
,beta2
,beta3
,zeta2
,pmecorrF
,pmecorrV
,rinv3
;
142 __m256 dummy_mask
,cutoff_mask
;
143 __m256 signbit
= _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
144 __m256 one
= _mm256_set1_ps(1.0);
145 __m256 two
= _mm256_set1_ps(2.0);
151 jindex
= nlist
->jindex
;
153 shiftidx
= nlist
->shift
;
155 shiftvec
= fr
->shift_vec
[0];
156 fshift
= fr
->fshift
[0];
157 facel
= _mm256_set1_ps(fr
->ic
->epsfac
);
158 charge
= mdatoms
->chargeA
;
159 nvdwtype
= fr
->ntype
;
161 vdwtype
= mdatoms
->typeA
;
162 vdwgridparam
= fr
->ljpme_c6grid
;
163 sh_lj_ewald
= _mm256_set1_ps(fr
->ic
->sh_lj_ewald
);
164 ewclj
= _mm256_set1_ps(fr
->ic
->ewaldcoeff_lj
);
165 ewclj2
= _mm256_mul_ps(minus_one
,_mm256_mul_ps(ewclj
,ewclj
));
167 sh_ewald
= _mm256_set1_ps(fr
->ic
->sh_ewald
);
168 beta
= _mm256_set1_ps(fr
->ic
->ewaldcoeff_q
);
169 beta2
= _mm256_mul_ps(beta
,beta
);
170 beta3
= _mm256_mul_ps(beta
,beta2
);
172 ewtab
= fr
->ic
->tabq_coul_FDV0
;
173 ewtabscale
= _mm256_set1_ps(fr
->ic
->tabq_scale
);
174 ewtabhalfspace
= _mm256_set1_ps(0.5/fr
->ic
->tabq_scale
);
176 /* Setup water-specific parameters */
177 inr
= nlist
->iinr
[0];
178 iq1
= _mm256_mul_ps(facel
,_mm256_set1_ps(charge
[inr
+1]));
179 iq2
= _mm256_mul_ps(facel
,_mm256_set1_ps(charge
[inr
+2]));
180 iq3
= _mm256_mul_ps(facel
,_mm256_set1_ps(charge
[inr
+3]));
181 vdwioffsetptr0
= vdwparam
+2*nvdwtype
*vdwtype
[inr
+0];
182 vdwgridioffsetptr0
= vdwgridparam
+2*nvdwtype
*vdwtype
[inr
+0];
184 jq1
= _mm256_set1_ps(charge
[inr
+1]);
185 jq2
= _mm256_set1_ps(charge
[inr
+2]);
186 jq3
= _mm256_set1_ps(charge
[inr
+3]);
187 vdwjidx0A
= 2*vdwtype
[inr
+0];
188 c6_00
= _mm256_set1_ps(vdwioffsetptr0
[vdwjidx0A
]);
189 c12_00
= _mm256_set1_ps(vdwioffsetptr0
[vdwjidx0A
+1]);
190 c6grid_00
= _mm256_set1_ps(vdwgridioffsetptr0
[vdwjidx0A
]);
191 qq11
= _mm256_mul_ps(iq1
,jq1
);
192 qq12
= _mm256_mul_ps(iq1
,jq2
);
193 qq13
= _mm256_mul_ps(iq1
,jq3
);
194 qq21
= _mm256_mul_ps(iq2
,jq1
);
195 qq22
= _mm256_mul_ps(iq2
,jq2
);
196 qq23
= _mm256_mul_ps(iq2
,jq3
);
197 qq31
= _mm256_mul_ps(iq3
,jq1
);
198 qq32
= _mm256_mul_ps(iq3
,jq2
);
199 qq33
= _mm256_mul_ps(iq3
,jq3
);
201 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
202 rcutoff_scalar
= fr
->ic
->rcoulomb
;
203 rcutoff
= _mm256_set1_ps(rcutoff_scalar
);
204 rcutoff2
= _mm256_mul_ps(rcutoff
,rcutoff
);
206 sh_vdw_invrcut6
= _mm256_set1_ps(fr
->ic
->sh_invrc6
);
207 rvdw
= _mm256_set1_ps(fr
->ic
->rvdw
);
209 /* Avoid stupid compiler warnings */
210 jnrA
= jnrB
= jnrC
= jnrD
= jnrE
= jnrF
= jnrG
= jnrH
= 0;
223 for(iidx
=0;iidx
<4*DIM
;iidx
++)
228 /* Start outer loop over neighborlists */
229 for(iidx
=0; iidx
<nri
; iidx
++)
231 /* Load shift vector for this list */
232 i_shift_offset
= DIM
*shiftidx
[iidx
];
234 /* Load limits for loop over neighbors */
235 j_index_start
= jindex
[iidx
];
236 j_index_end
= jindex
[iidx
+1];
238 /* Get outer coordinate index */
240 i_coord_offset
= DIM
*inr
;
242 /* Load i particle coords and add shift vector */
243 gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
244 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
246 fix0
= _mm256_setzero_ps();
247 fiy0
= _mm256_setzero_ps();
248 fiz0
= _mm256_setzero_ps();
249 fix1
= _mm256_setzero_ps();
250 fiy1
= _mm256_setzero_ps();
251 fiz1
= _mm256_setzero_ps();
252 fix2
= _mm256_setzero_ps();
253 fiy2
= _mm256_setzero_ps();
254 fiz2
= _mm256_setzero_ps();
255 fix3
= _mm256_setzero_ps();
256 fiy3
= _mm256_setzero_ps();
257 fiz3
= _mm256_setzero_ps();
259 /* Reset potential sums */
260 velecsum
= _mm256_setzero_ps();
261 vvdwsum
= _mm256_setzero_ps();
263 /* Start inner kernel loop */
264 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+7]>=0; jidx
+=8)
267 /* Get j neighbor index, and coordinate index */
276 j_coord_offsetA
= DIM
*jnrA
;
277 j_coord_offsetB
= DIM
*jnrB
;
278 j_coord_offsetC
= DIM
*jnrC
;
279 j_coord_offsetD
= DIM
*jnrD
;
280 j_coord_offsetE
= DIM
*jnrE
;
281 j_coord_offsetF
= DIM
*jnrF
;
282 j_coord_offsetG
= DIM
*jnrG
;
283 j_coord_offsetH
= DIM
*jnrH
;
285 /* load j atom coordinates */
286 gmx_mm256_load_4rvec_8ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
287 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
288 x
+j_coord_offsetE
,x
+j_coord_offsetF
,
289 x
+j_coord_offsetG
,x
+j_coord_offsetH
,
290 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
291 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
293 /* Calculate displacement vector */
294 dx00
= _mm256_sub_ps(ix0
,jx0
);
295 dy00
= _mm256_sub_ps(iy0
,jy0
);
296 dz00
= _mm256_sub_ps(iz0
,jz0
);
297 dx11
= _mm256_sub_ps(ix1
,jx1
);
298 dy11
= _mm256_sub_ps(iy1
,jy1
);
299 dz11
= _mm256_sub_ps(iz1
,jz1
);
300 dx12
= _mm256_sub_ps(ix1
,jx2
);
301 dy12
= _mm256_sub_ps(iy1
,jy2
);
302 dz12
= _mm256_sub_ps(iz1
,jz2
);
303 dx13
= _mm256_sub_ps(ix1
,jx3
);
304 dy13
= _mm256_sub_ps(iy1
,jy3
);
305 dz13
= _mm256_sub_ps(iz1
,jz3
);
306 dx21
= _mm256_sub_ps(ix2
,jx1
);
307 dy21
= _mm256_sub_ps(iy2
,jy1
);
308 dz21
= _mm256_sub_ps(iz2
,jz1
);
309 dx22
= _mm256_sub_ps(ix2
,jx2
);
310 dy22
= _mm256_sub_ps(iy2
,jy2
);
311 dz22
= _mm256_sub_ps(iz2
,jz2
);
312 dx23
= _mm256_sub_ps(ix2
,jx3
);
313 dy23
= _mm256_sub_ps(iy2
,jy3
);
314 dz23
= _mm256_sub_ps(iz2
,jz3
);
315 dx31
= _mm256_sub_ps(ix3
,jx1
);
316 dy31
= _mm256_sub_ps(iy3
,jy1
);
317 dz31
= _mm256_sub_ps(iz3
,jz1
);
318 dx32
= _mm256_sub_ps(ix3
,jx2
);
319 dy32
= _mm256_sub_ps(iy3
,jy2
);
320 dz32
= _mm256_sub_ps(iz3
,jz2
);
321 dx33
= _mm256_sub_ps(ix3
,jx3
);
322 dy33
= _mm256_sub_ps(iy3
,jy3
);
323 dz33
= _mm256_sub_ps(iz3
,jz3
);
325 /* Calculate squared distance and things based on it */
326 rsq00
= gmx_mm256_calc_rsq_ps(dx00
,dy00
,dz00
);
327 rsq11
= gmx_mm256_calc_rsq_ps(dx11
,dy11
,dz11
);
328 rsq12
= gmx_mm256_calc_rsq_ps(dx12
,dy12
,dz12
);
329 rsq13
= gmx_mm256_calc_rsq_ps(dx13
,dy13
,dz13
);
330 rsq21
= gmx_mm256_calc_rsq_ps(dx21
,dy21
,dz21
);
331 rsq22
= gmx_mm256_calc_rsq_ps(dx22
,dy22
,dz22
);
332 rsq23
= gmx_mm256_calc_rsq_ps(dx23
,dy23
,dz23
);
333 rsq31
= gmx_mm256_calc_rsq_ps(dx31
,dy31
,dz31
);
334 rsq32
= gmx_mm256_calc_rsq_ps(dx32
,dy32
,dz32
);
335 rsq33
= gmx_mm256_calc_rsq_ps(dx33
,dy33
,dz33
);
337 rinv00
= avx256_invsqrt_f(rsq00
);
338 rinv11
= avx256_invsqrt_f(rsq11
);
339 rinv12
= avx256_invsqrt_f(rsq12
);
340 rinv13
= avx256_invsqrt_f(rsq13
);
341 rinv21
= avx256_invsqrt_f(rsq21
);
342 rinv22
= avx256_invsqrt_f(rsq22
);
343 rinv23
= avx256_invsqrt_f(rsq23
);
344 rinv31
= avx256_invsqrt_f(rsq31
);
345 rinv32
= avx256_invsqrt_f(rsq32
);
346 rinv33
= avx256_invsqrt_f(rsq33
);
348 rinvsq00
= _mm256_mul_ps(rinv00
,rinv00
);
349 rinvsq11
= _mm256_mul_ps(rinv11
,rinv11
);
350 rinvsq12
= _mm256_mul_ps(rinv12
,rinv12
);
351 rinvsq13
= _mm256_mul_ps(rinv13
,rinv13
);
352 rinvsq21
= _mm256_mul_ps(rinv21
,rinv21
);
353 rinvsq22
= _mm256_mul_ps(rinv22
,rinv22
);
354 rinvsq23
= _mm256_mul_ps(rinv23
,rinv23
);
355 rinvsq31
= _mm256_mul_ps(rinv31
,rinv31
);
356 rinvsq32
= _mm256_mul_ps(rinv32
,rinv32
);
357 rinvsq33
= _mm256_mul_ps(rinv33
,rinv33
);
359 fjx0
= _mm256_setzero_ps();
360 fjy0
= _mm256_setzero_ps();
361 fjz0
= _mm256_setzero_ps();
362 fjx1
= _mm256_setzero_ps();
363 fjy1
= _mm256_setzero_ps();
364 fjz1
= _mm256_setzero_ps();
365 fjx2
= _mm256_setzero_ps();
366 fjy2
= _mm256_setzero_ps();
367 fjz2
= _mm256_setzero_ps();
368 fjx3
= _mm256_setzero_ps();
369 fjy3
= _mm256_setzero_ps();
370 fjz3
= _mm256_setzero_ps();
372 /**************************
373 * CALCULATE INTERACTIONS *
374 **************************/
376 if (gmx_mm256_any_lt(rsq00
,rcutoff2
))
379 r00
= _mm256_mul_ps(rsq00
,rinv00
);
381 /* Analytical LJ-PME */
382 rinvsix
= _mm256_mul_ps(_mm256_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
383 ewcljrsq
= _mm256_mul_ps(ewclj2
,rsq00
);
384 ewclj6
= _mm256_mul_ps(ewclj2
,_mm256_mul_ps(ewclj2
,ewclj2
));
385 exponent
= avx256_exp_f(ewcljrsq
);
386 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
387 poly
= _mm256_mul_ps(exponent
,_mm256_add_ps(_mm256_sub_ps(one
,ewcljrsq
),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq
,ewcljrsq
),one_half
)));
388 /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
389 vvdw6
= _mm256_mul_ps(_mm256_sub_ps(c6_00
,_mm256_mul_ps(c6grid_00
,_mm256_sub_ps(one
,poly
))),rinvsix
);
390 vvdw12
= _mm256_mul_ps(c12_00
,_mm256_mul_ps(rinvsix
,rinvsix
));
391 vvdw
= _mm256_sub_ps(_mm256_mul_ps( _mm256_sub_ps(vvdw12
, _mm256_mul_ps(c12_00
,_mm256_mul_ps(sh_vdw_invrcut6
,sh_vdw_invrcut6
))), one_twelfth
) ,
392 _mm256_mul_ps( _mm256_sub_ps(vvdw6
,_mm256_add_ps(_mm256_mul_ps(c6_00
,sh_vdw_invrcut6
),_mm256_mul_ps(c6grid_00
,sh_lj_ewald
))),one_sixth
));
393 /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
394 fvdw
= _mm256_mul_ps(_mm256_sub_ps(vvdw12
,_mm256_sub_ps(vvdw6
,_mm256_mul_ps(_mm256_mul_ps(c6grid_00
,one_sixth
),_mm256_mul_ps(exponent
,ewclj6
)))),rinvsq00
);
396 cutoff_mask
= _mm256_cmp_ps(rsq00
,rcutoff2
,_CMP_LT_OQ
);
398 /* Update potential sum for this i atom from the interaction with this j atom. */
399 vvdw
= _mm256_and_ps(vvdw
,cutoff_mask
);
400 vvdwsum
= _mm256_add_ps(vvdwsum
,vvdw
);
404 fscal
= _mm256_and_ps(fscal
,cutoff_mask
);
406 /* Calculate temporary vectorial force */
407 tx
= _mm256_mul_ps(fscal
,dx00
);
408 ty
= _mm256_mul_ps(fscal
,dy00
);
409 tz
= _mm256_mul_ps(fscal
,dz00
);
411 /* Update vectorial force */
412 fix0
= _mm256_add_ps(fix0
,tx
);
413 fiy0
= _mm256_add_ps(fiy0
,ty
);
414 fiz0
= _mm256_add_ps(fiz0
,tz
);
416 fjx0
= _mm256_add_ps(fjx0
,tx
);
417 fjy0
= _mm256_add_ps(fjy0
,ty
);
418 fjz0
= _mm256_add_ps(fjz0
,tz
);
422 /**************************
423 * CALCULATE INTERACTIONS *
424 **************************/
426 if (gmx_mm256_any_lt(rsq11
,rcutoff2
))
429 r11
= _mm256_mul_ps(rsq11
,rinv11
);
431 /* EWALD ELECTROSTATICS */
433 /* Analytical PME correction */
434 zeta2
= _mm256_mul_ps(beta2
,rsq11
);
435 rinv3
= _mm256_mul_ps(rinvsq11
,rinv11
);
436 pmecorrF
= avx256_pmecorrF_f(zeta2
);
437 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
438 felec
= _mm256_mul_ps(qq11
,felec
);
439 pmecorrV
= avx256_pmecorrV_f(zeta2
);
440 pmecorrV
= _mm256_mul_ps(pmecorrV
,beta
);
441 velec
= _mm256_sub_ps(_mm256_sub_ps(rinv11
,sh_ewald
),pmecorrV
);
442 velec
= _mm256_mul_ps(qq11
,velec
);
444 cutoff_mask
= _mm256_cmp_ps(rsq11
,rcutoff2
,_CMP_LT_OQ
);
446 /* Update potential sum for this i atom from the interaction with this j atom. */
447 velec
= _mm256_and_ps(velec
,cutoff_mask
);
448 velecsum
= _mm256_add_ps(velecsum
,velec
);
452 fscal
= _mm256_and_ps(fscal
,cutoff_mask
);
454 /* Calculate temporary vectorial force */
455 tx
= _mm256_mul_ps(fscal
,dx11
);
456 ty
= _mm256_mul_ps(fscal
,dy11
);
457 tz
= _mm256_mul_ps(fscal
,dz11
);
459 /* Update vectorial force */
460 fix1
= _mm256_add_ps(fix1
,tx
);
461 fiy1
= _mm256_add_ps(fiy1
,ty
);
462 fiz1
= _mm256_add_ps(fiz1
,tz
);
464 fjx1
= _mm256_add_ps(fjx1
,tx
);
465 fjy1
= _mm256_add_ps(fjy1
,ty
);
466 fjz1
= _mm256_add_ps(fjz1
,tz
);
470 /**************************
471 * CALCULATE INTERACTIONS *
472 **************************/
474 if (gmx_mm256_any_lt(rsq12
,rcutoff2
))
477 r12
= _mm256_mul_ps(rsq12
,rinv12
);
479 /* EWALD ELECTROSTATICS */
481 /* Analytical PME correction */
482 zeta2
= _mm256_mul_ps(beta2
,rsq12
);
483 rinv3
= _mm256_mul_ps(rinvsq12
,rinv12
);
484 pmecorrF
= avx256_pmecorrF_f(zeta2
);
485 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
486 felec
= _mm256_mul_ps(qq12
,felec
);
487 pmecorrV
= avx256_pmecorrV_f(zeta2
);
488 pmecorrV
= _mm256_mul_ps(pmecorrV
,beta
);
489 velec
= _mm256_sub_ps(_mm256_sub_ps(rinv12
,sh_ewald
),pmecorrV
);
490 velec
= _mm256_mul_ps(qq12
,velec
);
492 cutoff_mask
= _mm256_cmp_ps(rsq12
,rcutoff2
,_CMP_LT_OQ
);
494 /* Update potential sum for this i atom from the interaction with this j atom. */
495 velec
= _mm256_and_ps(velec
,cutoff_mask
);
496 velecsum
= _mm256_add_ps(velecsum
,velec
);
500 fscal
= _mm256_and_ps(fscal
,cutoff_mask
);
502 /* Calculate temporary vectorial force */
503 tx
= _mm256_mul_ps(fscal
,dx12
);
504 ty
= _mm256_mul_ps(fscal
,dy12
);
505 tz
= _mm256_mul_ps(fscal
,dz12
);
507 /* Update vectorial force */
508 fix1
= _mm256_add_ps(fix1
,tx
);
509 fiy1
= _mm256_add_ps(fiy1
,ty
);
510 fiz1
= _mm256_add_ps(fiz1
,tz
);
512 fjx2
= _mm256_add_ps(fjx2
,tx
);
513 fjy2
= _mm256_add_ps(fjy2
,ty
);
514 fjz2
= _mm256_add_ps(fjz2
,tz
);
518 /**************************
519 * CALCULATE INTERACTIONS *
520 **************************/
522 if (gmx_mm256_any_lt(rsq13
,rcutoff2
))
525 r13
= _mm256_mul_ps(rsq13
,rinv13
);
527 /* EWALD ELECTROSTATICS */
529 /* Analytical PME correction */
530 zeta2
= _mm256_mul_ps(beta2
,rsq13
);
531 rinv3
= _mm256_mul_ps(rinvsq13
,rinv13
);
532 pmecorrF
= avx256_pmecorrF_f(zeta2
);
533 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
534 felec
= _mm256_mul_ps(qq13
,felec
);
535 pmecorrV
= avx256_pmecorrV_f(zeta2
);
536 pmecorrV
= _mm256_mul_ps(pmecorrV
,beta
);
537 velec
= _mm256_sub_ps(_mm256_sub_ps(rinv13
,sh_ewald
),pmecorrV
);
538 velec
= _mm256_mul_ps(qq13
,velec
);
540 cutoff_mask
= _mm256_cmp_ps(rsq13
,rcutoff2
,_CMP_LT_OQ
);
542 /* Update potential sum for this i atom from the interaction with this j atom. */
543 velec
= _mm256_and_ps(velec
,cutoff_mask
);
544 velecsum
= _mm256_add_ps(velecsum
,velec
);
548 fscal
= _mm256_and_ps(fscal
,cutoff_mask
);
550 /* Calculate temporary vectorial force */
551 tx
= _mm256_mul_ps(fscal
,dx13
);
552 ty
= _mm256_mul_ps(fscal
,dy13
);
553 tz
= _mm256_mul_ps(fscal
,dz13
);
555 /* Update vectorial force */
556 fix1
= _mm256_add_ps(fix1
,tx
);
557 fiy1
= _mm256_add_ps(fiy1
,ty
);
558 fiz1
= _mm256_add_ps(fiz1
,tz
);
560 fjx3
= _mm256_add_ps(fjx3
,tx
);
561 fjy3
= _mm256_add_ps(fjy3
,ty
);
562 fjz3
= _mm256_add_ps(fjz3
,tz
);
566 /**************************
567 * CALCULATE INTERACTIONS *
568 **************************/
570 if (gmx_mm256_any_lt(rsq21
,rcutoff2
))
573 r21
= _mm256_mul_ps(rsq21
,rinv21
);
575 /* EWALD ELECTROSTATICS */
577 /* Analytical PME correction */
578 zeta2
= _mm256_mul_ps(beta2
,rsq21
);
579 rinv3
= _mm256_mul_ps(rinvsq21
,rinv21
);
580 pmecorrF
= avx256_pmecorrF_f(zeta2
);
581 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
582 felec
= _mm256_mul_ps(qq21
,felec
);
583 pmecorrV
= avx256_pmecorrV_f(zeta2
);
584 pmecorrV
= _mm256_mul_ps(pmecorrV
,beta
);
585 velec
= _mm256_sub_ps(_mm256_sub_ps(rinv21
,sh_ewald
),pmecorrV
);
586 velec
= _mm256_mul_ps(qq21
,velec
);
588 cutoff_mask
= _mm256_cmp_ps(rsq21
,rcutoff2
,_CMP_LT_OQ
);
590 /* Update potential sum for this i atom from the interaction with this j atom. */
591 velec
= _mm256_and_ps(velec
,cutoff_mask
);
592 velecsum
= _mm256_add_ps(velecsum
,velec
);
596 fscal
= _mm256_and_ps(fscal
,cutoff_mask
);
598 /* Calculate temporary vectorial force */
599 tx
= _mm256_mul_ps(fscal
,dx21
);
600 ty
= _mm256_mul_ps(fscal
,dy21
);
601 tz
= _mm256_mul_ps(fscal
,dz21
);
603 /* Update vectorial force */
604 fix2
= _mm256_add_ps(fix2
,tx
);
605 fiy2
= _mm256_add_ps(fiy2
,ty
);
606 fiz2
= _mm256_add_ps(fiz2
,tz
);
608 fjx1
= _mm256_add_ps(fjx1
,tx
);
609 fjy1
= _mm256_add_ps(fjy1
,ty
);
610 fjz1
= _mm256_add_ps(fjz1
,tz
);
614 /**************************
615 * CALCULATE INTERACTIONS *
616 **************************/
618 if (gmx_mm256_any_lt(rsq22
,rcutoff2
))
621 r22
= _mm256_mul_ps(rsq22
,rinv22
);
623 /* EWALD ELECTROSTATICS */
625 /* Analytical PME correction */
626 zeta2
= _mm256_mul_ps(beta2
,rsq22
);
627 rinv3
= _mm256_mul_ps(rinvsq22
,rinv22
);
628 pmecorrF
= avx256_pmecorrF_f(zeta2
);
629 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
630 felec
= _mm256_mul_ps(qq22
,felec
);
631 pmecorrV
= avx256_pmecorrV_f(zeta2
);
632 pmecorrV
= _mm256_mul_ps(pmecorrV
,beta
);
633 velec
= _mm256_sub_ps(_mm256_sub_ps(rinv22
,sh_ewald
),pmecorrV
);
634 velec
= _mm256_mul_ps(qq22
,velec
);
636 cutoff_mask
= _mm256_cmp_ps(rsq22
,rcutoff2
,_CMP_LT_OQ
);
638 /* Update potential sum for this i atom from the interaction with this j atom. */
639 velec
= _mm256_and_ps(velec
,cutoff_mask
);
640 velecsum
= _mm256_add_ps(velecsum
,velec
);
644 fscal
= _mm256_and_ps(fscal
,cutoff_mask
);
646 /* Calculate temporary vectorial force */
647 tx
= _mm256_mul_ps(fscal
,dx22
);
648 ty
= _mm256_mul_ps(fscal
,dy22
);
649 tz
= _mm256_mul_ps(fscal
,dz22
);
651 /* Update vectorial force */
652 fix2
= _mm256_add_ps(fix2
,tx
);
653 fiy2
= _mm256_add_ps(fiy2
,ty
);
654 fiz2
= _mm256_add_ps(fiz2
,tz
);
656 fjx2
= _mm256_add_ps(fjx2
,tx
);
657 fjy2
= _mm256_add_ps(fjy2
,ty
);
658 fjz2
= _mm256_add_ps(fjz2
,tz
);
662 /**************************
663 * CALCULATE INTERACTIONS *
664 **************************/
666 if (gmx_mm256_any_lt(rsq23
,rcutoff2
))
669 r23
= _mm256_mul_ps(rsq23
,rinv23
);
671 /* EWALD ELECTROSTATICS */
673 /* Analytical PME correction */
674 zeta2
= _mm256_mul_ps(beta2
,rsq23
);
675 rinv3
= _mm256_mul_ps(rinvsq23
,rinv23
);
676 pmecorrF
= avx256_pmecorrF_f(zeta2
);
677 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
678 felec
= _mm256_mul_ps(qq23
,felec
);
679 pmecorrV
= avx256_pmecorrV_f(zeta2
);
680 pmecorrV
= _mm256_mul_ps(pmecorrV
,beta
);
681 velec
= _mm256_sub_ps(_mm256_sub_ps(rinv23
,sh_ewald
),pmecorrV
);
682 velec
= _mm256_mul_ps(qq23
,velec
);
684 cutoff_mask
= _mm256_cmp_ps(rsq23
,rcutoff2
,_CMP_LT_OQ
);
686 /* Update potential sum for this i atom from the interaction with this j atom. */
687 velec
= _mm256_and_ps(velec
,cutoff_mask
);
688 velecsum
= _mm256_add_ps(velecsum
,velec
);
692 fscal
= _mm256_and_ps(fscal
,cutoff_mask
);
694 /* Calculate temporary vectorial force */
695 tx
= _mm256_mul_ps(fscal
,dx23
);
696 ty
= _mm256_mul_ps(fscal
,dy23
);
697 tz
= _mm256_mul_ps(fscal
,dz23
);
699 /* Update vectorial force */
700 fix2
= _mm256_add_ps(fix2
,tx
);
701 fiy2
= _mm256_add_ps(fiy2
,ty
);
702 fiz2
= _mm256_add_ps(fiz2
,tz
);
704 fjx3
= _mm256_add_ps(fjx3
,tx
);
705 fjy3
= _mm256_add_ps(fjy3
,ty
);
706 fjz3
= _mm256_add_ps(fjz3
,tz
);
710 /**************************
711 * CALCULATE INTERACTIONS *
712 **************************/
714 if (gmx_mm256_any_lt(rsq31
,rcutoff2
))
717 r31
= _mm256_mul_ps(rsq31
,rinv31
);
719 /* EWALD ELECTROSTATICS */
721 /* Analytical PME correction */
722 zeta2
= _mm256_mul_ps(beta2
,rsq31
);
723 rinv3
= _mm256_mul_ps(rinvsq31
,rinv31
);
724 pmecorrF
= avx256_pmecorrF_f(zeta2
);
725 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
726 felec
= _mm256_mul_ps(qq31
,felec
);
727 pmecorrV
= avx256_pmecorrV_f(zeta2
);
728 pmecorrV
= _mm256_mul_ps(pmecorrV
,beta
);
729 velec
= _mm256_sub_ps(_mm256_sub_ps(rinv31
,sh_ewald
),pmecorrV
);
730 velec
= _mm256_mul_ps(qq31
,velec
);
732 cutoff_mask
= _mm256_cmp_ps(rsq31
,rcutoff2
,_CMP_LT_OQ
);
734 /* Update potential sum for this i atom from the interaction with this j atom. */
735 velec
= _mm256_and_ps(velec
,cutoff_mask
);
736 velecsum
= _mm256_add_ps(velecsum
,velec
);
740 fscal
= _mm256_and_ps(fscal
,cutoff_mask
);
742 /* Calculate temporary vectorial force */
743 tx
= _mm256_mul_ps(fscal
,dx31
);
744 ty
= _mm256_mul_ps(fscal
,dy31
);
745 tz
= _mm256_mul_ps(fscal
,dz31
);
747 /* Update vectorial force */
748 fix3
= _mm256_add_ps(fix3
,tx
);
749 fiy3
= _mm256_add_ps(fiy3
,ty
);
750 fiz3
= _mm256_add_ps(fiz3
,tz
);
752 fjx1
= _mm256_add_ps(fjx1
,tx
);
753 fjy1
= _mm256_add_ps(fjy1
,ty
);
754 fjz1
= _mm256_add_ps(fjz1
,tz
);
758 /**************************
759 * CALCULATE INTERACTIONS *
760 **************************/
762 if (gmx_mm256_any_lt(rsq32
,rcutoff2
))
765 r32
= _mm256_mul_ps(rsq32
,rinv32
);
767 /* EWALD ELECTROSTATICS */
769 /* Analytical PME correction */
770 zeta2
= _mm256_mul_ps(beta2
,rsq32
);
771 rinv3
= _mm256_mul_ps(rinvsq32
,rinv32
);
772 pmecorrF
= avx256_pmecorrF_f(zeta2
);
773 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
774 felec
= _mm256_mul_ps(qq32
,felec
);
775 pmecorrV
= avx256_pmecorrV_f(zeta2
);
776 pmecorrV
= _mm256_mul_ps(pmecorrV
,beta
);
777 velec
= _mm256_sub_ps(_mm256_sub_ps(rinv32
,sh_ewald
),pmecorrV
);
778 velec
= _mm256_mul_ps(qq32
,velec
);
780 cutoff_mask
= _mm256_cmp_ps(rsq32
,rcutoff2
,_CMP_LT_OQ
);
782 /* Update potential sum for this i atom from the interaction with this j atom. */
783 velec
= _mm256_and_ps(velec
,cutoff_mask
);
784 velecsum
= _mm256_add_ps(velecsum
,velec
);
788 fscal
= _mm256_and_ps(fscal
,cutoff_mask
);
790 /* Calculate temporary vectorial force */
791 tx
= _mm256_mul_ps(fscal
,dx32
);
792 ty
= _mm256_mul_ps(fscal
,dy32
);
793 tz
= _mm256_mul_ps(fscal
,dz32
);
795 /* Update vectorial force */
796 fix3
= _mm256_add_ps(fix3
,tx
);
797 fiy3
= _mm256_add_ps(fiy3
,ty
);
798 fiz3
= _mm256_add_ps(fiz3
,tz
);
800 fjx2
= _mm256_add_ps(fjx2
,tx
);
801 fjy2
= _mm256_add_ps(fjy2
,ty
);
802 fjz2
= _mm256_add_ps(fjz2
,tz
);
806 /**************************
807 * CALCULATE INTERACTIONS *
808 **************************/
810 if (gmx_mm256_any_lt(rsq33
,rcutoff2
))
813 r33
= _mm256_mul_ps(rsq33
,rinv33
);
815 /* EWALD ELECTROSTATICS */
817 /* Analytical PME correction */
818 zeta2
= _mm256_mul_ps(beta2
,rsq33
);
819 rinv3
= _mm256_mul_ps(rinvsq33
,rinv33
);
820 pmecorrF
= avx256_pmecorrF_f(zeta2
);
821 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
822 felec
= _mm256_mul_ps(qq33
,felec
);
823 pmecorrV
= avx256_pmecorrV_f(zeta2
);
824 pmecorrV
= _mm256_mul_ps(pmecorrV
,beta
);
825 velec
= _mm256_sub_ps(_mm256_sub_ps(rinv33
,sh_ewald
),pmecorrV
);
826 velec
= _mm256_mul_ps(qq33
,velec
);
828 cutoff_mask
= _mm256_cmp_ps(rsq33
,rcutoff2
,_CMP_LT_OQ
);
830 /* Update potential sum for this i atom from the interaction with this j atom. */
831 velec
= _mm256_and_ps(velec
,cutoff_mask
);
832 velecsum
= _mm256_add_ps(velecsum
,velec
);
836 fscal
= _mm256_and_ps(fscal
,cutoff_mask
);
838 /* Calculate temporary vectorial force */
839 tx
= _mm256_mul_ps(fscal
,dx33
);
840 ty
= _mm256_mul_ps(fscal
,dy33
);
841 tz
= _mm256_mul_ps(fscal
,dz33
);
843 /* Update vectorial force */
844 fix3
= _mm256_add_ps(fix3
,tx
);
845 fiy3
= _mm256_add_ps(fiy3
,ty
);
846 fiz3
= _mm256_add_ps(fiz3
,tz
);
848 fjx3
= _mm256_add_ps(fjx3
,tx
);
849 fjy3
= _mm256_add_ps(fjy3
,ty
);
850 fjz3
= _mm256_add_ps(fjz3
,tz
);
854 fjptrA
= f
+j_coord_offsetA
;
855 fjptrB
= f
+j_coord_offsetB
;
856 fjptrC
= f
+j_coord_offsetC
;
857 fjptrD
= f
+j_coord_offsetD
;
858 fjptrE
= f
+j_coord_offsetE
;
859 fjptrF
= f
+j_coord_offsetF
;
860 fjptrG
= f
+j_coord_offsetG
;
861 fjptrH
= f
+j_coord_offsetH
;
863 gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,fjptrE
,fjptrF
,fjptrG
,fjptrH
,
864 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,
865 fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
867 /* Inner loop uses 1046 flops */
873 /* Get j neighbor index, and coordinate index */
874 jnrlistA
= jjnr
[jidx
];
875 jnrlistB
= jjnr
[jidx
+1];
876 jnrlistC
= jjnr
[jidx
+2];
877 jnrlistD
= jjnr
[jidx
+3];
878 jnrlistE
= jjnr
[jidx
+4];
879 jnrlistF
= jjnr
[jidx
+5];
880 jnrlistG
= jjnr
[jidx
+6];
881 jnrlistH
= jjnr
[jidx
+7];
882 /* Sign of each element will be negative for non-real atoms.
883 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
884 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
886 dummy_mask
= gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
+4)),_mm_setzero_si128())),
887 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128())));
889 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
890 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
891 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
892 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
893 jnrE
= (jnrlistE
>=0) ? jnrlistE
: 0;
894 jnrF
= (jnrlistF
>=0) ? jnrlistF
: 0;
895 jnrG
= (jnrlistG
>=0) ? jnrlistG
: 0;
896 jnrH
= (jnrlistH
>=0) ? jnrlistH
: 0;
897 j_coord_offsetA
= DIM
*jnrA
;
898 j_coord_offsetB
= DIM
*jnrB
;
899 j_coord_offsetC
= DIM
*jnrC
;
900 j_coord_offsetD
= DIM
*jnrD
;
901 j_coord_offsetE
= DIM
*jnrE
;
902 j_coord_offsetF
= DIM
*jnrF
;
903 j_coord_offsetG
= DIM
*jnrG
;
904 j_coord_offsetH
= DIM
*jnrH
;
906 /* load j atom coordinates */
907 gmx_mm256_load_4rvec_8ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
908 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
909 x
+j_coord_offsetE
,x
+j_coord_offsetF
,
910 x
+j_coord_offsetG
,x
+j_coord_offsetH
,
911 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
912 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
914 /* Calculate displacement vector */
915 dx00
= _mm256_sub_ps(ix0
,jx0
);
916 dy00
= _mm256_sub_ps(iy0
,jy0
);
917 dz00
= _mm256_sub_ps(iz0
,jz0
);
918 dx11
= _mm256_sub_ps(ix1
,jx1
);
919 dy11
= _mm256_sub_ps(iy1
,jy1
);
920 dz11
= _mm256_sub_ps(iz1
,jz1
);
921 dx12
= _mm256_sub_ps(ix1
,jx2
);
922 dy12
= _mm256_sub_ps(iy1
,jy2
);
923 dz12
= _mm256_sub_ps(iz1
,jz2
);
924 dx13
= _mm256_sub_ps(ix1
,jx3
);
925 dy13
= _mm256_sub_ps(iy1
,jy3
);
926 dz13
= _mm256_sub_ps(iz1
,jz3
);
927 dx21
= _mm256_sub_ps(ix2
,jx1
);
928 dy21
= _mm256_sub_ps(iy2
,jy1
);
929 dz21
= _mm256_sub_ps(iz2
,jz1
);
930 dx22
= _mm256_sub_ps(ix2
,jx2
);
931 dy22
= _mm256_sub_ps(iy2
,jy2
);
932 dz22
= _mm256_sub_ps(iz2
,jz2
);
933 dx23
= _mm256_sub_ps(ix2
,jx3
);
934 dy23
= _mm256_sub_ps(iy2
,jy3
);
935 dz23
= _mm256_sub_ps(iz2
,jz3
);
936 dx31
= _mm256_sub_ps(ix3
,jx1
);
937 dy31
= _mm256_sub_ps(iy3
,jy1
);
938 dz31
= _mm256_sub_ps(iz3
,jz1
);
939 dx32
= _mm256_sub_ps(ix3
,jx2
);
940 dy32
= _mm256_sub_ps(iy3
,jy2
);
941 dz32
= _mm256_sub_ps(iz3
,jz2
);
942 dx33
= _mm256_sub_ps(ix3
,jx3
);
943 dy33
= _mm256_sub_ps(iy3
,jy3
);
944 dz33
= _mm256_sub_ps(iz3
,jz3
);
946 /* Calculate squared distance and things based on it */
947 rsq00
= gmx_mm256_calc_rsq_ps(dx00
,dy00
,dz00
);
948 rsq11
= gmx_mm256_calc_rsq_ps(dx11
,dy11
,dz11
);
949 rsq12
= gmx_mm256_calc_rsq_ps(dx12
,dy12
,dz12
);
950 rsq13
= gmx_mm256_calc_rsq_ps(dx13
,dy13
,dz13
);
951 rsq21
= gmx_mm256_calc_rsq_ps(dx21
,dy21
,dz21
);
952 rsq22
= gmx_mm256_calc_rsq_ps(dx22
,dy22
,dz22
);
953 rsq23
= gmx_mm256_calc_rsq_ps(dx23
,dy23
,dz23
);
954 rsq31
= gmx_mm256_calc_rsq_ps(dx31
,dy31
,dz31
);
955 rsq32
= gmx_mm256_calc_rsq_ps(dx32
,dy32
,dz32
);
956 rsq33
= gmx_mm256_calc_rsq_ps(dx33
,dy33
,dz33
);
958 rinv00
= avx256_invsqrt_f(rsq00
);
959 rinv11
= avx256_invsqrt_f(rsq11
);
960 rinv12
= avx256_invsqrt_f(rsq12
);
961 rinv13
= avx256_invsqrt_f(rsq13
);
962 rinv21
= avx256_invsqrt_f(rsq21
);
963 rinv22
= avx256_invsqrt_f(rsq22
);
964 rinv23
= avx256_invsqrt_f(rsq23
);
965 rinv31
= avx256_invsqrt_f(rsq31
);
966 rinv32
= avx256_invsqrt_f(rsq32
);
967 rinv33
= avx256_invsqrt_f(rsq33
);
969 rinvsq00
= _mm256_mul_ps(rinv00
,rinv00
);
970 rinvsq11
= _mm256_mul_ps(rinv11
,rinv11
);
971 rinvsq12
= _mm256_mul_ps(rinv12
,rinv12
);
972 rinvsq13
= _mm256_mul_ps(rinv13
,rinv13
);
973 rinvsq21
= _mm256_mul_ps(rinv21
,rinv21
);
974 rinvsq22
= _mm256_mul_ps(rinv22
,rinv22
);
975 rinvsq23
= _mm256_mul_ps(rinv23
,rinv23
);
976 rinvsq31
= _mm256_mul_ps(rinv31
,rinv31
);
977 rinvsq32
= _mm256_mul_ps(rinv32
,rinv32
);
978 rinvsq33
= _mm256_mul_ps(rinv33
,rinv33
);
980 fjx0
= _mm256_setzero_ps();
981 fjy0
= _mm256_setzero_ps();
982 fjz0
= _mm256_setzero_ps();
983 fjx1
= _mm256_setzero_ps();
984 fjy1
= _mm256_setzero_ps();
985 fjz1
= _mm256_setzero_ps();
986 fjx2
= _mm256_setzero_ps();
987 fjy2
= _mm256_setzero_ps();
988 fjz2
= _mm256_setzero_ps();
989 fjx3
= _mm256_setzero_ps();
990 fjy3
= _mm256_setzero_ps();
991 fjz3
= _mm256_setzero_ps();
993 /**************************
994 * CALCULATE INTERACTIONS *
995 **************************/
997 if (gmx_mm256_any_lt(rsq00
,rcutoff2
))
1000 r00
= _mm256_mul_ps(rsq00
,rinv00
);
1001 r00
= _mm256_andnot_ps(dummy_mask
,r00
);
1003 /* Analytical LJ-PME */
1004 rinvsix
= _mm256_mul_ps(_mm256_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
1005 ewcljrsq
= _mm256_mul_ps(ewclj2
,rsq00
);
1006 ewclj6
= _mm256_mul_ps(ewclj2
,_mm256_mul_ps(ewclj2
,ewclj2
));
1007 exponent
= avx256_exp_f(ewcljrsq
);
1008 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
1009 poly
= _mm256_mul_ps(exponent
,_mm256_add_ps(_mm256_sub_ps(one
,ewcljrsq
),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq
,ewcljrsq
),one_half
)));
1010 /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
1011 vvdw6
= _mm256_mul_ps(_mm256_sub_ps(c6_00
,_mm256_mul_ps(c6grid_00
,_mm256_sub_ps(one
,poly
))),rinvsix
);
1012 vvdw12
= _mm256_mul_ps(c12_00
,_mm256_mul_ps(rinvsix
,rinvsix
));
1013 vvdw
= _mm256_sub_ps(_mm256_mul_ps( _mm256_sub_ps(vvdw12
, _mm256_mul_ps(c12_00
,_mm256_mul_ps(sh_vdw_invrcut6
,sh_vdw_invrcut6
))), one_twelfth
) ,
1014 _mm256_mul_ps( _mm256_sub_ps(vvdw6
,_mm256_add_ps(_mm256_mul_ps(c6_00
,sh_vdw_invrcut6
),_mm256_mul_ps(c6grid_00
,sh_lj_ewald
))),one_sixth
));
1015 /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
1016 fvdw
= _mm256_mul_ps(_mm256_sub_ps(vvdw12
,_mm256_sub_ps(vvdw6
,_mm256_mul_ps(_mm256_mul_ps(c6grid_00
,one_sixth
),_mm256_mul_ps(exponent
,ewclj6
)))),rinvsq00
);
1018 cutoff_mask
= _mm256_cmp_ps(rsq00
,rcutoff2
,_CMP_LT_OQ
);
1020 /* Update potential sum for this i atom from the interaction with this j atom. */
1021 vvdw
= _mm256_and_ps(vvdw
,cutoff_mask
);
1022 vvdw
= _mm256_andnot_ps(dummy_mask
,vvdw
);
1023 vvdwsum
= _mm256_add_ps(vvdwsum
,vvdw
);
1027 fscal
= _mm256_and_ps(fscal
,cutoff_mask
);
1029 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
1031 /* Calculate temporary vectorial force */
1032 tx
= _mm256_mul_ps(fscal
,dx00
);
1033 ty
= _mm256_mul_ps(fscal
,dy00
);
1034 tz
= _mm256_mul_ps(fscal
,dz00
);
1036 /* Update vectorial force */
1037 fix0
= _mm256_add_ps(fix0
,tx
);
1038 fiy0
= _mm256_add_ps(fiy0
,ty
);
1039 fiz0
= _mm256_add_ps(fiz0
,tz
);
1041 fjx0
= _mm256_add_ps(fjx0
,tx
);
1042 fjy0
= _mm256_add_ps(fjy0
,ty
);
1043 fjz0
= _mm256_add_ps(fjz0
,tz
);
1047 /**************************
1048 * CALCULATE INTERACTIONS *
1049 **************************/
1051 if (gmx_mm256_any_lt(rsq11
,rcutoff2
))
1054 r11
= _mm256_mul_ps(rsq11
,rinv11
);
1055 r11
= _mm256_andnot_ps(dummy_mask
,r11
);
1057 /* EWALD ELECTROSTATICS */
1059 /* Analytical PME correction */
1060 zeta2
= _mm256_mul_ps(beta2
,rsq11
);
1061 rinv3
= _mm256_mul_ps(rinvsq11
,rinv11
);
1062 pmecorrF
= avx256_pmecorrF_f(zeta2
);
1063 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
1064 felec
= _mm256_mul_ps(qq11
,felec
);
1065 pmecorrV
= avx256_pmecorrV_f(zeta2
);
1066 pmecorrV
= _mm256_mul_ps(pmecorrV
,beta
);
1067 velec
= _mm256_sub_ps(_mm256_sub_ps(rinv11
,sh_ewald
),pmecorrV
);
1068 velec
= _mm256_mul_ps(qq11
,velec
);
1070 cutoff_mask
= _mm256_cmp_ps(rsq11
,rcutoff2
,_CMP_LT_OQ
);
1072 /* Update potential sum for this i atom from the interaction with this j atom. */
1073 velec
= _mm256_and_ps(velec
,cutoff_mask
);
1074 velec
= _mm256_andnot_ps(dummy_mask
,velec
);
1075 velecsum
= _mm256_add_ps(velecsum
,velec
);
1079 fscal
= _mm256_and_ps(fscal
,cutoff_mask
);
1081 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
1083 /* Calculate temporary vectorial force */
1084 tx
= _mm256_mul_ps(fscal
,dx11
);
1085 ty
= _mm256_mul_ps(fscal
,dy11
);
1086 tz
= _mm256_mul_ps(fscal
,dz11
);
1088 /* Update vectorial force */
1089 fix1
= _mm256_add_ps(fix1
,tx
);
1090 fiy1
= _mm256_add_ps(fiy1
,ty
);
1091 fiz1
= _mm256_add_ps(fiz1
,tz
);
1093 fjx1
= _mm256_add_ps(fjx1
,tx
);
1094 fjy1
= _mm256_add_ps(fjy1
,ty
);
1095 fjz1
= _mm256_add_ps(fjz1
,tz
);
1099 /**************************
1100 * CALCULATE INTERACTIONS *
1101 **************************/
1103 if (gmx_mm256_any_lt(rsq12
,rcutoff2
))
1106 r12
= _mm256_mul_ps(rsq12
,rinv12
);
1107 r12
= _mm256_andnot_ps(dummy_mask
,r12
);
1109 /* EWALD ELECTROSTATICS */
1111 /* Analytical PME correction */
1112 zeta2
= _mm256_mul_ps(beta2
,rsq12
);
1113 rinv3
= _mm256_mul_ps(rinvsq12
,rinv12
);
1114 pmecorrF
= avx256_pmecorrF_f(zeta2
);
1115 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
1116 felec
= _mm256_mul_ps(qq12
,felec
);
1117 pmecorrV
= avx256_pmecorrV_f(zeta2
);
1118 pmecorrV
= _mm256_mul_ps(pmecorrV
,beta
);
1119 velec
= _mm256_sub_ps(_mm256_sub_ps(rinv12
,sh_ewald
),pmecorrV
);
1120 velec
= _mm256_mul_ps(qq12
,velec
);
1122 cutoff_mask
= _mm256_cmp_ps(rsq12
,rcutoff2
,_CMP_LT_OQ
);
1124 /* Update potential sum for this i atom from the interaction with this j atom. */
1125 velec
= _mm256_and_ps(velec
,cutoff_mask
);
1126 velec
= _mm256_andnot_ps(dummy_mask
,velec
);
1127 velecsum
= _mm256_add_ps(velecsum
,velec
);
1131 fscal
= _mm256_and_ps(fscal
,cutoff_mask
);
1133 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
1135 /* Calculate temporary vectorial force */
1136 tx
= _mm256_mul_ps(fscal
,dx12
);
1137 ty
= _mm256_mul_ps(fscal
,dy12
);
1138 tz
= _mm256_mul_ps(fscal
,dz12
);
1140 /* Update vectorial force */
1141 fix1
= _mm256_add_ps(fix1
,tx
);
1142 fiy1
= _mm256_add_ps(fiy1
,ty
);
1143 fiz1
= _mm256_add_ps(fiz1
,tz
);
1145 fjx2
= _mm256_add_ps(fjx2
,tx
);
1146 fjy2
= _mm256_add_ps(fjy2
,ty
);
1147 fjz2
= _mm256_add_ps(fjz2
,tz
);
1151 /**************************
1152 * CALCULATE INTERACTIONS *
1153 **************************/
1155 if (gmx_mm256_any_lt(rsq13
,rcutoff2
))
1158 r13
= _mm256_mul_ps(rsq13
,rinv13
);
1159 r13
= _mm256_andnot_ps(dummy_mask
,r13
);
1161 /* EWALD ELECTROSTATICS */
1163 /* Analytical PME correction */
1164 zeta2
= _mm256_mul_ps(beta2
,rsq13
);
1165 rinv3
= _mm256_mul_ps(rinvsq13
,rinv13
);
1166 pmecorrF
= avx256_pmecorrF_f(zeta2
);
1167 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
1168 felec
= _mm256_mul_ps(qq13
,felec
);
1169 pmecorrV
= avx256_pmecorrV_f(zeta2
);
1170 pmecorrV
= _mm256_mul_ps(pmecorrV
,beta
);
1171 velec
= _mm256_sub_ps(_mm256_sub_ps(rinv13
,sh_ewald
),pmecorrV
);
1172 velec
= _mm256_mul_ps(qq13
,velec
);
1174 cutoff_mask
= _mm256_cmp_ps(rsq13
,rcutoff2
,_CMP_LT_OQ
);
1176 /* Update potential sum for this i atom from the interaction with this j atom. */
1177 velec
= _mm256_and_ps(velec
,cutoff_mask
);
1178 velec
= _mm256_andnot_ps(dummy_mask
,velec
);
1179 velecsum
= _mm256_add_ps(velecsum
,velec
);
1183 fscal
= _mm256_and_ps(fscal
,cutoff_mask
);
1185 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
1187 /* Calculate temporary vectorial force */
1188 tx
= _mm256_mul_ps(fscal
,dx13
);
1189 ty
= _mm256_mul_ps(fscal
,dy13
);
1190 tz
= _mm256_mul_ps(fscal
,dz13
);
1192 /* Update vectorial force */
1193 fix1
= _mm256_add_ps(fix1
,tx
);
1194 fiy1
= _mm256_add_ps(fiy1
,ty
);
1195 fiz1
= _mm256_add_ps(fiz1
,tz
);
1197 fjx3
= _mm256_add_ps(fjx3
,tx
);
1198 fjy3
= _mm256_add_ps(fjy3
,ty
);
1199 fjz3
= _mm256_add_ps(fjz3
,tz
);
1203 /**************************
1204 * CALCULATE INTERACTIONS *
1205 **************************/
1207 if (gmx_mm256_any_lt(rsq21
,rcutoff2
))
1210 r21
= _mm256_mul_ps(rsq21
,rinv21
);
1211 r21
= _mm256_andnot_ps(dummy_mask
,r21
);
1213 /* EWALD ELECTROSTATICS */
1215 /* Analytical PME correction */
1216 zeta2
= _mm256_mul_ps(beta2
,rsq21
);
1217 rinv3
= _mm256_mul_ps(rinvsq21
,rinv21
);
1218 pmecorrF
= avx256_pmecorrF_f(zeta2
);
1219 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
1220 felec
= _mm256_mul_ps(qq21
,felec
);
1221 pmecorrV
= avx256_pmecorrV_f(zeta2
);
1222 pmecorrV
= _mm256_mul_ps(pmecorrV
,beta
);
1223 velec
= _mm256_sub_ps(_mm256_sub_ps(rinv21
,sh_ewald
),pmecorrV
);
1224 velec
= _mm256_mul_ps(qq21
,velec
);
1226 cutoff_mask
= _mm256_cmp_ps(rsq21
,rcutoff2
,_CMP_LT_OQ
);
1228 /* Update potential sum for this i atom from the interaction with this j atom. */
1229 velec
= _mm256_and_ps(velec
,cutoff_mask
);
1230 velec
= _mm256_andnot_ps(dummy_mask
,velec
);
1231 velecsum
= _mm256_add_ps(velecsum
,velec
);
1235 fscal
= _mm256_and_ps(fscal
,cutoff_mask
);
1237 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
1239 /* Calculate temporary vectorial force */
1240 tx
= _mm256_mul_ps(fscal
,dx21
);
1241 ty
= _mm256_mul_ps(fscal
,dy21
);
1242 tz
= _mm256_mul_ps(fscal
,dz21
);
1244 /* Update vectorial force */
1245 fix2
= _mm256_add_ps(fix2
,tx
);
1246 fiy2
= _mm256_add_ps(fiy2
,ty
);
1247 fiz2
= _mm256_add_ps(fiz2
,tz
);
1249 fjx1
= _mm256_add_ps(fjx1
,tx
);
1250 fjy1
= _mm256_add_ps(fjy1
,ty
);
1251 fjz1
= _mm256_add_ps(fjz1
,tz
);
1255 /**************************
1256 * CALCULATE INTERACTIONS *
1257 **************************/
1259 if (gmx_mm256_any_lt(rsq22
,rcutoff2
))
1262 r22
= _mm256_mul_ps(rsq22
,rinv22
);
1263 r22
= _mm256_andnot_ps(dummy_mask
,r22
);
1265 /* EWALD ELECTROSTATICS */
1267 /* Analytical PME correction */
1268 zeta2
= _mm256_mul_ps(beta2
,rsq22
);
1269 rinv3
= _mm256_mul_ps(rinvsq22
,rinv22
);
1270 pmecorrF
= avx256_pmecorrF_f(zeta2
);
1271 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
1272 felec
= _mm256_mul_ps(qq22
,felec
);
1273 pmecorrV
= avx256_pmecorrV_f(zeta2
);
1274 pmecorrV
= _mm256_mul_ps(pmecorrV
,beta
);
1275 velec
= _mm256_sub_ps(_mm256_sub_ps(rinv22
,sh_ewald
),pmecorrV
);
1276 velec
= _mm256_mul_ps(qq22
,velec
);
1278 cutoff_mask
= _mm256_cmp_ps(rsq22
,rcutoff2
,_CMP_LT_OQ
);
1280 /* Update potential sum for this i atom from the interaction with this j atom. */
1281 velec
= _mm256_and_ps(velec
,cutoff_mask
);
1282 velec
= _mm256_andnot_ps(dummy_mask
,velec
);
1283 velecsum
= _mm256_add_ps(velecsum
,velec
);
1287 fscal
= _mm256_and_ps(fscal
,cutoff_mask
);
1289 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
1291 /* Calculate temporary vectorial force */
1292 tx
= _mm256_mul_ps(fscal
,dx22
);
1293 ty
= _mm256_mul_ps(fscal
,dy22
);
1294 tz
= _mm256_mul_ps(fscal
,dz22
);
1296 /* Update vectorial force */
1297 fix2
= _mm256_add_ps(fix2
,tx
);
1298 fiy2
= _mm256_add_ps(fiy2
,ty
);
1299 fiz2
= _mm256_add_ps(fiz2
,tz
);
1301 fjx2
= _mm256_add_ps(fjx2
,tx
);
1302 fjy2
= _mm256_add_ps(fjy2
,ty
);
1303 fjz2
= _mm256_add_ps(fjz2
,tz
);
1307 /**************************
1308 * CALCULATE INTERACTIONS *
1309 **************************/
1311 if (gmx_mm256_any_lt(rsq23
,rcutoff2
))
1314 r23
= _mm256_mul_ps(rsq23
,rinv23
);
1315 r23
= _mm256_andnot_ps(dummy_mask
,r23
);
1317 /* EWALD ELECTROSTATICS */
1319 /* Analytical PME correction */
1320 zeta2
= _mm256_mul_ps(beta2
,rsq23
);
1321 rinv3
= _mm256_mul_ps(rinvsq23
,rinv23
);
1322 pmecorrF
= avx256_pmecorrF_f(zeta2
);
1323 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
1324 felec
= _mm256_mul_ps(qq23
,felec
);
1325 pmecorrV
= avx256_pmecorrV_f(zeta2
);
1326 pmecorrV
= _mm256_mul_ps(pmecorrV
,beta
);
1327 velec
= _mm256_sub_ps(_mm256_sub_ps(rinv23
,sh_ewald
),pmecorrV
);
1328 velec
= _mm256_mul_ps(qq23
,velec
);
1330 cutoff_mask
= _mm256_cmp_ps(rsq23
,rcutoff2
,_CMP_LT_OQ
);
1332 /* Update potential sum for this i atom from the interaction with this j atom. */
1333 velec
= _mm256_and_ps(velec
,cutoff_mask
);
1334 velec
= _mm256_andnot_ps(dummy_mask
,velec
);
1335 velecsum
= _mm256_add_ps(velecsum
,velec
);
1339 fscal
= _mm256_and_ps(fscal
,cutoff_mask
);
1341 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
1343 /* Calculate temporary vectorial force */
1344 tx
= _mm256_mul_ps(fscal
,dx23
);
1345 ty
= _mm256_mul_ps(fscal
,dy23
);
1346 tz
= _mm256_mul_ps(fscal
,dz23
);
1348 /* Update vectorial force */
1349 fix2
= _mm256_add_ps(fix2
,tx
);
1350 fiy2
= _mm256_add_ps(fiy2
,ty
);
1351 fiz2
= _mm256_add_ps(fiz2
,tz
);
1353 fjx3
= _mm256_add_ps(fjx3
,tx
);
1354 fjy3
= _mm256_add_ps(fjy3
,ty
);
1355 fjz3
= _mm256_add_ps(fjz3
,tz
);
1359 /**************************
1360 * CALCULATE INTERACTIONS *
1361 **************************/
1363 if (gmx_mm256_any_lt(rsq31
,rcutoff2
))
1366 r31
= _mm256_mul_ps(rsq31
,rinv31
);
1367 r31
= _mm256_andnot_ps(dummy_mask
,r31
);
1369 /* EWALD ELECTROSTATICS */
1371 /* Analytical PME correction */
1372 zeta2
= _mm256_mul_ps(beta2
,rsq31
);
1373 rinv3
= _mm256_mul_ps(rinvsq31
,rinv31
);
1374 pmecorrF
= avx256_pmecorrF_f(zeta2
);
1375 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
1376 felec
= _mm256_mul_ps(qq31
,felec
);
1377 pmecorrV
= avx256_pmecorrV_f(zeta2
);
1378 pmecorrV
= _mm256_mul_ps(pmecorrV
,beta
);
1379 velec
= _mm256_sub_ps(_mm256_sub_ps(rinv31
,sh_ewald
),pmecorrV
);
1380 velec
= _mm256_mul_ps(qq31
,velec
);
1382 cutoff_mask
= _mm256_cmp_ps(rsq31
,rcutoff2
,_CMP_LT_OQ
);
1384 /* Update potential sum for this i atom from the interaction with this j atom. */
1385 velec
= _mm256_and_ps(velec
,cutoff_mask
);
1386 velec
= _mm256_andnot_ps(dummy_mask
,velec
);
1387 velecsum
= _mm256_add_ps(velecsum
,velec
);
1391 fscal
= _mm256_and_ps(fscal
,cutoff_mask
);
1393 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
1395 /* Calculate temporary vectorial force */
1396 tx
= _mm256_mul_ps(fscal
,dx31
);
1397 ty
= _mm256_mul_ps(fscal
,dy31
);
1398 tz
= _mm256_mul_ps(fscal
,dz31
);
1400 /* Update vectorial force */
1401 fix3
= _mm256_add_ps(fix3
,tx
);
1402 fiy3
= _mm256_add_ps(fiy3
,ty
);
1403 fiz3
= _mm256_add_ps(fiz3
,tz
);
1405 fjx1
= _mm256_add_ps(fjx1
,tx
);
1406 fjy1
= _mm256_add_ps(fjy1
,ty
);
1407 fjz1
= _mm256_add_ps(fjz1
,tz
);
1411 /**************************
1412 * CALCULATE INTERACTIONS *
1413 **************************/
1415 if (gmx_mm256_any_lt(rsq32
,rcutoff2
))
1418 r32
= _mm256_mul_ps(rsq32
,rinv32
);
1419 r32
= _mm256_andnot_ps(dummy_mask
,r32
);
1421 /* EWALD ELECTROSTATICS */
1423 /* Analytical PME correction */
1424 zeta2
= _mm256_mul_ps(beta2
,rsq32
);
1425 rinv3
= _mm256_mul_ps(rinvsq32
,rinv32
);
1426 pmecorrF
= avx256_pmecorrF_f(zeta2
);
1427 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
1428 felec
= _mm256_mul_ps(qq32
,felec
);
1429 pmecorrV
= avx256_pmecorrV_f(zeta2
);
1430 pmecorrV
= _mm256_mul_ps(pmecorrV
,beta
);
1431 velec
= _mm256_sub_ps(_mm256_sub_ps(rinv32
,sh_ewald
),pmecorrV
);
1432 velec
= _mm256_mul_ps(qq32
,velec
);
1434 cutoff_mask
= _mm256_cmp_ps(rsq32
,rcutoff2
,_CMP_LT_OQ
);
1436 /* Update potential sum for this i atom from the interaction with this j atom. */
1437 velec
= _mm256_and_ps(velec
,cutoff_mask
);
1438 velec
= _mm256_andnot_ps(dummy_mask
,velec
);
1439 velecsum
= _mm256_add_ps(velecsum
,velec
);
1443 fscal
= _mm256_and_ps(fscal
,cutoff_mask
);
1445 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
1447 /* Calculate temporary vectorial force */
1448 tx
= _mm256_mul_ps(fscal
,dx32
);
1449 ty
= _mm256_mul_ps(fscal
,dy32
);
1450 tz
= _mm256_mul_ps(fscal
,dz32
);
1452 /* Update vectorial force */
1453 fix3
= _mm256_add_ps(fix3
,tx
);
1454 fiy3
= _mm256_add_ps(fiy3
,ty
);
1455 fiz3
= _mm256_add_ps(fiz3
,tz
);
1457 fjx2
= _mm256_add_ps(fjx2
,tx
);
1458 fjy2
= _mm256_add_ps(fjy2
,ty
);
1459 fjz2
= _mm256_add_ps(fjz2
,tz
);
1463 /**************************
1464 * CALCULATE INTERACTIONS *
1465 **************************/
1467 if (gmx_mm256_any_lt(rsq33
,rcutoff2
))
1470 r33
= _mm256_mul_ps(rsq33
,rinv33
);
1471 r33
= _mm256_andnot_ps(dummy_mask
,r33
);
1473 /* EWALD ELECTROSTATICS */
1475 /* Analytical PME correction */
1476 zeta2
= _mm256_mul_ps(beta2
,rsq33
);
1477 rinv3
= _mm256_mul_ps(rinvsq33
,rinv33
);
1478 pmecorrF
= avx256_pmecorrF_f(zeta2
);
1479 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
1480 felec
= _mm256_mul_ps(qq33
,felec
);
1481 pmecorrV
= avx256_pmecorrV_f(zeta2
);
1482 pmecorrV
= _mm256_mul_ps(pmecorrV
,beta
);
1483 velec
= _mm256_sub_ps(_mm256_sub_ps(rinv33
,sh_ewald
),pmecorrV
);
1484 velec
= _mm256_mul_ps(qq33
,velec
);
1486 cutoff_mask
= _mm256_cmp_ps(rsq33
,rcutoff2
,_CMP_LT_OQ
);
1488 /* Update potential sum for this i atom from the interaction with this j atom. */
1489 velec
= _mm256_and_ps(velec
,cutoff_mask
);
1490 velec
= _mm256_andnot_ps(dummy_mask
,velec
);
1491 velecsum
= _mm256_add_ps(velecsum
,velec
);
1495 fscal
= _mm256_and_ps(fscal
,cutoff_mask
);
1497 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
1499 /* Calculate temporary vectorial force */
1500 tx
= _mm256_mul_ps(fscal
,dx33
);
1501 ty
= _mm256_mul_ps(fscal
,dy33
);
1502 tz
= _mm256_mul_ps(fscal
,dz33
);
1504 /* Update vectorial force */
1505 fix3
= _mm256_add_ps(fix3
,tx
);
1506 fiy3
= _mm256_add_ps(fiy3
,ty
);
1507 fiz3
= _mm256_add_ps(fiz3
,tz
);
1509 fjx3
= _mm256_add_ps(fjx3
,tx
);
1510 fjy3
= _mm256_add_ps(fjy3
,ty
);
1511 fjz3
= _mm256_add_ps(fjz3
,tz
);
1515 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
1516 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
1517 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
1518 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
1519 fjptrE
= (jnrlistE
>=0) ? f
+j_coord_offsetE
: scratch
;
1520 fjptrF
= (jnrlistF
>=0) ? f
+j_coord_offsetF
: scratch
;
1521 fjptrG
= (jnrlistG
>=0) ? f
+j_coord_offsetG
: scratch
;
1522 fjptrH
= (jnrlistH
>=0) ? f
+j_coord_offsetH
: scratch
;
1524 gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,fjptrE
,fjptrF
,fjptrG
,fjptrH
,
1525 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,
1526 fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
1528 /* Inner loop uses 1056 flops */
1531 /* End of innermost loop */
1533 gmx_mm256_update_iforce_4atom_swizzle_ps(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
1534 f
+i_coord_offset
,fshift
+i_shift_offset
);
1537 /* Update potential energies */
1538 gmx_mm256_update_1pot_ps(velecsum
,kernel_data
->energygrp_elec
+ggid
);
1539 gmx_mm256_update_1pot_ps(vvdwsum
,kernel_data
->energygrp_vdw
+ggid
);
1541 /* Increment number of inner iterations */
1542 inneriter
+= j_index_end
- j_index_start
;
1544 /* Outer loop uses 26 flops */
1547 /* Increment number of outer iterations */
1550 /* Update outer/inner flops */
1552 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W4W4_VF
,outeriter
*26 + inneriter
*1056);
1555 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_avx_256_single
1556 * Electrostatics interaction: Ewald
1557 * VdW interaction: LJEwald
1558 * Geometry: Water4-Water4
1559 * Calculate force/pot: Force
1562 nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_avx_256_single
1563 (t_nblist
* gmx_restrict nlist
,
1564 rvec
* gmx_restrict xx
,
1565 rvec
* gmx_restrict ff
,
1566 struct t_forcerec
* gmx_restrict fr
,
1567 t_mdatoms
* gmx_restrict mdatoms
,
1568 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
1569 t_nrnb
* gmx_restrict nrnb
)
1571 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1572 * just 0 for non-waters.
1573 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1574 * jnr indices corresponding to data put in the four positions in the SIMD register.
1576 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
1577 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
1578 int jnrA
,jnrB
,jnrC
,jnrD
;
1579 int jnrE
,jnrF
,jnrG
,jnrH
;
1580 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
1581 int jnrlistE
,jnrlistF
,jnrlistG
,jnrlistH
;
1582 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
1583 int j_coord_offsetE
,j_coord_offsetF
,j_coord_offsetG
,j_coord_offsetH
;
1584 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
1585 real rcutoff_scalar
;
1586 real
*shiftvec
,*fshift
,*x
,*f
;
1587 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
,*fjptrE
,*fjptrF
,*fjptrG
,*fjptrH
;
1588 real scratch
[4*DIM
];
1589 __m256 tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
1590 real
* vdwioffsetptr0
;
1591 real
* vdwgridioffsetptr0
;
1592 __m256 ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
1593 real
* vdwioffsetptr1
;
1594 real
* vdwgridioffsetptr1
;
1595 __m256 ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
1596 real
* vdwioffsetptr2
;
1597 real
* vdwgridioffsetptr2
;
1598 __m256 ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
1599 real
* vdwioffsetptr3
;
1600 real
* vdwgridioffsetptr3
;
1601 __m256 ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
1602 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
,vdwjidx0E
,vdwjidx0F
,vdwjidx0G
,vdwjidx0H
;
1603 __m256 jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
1604 int vdwjidx1A
,vdwjidx1B
,vdwjidx1C
,vdwjidx1D
,vdwjidx1E
,vdwjidx1F
,vdwjidx1G
,vdwjidx1H
;
1605 __m256 jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
1606 int vdwjidx2A
,vdwjidx2B
,vdwjidx2C
,vdwjidx2D
,vdwjidx2E
,vdwjidx2F
,vdwjidx2G
,vdwjidx2H
;
1607 __m256 jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
1608 int vdwjidx3A
,vdwjidx3B
,vdwjidx3C
,vdwjidx3D
,vdwjidx3E
,vdwjidx3F
,vdwjidx3G
,vdwjidx3H
;
1609 __m256 jx3
,jy3
,jz3
,fjx3
,fjy3
,fjz3
,jq3
,isaj3
;
1610 __m256 dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
1611 __m256 dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
1612 __m256 dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
1613 __m256 dx13
,dy13
,dz13
,rsq13
,rinv13
,rinvsq13
,r13
,qq13
,c6_13
,c12_13
;
1614 __m256 dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
1615 __m256 dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
1616 __m256 dx23
,dy23
,dz23
,rsq23
,rinv23
,rinvsq23
,r23
,qq23
,c6_23
,c12_23
;
1617 __m256 dx31
,dy31
,dz31
,rsq31
,rinv31
,rinvsq31
,r31
,qq31
,c6_31
,c12_31
;
1618 __m256 dx32
,dy32
,dz32
,rsq32
,rinv32
,rinvsq32
,r32
,qq32
,c6_32
,c12_32
;
1619 __m256 dx33
,dy33
,dz33
,rsq33
,rinv33
,rinvsq33
,r33
,qq33
,c6_33
,c12_33
;
1620 __m256 velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
1623 __m256 rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
1626 __m256 one_sixth
= _mm256_set1_ps(1.0/6.0);
1627 __m256 one_twelfth
= _mm256_set1_ps(1.0/12.0);
1639 __m256 ewclj
,ewclj2
,ewclj6
,ewcljrsq
,poly
,exponent
,f6A
,f6B
,sh_lj_ewald
;
1640 __m256 one_half
= _mm256_set1_ps(0.5);
1641 __m256 minus_one
= _mm256_set1_ps(-1.0);
1643 __m128i ewitab_lo
,ewitab_hi
;
1644 __m256 ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
1645 __m256 beta
,beta2
,beta3
,zeta2
,pmecorrF
,pmecorrV
,rinv3
;
1647 __m256 dummy_mask
,cutoff_mask
;
1648 __m256 signbit
= _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1649 __m256 one
= _mm256_set1_ps(1.0);
1650 __m256 two
= _mm256_set1_ps(2.0);
1656 jindex
= nlist
->jindex
;
1658 shiftidx
= nlist
->shift
;
1660 shiftvec
= fr
->shift_vec
[0];
1661 fshift
= fr
->fshift
[0];
1662 facel
= _mm256_set1_ps(fr
->ic
->epsfac
);
1663 charge
= mdatoms
->chargeA
;
1664 nvdwtype
= fr
->ntype
;
1665 vdwparam
= fr
->nbfp
;
1666 vdwtype
= mdatoms
->typeA
;
1667 vdwgridparam
= fr
->ljpme_c6grid
;
1668 sh_lj_ewald
= _mm256_set1_ps(fr
->ic
->sh_lj_ewald
);
1669 ewclj
= _mm256_set1_ps(fr
->ic
->ewaldcoeff_lj
);
1670 ewclj2
= _mm256_mul_ps(minus_one
,_mm256_mul_ps(ewclj
,ewclj
));
1672 sh_ewald
= _mm256_set1_ps(fr
->ic
->sh_ewald
);
1673 beta
= _mm256_set1_ps(fr
->ic
->ewaldcoeff_q
);
1674 beta2
= _mm256_mul_ps(beta
,beta
);
1675 beta3
= _mm256_mul_ps(beta
,beta2
);
1677 ewtab
= fr
->ic
->tabq_coul_F
;
1678 ewtabscale
= _mm256_set1_ps(fr
->ic
->tabq_scale
);
1679 ewtabhalfspace
= _mm256_set1_ps(0.5/fr
->ic
->tabq_scale
);
1681 /* Setup water-specific parameters */
1682 inr
= nlist
->iinr
[0];
1683 iq1
= _mm256_mul_ps(facel
,_mm256_set1_ps(charge
[inr
+1]));
1684 iq2
= _mm256_mul_ps(facel
,_mm256_set1_ps(charge
[inr
+2]));
1685 iq3
= _mm256_mul_ps(facel
,_mm256_set1_ps(charge
[inr
+3]));
1686 vdwioffsetptr0
= vdwparam
+2*nvdwtype
*vdwtype
[inr
+0];
1687 vdwgridioffsetptr0
= vdwgridparam
+2*nvdwtype
*vdwtype
[inr
+0];
1689 jq1
= _mm256_set1_ps(charge
[inr
+1]);
1690 jq2
= _mm256_set1_ps(charge
[inr
+2]);
1691 jq3
= _mm256_set1_ps(charge
[inr
+3]);
1692 vdwjidx0A
= 2*vdwtype
[inr
+0];
1693 c6_00
= _mm256_set1_ps(vdwioffsetptr0
[vdwjidx0A
]);
1694 c12_00
= _mm256_set1_ps(vdwioffsetptr0
[vdwjidx0A
+1]);
1695 c6grid_00
= _mm256_set1_ps(vdwgridioffsetptr0
[vdwjidx0A
]);
1696 qq11
= _mm256_mul_ps(iq1
,jq1
);
1697 qq12
= _mm256_mul_ps(iq1
,jq2
);
1698 qq13
= _mm256_mul_ps(iq1
,jq3
);
1699 qq21
= _mm256_mul_ps(iq2
,jq1
);
1700 qq22
= _mm256_mul_ps(iq2
,jq2
);
1701 qq23
= _mm256_mul_ps(iq2
,jq3
);
1702 qq31
= _mm256_mul_ps(iq3
,jq1
);
1703 qq32
= _mm256_mul_ps(iq3
,jq2
);
1704 qq33
= _mm256_mul_ps(iq3
,jq3
);
1706 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1707 rcutoff_scalar
= fr
->ic
->rcoulomb
;
1708 rcutoff
= _mm256_set1_ps(rcutoff_scalar
);
1709 rcutoff2
= _mm256_mul_ps(rcutoff
,rcutoff
);
1711 sh_vdw_invrcut6
= _mm256_set1_ps(fr
->ic
->sh_invrc6
);
1712 rvdw
= _mm256_set1_ps(fr
->ic
->rvdw
);
1714 /* Avoid stupid compiler warnings */
1715 jnrA
= jnrB
= jnrC
= jnrD
= jnrE
= jnrF
= jnrG
= jnrH
= 0;
1716 j_coord_offsetA
= 0;
1717 j_coord_offsetB
= 0;
1718 j_coord_offsetC
= 0;
1719 j_coord_offsetD
= 0;
1720 j_coord_offsetE
= 0;
1721 j_coord_offsetF
= 0;
1722 j_coord_offsetG
= 0;
1723 j_coord_offsetH
= 0;
1728 for(iidx
=0;iidx
<4*DIM
;iidx
++)
1730 scratch
[iidx
] = 0.0;
1733 /* Start outer loop over neighborlists */
1734 for(iidx
=0; iidx
<nri
; iidx
++)
1736 /* Load shift vector for this list */
1737 i_shift_offset
= DIM
*shiftidx
[iidx
];
1739 /* Load limits for loop over neighbors */
1740 j_index_start
= jindex
[iidx
];
1741 j_index_end
= jindex
[iidx
+1];
1743 /* Get outer coordinate index */
1745 i_coord_offset
= DIM
*inr
;
1747 /* Load i particle coords and add shift vector */
1748 gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
1749 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
1751 fix0
= _mm256_setzero_ps();
1752 fiy0
= _mm256_setzero_ps();
1753 fiz0
= _mm256_setzero_ps();
1754 fix1
= _mm256_setzero_ps();
1755 fiy1
= _mm256_setzero_ps();
1756 fiz1
= _mm256_setzero_ps();
1757 fix2
= _mm256_setzero_ps();
1758 fiy2
= _mm256_setzero_ps();
1759 fiz2
= _mm256_setzero_ps();
1760 fix3
= _mm256_setzero_ps();
1761 fiy3
= _mm256_setzero_ps();
1762 fiz3
= _mm256_setzero_ps();
1764 /* Start inner kernel loop */
1765 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+7]>=0; jidx
+=8)
1768 /* Get j neighbor index, and coordinate index */
1770 jnrB
= jjnr
[jidx
+1];
1771 jnrC
= jjnr
[jidx
+2];
1772 jnrD
= jjnr
[jidx
+3];
1773 jnrE
= jjnr
[jidx
+4];
1774 jnrF
= jjnr
[jidx
+5];
1775 jnrG
= jjnr
[jidx
+6];
1776 jnrH
= jjnr
[jidx
+7];
1777 j_coord_offsetA
= DIM
*jnrA
;
1778 j_coord_offsetB
= DIM
*jnrB
;
1779 j_coord_offsetC
= DIM
*jnrC
;
1780 j_coord_offsetD
= DIM
*jnrD
;
1781 j_coord_offsetE
= DIM
*jnrE
;
1782 j_coord_offsetF
= DIM
*jnrF
;
1783 j_coord_offsetG
= DIM
*jnrG
;
1784 j_coord_offsetH
= DIM
*jnrH
;
1786 /* load j atom coordinates */
1787 gmx_mm256_load_4rvec_8ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1788 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
1789 x
+j_coord_offsetE
,x
+j_coord_offsetF
,
1790 x
+j_coord_offsetG
,x
+j_coord_offsetH
,
1791 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
1792 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
1794 /* Calculate displacement vector */
1795 dx00
= _mm256_sub_ps(ix0
,jx0
);
1796 dy00
= _mm256_sub_ps(iy0
,jy0
);
1797 dz00
= _mm256_sub_ps(iz0
,jz0
);
1798 dx11
= _mm256_sub_ps(ix1
,jx1
);
1799 dy11
= _mm256_sub_ps(iy1
,jy1
);
1800 dz11
= _mm256_sub_ps(iz1
,jz1
);
1801 dx12
= _mm256_sub_ps(ix1
,jx2
);
1802 dy12
= _mm256_sub_ps(iy1
,jy2
);
1803 dz12
= _mm256_sub_ps(iz1
,jz2
);
1804 dx13
= _mm256_sub_ps(ix1
,jx3
);
1805 dy13
= _mm256_sub_ps(iy1
,jy3
);
1806 dz13
= _mm256_sub_ps(iz1
,jz3
);
1807 dx21
= _mm256_sub_ps(ix2
,jx1
);
1808 dy21
= _mm256_sub_ps(iy2
,jy1
);
1809 dz21
= _mm256_sub_ps(iz2
,jz1
);
1810 dx22
= _mm256_sub_ps(ix2
,jx2
);
1811 dy22
= _mm256_sub_ps(iy2
,jy2
);
1812 dz22
= _mm256_sub_ps(iz2
,jz2
);
1813 dx23
= _mm256_sub_ps(ix2
,jx3
);
1814 dy23
= _mm256_sub_ps(iy2
,jy3
);
1815 dz23
= _mm256_sub_ps(iz2
,jz3
);
1816 dx31
= _mm256_sub_ps(ix3
,jx1
);
1817 dy31
= _mm256_sub_ps(iy3
,jy1
);
1818 dz31
= _mm256_sub_ps(iz3
,jz1
);
1819 dx32
= _mm256_sub_ps(ix3
,jx2
);
1820 dy32
= _mm256_sub_ps(iy3
,jy2
);
1821 dz32
= _mm256_sub_ps(iz3
,jz2
);
1822 dx33
= _mm256_sub_ps(ix3
,jx3
);
1823 dy33
= _mm256_sub_ps(iy3
,jy3
);
1824 dz33
= _mm256_sub_ps(iz3
,jz3
);
1826 /* Calculate squared distance and things based on it */
1827 rsq00
= gmx_mm256_calc_rsq_ps(dx00
,dy00
,dz00
);
1828 rsq11
= gmx_mm256_calc_rsq_ps(dx11
,dy11
,dz11
);
1829 rsq12
= gmx_mm256_calc_rsq_ps(dx12
,dy12
,dz12
);
1830 rsq13
= gmx_mm256_calc_rsq_ps(dx13
,dy13
,dz13
);
1831 rsq21
= gmx_mm256_calc_rsq_ps(dx21
,dy21
,dz21
);
1832 rsq22
= gmx_mm256_calc_rsq_ps(dx22
,dy22
,dz22
);
1833 rsq23
= gmx_mm256_calc_rsq_ps(dx23
,dy23
,dz23
);
1834 rsq31
= gmx_mm256_calc_rsq_ps(dx31
,dy31
,dz31
);
1835 rsq32
= gmx_mm256_calc_rsq_ps(dx32
,dy32
,dz32
);
1836 rsq33
= gmx_mm256_calc_rsq_ps(dx33
,dy33
,dz33
);
1838 rinv00
= avx256_invsqrt_f(rsq00
);
1839 rinv11
= avx256_invsqrt_f(rsq11
);
1840 rinv12
= avx256_invsqrt_f(rsq12
);
1841 rinv13
= avx256_invsqrt_f(rsq13
);
1842 rinv21
= avx256_invsqrt_f(rsq21
);
1843 rinv22
= avx256_invsqrt_f(rsq22
);
1844 rinv23
= avx256_invsqrt_f(rsq23
);
1845 rinv31
= avx256_invsqrt_f(rsq31
);
1846 rinv32
= avx256_invsqrt_f(rsq32
);
1847 rinv33
= avx256_invsqrt_f(rsq33
);
1849 rinvsq00
= _mm256_mul_ps(rinv00
,rinv00
);
1850 rinvsq11
= _mm256_mul_ps(rinv11
,rinv11
);
1851 rinvsq12
= _mm256_mul_ps(rinv12
,rinv12
);
1852 rinvsq13
= _mm256_mul_ps(rinv13
,rinv13
);
1853 rinvsq21
= _mm256_mul_ps(rinv21
,rinv21
);
1854 rinvsq22
= _mm256_mul_ps(rinv22
,rinv22
);
1855 rinvsq23
= _mm256_mul_ps(rinv23
,rinv23
);
1856 rinvsq31
= _mm256_mul_ps(rinv31
,rinv31
);
1857 rinvsq32
= _mm256_mul_ps(rinv32
,rinv32
);
1858 rinvsq33
= _mm256_mul_ps(rinv33
,rinv33
);
1860 fjx0
= _mm256_setzero_ps();
1861 fjy0
= _mm256_setzero_ps();
1862 fjz0
= _mm256_setzero_ps();
1863 fjx1
= _mm256_setzero_ps();
1864 fjy1
= _mm256_setzero_ps();
1865 fjz1
= _mm256_setzero_ps();
1866 fjx2
= _mm256_setzero_ps();
1867 fjy2
= _mm256_setzero_ps();
1868 fjz2
= _mm256_setzero_ps();
1869 fjx3
= _mm256_setzero_ps();
1870 fjy3
= _mm256_setzero_ps();
1871 fjz3
= _mm256_setzero_ps();
1873 /**************************
1874 * CALCULATE INTERACTIONS *
1875 **************************/
1877 if (gmx_mm256_any_lt(rsq00
,rcutoff2
))
1880 r00
= _mm256_mul_ps(rsq00
,rinv00
);
1882 /* Analytical LJ-PME */
1883 rinvsix
= _mm256_mul_ps(_mm256_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
1884 ewcljrsq
= _mm256_mul_ps(ewclj2
,rsq00
);
1885 ewclj6
= _mm256_mul_ps(ewclj2
,_mm256_mul_ps(ewclj2
,ewclj2
));
1886 exponent
= avx256_exp_f(ewcljrsq
);
1887 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
1888 poly
= _mm256_mul_ps(exponent
,_mm256_add_ps(_mm256_sub_ps(one
,ewcljrsq
),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq
,ewcljrsq
),one_half
)));
1889 /* f6A = 6 * C6grid * (1 - poly) */
1890 f6A
= _mm256_mul_ps(c6grid_00
,_mm256_sub_ps(one
,poly
));
1891 /* f6B = C6grid * exponent * beta^6 */
1892 f6B
= _mm256_mul_ps(_mm256_mul_ps(c6grid_00
,one_sixth
),_mm256_mul_ps(exponent
,ewclj6
));
1893 /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
1894 fvdw
= _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00
,rinvsix
),_mm256_sub_ps(c6_00
,f6A
)),rinvsix
),f6B
),rinvsq00
);
1896 cutoff_mask
= _mm256_cmp_ps(rsq00
,rcutoff2
,_CMP_LT_OQ
);
1900 fscal
= _mm256_and_ps(fscal
,cutoff_mask
);
1902 /* Calculate temporary vectorial force */
1903 tx
= _mm256_mul_ps(fscal
,dx00
);
1904 ty
= _mm256_mul_ps(fscal
,dy00
);
1905 tz
= _mm256_mul_ps(fscal
,dz00
);
1907 /* Update vectorial force */
1908 fix0
= _mm256_add_ps(fix0
,tx
);
1909 fiy0
= _mm256_add_ps(fiy0
,ty
);
1910 fiz0
= _mm256_add_ps(fiz0
,tz
);
1912 fjx0
= _mm256_add_ps(fjx0
,tx
);
1913 fjy0
= _mm256_add_ps(fjy0
,ty
);
1914 fjz0
= _mm256_add_ps(fjz0
,tz
);
1918 /**************************
1919 * CALCULATE INTERACTIONS *
1920 **************************/
1922 if (gmx_mm256_any_lt(rsq11
,rcutoff2
))
1925 r11
= _mm256_mul_ps(rsq11
,rinv11
);
1927 /* EWALD ELECTROSTATICS */
1929 /* Analytical PME correction */
1930 zeta2
= _mm256_mul_ps(beta2
,rsq11
);
1931 rinv3
= _mm256_mul_ps(rinvsq11
,rinv11
);
1932 pmecorrF
= avx256_pmecorrF_f(zeta2
);
1933 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
1934 felec
= _mm256_mul_ps(qq11
,felec
);
1936 cutoff_mask
= _mm256_cmp_ps(rsq11
,rcutoff2
,_CMP_LT_OQ
);
1940 fscal
= _mm256_and_ps(fscal
,cutoff_mask
);
1942 /* Calculate temporary vectorial force */
1943 tx
= _mm256_mul_ps(fscal
,dx11
);
1944 ty
= _mm256_mul_ps(fscal
,dy11
);
1945 tz
= _mm256_mul_ps(fscal
,dz11
);
1947 /* Update vectorial force */
1948 fix1
= _mm256_add_ps(fix1
,tx
);
1949 fiy1
= _mm256_add_ps(fiy1
,ty
);
1950 fiz1
= _mm256_add_ps(fiz1
,tz
);
1952 fjx1
= _mm256_add_ps(fjx1
,tx
);
1953 fjy1
= _mm256_add_ps(fjy1
,ty
);
1954 fjz1
= _mm256_add_ps(fjz1
,tz
);
1958 /**************************
1959 * CALCULATE INTERACTIONS *
1960 **************************/
1962 if (gmx_mm256_any_lt(rsq12
,rcutoff2
))
1965 r12
= _mm256_mul_ps(rsq12
,rinv12
);
1967 /* EWALD ELECTROSTATICS */
1969 /* Analytical PME correction */
1970 zeta2
= _mm256_mul_ps(beta2
,rsq12
);
1971 rinv3
= _mm256_mul_ps(rinvsq12
,rinv12
);
1972 pmecorrF
= avx256_pmecorrF_f(zeta2
);
1973 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
1974 felec
= _mm256_mul_ps(qq12
,felec
);
1976 cutoff_mask
= _mm256_cmp_ps(rsq12
,rcutoff2
,_CMP_LT_OQ
);
1980 fscal
= _mm256_and_ps(fscal
,cutoff_mask
);
1982 /* Calculate temporary vectorial force */
1983 tx
= _mm256_mul_ps(fscal
,dx12
);
1984 ty
= _mm256_mul_ps(fscal
,dy12
);
1985 tz
= _mm256_mul_ps(fscal
,dz12
);
1987 /* Update vectorial force */
1988 fix1
= _mm256_add_ps(fix1
,tx
);
1989 fiy1
= _mm256_add_ps(fiy1
,ty
);
1990 fiz1
= _mm256_add_ps(fiz1
,tz
);
1992 fjx2
= _mm256_add_ps(fjx2
,tx
);
1993 fjy2
= _mm256_add_ps(fjy2
,ty
);
1994 fjz2
= _mm256_add_ps(fjz2
,tz
);
1998 /**************************
1999 * CALCULATE INTERACTIONS *
2000 **************************/
2002 if (gmx_mm256_any_lt(rsq13
,rcutoff2
))
2005 r13
= _mm256_mul_ps(rsq13
,rinv13
);
2007 /* EWALD ELECTROSTATICS */
2009 /* Analytical PME correction */
2010 zeta2
= _mm256_mul_ps(beta2
,rsq13
);
2011 rinv3
= _mm256_mul_ps(rinvsq13
,rinv13
);
2012 pmecorrF
= avx256_pmecorrF_f(zeta2
);
2013 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
2014 felec
= _mm256_mul_ps(qq13
,felec
);
2016 cutoff_mask
= _mm256_cmp_ps(rsq13
,rcutoff2
,_CMP_LT_OQ
);
2020 fscal
= _mm256_and_ps(fscal
,cutoff_mask
);
2022 /* Calculate temporary vectorial force */
2023 tx
= _mm256_mul_ps(fscal
,dx13
);
2024 ty
= _mm256_mul_ps(fscal
,dy13
);
2025 tz
= _mm256_mul_ps(fscal
,dz13
);
2027 /* Update vectorial force */
2028 fix1
= _mm256_add_ps(fix1
,tx
);
2029 fiy1
= _mm256_add_ps(fiy1
,ty
);
2030 fiz1
= _mm256_add_ps(fiz1
,tz
);
2032 fjx3
= _mm256_add_ps(fjx3
,tx
);
2033 fjy3
= _mm256_add_ps(fjy3
,ty
);
2034 fjz3
= _mm256_add_ps(fjz3
,tz
);
2038 /**************************
2039 * CALCULATE INTERACTIONS *
2040 **************************/
2042 if (gmx_mm256_any_lt(rsq21
,rcutoff2
))
2045 r21
= _mm256_mul_ps(rsq21
,rinv21
);
2047 /* EWALD ELECTROSTATICS */
2049 /* Analytical PME correction */
2050 zeta2
= _mm256_mul_ps(beta2
,rsq21
);
2051 rinv3
= _mm256_mul_ps(rinvsq21
,rinv21
);
2052 pmecorrF
= avx256_pmecorrF_f(zeta2
);
2053 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
2054 felec
= _mm256_mul_ps(qq21
,felec
);
2056 cutoff_mask
= _mm256_cmp_ps(rsq21
,rcutoff2
,_CMP_LT_OQ
);
2060 fscal
= _mm256_and_ps(fscal
,cutoff_mask
);
2062 /* Calculate temporary vectorial force */
2063 tx
= _mm256_mul_ps(fscal
,dx21
);
2064 ty
= _mm256_mul_ps(fscal
,dy21
);
2065 tz
= _mm256_mul_ps(fscal
,dz21
);
2067 /* Update vectorial force */
2068 fix2
= _mm256_add_ps(fix2
,tx
);
2069 fiy2
= _mm256_add_ps(fiy2
,ty
);
2070 fiz2
= _mm256_add_ps(fiz2
,tz
);
2072 fjx1
= _mm256_add_ps(fjx1
,tx
);
2073 fjy1
= _mm256_add_ps(fjy1
,ty
);
2074 fjz1
= _mm256_add_ps(fjz1
,tz
);
2078 /**************************
2079 * CALCULATE INTERACTIONS *
2080 **************************/
2082 if (gmx_mm256_any_lt(rsq22
,rcutoff2
))
2085 r22
= _mm256_mul_ps(rsq22
,rinv22
);
2087 /* EWALD ELECTROSTATICS */
2089 /* Analytical PME correction */
2090 zeta2
= _mm256_mul_ps(beta2
,rsq22
);
2091 rinv3
= _mm256_mul_ps(rinvsq22
,rinv22
);
2092 pmecorrF
= avx256_pmecorrF_f(zeta2
);
2093 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
2094 felec
= _mm256_mul_ps(qq22
,felec
);
2096 cutoff_mask
= _mm256_cmp_ps(rsq22
,rcutoff2
,_CMP_LT_OQ
);
2100 fscal
= _mm256_and_ps(fscal
,cutoff_mask
);
2102 /* Calculate temporary vectorial force */
2103 tx
= _mm256_mul_ps(fscal
,dx22
);
2104 ty
= _mm256_mul_ps(fscal
,dy22
);
2105 tz
= _mm256_mul_ps(fscal
,dz22
);
2107 /* Update vectorial force */
2108 fix2
= _mm256_add_ps(fix2
,tx
);
2109 fiy2
= _mm256_add_ps(fiy2
,ty
);
2110 fiz2
= _mm256_add_ps(fiz2
,tz
);
2112 fjx2
= _mm256_add_ps(fjx2
,tx
);
2113 fjy2
= _mm256_add_ps(fjy2
,ty
);
2114 fjz2
= _mm256_add_ps(fjz2
,tz
);
2118 /**************************
2119 * CALCULATE INTERACTIONS *
2120 **************************/
2122 if (gmx_mm256_any_lt(rsq23
,rcutoff2
))
2125 r23
= _mm256_mul_ps(rsq23
,rinv23
);
2127 /* EWALD ELECTROSTATICS */
2129 /* Analytical PME correction */
2130 zeta2
= _mm256_mul_ps(beta2
,rsq23
);
2131 rinv3
= _mm256_mul_ps(rinvsq23
,rinv23
);
2132 pmecorrF
= avx256_pmecorrF_f(zeta2
);
2133 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
2134 felec
= _mm256_mul_ps(qq23
,felec
);
2136 cutoff_mask
= _mm256_cmp_ps(rsq23
,rcutoff2
,_CMP_LT_OQ
);
2140 fscal
= _mm256_and_ps(fscal
,cutoff_mask
);
2142 /* Calculate temporary vectorial force */
2143 tx
= _mm256_mul_ps(fscal
,dx23
);
2144 ty
= _mm256_mul_ps(fscal
,dy23
);
2145 tz
= _mm256_mul_ps(fscal
,dz23
);
2147 /* Update vectorial force */
2148 fix2
= _mm256_add_ps(fix2
,tx
);
2149 fiy2
= _mm256_add_ps(fiy2
,ty
);
2150 fiz2
= _mm256_add_ps(fiz2
,tz
);
2152 fjx3
= _mm256_add_ps(fjx3
,tx
);
2153 fjy3
= _mm256_add_ps(fjy3
,ty
);
2154 fjz3
= _mm256_add_ps(fjz3
,tz
);
2158 /**************************
2159 * CALCULATE INTERACTIONS *
2160 **************************/
2162 if (gmx_mm256_any_lt(rsq31
,rcutoff2
))
2165 r31
= _mm256_mul_ps(rsq31
,rinv31
);
2167 /* EWALD ELECTROSTATICS */
2169 /* Analytical PME correction */
2170 zeta2
= _mm256_mul_ps(beta2
,rsq31
);
2171 rinv3
= _mm256_mul_ps(rinvsq31
,rinv31
);
2172 pmecorrF
= avx256_pmecorrF_f(zeta2
);
2173 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
2174 felec
= _mm256_mul_ps(qq31
,felec
);
2176 cutoff_mask
= _mm256_cmp_ps(rsq31
,rcutoff2
,_CMP_LT_OQ
);
2180 fscal
= _mm256_and_ps(fscal
,cutoff_mask
);
2182 /* Calculate temporary vectorial force */
2183 tx
= _mm256_mul_ps(fscal
,dx31
);
2184 ty
= _mm256_mul_ps(fscal
,dy31
);
2185 tz
= _mm256_mul_ps(fscal
,dz31
);
2187 /* Update vectorial force */
2188 fix3
= _mm256_add_ps(fix3
,tx
);
2189 fiy3
= _mm256_add_ps(fiy3
,ty
);
2190 fiz3
= _mm256_add_ps(fiz3
,tz
);
2192 fjx1
= _mm256_add_ps(fjx1
,tx
);
2193 fjy1
= _mm256_add_ps(fjy1
,ty
);
2194 fjz1
= _mm256_add_ps(fjz1
,tz
);
2198 /**************************
2199 * CALCULATE INTERACTIONS *
2200 **************************/
2202 if (gmx_mm256_any_lt(rsq32
,rcutoff2
))
2205 r32
= _mm256_mul_ps(rsq32
,rinv32
);
2207 /* EWALD ELECTROSTATICS */
2209 /* Analytical PME correction */
2210 zeta2
= _mm256_mul_ps(beta2
,rsq32
);
2211 rinv3
= _mm256_mul_ps(rinvsq32
,rinv32
);
2212 pmecorrF
= avx256_pmecorrF_f(zeta2
);
2213 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
2214 felec
= _mm256_mul_ps(qq32
,felec
);
2216 cutoff_mask
= _mm256_cmp_ps(rsq32
,rcutoff2
,_CMP_LT_OQ
);
2220 fscal
= _mm256_and_ps(fscal
,cutoff_mask
);
2222 /* Calculate temporary vectorial force */
2223 tx
= _mm256_mul_ps(fscal
,dx32
);
2224 ty
= _mm256_mul_ps(fscal
,dy32
);
2225 tz
= _mm256_mul_ps(fscal
,dz32
);
2227 /* Update vectorial force */
2228 fix3
= _mm256_add_ps(fix3
,tx
);
2229 fiy3
= _mm256_add_ps(fiy3
,ty
);
2230 fiz3
= _mm256_add_ps(fiz3
,tz
);
2232 fjx2
= _mm256_add_ps(fjx2
,tx
);
2233 fjy2
= _mm256_add_ps(fjy2
,ty
);
2234 fjz2
= _mm256_add_ps(fjz2
,tz
);
2238 /**************************
2239 * CALCULATE INTERACTIONS *
2240 **************************/
2242 if (gmx_mm256_any_lt(rsq33
,rcutoff2
))
2245 r33
= _mm256_mul_ps(rsq33
,rinv33
);
2247 /* EWALD ELECTROSTATICS */
2249 /* Analytical PME correction */
2250 zeta2
= _mm256_mul_ps(beta2
,rsq33
);
2251 rinv3
= _mm256_mul_ps(rinvsq33
,rinv33
);
2252 pmecorrF
= avx256_pmecorrF_f(zeta2
);
2253 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
2254 felec
= _mm256_mul_ps(qq33
,felec
);
2256 cutoff_mask
= _mm256_cmp_ps(rsq33
,rcutoff2
,_CMP_LT_OQ
);
2260 fscal
= _mm256_and_ps(fscal
,cutoff_mask
);
2262 /* Calculate temporary vectorial force */
2263 tx
= _mm256_mul_ps(fscal
,dx33
);
2264 ty
= _mm256_mul_ps(fscal
,dy33
);
2265 tz
= _mm256_mul_ps(fscal
,dz33
);
2267 /* Update vectorial force */
2268 fix3
= _mm256_add_ps(fix3
,tx
);
2269 fiy3
= _mm256_add_ps(fiy3
,ty
);
2270 fiz3
= _mm256_add_ps(fiz3
,tz
);
2272 fjx3
= _mm256_add_ps(fjx3
,tx
);
2273 fjy3
= _mm256_add_ps(fjy3
,ty
);
2274 fjz3
= _mm256_add_ps(fjz3
,tz
);
2278 fjptrA
= f
+j_coord_offsetA
;
2279 fjptrB
= f
+j_coord_offsetB
;
2280 fjptrC
= f
+j_coord_offsetC
;
2281 fjptrD
= f
+j_coord_offsetD
;
2282 fjptrE
= f
+j_coord_offsetE
;
2283 fjptrF
= f
+j_coord_offsetF
;
2284 fjptrG
= f
+j_coord_offsetG
;
2285 fjptrH
= f
+j_coord_offsetH
;
2287 gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,fjptrE
,fjptrF
,fjptrG
,fjptrH
,
2288 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,
2289 fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
2291 /* Inner loop uses 583 flops */
2294 if(jidx
<j_index_end
)
2297 /* Get j neighbor index, and coordinate index */
2298 jnrlistA
= jjnr
[jidx
];
2299 jnrlistB
= jjnr
[jidx
+1];
2300 jnrlistC
= jjnr
[jidx
+2];
2301 jnrlistD
= jjnr
[jidx
+3];
2302 jnrlistE
= jjnr
[jidx
+4];
2303 jnrlistF
= jjnr
[jidx
+5];
2304 jnrlistG
= jjnr
[jidx
+6];
2305 jnrlistH
= jjnr
[jidx
+7];
2306 /* Sign of each element will be negative for non-real atoms.
2307 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2308 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2310 dummy_mask
= gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
+4)),_mm_setzero_si128())),
2311 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128())));
2313 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
2314 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
2315 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
2316 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
2317 jnrE
= (jnrlistE
>=0) ? jnrlistE
: 0;
2318 jnrF
= (jnrlistF
>=0) ? jnrlistF
: 0;
2319 jnrG
= (jnrlistG
>=0) ? jnrlistG
: 0;
2320 jnrH
= (jnrlistH
>=0) ? jnrlistH
: 0;
2321 j_coord_offsetA
= DIM
*jnrA
;
2322 j_coord_offsetB
= DIM
*jnrB
;
2323 j_coord_offsetC
= DIM
*jnrC
;
2324 j_coord_offsetD
= DIM
*jnrD
;
2325 j_coord_offsetE
= DIM
*jnrE
;
2326 j_coord_offsetF
= DIM
*jnrF
;
2327 j_coord_offsetG
= DIM
*jnrG
;
2328 j_coord_offsetH
= DIM
*jnrH
;
2330 /* load j atom coordinates */
2331 gmx_mm256_load_4rvec_8ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
2332 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
2333 x
+j_coord_offsetE
,x
+j_coord_offsetF
,
2334 x
+j_coord_offsetG
,x
+j_coord_offsetH
,
2335 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
2336 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
2338 /* Calculate displacement vector */
2339 dx00
= _mm256_sub_ps(ix0
,jx0
);
2340 dy00
= _mm256_sub_ps(iy0
,jy0
);
2341 dz00
= _mm256_sub_ps(iz0
,jz0
);
2342 dx11
= _mm256_sub_ps(ix1
,jx1
);
2343 dy11
= _mm256_sub_ps(iy1
,jy1
);
2344 dz11
= _mm256_sub_ps(iz1
,jz1
);
2345 dx12
= _mm256_sub_ps(ix1
,jx2
);
2346 dy12
= _mm256_sub_ps(iy1
,jy2
);
2347 dz12
= _mm256_sub_ps(iz1
,jz2
);
2348 dx13
= _mm256_sub_ps(ix1
,jx3
);
2349 dy13
= _mm256_sub_ps(iy1
,jy3
);
2350 dz13
= _mm256_sub_ps(iz1
,jz3
);
2351 dx21
= _mm256_sub_ps(ix2
,jx1
);
2352 dy21
= _mm256_sub_ps(iy2
,jy1
);
2353 dz21
= _mm256_sub_ps(iz2
,jz1
);
2354 dx22
= _mm256_sub_ps(ix2
,jx2
);
2355 dy22
= _mm256_sub_ps(iy2
,jy2
);
2356 dz22
= _mm256_sub_ps(iz2
,jz2
);
2357 dx23
= _mm256_sub_ps(ix2
,jx3
);
2358 dy23
= _mm256_sub_ps(iy2
,jy3
);
2359 dz23
= _mm256_sub_ps(iz2
,jz3
);
2360 dx31
= _mm256_sub_ps(ix3
,jx1
);
2361 dy31
= _mm256_sub_ps(iy3
,jy1
);
2362 dz31
= _mm256_sub_ps(iz3
,jz1
);
2363 dx32
= _mm256_sub_ps(ix3
,jx2
);
2364 dy32
= _mm256_sub_ps(iy3
,jy2
);
2365 dz32
= _mm256_sub_ps(iz3
,jz2
);
2366 dx33
= _mm256_sub_ps(ix3
,jx3
);
2367 dy33
= _mm256_sub_ps(iy3
,jy3
);
2368 dz33
= _mm256_sub_ps(iz3
,jz3
);
2370 /* Calculate squared distance and things based on it */
2371 rsq00
= gmx_mm256_calc_rsq_ps(dx00
,dy00
,dz00
);
2372 rsq11
= gmx_mm256_calc_rsq_ps(dx11
,dy11
,dz11
);
2373 rsq12
= gmx_mm256_calc_rsq_ps(dx12
,dy12
,dz12
);
2374 rsq13
= gmx_mm256_calc_rsq_ps(dx13
,dy13
,dz13
);
2375 rsq21
= gmx_mm256_calc_rsq_ps(dx21
,dy21
,dz21
);
2376 rsq22
= gmx_mm256_calc_rsq_ps(dx22
,dy22
,dz22
);
2377 rsq23
= gmx_mm256_calc_rsq_ps(dx23
,dy23
,dz23
);
2378 rsq31
= gmx_mm256_calc_rsq_ps(dx31
,dy31
,dz31
);
2379 rsq32
= gmx_mm256_calc_rsq_ps(dx32
,dy32
,dz32
);
2380 rsq33
= gmx_mm256_calc_rsq_ps(dx33
,dy33
,dz33
);
2382 rinv00
= avx256_invsqrt_f(rsq00
);
2383 rinv11
= avx256_invsqrt_f(rsq11
);
2384 rinv12
= avx256_invsqrt_f(rsq12
);
2385 rinv13
= avx256_invsqrt_f(rsq13
);
2386 rinv21
= avx256_invsqrt_f(rsq21
);
2387 rinv22
= avx256_invsqrt_f(rsq22
);
2388 rinv23
= avx256_invsqrt_f(rsq23
);
2389 rinv31
= avx256_invsqrt_f(rsq31
);
2390 rinv32
= avx256_invsqrt_f(rsq32
);
2391 rinv33
= avx256_invsqrt_f(rsq33
);
2393 rinvsq00
= _mm256_mul_ps(rinv00
,rinv00
);
2394 rinvsq11
= _mm256_mul_ps(rinv11
,rinv11
);
2395 rinvsq12
= _mm256_mul_ps(rinv12
,rinv12
);
2396 rinvsq13
= _mm256_mul_ps(rinv13
,rinv13
);
2397 rinvsq21
= _mm256_mul_ps(rinv21
,rinv21
);
2398 rinvsq22
= _mm256_mul_ps(rinv22
,rinv22
);
2399 rinvsq23
= _mm256_mul_ps(rinv23
,rinv23
);
2400 rinvsq31
= _mm256_mul_ps(rinv31
,rinv31
);
2401 rinvsq32
= _mm256_mul_ps(rinv32
,rinv32
);
2402 rinvsq33
= _mm256_mul_ps(rinv33
,rinv33
);
2404 fjx0
= _mm256_setzero_ps();
2405 fjy0
= _mm256_setzero_ps();
2406 fjz0
= _mm256_setzero_ps();
2407 fjx1
= _mm256_setzero_ps();
2408 fjy1
= _mm256_setzero_ps();
2409 fjz1
= _mm256_setzero_ps();
2410 fjx2
= _mm256_setzero_ps();
2411 fjy2
= _mm256_setzero_ps();
2412 fjz2
= _mm256_setzero_ps();
2413 fjx3
= _mm256_setzero_ps();
2414 fjy3
= _mm256_setzero_ps();
2415 fjz3
= _mm256_setzero_ps();
2417 /**************************
2418 * CALCULATE INTERACTIONS *
2419 **************************/
2421 if (gmx_mm256_any_lt(rsq00
,rcutoff2
))
2424 r00
= _mm256_mul_ps(rsq00
,rinv00
);
2425 r00
= _mm256_andnot_ps(dummy_mask
,r00
);
2427 /* Analytical LJ-PME */
2428 rinvsix
= _mm256_mul_ps(_mm256_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
2429 ewcljrsq
= _mm256_mul_ps(ewclj2
,rsq00
);
2430 ewclj6
= _mm256_mul_ps(ewclj2
,_mm256_mul_ps(ewclj2
,ewclj2
));
2431 exponent
= avx256_exp_f(ewcljrsq
);
2432 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
2433 poly
= _mm256_mul_ps(exponent
,_mm256_add_ps(_mm256_sub_ps(one
,ewcljrsq
),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq
,ewcljrsq
),one_half
)));
2434 /* f6A = 6 * C6grid * (1 - poly) */
2435 f6A
= _mm256_mul_ps(c6grid_00
,_mm256_sub_ps(one
,poly
));
2436 /* f6B = C6grid * exponent * beta^6 */
2437 f6B
= _mm256_mul_ps(_mm256_mul_ps(c6grid_00
,one_sixth
),_mm256_mul_ps(exponent
,ewclj6
));
2438 /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
2439 fvdw
= _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00
,rinvsix
),_mm256_sub_ps(c6_00
,f6A
)),rinvsix
),f6B
),rinvsq00
);
2441 cutoff_mask
= _mm256_cmp_ps(rsq00
,rcutoff2
,_CMP_LT_OQ
);
2445 fscal
= _mm256_and_ps(fscal
,cutoff_mask
);
2447 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
2449 /* Calculate temporary vectorial force */
2450 tx
= _mm256_mul_ps(fscal
,dx00
);
2451 ty
= _mm256_mul_ps(fscal
,dy00
);
2452 tz
= _mm256_mul_ps(fscal
,dz00
);
2454 /* Update vectorial force */
2455 fix0
= _mm256_add_ps(fix0
,tx
);
2456 fiy0
= _mm256_add_ps(fiy0
,ty
);
2457 fiz0
= _mm256_add_ps(fiz0
,tz
);
2459 fjx0
= _mm256_add_ps(fjx0
,tx
);
2460 fjy0
= _mm256_add_ps(fjy0
,ty
);
2461 fjz0
= _mm256_add_ps(fjz0
,tz
);
2465 /**************************
2466 * CALCULATE INTERACTIONS *
2467 **************************/
2469 if (gmx_mm256_any_lt(rsq11
,rcutoff2
))
2472 r11
= _mm256_mul_ps(rsq11
,rinv11
);
2473 r11
= _mm256_andnot_ps(dummy_mask
,r11
);
2475 /* EWALD ELECTROSTATICS */
2477 /* Analytical PME correction */
2478 zeta2
= _mm256_mul_ps(beta2
,rsq11
);
2479 rinv3
= _mm256_mul_ps(rinvsq11
,rinv11
);
2480 pmecorrF
= avx256_pmecorrF_f(zeta2
);
2481 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
2482 felec
= _mm256_mul_ps(qq11
,felec
);
2484 cutoff_mask
= _mm256_cmp_ps(rsq11
,rcutoff2
,_CMP_LT_OQ
);
2488 fscal
= _mm256_and_ps(fscal
,cutoff_mask
);
2490 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
2492 /* Calculate temporary vectorial force */
2493 tx
= _mm256_mul_ps(fscal
,dx11
);
2494 ty
= _mm256_mul_ps(fscal
,dy11
);
2495 tz
= _mm256_mul_ps(fscal
,dz11
);
2497 /* Update vectorial force */
2498 fix1
= _mm256_add_ps(fix1
,tx
);
2499 fiy1
= _mm256_add_ps(fiy1
,ty
);
2500 fiz1
= _mm256_add_ps(fiz1
,tz
);
2502 fjx1
= _mm256_add_ps(fjx1
,tx
);
2503 fjy1
= _mm256_add_ps(fjy1
,ty
);
2504 fjz1
= _mm256_add_ps(fjz1
,tz
);
2508 /**************************
2509 * CALCULATE INTERACTIONS *
2510 **************************/
2512 if (gmx_mm256_any_lt(rsq12
,rcutoff2
))
2515 r12
= _mm256_mul_ps(rsq12
,rinv12
);
2516 r12
= _mm256_andnot_ps(dummy_mask
,r12
);
2518 /* EWALD ELECTROSTATICS */
2520 /* Analytical PME correction */
2521 zeta2
= _mm256_mul_ps(beta2
,rsq12
);
2522 rinv3
= _mm256_mul_ps(rinvsq12
,rinv12
);
2523 pmecorrF
= avx256_pmecorrF_f(zeta2
);
2524 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
2525 felec
= _mm256_mul_ps(qq12
,felec
);
2527 cutoff_mask
= _mm256_cmp_ps(rsq12
,rcutoff2
,_CMP_LT_OQ
);
2531 fscal
= _mm256_and_ps(fscal
,cutoff_mask
);
2533 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
2535 /* Calculate temporary vectorial force */
2536 tx
= _mm256_mul_ps(fscal
,dx12
);
2537 ty
= _mm256_mul_ps(fscal
,dy12
);
2538 tz
= _mm256_mul_ps(fscal
,dz12
);
2540 /* Update vectorial force */
2541 fix1
= _mm256_add_ps(fix1
,tx
);
2542 fiy1
= _mm256_add_ps(fiy1
,ty
);
2543 fiz1
= _mm256_add_ps(fiz1
,tz
);
2545 fjx2
= _mm256_add_ps(fjx2
,tx
);
2546 fjy2
= _mm256_add_ps(fjy2
,ty
);
2547 fjz2
= _mm256_add_ps(fjz2
,tz
);
2551 /**************************
2552 * CALCULATE INTERACTIONS *
2553 **************************/
2555 if (gmx_mm256_any_lt(rsq13
,rcutoff2
))
2558 r13
= _mm256_mul_ps(rsq13
,rinv13
);
2559 r13
= _mm256_andnot_ps(dummy_mask
,r13
);
2561 /* EWALD ELECTROSTATICS */
2563 /* Analytical PME correction */
2564 zeta2
= _mm256_mul_ps(beta2
,rsq13
);
2565 rinv3
= _mm256_mul_ps(rinvsq13
,rinv13
);
2566 pmecorrF
= avx256_pmecorrF_f(zeta2
);
2567 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
2568 felec
= _mm256_mul_ps(qq13
,felec
);
2570 cutoff_mask
= _mm256_cmp_ps(rsq13
,rcutoff2
,_CMP_LT_OQ
);
2574 fscal
= _mm256_and_ps(fscal
,cutoff_mask
);
2576 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
2578 /* Calculate temporary vectorial force */
2579 tx
= _mm256_mul_ps(fscal
,dx13
);
2580 ty
= _mm256_mul_ps(fscal
,dy13
);
2581 tz
= _mm256_mul_ps(fscal
,dz13
);
2583 /* Update vectorial force */
2584 fix1
= _mm256_add_ps(fix1
,tx
);
2585 fiy1
= _mm256_add_ps(fiy1
,ty
);
2586 fiz1
= _mm256_add_ps(fiz1
,tz
);
2588 fjx3
= _mm256_add_ps(fjx3
,tx
);
2589 fjy3
= _mm256_add_ps(fjy3
,ty
);
2590 fjz3
= _mm256_add_ps(fjz3
,tz
);
2594 /**************************
2595 * CALCULATE INTERACTIONS *
2596 **************************/
2598 if (gmx_mm256_any_lt(rsq21
,rcutoff2
))
2601 r21
= _mm256_mul_ps(rsq21
,rinv21
);
2602 r21
= _mm256_andnot_ps(dummy_mask
,r21
);
2604 /* EWALD ELECTROSTATICS */
2606 /* Analytical PME correction */
2607 zeta2
= _mm256_mul_ps(beta2
,rsq21
);
2608 rinv3
= _mm256_mul_ps(rinvsq21
,rinv21
);
2609 pmecorrF
= avx256_pmecorrF_f(zeta2
);
2610 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
2611 felec
= _mm256_mul_ps(qq21
,felec
);
2613 cutoff_mask
= _mm256_cmp_ps(rsq21
,rcutoff2
,_CMP_LT_OQ
);
2617 fscal
= _mm256_and_ps(fscal
,cutoff_mask
);
2619 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
2621 /* Calculate temporary vectorial force */
2622 tx
= _mm256_mul_ps(fscal
,dx21
);
2623 ty
= _mm256_mul_ps(fscal
,dy21
);
2624 tz
= _mm256_mul_ps(fscal
,dz21
);
2626 /* Update vectorial force */
2627 fix2
= _mm256_add_ps(fix2
,tx
);
2628 fiy2
= _mm256_add_ps(fiy2
,ty
);
2629 fiz2
= _mm256_add_ps(fiz2
,tz
);
2631 fjx1
= _mm256_add_ps(fjx1
,tx
);
2632 fjy1
= _mm256_add_ps(fjy1
,ty
);
2633 fjz1
= _mm256_add_ps(fjz1
,tz
);
2637 /**************************
2638 * CALCULATE INTERACTIONS *
2639 **************************/
2641 if (gmx_mm256_any_lt(rsq22
,rcutoff2
))
2644 r22
= _mm256_mul_ps(rsq22
,rinv22
);
2645 r22
= _mm256_andnot_ps(dummy_mask
,r22
);
2647 /* EWALD ELECTROSTATICS */
2649 /* Analytical PME correction */
2650 zeta2
= _mm256_mul_ps(beta2
,rsq22
);
2651 rinv3
= _mm256_mul_ps(rinvsq22
,rinv22
);
2652 pmecorrF
= avx256_pmecorrF_f(zeta2
);
2653 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
2654 felec
= _mm256_mul_ps(qq22
,felec
);
2656 cutoff_mask
= _mm256_cmp_ps(rsq22
,rcutoff2
,_CMP_LT_OQ
);
2660 fscal
= _mm256_and_ps(fscal
,cutoff_mask
);
2662 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
2664 /* Calculate temporary vectorial force */
2665 tx
= _mm256_mul_ps(fscal
,dx22
);
2666 ty
= _mm256_mul_ps(fscal
,dy22
);
2667 tz
= _mm256_mul_ps(fscal
,dz22
);
2669 /* Update vectorial force */
2670 fix2
= _mm256_add_ps(fix2
,tx
);
2671 fiy2
= _mm256_add_ps(fiy2
,ty
);
2672 fiz2
= _mm256_add_ps(fiz2
,tz
);
2674 fjx2
= _mm256_add_ps(fjx2
,tx
);
2675 fjy2
= _mm256_add_ps(fjy2
,ty
);
2676 fjz2
= _mm256_add_ps(fjz2
,tz
);
2680 /**************************
2681 * CALCULATE INTERACTIONS *
2682 **************************/
2684 if (gmx_mm256_any_lt(rsq23
,rcutoff2
))
2687 r23
= _mm256_mul_ps(rsq23
,rinv23
);
2688 r23
= _mm256_andnot_ps(dummy_mask
,r23
);
2690 /* EWALD ELECTROSTATICS */
2692 /* Analytical PME correction */
2693 zeta2
= _mm256_mul_ps(beta2
,rsq23
);
2694 rinv3
= _mm256_mul_ps(rinvsq23
,rinv23
);
2695 pmecorrF
= avx256_pmecorrF_f(zeta2
);
2696 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
2697 felec
= _mm256_mul_ps(qq23
,felec
);
2699 cutoff_mask
= _mm256_cmp_ps(rsq23
,rcutoff2
,_CMP_LT_OQ
);
2703 fscal
= _mm256_and_ps(fscal
,cutoff_mask
);
2705 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
2707 /* Calculate temporary vectorial force */
2708 tx
= _mm256_mul_ps(fscal
,dx23
);
2709 ty
= _mm256_mul_ps(fscal
,dy23
);
2710 tz
= _mm256_mul_ps(fscal
,dz23
);
2712 /* Update vectorial force */
2713 fix2
= _mm256_add_ps(fix2
,tx
);
2714 fiy2
= _mm256_add_ps(fiy2
,ty
);
2715 fiz2
= _mm256_add_ps(fiz2
,tz
);
2717 fjx3
= _mm256_add_ps(fjx3
,tx
);
2718 fjy3
= _mm256_add_ps(fjy3
,ty
);
2719 fjz3
= _mm256_add_ps(fjz3
,tz
);
2723 /**************************
2724 * CALCULATE INTERACTIONS *
2725 **************************/
2727 if (gmx_mm256_any_lt(rsq31
,rcutoff2
))
2730 r31
= _mm256_mul_ps(rsq31
,rinv31
);
2731 r31
= _mm256_andnot_ps(dummy_mask
,r31
);
2733 /* EWALD ELECTROSTATICS */
2735 /* Analytical PME correction */
2736 zeta2
= _mm256_mul_ps(beta2
,rsq31
);
2737 rinv3
= _mm256_mul_ps(rinvsq31
,rinv31
);
2738 pmecorrF
= avx256_pmecorrF_f(zeta2
);
2739 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
2740 felec
= _mm256_mul_ps(qq31
,felec
);
2742 cutoff_mask
= _mm256_cmp_ps(rsq31
,rcutoff2
,_CMP_LT_OQ
);
2746 fscal
= _mm256_and_ps(fscal
,cutoff_mask
);
2748 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
2750 /* Calculate temporary vectorial force */
2751 tx
= _mm256_mul_ps(fscal
,dx31
);
2752 ty
= _mm256_mul_ps(fscal
,dy31
);
2753 tz
= _mm256_mul_ps(fscal
,dz31
);
2755 /* Update vectorial force */
2756 fix3
= _mm256_add_ps(fix3
,tx
);
2757 fiy3
= _mm256_add_ps(fiy3
,ty
);
2758 fiz3
= _mm256_add_ps(fiz3
,tz
);
2760 fjx1
= _mm256_add_ps(fjx1
,tx
);
2761 fjy1
= _mm256_add_ps(fjy1
,ty
);
2762 fjz1
= _mm256_add_ps(fjz1
,tz
);
2766 /**************************
2767 * CALCULATE INTERACTIONS *
2768 **************************/
2770 if (gmx_mm256_any_lt(rsq32
,rcutoff2
))
2773 r32
= _mm256_mul_ps(rsq32
,rinv32
);
2774 r32
= _mm256_andnot_ps(dummy_mask
,r32
);
2776 /* EWALD ELECTROSTATICS */
2778 /* Analytical PME correction */
2779 zeta2
= _mm256_mul_ps(beta2
,rsq32
);
2780 rinv3
= _mm256_mul_ps(rinvsq32
,rinv32
);
2781 pmecorrF
= avx256_pmecorrF_f(zeta2
);
2782 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
2783 felec
= _mm256_mul_ps(qq32
,felec
);
2785 cutoff_mask
= _mm256_cmp_ps(rsq32
,rcutoff2
,_CMP_LT_OQ
);
2789 fscal
= _mm256_and_ps(fscal
,cutoff_mask
);
2791 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
2793 /* Calculate temporary vectorial force */
2794 tx
= _mm256_mul_ps(fscal
,dx32
);
2795 ty
= _mm256_mul_ps(fscal
,dy32
);
2796 tz
= _mm256_mul_ps(fscal
,dz32
);
2798 /* Update vectorial force */
2799 fix3
= _mm256_add_ps(fix3
,tx
);
2800 fiy3
= _mm256_add_ps(fiy3
,ty
);
2801 fiz3
= _mm256_add_ps(fiz3
,tz
);
2803 fjx2
= _mm256_add_ps(fjx2
,tx
);
2804 fjy2
= _mm256_add_ps(fjy2
,ty
);
2805 fjz2
= _mm256_add_ps(fjz2
,tz
);
2809 /**************************
2810 * CALCULATE INTERACTIONS *
2811 **************************/
2813 if (gmx_mm256_any_lt(rsq33
,rcutoff2
))
2816 r33
= _mm256_mul_ps(rsq33
,rinv33
);
2817 r33
= _mm256_andnot_ps(dummy_mask
,r33
);
2819 /* EWALD ELECTROSTATICS */
2821 /* Analytical PME correction */
2822 zeta2
= _mm256_mul_ps(beta2
,rsq33
);
2823 rinv3
= _mm256_mul_ps(rinvsq33
,rinv33
);
2824 pmecorrF
= avx256_pmecorrF_f(zeta2
);
2825 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
2826 felec
= _mm256_mul_ps(qq33
,felec
);
2828 cutoff_mask
= _mm256_cmp_ps(rsq33
,rcutoff2
,_CMP_LT_OQ
);
2832 fscal
= _mm256_and_ps(fscal
,cutoff_mask
);
2834 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
2836 /* Calculate temporary vectorial force */
2837 tx
= _mm256_mul_ps(fscal
,dx33
);
2838 ty
= _mm256_mul_ps(fscal
,dy33
);
2839 tz
= _mm256_mul_ps(fscal
,dz33
);
2841 /* Update vectorial force */
2842 fix3
= _mm256_add_ps(fix3
,tx
);
2843 fiy3
= _mm256_add_ps(fiy3
,ty
);
2844 fiz3
= _mm256_add_ps(fiz3
,tz
);
2846 fjx3
= _mm256_add_ps(fjx3
,tx
);
2847 fjy3
= _mm256_add_ps(fjy3
,ty
);
2848 fjz3
= _mm256_add_ps(fjz3
,tz
);
2852 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
2853 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
2854 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
2855 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
2856 fjptrE
= (jnrlistE
>=0) ? f
+j_coord_offsetE
: scratch
;
2857 fjptrF
= (jnrlistF
>=0) ? f
+j_coord_offsetF
: scratch
;
2858 fjptrG
= (jnrlistG
>=0) ? f
+j_coord_offsetG
: scratch
;
2859 fjptrH
= (jnrlistH
>=0) ? f
+j_coord_offsetH
: scratch
;
2861 gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,fjptrE
,fjptrF
,fjptrG
,fjptrH
,
2862 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,
2863 fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
2865 /* Inner loop uses 593 flops */
2868 /* End of innermost loop */
2870 gmx_mm256_update_iforce_4atom_swizzle_ps(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
2871 f
+i_coord_offset
,fshift
+i_shift_offset
);
2873 /* Increment number of inner iterations */
2874 inneriter
+= j_index_end
- j_index_start
;
2876 /* Outer loop uses 24 flops */
2879 /* Increment number of outer iterations */
2882 /* Update outer/inner flops */
2884 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W4W4_F
,outeriter
*24 + inneriter
*593);