2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_single kernel generator.
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
47 #include "gromacs/simd/math_x86_avx_256_single.h"
48 #include "kernelutil_x86_avx_256_single.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_avx_256_single
52 * Electrostatics interaction: Ewald
53 * VdW interaction: LJEwald
54 * Geometry: Water4-Water4
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_avx_256_single
59 (t_nblist
* gmx_restrict nlist
,
60 rvec
* gmx_restrict xx
,
61 rvec
* gmx_restrict ff
,
62 t_forcerec
* gmx_restrict fr
,
63 t_mdatoms
* gmx_restrict mdatoms
,
64 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
65 t_nrnb
* gmx_restrict nrnb
)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
73 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
74 int jnrA
,jnrB
,jnrC
,jnrD
;
75 int jnrE
,jnrF
,jnrG
,jnrH
;
76 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
77 int jnrlistE
,jnrlistF
,jnrlistG
,jnrlistH
;
78 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
79 int j_coord_offsetE
,j_coord_offsetF
,j_coord_offsetG
,j_coord_offsetH
;
80 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
82 real
*shiftvec
,*fshift
,*x
,*f
;
83 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
,*fjptrE
,*fjptrF
,*fjptrG
,*fjptrH
;
85 __m256 tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
86 real
* vdwioffsetptr0
;
87 real
* vdwgridioffsetptr0
;
88 __m256 ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
89 real
* vdwioffsetptr1
;
90 real
* vdwgridioffsetptr1
;
91 __m256 ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
92 real
* vdwioffsetptr2
;
93 real
* vdwgridioffsetptr2
;
94 __m256 ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
95 real
* vdwioffsetptr3
;
96 real
* vdwgridioffsetptr3
;
97 __m256 ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
98 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
,vdwjidx0E
,vdwjidx0F
,vdwjidx0G
,vdwjidx0H
;
99 __m256 jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
100 int vdwjidx1A
,vdwjidx1B
,vdwjidx1C
,vdwjidx1D
,vdwjidx1E
,vdwjidx1F
,vdwjidx1G
,vdwjidx1H
;
101 __m256 jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
102 int vdwjidx2A
,vdwjidx2B
,vdwjidx2C
,vdwjidx2D
,vdwjidx2E
,vdwjidx2F
,vdwjidx2G
,vdwjidx2H
;
103 __m256 jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
104 int vdwjidx3A
,vdwjidx3B
,vdwjidx3C
,vdwjidx3D
,vdwjidx3E
,vdwjidx3F
,vdwjidx3G
,vdwjidx3H
;
105 __m256 jx3
,jy3
,jz3
,fjx3
,fjy3
,fjz3
,jq3
,isaj3
;
106 __m256 dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
107 __m256 dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
108 __m256 dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
109 __m256 dx13
,dy13
,dz13
,rsq13
,rinv13
,rinvsq13
,r13
,qq13
,c6_13
,c12_13
;
110 __m256 dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
111 __m256 dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
112 __m256 dx23
,dy23
,dz23
,rsq23
,rinv23
,rinvsq23
,r23
,qq23
,c6_23
,c12_23
;
113 __m256 dx31
,dy31
,dz31
,rsq31
,rinv31
,rinvsq31
,r31
,qq31
,c6_31
,c12_31
;
114 __m256 dx32
,dy32
,dz32
,rsq32
,rinv32
,rinvsq32
,r32
,qq32
,c6_32
,c12_32
;
115 __m256 dx33
,dy33
,dz33
,rsq33
,rinv33
,rinvsq33
,r33
,qq33
,c6_33
,c12_33
;
116 __m256 velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
119 __m256 rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
122 __m256 one_sixth
= _mm256_set1_ps(1.0/6.0);
123 __m256 one_twelfth
= _mm256_set1_ps(1.0/12.0);
135 __m256 ewclj
,ewclj2
,ewclj6
,ewcljrsq
,poly
,exponent
,f6A
,f6B
,sh_lj_ewald
;
136 __m256 one_half
= _mm256_set1_ps(0.5);
137 __m256 minus_one
= _mm256_set1_ps(-1.0);
139 __m128i ewitab_lo
,ewitab_hi
;
140 __m256 ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
141 __m256 beta
,beta2
,beta3
,zeta2
,pmecorrF
,pmecorrV
,rinv3
;
143 __m256 dummy_mask
,cutoff_mask
;
144 __m256 signbit
= _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
145 __m256 one
= _mm256_set1_ps(1.0);
146 __m256 two
= _mm256_set1_ps(2.0);
152 jindex
= nlist
->jindex
;
154 shiftidx
= nlist
->shift
;
156 shiftvec
= fr
->shift_vec
[0];
157 fshift
= fr
->fshift
[0];
158 facel
= _mm256_set1_ps(fr
->epsfac
);
159 charge
= mdatoms
->chargeA
;
160 nvdwtype
= fr
->ntype
;
162 vdwtype
= mdatoms
->typeA
;
163 vdwgridparam
= fr
->ljpme_c6grid
;
164 sh_lj_ewald
= _mm256_set1_ps(fr
->ic
->sh_lj_ewald
);
165 ewclj
= _mm256_set1_ps(fr
->ewaldcoeff_lj
);
166 ewclj2
= _mm256_mul_ps(minus_one
,_mm256_mul_ps(ewclj
,ewclj
));
168 sh_ewald
= _mm256_set1_ps(fr
->ic
->sh_ewald
);
169 beta
= _mm256_set1_ps(fr
->ic
->ewaldcoeff_q
);
170 beta2
= _mm256_mul_ps(beta
,beta
);
171 beta3
= _mm256_mul_ps(beta
,beta2
);
173 ewtab
= fr
->ic
->tabq_coul_FDV0
;
174 ewtabscale
= _mm256_set1_ps(fr
->ic
->tabq_scale
);
175 ewtabhalfspace
= _mm256_set1_ps(0.5/fr
->ic
->tabq_scale
);
177 /* Setup water-specific parameters */
178 inr
= nlist
->iinr
[0];
179 iq1
= _mm256_mul_ps(facel
,_mm256_set1_ps(charge
[inr
+1]));
180 iq2
= _mm256_mul_ps(facel
,_mm256_set1_ps(charge
[inr
+2]));
181 iq3
= _mm256_mul_ps(facel
,_mm256_set1_ps(charge
[inr
+3]));
182 vdwioffsetptr0
= vdwparam
+2*nvdwtype
*vdwtype
[inr
+0];
183 vdwgridioffsetptr0
= vdwgridparam
+2*nvdwtype
*vdwtype
[inr
+0];
185 jq1
= _mm256_set1_ps(charge
[inr
+1]);
186 jq2
= _mm256_set1_ps(charge
[inr
+2]);
187 jq3
= _mm256_set1_ps(charge
[inr
+3]);
188 vdwjidx0A
= 2*vdwtype
[inr
+0];
189 c6_00
= _mm256_set1_ps(vdwioffsetptr0
[vdwjidx0A
]);
190 c12_00
= _mm256_set1_ps(vdwioffsetptr0
[vdwjidx0A
+1]);
191 c6grid_00
= _mm256_set1_ps(vdwgridioffsetptr0
[vdwjidx0A
]);
192 qq11
= _mm256_mul_ps(iq1
,jq1
);
193 qq12
= _mm256_mul_ps(iq1
,jq2
);
194 qq13
= _mm256_mul_ps(iq1
,jq3
);
195 qq21
= _mm256_mul_ps(iq2
,jq1
);
196 qq22
= _mm256_mul_ps(iq2
,jq2
);
197 qq23
= _mm256_mul_ps(iq2
,jq3
);
198 qq31
= _mm256_mul_ps(iq3
,jq1
);
199 qq32
= _mm256_mul_ps(iq3
,jq2
);
200 qq33
= _mm256_mul_ps(iq3
,jq3
);
202 /* Avoid stupid compiler warnings */
203 jnrA
= jnrB
= jnrC
= jnrD
= jnrE
= jnrF
= jnrG
= jnrH
= 0;
216 for(iidx
=0;iidx
<4*DIM
;iidx
++)
221 /* Start outer loop over neighborlists */
222 for(iidx
=0; iidx
<nri
; iidx
++)
224 /* Load shift vector for this list */
225 i_shift_offset
= DIM
*shiftidx
[iidx
];
227 /* Load limits for loop over neighbors */
228 j_index_start
= jindex
[iidx
];
229 j_index_end
= jindex
[iidx
+1];
231 /* Get outer coordinate index */
233 i_coord_offset
= DIM
*inr
;
235 /* Load i particle coords and add shift vector */
236 gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
237 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
239 fix0
= _mm256_setzero_ps();
240 fiy0
= _mm256_setzero_ps();
241 fiz0
= _mm256_setzero_ps();
242 fix1
= _mm256_setzero_ps();
243 fiy1
= _mm256_setzero_ps();
244 fiz1
= _mm256_setzero_ps();
245 fix2
= _mm256_setzero_ps();
246 fiy2
= _mm256_setzero_ps();
247 fiz2
= _mm256_setzero_ps();
248 fix3
= _mm256_setzero_ps();
249 fiy3
= _mm256_setzero_ps();
250 fiz3
= _mm256_setzero_ps();
252 /* Reset potential sums */
253 velecsum
= _mm256_setzero_ps();
254 vvdwsum
= _mm256_setzero_ps();
256 /* Start inner kernel loop */
257 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+7]>=0; jidx
+=8)
260 /* Get j neighbor index, and coordinate index */
269 j_coord_offsetA
= DIM
*jnrA
;
270 j_coord_offsetB
= DIM
*jnrB
;
271 j_coord_offsetC
= DIM
*jnrC
;
272 j_coord_offsetD
= DIM
*jnrD
;
273 j_coord_offsetE
= DIM
*jnrE
;
274 j_coord_offsetF
= DIM
*jnrF
;
275 j_coord_offsetG
= DIM
*jnrG
;
276 j_coord_offsetH
= DIM
*jnrH
;
278 /* load j atom coordinates */
279 gmx_mm256_load_4rvec_8ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
280 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
281 x
+j_coord_offsetE
,x
+j_coord_offsetF
,
282 x
+j_coord_offsetG
,x
+j_coord_offsetH
,
283 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
284 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
286 /* Calculate displacement vector */
287 dx00
= _mm256_sub_ps(ix0
,jx0
);
288 dy00
= _mm256_sub_ps(iy0
,jy0
);
289 dz00
= _mm256_sub_ps(iz0
,jz0
);
290 dx11
= _mm256_sub_ps(ix1
,jx1
);
291 dy11
= _mm256_sub_ps(iy1
,jy1
);
292 dz11
= _mm256_sub_ps(iz1
,jz1
);
293 dx12
= _mm256_sub_ps(ix1
,jx2
);
294 dy12
= _mm256_sub_ps(iy1
,jy2
);
295 dz12
= _mm256_sub_ps(iz1
,jz2
);
296 dx13
= _mm256_sub_ps(ix1
,jx3
);
297 dy13
= _mm256_sub_ps(iy1
,jy3
);
298 dz13
= _mm256_sub_ps(iz1
,jz3
);
299 dx21
= _mm256_sub_ps(ix2
,jx1
);
300 dy21
= _mm256_sub_ps(iy2
,jy1
);
301 dz21
= _mm256_sub_ps(iz2
,jz1
);
302 dx22
= _mm256_sub_ps(ix2
,jx2
);
303 dy22
= _mm256_sub_ps(iy2
,jy2
);
304 dz22
= _mm256_sub_ps(iz2
,jz2
);
305 dx23
= _mm256_sub_ps(ix2
,jx3
);
306 dy23
= _mm256_sub_ps(iy2
,jy3
);
307 dz23
= _mm256_sub_ps(iz2
,jz3
);
308 dx31
= _mm256_sub_ps(ix3
,jx1
);
309 dy31
= _mm256_sub_ps(iy3
,jy1
);
310 dz31
= _mm256_sub_ps(iz3
,jz1
);
311 dx32
= _mm256_sub_ps(ix3
,jx2
);
312 dy32
= _mm256_sub_ps(iy3
,jy2
);
313 dz32
= _mm256_sub_ps(iz3
,jz2
);
314 dx33
= _mm256_sub_ps(ix3
,jx3
);
315 dy33
= _mm256_sub_ps(iy3
,jy3
);
316 dz33
= _mm256_sub_ps(iz3
,jz3
);
318 /* Calculate squared distance and things based on it */
319 rsq00
= gmx_mm256_calc_rsq_ps(dx00
,dy00
,dz00
);
320 rsq11
= gmx_mm256_calc_rsq_ps(dx11
,dy11
,dz11
);
321 rsq12
= gmx_mm256_calc_rsq_ps(dx12
,dy12
,dz12
);
322 rsq13
= gmx_mm256_calc_rsq_ps(dx13
,dy13
,dz13
);
323 rsq21
= gmx_mm256_calc_rsq_ps(dx21
,dy21
,dz21
);
324 rsq22
= gmx_mm256_calc_rsq_ps(dx22
,dy22
,dz22
);
325 rsq23
= gmx_mm256_calc_rsq_ps(dx23
,dy23
,dz23
);
326 rsq31
= gmx_mm256_calc_rsq_ps(dx31
,dy31
,dz31
);
327 rsq32
= gmx_mm256_calc_rsq_ps(dx32
,dy32
,dz32
);
328 rsq33
= gmx_mm256_calc_rsq_ps(dx33
,dy33
,dz33
);
330 rinv00
= gmx_mm256_invsqrt_ps(rsq00
);
331 rinv11
= gmx_mm256_invsqrt_ps(rsq11
);
332 rinv12
= gmx_mm256_invsqrt_ps(rsq12
);
333 rinv13
= gmx_mm256_invsqrt_ps(rsq13
);
334 rinv21
= gmx_mm256_invsqrt_ps(rsq21
);
335 rinv22
= gmx_mm256_invsqrt_ps(rsq22
);
336 rinv23
= gmx_mm256_invsqrt_ps(rsq23
);
337 rinv31
= gmx_mm256_invsqrt_ps(rsq31
);
338 rinv32
= gmx_mm256_invsqrt_ps(rsq32
);
339 rinv33
= gmx_mm256_invsqrt_ps(rsq33
);
341 rinvsq00
= _mm256_mul_ps(rinv00
,rinv00
);
342 rinvsq11
= _mm256_mul_ps(rinv11
,rinv11
);
343 rinvsq12
= _mm256_mul_ps(rinv12
,rinv12
);
344 rinvsq13
= _mm256_mul_ps(rinv13
,rinv13
);
345 rinvsq21
= _mm256_mul_ps(rinv21
,rinv21
);
346 rinvsq22
= _mm256_mul_ps(rinv22
,rinv22
);
347 rinvsq23
= _mm256_mul_ps(rinv23
,rinv23
);
348 rinvsq31
= _mm256_mul_ps(rinv31
,rinv31
);
349 rinvsq32
= _mm256_mul_ps(rinv32
,rinv32
);
350 rinvsq33
= _mm256_mul_ps(rinv33
,rinv33
);
352 fjx0
= _mm256_setzero_ps();
353 fjy0
= _mm256_setzero_ps();
354 fjz0
= _mm256_setzero_ps();
355 fjx1
= _mm256_setzero_ps();
356 fjy1
= _mm256_setzero_ps();
357 fjz1
= _mm256_setzero_ps();
358 fjx2
= _mm256_setzero_ps();
359 fjy2
= _mm256_setzero_ps();
360 fjz2
= _mm256_setzero_ps();
361 fjx3
= _mm256_setzero_ps();
362 fjy3
= _mm256_setzero_ps();
363 fjz3
= _mm256_setzero_ps();
365 /**************************
366 * CALCULATE INTERACTIONS *
367 **************************/
369 r00
= _mm256_mul_ps(rsq00
,rinv00
);
371 /* Analytical LJ-PME */
372 rinvsix
= _mm256_mul_ps(_mm256_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
373 ewcljrsq
= _mm256_mul_ps(ewclj2
,rsq00
);
374 ewclj6
= _mm256_mul_ps(ewclj2
,_mm256_mul_ps(ewclj2
,ewclj2
));
375 exponent
= gmx_simd_exp_r(ewcljrsq
);
376 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
377 poly
= _mm256_mul_ps(exponent
,_mm256_add_ps(_mm256_sub_ps(one
,ewcljrsq
),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq
,ewcljrsq
),one_half
)));
378 /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
379 vvdw6
= _mm256_mul_ps(_mm256_sub_ps(c6_00
,_mm256_mul_ps(c6grid_00
,_mm256_sub_ps(one
,poly
))),rinvsix
);
380 vvdw12
= _mm256_mul_ps(c12_00
,_mm256_mul_ps(rinvsix
,rinvsix
));
381 vvdw
= _mm256_sub_ps(_mm256_mul_ps(vvdw12
,one_twelfth
),_mm256_mul_ps(vvdw6
,one_sixth
));
382 /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
383 fvdw
= _mm256_mul_ps(_mm256_sub_ps(vvdw12
,_mm256_sub_ps(vvdw6
,_mm256_mul_ps(_mm256_mul_ps(c6grid_00
,one_sixth
),_mm256_mul_ps(exponent
,ewclj6
)))),rinvsq00
);
385 /* Update potential sum for this i atom from the interaction with this j atom. */
386 vvdwsum
= _mm256_add_ps(vvdwsum
,vvdw
);
390 /* Calculate temporary vectorial force */
391 tx
= _mm256_mul_ps(fscal
,dx00
);
392 ty
= _mm256_mul_ps(fscal
,dy00
);
393 tz
= _mm256_mul_ps(fscal
,dz00
);
395 /* Update vectorial force */
396 fix0
= _mm256_add_ps(fix0
,tx
);
397 fiy0
= _mm256_add_ps(fiy0
,ty
);
398 fiz0
= _mm256_add_ps(fiz0
,tz
);
400 fjx0
= _mm256_add_ps(fjx0
,tx
);
401 fjy0
= _mm256_add_ps(fjy0
,ty
);
402 fjz0
= _mm256_add_ps(fjz0
,tz
);
404 /**************************
405 * CALCULATE INTERACTIONS *
406 **************************/
408 r11
= _mm256_mul_ps(rsq11
,rinv11
);
410 /* EWALD ELECTROSTATICS */
412 /* Analytical PME correction */
413 zeta2
= _mm256_mul_ps(beta2
,rsq11
);
414 rinv3
= _mm256_mul_ps(rinvsq11
,rinv11
);
415 pmecorrF
= gmx_mm256_pmecorrF_ps(zeta2
);
416 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
417 felec
= _mm256_mul_ps(qq11
,felec
);
418 pmecorrV
= gmx_mm256_pmecorrV_ps(zeta2
);
419 pmecorrV
= _mm256_mul_ps(pmecorrV
,beta
);
420 velec
= _mm256_sub_ps(rinv11
,pmecorrV
);
421 velec
= _mm256_mul_ps(qq11
,velec
);
423 /* Update potential sum for this i atom from the interaction with this j atom. */
424 velecsum
= _mm256_add_ps(velecsum
,velec
);
428 /* Calculate temporary vectorial force */
429 tx
= _mm256_mul_ps(fscal
,dx11
);
430 ty
= _mm256_mul_ps(fscal
,dy11
);
431 tz
= _mm256_mul_ps(fscal
,dz11
);
433 /* Update vectorial force */
434 fix1
= _mm256_add_ps(fix1
,tx
);
435 fiy1
= _mm256_add_ps(fiy1
,ty
);
436 fiz1
= _mm256_add_ps(fiz1
,tz
);
438 fjx1
= _mm256_add_ps(fjx1
,tx
);
439 fjy1
= _mm256_add_ps(fjy1
,ty
);
440 fjz1
= _mm256_add_ps(fjz1
,tz
);
442 /**************************
443 * CALCULATE INTERACTIONS *
444 **************************/
446 r12
= _mm256_mul_ps(rsq12
,rinv12
);
448 /* EWALD ELECTROSTATICS */
450 /* Analytical PME correction */
451 zeta2
= _mm256_mul_ps(beta2
,rsq12
);
452 rinv3
= _mm256_mul_ps(rinvsq12
,rinv12
);
453 pmecorrF
= gmx_mm256_pmecorrF_ps(zeta2
);
454 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
455 felec
= _mm256_mul_ps(qq12
,felec
);
456 pmecorrV
= gmx_mm256_pmecorrV_ps(zeta2
);
457 pmecorrV
= _mm256_mul_ps(pmecorrV
,beta
);
458 velec
= _mm256_sub_ps(rinv12
,pmecorrV
);
459 velec
= _mm256_mul_ps(qq12
,velec
);
461 /* Update potential sum for this i atom from the interaction with this j atom. */
462 velecsum
= _mm256_add_ps(velecsum
,velec
);
466 /* Calculate temporary vectorial force */
467 tx
= _mm256_mul_ps(fscal
,dx12
);
468 ty
= _mm256_mul_ps(fscal
,dy12
);
469 tz
= _mm256_mul_ps(fscal
,dz12
);
471 /* Update vectorial force */
472 fix1
= _mm256_add_ps(fix1
,tx
);
473 fiy1
= _mm256_add_ps(fiy1
,ty
);
474 fiz1
= _mm256_add_ps(fiz1
,tz
);
476 fjx2
= _mm256_add_ps(fjx2
,tx
);
477 fjy2
= _mm256_add_ps(fjy2
,ty
);
478 fjz2
= _mm256_add_ps(fjz2
,tz
);
480 /**************************
481 * CALCULATE INTERACTIONS *
482 **************************/
484 r13
= _mm256_mul_ps(rsq13
,rinv13
);
486 /* EWALD ELECTROSTATICS */
488 /* Analytical PME correction */
489 zeta2
= _mm256_mul_ps(beta2
,rsq13
);
490 rinv3
= _mm256_mul_ps(rinvsq13
,rinv13
);
491 pmecorrF
= gmx_mm256_pmecorrF_ps(zeta2
);
492 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
493 felec
= _mm256_mul_ps(qq13
,felec
);
494 pmecorrV
= gmx_mm256_pmecorrV_ps(zeta2
);
495 pmecorrV
= _mm256_mul_ps(pmecorrV
,beta
);
496 velec
= _mm256_sub_ps(rinv13
,pmecorrV
);
497 velec
= _mm256_mul_ps(qq13
,velec
);
499 /* Update potential sum for this i atom from the interaction with this j atom. */
500 velecsum
= _mm256_add_ps(velecsum
,velec
);
504 /* Calculate temporary vectorial force */
505 tx
= _mm256_mul_ps(fscal
,dx13
);
506 ty
= _mm256_mul_ps(fscal
,dy13
);
507 tz
= _mm256_mul_ps(fscal
,dz13
);
509 /* Update vectorial force */
510 fix1
= _mm256_add_ps(fix1
,tx
);
511 fiy1
= _mm256_add_ps(fiy1
,ty
);
512 fiz1
= _mm256_add_ps(fiz1
,tz
);
514 fjx3
= _mm256_add_ps(fjx3
,tx
);
515 fjy3
= _mm256_add_ps(fjy3
,ty
);
516 fjz3
= _mm256_add_ps(fjz3
,tz
);
518 /**************************
519 * CALCULATE INTERACTIONS *
520 **************************/
522 r21
= _mm256_mul_ps(rsq21
,rinv21
);
524 /* EWALD ELECTROSTATICS */
526 /* Analytical PME correction */
527 zeta2
= _mm256_mul_ps(beta2
,rsq21
);
528 rinv3
= _mm256_mul_ps(rinvsq21
,rinv21
);
529 pmecorrF
= gmx_mm256_pmecorrF_ps(zeta2
);
530 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
531 felec
= _mm256_mul_ps(qq21
,felec
);
532 pmecorrV
= gmx_mm256_pmecorrV_ps(zeta2
);
533 pmecorrV
= _mm256_mul_ps(pmecorrV
,beta
);
534 velec
= _mm256_sub_ps(rinv21
,pmecorrV
);
535 velec
= _mm256_mul_ps(qq21
,velec
);
537 /* Update potential sum for this i atom from the interaction with this j atom. */
538 velecsum
= _mm256_add_ps(velecsum
,velec
);
542 /* Calculate temporary vectorial force */
543 tx
= _mm256_mul_ps(fscal
,dx21
);
544 ty
= _mm256_mul_ps(fscal
,dy21
);
545 tz
= _mm256_mul_ps(fscal
,dz21
);
547 /* Update vectorial force */
548 fix2
= _mm256_add_ps(fix2
,tx
);
549 fiy2
= _mm256_add_ps(fiy2
,ty
);
550 fiz2
= _mm256_add_ps(fiz2
,tz
);
552 fjx1
= _mm256_add_ps(fjx1
,tx
);
553 fjy1
= _mm256_add_ps(fjy1
,ty
);
554 fjz1
= _mm256_add_ps(fjz1
,tz
);
556 /**************************
557 * CALCULATE INTERACTIONS *
558 **************************/
560 r22
= _mm256_mul_ps(rsq22
,rinv22
);
562 /* EWALD ELECTROSTATICS */
564 /* Analytical PME correction */
565 zeta2
= _mm256_mul_ps(beta2
,rsq22
);
566 rinv3
= _mm256_mul_ps(rinvsq22
,rinv22
);
567 pmecorrF
= gmx_mm256_pmecorrF_ps(zeta2
);
568 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
569 felec
= _mm256_mul_ps(qq22
,felec
);
570 pmecorrV
= gmx_mm256_pmecorrV_ps(zeta2
);
571 pmecorrV
= _mm256_mul_ps(pmecorrV
,beta
);
572 velec
= _mm256_sub_ps(rinv22
,pmecorrV
);
573 velec
= _mm256_mul_ps(qq22
,velec
);
575 /* Update potential sum for this i atom from the interaction with this j atom. */
576 velecsum
= _mm256_add_ps(velecsum
,velec
);
580 /* Calculate temporary vectorial force */
581 tx
= _mm256_mul_ps(fscal
,dx22
);
582 ty
= _mm256_mul_ps(fscal
,dy22
);
583 tz
= _mm256_mul_ps(fscal
,dz22
);
585 /* Update vectorial force */
586 fix2
= _mm256_add_ps(fix2
,tx
);
587 fiy2
= _mm256_add_ps(fiy2
,ty
);
588 fiz2
= _mm256_add_ps(fiz2
,tz
);
590 fjx2
= _mm256_add_ps(fjx2
,tx
);
591 fjy2
= _mm256_add_ps(fjy2
,ty
);
592 fjz2
= _mm256_add_ps(fjz2
,tz
);
594 /**************************
595 * CALCULATE INTERACTIONS *
596 **************************/
598 r23
= _mm256_mul_ps(rsq23
,rinv23
);
600 /* EWALD ELECTROSTATICS */
602 /* Analytical PME correction */
603 zeta2
= _mm256_mul_ps(beta2
,rsq23
);
604 rinv3
= _mm256_mul_ps(rinvsq23
,rinv23
);
605 pmecorrF
= gmx_mm256_pmecorrF_ps(zeta2
);
606 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
607 felec
= _mm256_mul_ps(qq23
,felec
);
608 pmecorrV
= gmx_mm256_pmecorrV_ps(zeta2
);
609 pmecorrV
= _mm256_mul_ps(pmecorrV
,beta
);
610 velec
= _mm256_sub_ps(rinv23
,pmecorrV
);
611 velec
= _mm256_mul_ps(qq23
,velec
);
613 /* Update potential sum for this i atom from the interaction with this j atom. */
614 velecsum
= _mm256_add_ps(velecsum
,velec
);
618 /* Calculate temporary vectorial force */
619 tx
= _mm256_mul_ps(fscal
,dx23
);
620 ty
= _mm256_mul_ps(fscal
,dy23
);
621 tz
= _mm256_mul_ps(fscal
,dz23
);
623 /* Update vectorial force */
624 fix2
= _mm256_add_ps(fix2
,tx
);
625 fiy2
= _mm256_add_ps(fiy2
,ty
);
626 fiz2
= _mm256_add_ps(fiz2
,tz
);
628 fjx3
= _mm256_add_ps(fjx3
,tx
);
629 fjy3
= _mm256_add_ps(fjy3
,ty
);
630 fjz3
= _mm256_add_ps(fjz3
,tz
);
632 /**************************
633 * CALCULATE INTERACTIONS *
634 **************************/
636 r31
= _mm256_mul_ps(rsq31
,rinv31
);
638 /* EWALD ELECTROSTATICS */
640 /* Analytical PME correction */
641 zeta2
= _mm256_mul_ps(beta2
,rsq31
);
642 rinv3
= _mm256_mul_ps(rinvsq31
,rinv31
);
643 pmecorrF
= gmx_mm256_pmecorrF_ps(zeta2
);
644 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
645 felec
= _mm256_mul_ps(qq31
,felec
);
646 pmecorrV
= gmx_mm256_pmecorrV_ps(zeta2
);
647 pmecorrV
= _mm256_mul_ps(pmecorrV
,beta
);
648 velec
= _mm256_sub_ps(rinv31
,pmecorrV
);
649 velec
= _mm256_mul_ps(qq31
,velec
);
651 /* Update potential sum for this i atom from the interaction with this j atom. */
652 velecsum
= _mm256_add_ps(velecsum
,velec
);
656 /* Calculate temporary vectorial force */
657 tx
= _mm256_mul_ps(fscal
,dx31
);
658 ty
= _mm256_mul_ps(fscal
,dy31
);
659 tz
= _mm256_mul_ps(fscal
,dz31
);
661 /* Update vectorial force */
662 fix3
= _mm256_add_ps(fix3
,tx
);
663 fiy3
= _mm256_add_ps(fiy3
,ty
);
664 fiz3
= _mm256_add_ps(fiz3
,tz
);
666 fjx1
= _mm256_add_ps(fjx1
,tx
);
667 fjy1
= _mm256_add_ps(fjy1
,ty
);
668 fjz1
= _mm256_add_ps(fjz1
,tz
);
670 /**************************
671 * CALCULATE INTERACTIONS *
672 **************************/
674 r32
= _mm256_mul_ps(rsq32
,rinv32
);
676 /* EWALD ELECTROSTATICS */
678 /* Analytical PME correction */
679 zeta2
= _mm256_mul_ps(beta2
,rsq32
);
680 rinv3
= _mm256_mul_ps(rinvsq32
,rinv32
);
681 pmecorrF
= gmx_mm256_pmecorrF_ps(zeta2
);
682 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
683 felec
= _mm256_mul_ps(qq32
,felec
);
684 pmecorrV
= gmx_mm256_pmecorrV_ps(zeta2
);
685 pmecorrV
= _mm256_mul_ps(pmecorrV
,beta
);
686 velec
= _mm256_sub_ps(rinv32
,pmecorrV
);
687 velec
= _mm256_mul_ps(qq32
,velec
);
689 /* Update potential sum for this i atom from the interaction with this j atom. */
690 velecsum
= _mm256_add_ps(velecsum
,velec
);
694 /* Calculate temporary vectorial force */
695 tx
= _mm256_mul_ps(fscal
,dx32
);
696 ty
= _mm256_mul_ps(fscal
,dy32
);
697 tz
= _mm256_mul_ps(fscal
,dz32
);
699 /* Update vectorial force */
700 fix3
= _mm256_add_ps(fix3
,tx
);
701 fiy3
= _mm256_add_ps(fiy3
,ty
);
702 fiz3
= _mm256_add_ps(fiz3
,tz
);
704 fjx2
= _mm256_add_ps(fjx2
,tx
);
705 fjy2
= _mm256_add_ps(fjy2
,ty
);
706 fjz2
= _mm256_add_ps(fjz2
,tz
);
708 /**************************
709 * CALCULATE INTERACTIONS *
710 **************************/
712 r33
= _mm256_mul_ps(rsq33
,rinv33
);
714 /* EWALD ELECTROSTATICS */
716 /* Analytical PME correction */
717 zeta2
= _mm256_mul_ps(beta2
,rsq33
);
718 rinv3
= _mm256_mul_ps(rinvsq33
,rinv33
);
719 pmecorrF
= gmx_mm256_pmecorrF_ps(zeta2
);
720 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
721 felec
= _mm256_mul_ps(qq33
,felec
);
722 pmecorrV
= gmx_mm256_pmecorrV_ps(zeta2
);
723 pmecorrV
= _mm256_mul_ps(pmecorrV
,beta
);
724 velec
= _mm256_sub_ps(rinv33
,pmecorrV
);
725 velec
= _mm256_mul_ps(qq33
,velec
);
727 /* Update potential sum for this i atom from the interaction with this j atom. */
728 velecsum
= _mm256_add_ps(velecsum
,velec
);
732 /* Calculate temporary vectorial force */
733 tx
= _mm256_mul_ps(fscal
,dx33
);
734 ty
= _mm256_mul_ps(fscal
,dy33
);
735 tz
= _mm256_mul_ps(fscal
,dz33
);
737 /* Update vectorial force */
738 fix3
= _mm256_add_ps(fix3
,tx
);
739 fiy3
= _mm256_add_ps(fiy3
,ty
);
740 fiz3
= _mm256_add_ps(fiz3
,tz
);
742 fjx3
= _mm256_add_ps(fjx3
,tx
);
743 fjy3
= _mm256_add_ps(fjy3
,ty
);
744 fjz3
= _mm256_add_ps(fjz3
,tz
);
746 fjptrA
= f
+j_coord_offsetA
;
747 fjptrB
= f
+j_coord_offsetB
;
748 fjptrC
= f
+j_coord_offsetC
;
749 fjptrD
= f
+j_coord_offsetD
;
750 fjptrE
= f
+j_coord_offsetE
;
751 fjptrF
= f
+j_coord_offsetF
;
752 fjptrG
= f
+j_coord_offsetG
;
753 fjptrH
= f
+j_coord_offsetH
;
755 gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,fjptrE
,fjptrF
,fjptrG
,fjptrH
,
756 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,
757 fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
759 /* Inner loop uses 810 flops */
765 /* Get j neighbor index, and coordinate index */
766 jnrlistA
= jjnr
[jidx
];
767 jnrlistB
= jjnr
[jidx
+1];
768 jnrlistC
= jjnr
[jidx
+2];
769 jnrlistD
= jjnr
[jidx
+3];
770 jnrlistE
= jjnr
[jidx
+4];
771 jnrlistF
= jjnr
[jidx
+5];
772 jnrlistG
= jjnr
[jidx
+6];
773 jnrlistH
= jjnr
[jidx
+7];
774 /* Sign of each element will be negative for non-real atoms.
775 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
776 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
778 dummy_mask
= gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
+4)),_mm_setzero_si128())),
779 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128())));
781 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
782 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
783 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
784 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
785 jnrE
= (jnrlistE
>=0) ? jnrlistE
: 0;
786 jnrF
= (jnrlistF
>=0) ? jnrlistF
: 0;
787 jnrG
= (jnrlistG
>=0) ? jnrlistG
: 0;
788 jnrH
= (jnrlistH
>=0) ? jnrlistH
: 0;
789 j_coord_offsetA
= DIM
*jnrA
;
790 j_coord_offsetB
= DIM
*jnrB
;
791 j_coord_offsetC
= DIM
*jnrC
;
792 j_coord_offsetD
= DIM
*jnrD
;
793 j_coord_offsetE
= DIM
*jnrE
;
794 j_coord_offsetF
= DIM
*jnrF
;
795 j_coord_offsetG
= DIM
*jnrG
;
796 j_coord_offsetH
= DIM
*jnrH
;
798 /* load j atom coordinates */
799 gmx_mm256_load_4rvec_8ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
800 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
801 x
+j_coord_offsetE
,x
+j_coord_offsetF
,
802 x
+j_coord_offsetG
,x
+j_coord_offsetH
,
803 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
804 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
806 /* Calculate displacement vector */
807 dx00
= _mm256_sub_ps(ix0
,jx0
);
808 dy00
= _mm256_sub_ps(iy0
,jy0
);
809 dz00
= _mm256_sub_ps(iz0
,jz0
);
810 dx11
= _mm256_sub_ps(ix1
,jx1
);
811 dy11
= _mm256_sub_ps(iy1
,jy1
);
812 dz11
= _mm256_sub_ps(iz1
,jz1
);
813 dx12
= _mm256_sub_ps(ix1
,jx2
);
814 dy12
= _mm256_sub_ps(iy1
,jy2
);
815 dz12
= _mm256_sub_ps(iz1
,jz2
);
816 dx13
= _mm256_sub_ps(ix1
,jx3
);
817 dy13
= _mm256_sub_ps(iy1
,jy3
);
818 dz13
= _mm256_sub_ps(iz1
,jz3
);
819 dx21
= _mm256_sub_ps(ix2
,jx1
);
820 dy21
= _mm256_sub_ps(iy2
,jy1
);
821 dz21
= _mm256_sub_ps(iz2
,jz1
);
822 dx22
= _mm256_sub_ps(ix2
,jx2
);
823 dy22
= _mm256_sub_ps(iy2
,jy2
);
824 dz22
= _mm256_sub_ps(iz2
,jz2
);
825 dx23
= _mm256_sub_ps(ix2
,jx3
);
826 dy23
= _mm256_sub_ps(iy2
,jy3
);
827 dz23
= _mm256_sub_ps(iz2
,jz3
);
828 dx31
= _mm256_sub_ps(ix3
,jx1
);
829 dy31
= _mm256_sub_ps(iy3
,jy1
);
830 dz31
= _mm256_sub_ps(iz3
,jz1
);
831 dx32
= _mm256_sub_ps(ix3
,jx2
);
832 dy32
= _mm256_sub_ps(iy3
,jy2
);
833 dz32
= _mm256_sub_ps(iz3
,jz2
);
834 dx33
= _mm256_sub_ps(ix3
,jx3
);
835 dy33
= _mm256_sub_ps(iy3
,jy3
);
836 dz33
= _mm256_sub_ps(iz3
,jz3
);
838 /* Calculate squared distance and things based on it */
839 rsq00
= gmx_mm256_calc_rsq_ps(dx00
,dy00
,dz00
);
840 rsq11
= gmx_mm256_calc_rsq_ps(dx11
,dy11
,dz11
);
841 rsq12
= gmx_mm256_calc_rsq_ps(dx12
,dy12
,dz12
);
842 rsq13
= gmx_mm256_calc_rsq_ps(dx13
,dy13
,dz13
);
843 rsq21
= gmx_mm256_calc_rsq_ps(dx21
,dy21
,dz21
);
844 rsq22
= gmx_mm256_calc_rsq_ps(dx22
,dy22
,dz22
);
845 rsq23
= gmx_mm256_calc_rsq_ps(dx23
,dy23
,dz23
);
846 rsq31
= gmx_mm256_calc_rsq_ps(dx31
,dy31
,dz31
);
847 rsq32
= gmx_mm256_calc_rsq_ps(dx32
,dy32
,dz32
);
848 rsq33
= gmx_mm256_calc_rsq_ps(dx33
,dy33
,dz33
);
850 rinv00
= gmx_mm256_invsqrt_ps(rsq00
);
851 rinv11
= gmx_mm256_invsqrt_ps(rsq11
);
852 rinv12
= gmx_mm256_invsqrt_ps(rsq12
);
853 rinv13
= gmx_mm256_invsqrt_ps(rsq13
);
854 rinv21
= gmx_mm256_invsqrt_ps(rsq21
);
855 rinv22
= gmx_mm256_invsqrt_ps(rsq22
);
856 rinv23
= gmx_mm256_invsqrt_ps(rsq23
);
857 rinv31
= gmx_mm256_invsqrt_ps(rsq31
);
858 rinv32
= gmx_mm256_invsqrt_ps(rsq32
);
859 rinv33
= gmx_mm256_invsqrt_ps(rsq33
);
861 rinvsq00
= _mm256_mul_ps(rinv00
,rinv00
);
862 rinvsq11
= _mm256_mul_ps(rinv11
,rinv11
);
863 rinvsq12
= _mm256_mul_ps(rinv12
,rinv12
);
864 rinvsq13
= _mm256_mul_ps(rinv13
,rinv13
);
865 rinvsq21
= _mm256_mul_ps(rinv21
,rinv21
);
866 rinvsq22
= _mm256_mul_ps(rinv22
,rinv22
);
867 rinvsq23
= _mm256_mul_ps(rinv23
,rinv23
);
868 rinvsq31
= _mm256_mul_ps(rinv31
,rinv31
);
869 rinvsq32
= _mm256_mul_ps(rinv32
,rinv32
);
870 rinvsq33
= _mm256_mul_ps(rinv33
,rinv33
);
872 fjx0
= _mm256_setzero_ps();
873 fjy0
= _mm256_setzero_ps();
874 fjz0
= _mm256_setzero_ps();
875 fjx1
= _mm256_setzero_ps();
876 fjy1
= _mm256_setzero_ps();
877 fjz1
= _mm256_setzero_ps();
878 fjx2
= _mm256_setzero_ps();
879 fjy2
= _mm256_setzero_ps();
880 fjz2
= _mm256_setzero_ps();
881 fjx3
= _mm256_setzero_ps();
882 fjy3
= _mm256_setzero_ps();
883 fjz3
= _mm256_setzero_ps();
885 /**************************
886 * CALCULATE INTERACTIONS *
887 **************************/
889 r00
= _mm256_mul_ps(rsq00
,rinv00
);
890 r00
= _mm256_andnot_ps(dummy_mask
,r00
);
892 /* Analytical LJ-PME */
893 rinvsix
= _mm256_mul_ps(_mm256_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
894 ewcljrsq
= _mm256_mul_ps(ewclj2
,rsq00
);
895 ewclj6
= _mm256_mul_ps(ewclj2
,_mm256_mul_ps(ewclj2
,ewclj2
));
896 exponent
= gmx_simd_exp_r(ewcljrsq
);
897 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
898 poly
= _mm256_mul_ps(exponent
,_mm256_add_ps(_mm256_sub_ps(one
,ewcljrsq
),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq
,ewcljrsq
),one_half
)));
899 /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
900 vvdw6
= _mm256_mul_ps(_mm256_sub_ps(c6_00
,_mm256_mul_ps(c6grid_00
,_mm256_sub_ps(one
,poly
))),rinvsix
);
901 vvdw12
= _mm256_mul_ps(c12_00
,_mm256_mul_ps(rinvsix
,rinvsix
));
902 vvdw
= _mm256_sub_ps(_mm256_mul_ps(vvdw12
,one_twelfth
),_mm256_mul_ps(vvdw6
,one_sixth
));
903 /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
904 fvdw
= _mm256_mul_ps(_mm256_sub_ps(vvdw12
,_mm256_sub_ps(vvdw6
,_mm256_mul_ps(_mm256_mul_ps(c6grid_00
,one_sixth
),_mm256_mul_ps(exponent
,ewclj6
)))),rinvsq00
);
906 /* Update potential sum for this i atom from the interaction with this j atom. */
907 vvdw
= _mm256_andnot_ps(dummy_mask
,vvdw
);
908 vvdwsum
= _mm256_add_ps(vvdwsum
,vvdw
);
912 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
914 /* Calculate temporary vectorial force */
915 tx
= _mm256_mul_ps(fscal
,dx00
);
916 ty
= _mm256_mul_ps(fscal
,dy00
);
917 tz
= _mm256_mul_ps(fscal
,dz00
);
919 /* Update vectorial force */
920 fix0
= _mm256_add_ps(fix0
,tx
);
921 fiy0
= _mm256_add_ps(fiy0
,ty
);
922 fiz0
= _mm256_add_ps(fiz0
,tz
);
924 fjx0
= _mm256_add_ps(fjx0
,tx
);
925 fjy0
= _mm256_add_ps(fjy0
,ty
);
926 fjz0
= _mm256_add_ps(fjz0
,tz
);
928 /**************************
929 * CALCULATE INTERACTIONS *
930 **************************/
932 r11
= _mm256_mul_ps(rsq11
,rinv11
);
933 r11
= _mm256_andnot_ps(dummy_mask
,r11
);
935 /* EWALD ELECTROSTATICS */
937 /* Analytical PME correction */
938 zeta2
= _mm256_mul_ps(beta2
,rsq11
);
939 rinv3
= _mm256_mul_ps(rinvsq11
,rinv11
);
940 pmecorrF
= gmx_mm256_pmecorrF_ps(zeta2
);
941 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
942 felec
= _mm256_mul_ps(qq11
,felec
);
943 pmecorrV
= gmx_mm256_pmecorrV_ps(zeta2
);
944 pmecorrV
= _mm256_mul_ps(pmecorrV
,beta
);
945 velec
= _mm256_sub_ps(rinv11
,pmecorrV
);
946 velec
= _mm256_mul_ps(qq11
,velec
);
948 /* Update potential sum for this i atom from the interaction with this j atom. */
949 velec
= _mm256_andnot_ps(dummy_mask
,velec
);
950 velecsum
= _mm256_add_ps(velecsum
,velec
);
954 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
956 /* Calculate temporary vectorial force */
957 tx
= _mm256_mul_ps(fscal
,dx11
);
958 ty
= _mm256_mul_ps(fscal
,dy11
);
959 tz
= _mm256_mul_ps(fscal
,dz11
);
961 /* Update vectorial force */
962 fix1
= _mm256_add_ps(fix1
,tx
);
963 fiy1
= _mm256_add_ps(fiy1
,ty
);
964 fiz1
= _mm256_add_ps(fiz1
,tz
);
966 fjx1
= _mm256_add_ps(fjx1
,tx
);
967 fjy1
= _mm256_add_ps(fjy1
,ty
);
968 fjz1
= _mm256_add_ps(fjz1
,tz
);
970 /**************************
971 * CALCULATE INTERACTIONS *
972 **************************/
974 r12
= _mm256_mul_ps(rsq12
,rinv12
);
975 r12
= _mm256_andnot_ps(dummy_mask
,r12
);
977 /* EWALD ELECTROSTATICS */
979 /* Analytical PME correction */
980 zeta2
= _mm256_mul_ps(beta2
,rsq12
);
981 rinv3
= _mm256_mul_ps(rinvsq12
,rinv12
);
982 pmecorrF
= gmx_mm256_pmecorrF_ps(zeta2
);
983 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
984 felec
= _mm256_mul_ps(qq12
,felec
);
985 pmecorrV
= gmx_mm256_pmecorrV_ps(zeta2
);
986 pmecorrV
= _mm256_mul_ps(pmecorrV
,beta
);
987 velec
= _mm256_sub_ps(rinv12
,pmecorrV
);
988 velec
= _mm256_mul_ps(qq12
,velec
);
990 /* Update potential sum for this i atom from the interaction with this j atom. */
991 velec
= _mm256_andnot_ps(dummy_mask
,velec
);
992 velecsum
= _mm256_add_ps(velecsum
,velec
);
996 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
998 /* Calculate temporary vectorial force */
999 tx
= _mm256_mul_ps(fscal
,dx12
);
1000 ty
= _mm256_mul_ps(fscal
,dy12
);
1001 tz
= _mm256_mul_ps(fscal
,dz12
);
1003 /* Update vectorial force */
1004 fix1
= _mm256_add_ps(fix1
,tx
);
1005 fiy1
= _mm256_add_ps(fiy1
,ty
);
1006 fiz1
= _mm256_add_ps(fiz1
,tz
);
1008 fjx2
= _mm256_add_ps(fjx2
,tx
);
1009 fjy2
= _mm256_add_ps(fjy2
,ty
);
1010 fjz2
= _mm256_add_ps(fjz2
,tz
);
1012 /**************************
1013 * CALCULATE INTERACTIONS *
1014 **************************/
1016 r13
= _mm256_mul_ps(rsq13
,rinv13
);
1017 r13
= _mm256_andnot_ps(dummy_mask
,r13
);
1019 /* EWALD ELECTROSTATICS */
1021 /* Analytical PME correction */
1022 zeta2
= _mm256_mul_ps(beta2
,rsq13
);
1023 rinv3
= _mm256_mul_ps(rinvsq13
,rinv13
);
1024 pmecorrF
= gmx_mm256_pmecorrF_ps(zeta2
);
1025 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
1026 felec
= _mm256_mul_ps(qq13
,felec
);
1027 pmecorrV
= gmx_mm256_pmecorrV_ps(zeta2
);
1028 pmecorrV
= _mm256_mul_ps(pmecorrV
,beta
);
1029 velec
= _mm256_sub_ps(rinv13
,pmecorrV
);
1030 velec
= _mm256_mul_ps(qq13
,velec
);
1032 /* Update potential sum for this i atom from the interaction with this j atom. */
1033 velec
= _mm256_andnot_ps(dummy_mask
,velec
);
1034 velecsum
= _mm256_add_ps(velecsum
,velec
);
1038 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
1040 /* Calculate temporary vectorial force */
1041 tx
= _mm256_mul_ps(fscal
,dx13
);
1042 ty
= _mm256_mul_ps(fscal
,dy13
);
1043 tz
= _mm256_mul_ps(fscal
,dz13
);
1045 /* Update vectorial force */
1046 fix1
= _mm256_add_ps(fix1
,tx
);
1047 fiy1
= _mm256_add_ps(fiy1
,ty
);
1048 fiz1
= _mm256_add_ps(fiz1
,tz
);
1050 fjx3
= _mm256_add_ps(fjx3
,tx
);
1051 fjy3
= _mm256_add_ps(fjy3
,ty
);
1052 fjz3
= _mm256_add_ps(fjz3
,tz
);
1054 /**************************
1055 * CALCULATE INTERACTIONS *
1056 **************************/
1058 r21
= _mm256_mul_ps(rsq21
,rinv21
);
1059 r21
= _mm256_andnot_ps(dummy_mask
,r21
);
1061 /* EWALD ELECTROSTATICS */
1063 /* Analytical PME correction */
1064 zeta2
= _mm256_mul_ps(beta2
,rsq21
);
1065 rinv3
= _mm256_mul_ps(rinvsq21
,rinv21
);
1066 pmecorrF
= gmx_mm256_pmecorrF_ps(zeta2
);
1067 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
1068 felec
= _mm256_mul_ps(qq21
,felec
);
1069 pmecorrV
= gmx_mm256_pmecorrV_ps(zeta2
);
1070 pmecorrV
= _mm256_mul_ps(pmecorrV
,beta
);
1071 velec
= _mm256_sub_ps(rinv21
,pmecorrV
);
1072 velec
= _mm256_mul_ps(qq21
,velec
);
1074 /* Update potential sum for this i atom from the interaction with this j atom. */
1075 velec
= _mm256_andnot_ps(dummy_mask
,velec
);
1076 velecsum
= _mm256_add_ps(velecsum
,velec
);
1080 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
1082 /* Calculate temporary vectorial force */
1083 tx
= _mm256_mul_ps(fscal
,dx21
);
1084 ty
= _mm256_mul_ps(fscal
,dy21
);
1085 tz
= _mm256_mul_ps(fscal
,dz21
);
1087 /* Update vectorial force */
1088 fix2
= _mm256_add_ps(fix2
,tx
);
1089 fiy2
= _mm256_add_ps(fiy2
,ty
);
1090 fiz2
= _mm256_add_ps(fiz2
,tz
);
1092 fjx1
= _mm256_add_ps(fjx1
,tx
);
1093 fjy1
= _mm256_add_ps(fjy1
,ty
);
1094 fjz1
= _mm256_add_ps(fjz1
,tz
);
1096 /**************************
1097 * CALCULATE INTERACTIONS *
1098 **************************/
1100 r22
= _mm256_mul_ps(rsq22
,rinv22
);
1101 r22
= _mm256_andnot_ps(dummy_mask
,r22
);
1103 /* EWALD ELECTROSTATICS */
1105 /* Analytical PME correction */
1106 zeta2
= _mm256_mul_ps(beta2
,rsq22
);
1107 rinv3
= _mm256_mul_ps(rinvsq22
,rinv22
);
1108 pmecorrF
= gmx_mm256_pmecorrF_ps(zeta2
);
1109 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
1110 felec
= _mm256_mul_ps(qq22
,felec
);
1111 pmecorrV
= gmx_mm256_pmecorrV_ps(zeta2
);
1112 pmecorrV
= _mm256_mul_ps(pmecorrV
,beta
);
1113 velec
= _mm256_sub_ps(rinv22
,pmecorrV
);
1114 velec
= _mm256_mul_ps(qq22
,velec
);
1116 /* Update potential sum for this i atom from the interaction with this j atom. */
1117 velec
= _mm256_andnot_ps(dummy_mask
,velec
);
1118 velecsum
= _mm256_add_ps(velecsum
,velec
);
1122 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
1124 /* Calculate temporary vectorial force */
1125 tx
= _mm256_mul_ps(fscal
,dx22
);
1126 ty
= _mm256_mul_ps(fscal
,dy22
);
1127 tz
= _mm256_mul_ps(fscal
,dz22
);
1129 /* Update vectorial force */
1130 fix2
= _mm256_add_ps(fix2
,tx
);
1131 fiy2
= _mm256_add_ps(fiy2
,ty
);
1132 fiz2
= _mm256_add_ps(fiz2
,tz
);
1134 fjx2
= _mm256_add_ps(fjx2
,tx
);
1135 fjy2
= _mm256_add_ps(fjy2
,ty
);
1136 fjz2
= _mm256_add_ps(fjz2
,tz
);
1138 /**************************
1139 * CALCULATE INTERACTIONS *
1140 **************************/
1142 r23
= _mm256_mul_ps(rsq23
,rinv23
);
1143 r23
= _mm256_andnot_ps(dummy_mask
,r23
);
1145 /* EWALD ELECTROSTATICS */
1147 /* Analytical PME correction */
1148 zeta2
= _mm256_mul_ps(beta2
,rsq23
);
1149 rinv3
= _mm256_mul_ps(rinvsq23
,rinv23
);
1150 pmecorrF
= gmx_mm256_pmecorrF_ps(zeta2
);
1151 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
1152 felec
= _mm256_mul_ps(qq23
,felec
);
1153 pmecorrV
= gmx_mm256_pmecorrV_ps(zeta2
);
1154 pmecorrV
= _mm256_mul_ps(pmecorrV
,beta
);
1155 velec
= _mm256_sub_ps(rinv23
,pmecorrV
);
1156 velec
= _mm256_mul_ps(qq23
,velec
);
1158 /* Update potential sum for this i atom from the interaction with this j atom. */
1159 velec
= _mm256_andnot_ps(dummy_mask
,velec
);
1160 velecsum
= _mm256_add_ps(velecsum
,velec
);
1164 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
1166 /* Calculate temporary vectorial force */
1167 tx
= _mm256_mul_ps(fscal
,dx23
);
1168 ty
= _mm256_mul_ps(fscal
,dy23
);
1169 tz
= _mm256_mul_ps(fscal
,dz23
);
1171 /* Update vectorial force */
1172 fix2
= _mm256_add_ps(fix2
,tx
);
1173 fiy2
= _mm256_add_ps(fiy2
,ty
);
1174 fiz2
= _mm256_add_ps(fiz2
,tz
);
1176 fjx3
= _mm256_add_ps(fjx3
,tx
);
1177 fjy3
= _mm256_add_ps(fjy3
,ty
);
1178 fjz3
= _mm256_add_ps(fjz3
,tz
);
1180 /**************************
1181 * CALCULATE INTERACTIONS *
1182 **************************/
1184 r31
= _mm256_mul_ps(rsq31
,rinv31
);
1185 r31
= _mm256_andnot_ps(dummy_mask
,r31
);
1187 /* EWALD ELECTROSTATICS */
1189 /* Analytical PME correction */
1190 zeta2
= _mm256_mul_ps(beta2
,rsq31
);
1191 rinv3
= _mm256_mul_ps(rinvsq31
,rinv31
);
1192 pmecorrF
= gmx_mm256_pmecorrF_ps(zeta2
);
1193 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
1194 felec
= _mm256_mul_ps(qq31
,felec
);
1195 pmecorrV
= gmx_mm256_pmecorrV_ps(zeta2
);
1196 pmecorrV
= _mm256_mul_ps(pmecorrV
,beta
);
1197 velec
= _mm256_sub_ps(rinv31
,pmecorrV
);
1198 velec
= _mm256_mul_ps(qq31
,velec
);
1200 /* Update potential sum for this i atom from the interaction with this j atom. */
1201 velec
= _mm256_andnot_ps(dummy_mask
,velec
);
1202 velecsum
= _mm256_add_ps(velecsum
,velec
);
1206 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
1208 /* Calculate temporary vectorial force */
1209 tx
= _mm256_mul_ps(fscal
,dx31
);
1210 ty
= _mm256_mul_ps(fscal
,dy31
);
1211 tz
= _mm256_mul_ps(fscal
,dz31
);
1213 /* Update vectorial force */
1214 fix3
= _mm256_add_ps(fix3
,tx
);
1215 fiy3
= _mm256_add_ps(fiy3
,ty
);
1216 fiz3
= _mm256_add_ps(fiz3
,tz
);
1218 fjx1
= _mm256_add_ps(fjx1
,tx
);
1219 fjy1
= _mm256_add_ps(fjy1
,ty
);
1220 fjz1
= _mm256_add_ps(fjz1
,tz
);
1222 /**************************
1223 * CALCULATE INTERACTIONS *
1224 **************************/
1226 r32
= _mm256_mul_ps(rsq32
,rinv32
);
1227 r32
= _mm256_andnot_ps(dummy_mask
,r32
);
1229 /* EWALD ELECTROSTATICS */
1231 /* Analytical PME correction */
1232 zeta2
= _mm256_mul_ps(beta2
,rsq32
);
1233 rinv3
= _mm256_mul_ps(rinvsq32
,rinv32
);
1234 pmecorrF
= gmx_mm256_pmecorrF_ps(zeta2
);
1235 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
1236 felec
= _mm256_mul_ps(qq32
,felec
);
1237 pmecorrV
= gmx_mm256_pmecorrV_ps(zeta2
);
1238 pmecorrV
= _mm256_mul_ps(pmecorrV
,beta
);
1239 velec
= _mm256_sub_ps(rinv32
,pmecorrV
);
1240 velec
= _mm256_mul_ps(qq32
,velec
);
1242 /* Update potential sum for this i atom from the interaction with this j atom. */
1243 velec
= _mm256_andnot_ps(dummy_mask
,velec
);
1244 velecsum
= _mm256_add_ps(velecsum
,velec
);
1248 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
1250 /* Calculate temporary vectorial force */
1251 tx
= _mm256_mul_ps(fscal
,dx32
);
1252 ty
= _mm256_mul_ps(fscal
,dy32
);
1253 tz
= _mm256_mul_ps(fscal
,dz32
);
1255 /* Update vectorial force */
1256 fix3
= _mm256_add_ps(fix3
,tx
);
1257 fiy3
= _mm256_add_ps(fiy3
,ty
);
1258 fiz3
= _mm256_add_ps(fiz3
,tz
);
1260 fjx2
= _mm256_add_ps(fjx2
,tx
);
1261 fjy2
= _mm256_add_ps(fjy2
,ty
);
1262 fjz2
= _mm256_add_ps(fjz2
,tz
);
1264 /**************************
1265 * CALCULATE INTERACTIONS *
1266 **************************/
1268 r33
= _mm256_mul_ps(rsq33
,rinv33
);
1269 r33
= _mm256_andnot_ps(dummy_mask
,r33
);
1271 /* EWALD ELECTROSTATICS */
1273 /* Analytical PME correction */
1274 zeta2
= _mm256_mul_ps(beta2
,rsq33
);
1275 rinv3
= _mm256_mul_ps(rinvsq33
,rinv33
);
1276 pmecorrF
= gmx_mm256_pmecorrF_ps(zeta2
);
1277 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
1278 felec
= _mm256_mul_ps(qq33
,felec
);
1279 pmecorrV
= gmx_mm256_pmecorrV_ps(zeta2
);
1280 pmecorrV
= _mm256_mul_ps(pmecorrV
,beta
);
1281 velec
= _mm256_sub_ps(rinv33
,pmecorrV
);
1282 velec
= _mm256_mul_ps(qq33
,velec
);
1284 /* Update potential sum for this i atom from the interaction with this j atom. */
1285 velec
= _mm256_andnot_ps(dummy_mask
,velec
);
1286 velecsum
= _mm256_add_ps(velecsum
,velec
);
1290 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
1292 /* Calculate temporary vectorial force */
1293 tx
= _mm256_mul_ps(fscal
,dx33
);
1294 ty
= _mm256_mul_ps(fscal
,dy33
);
1295 tz
= _mm256_mul_ps(fscal
,dz33
);
1297 /* Update vectorial force */
1298 fix3
= _mm256_add_ps(fix3
,tx
);
1299 fiy3
= _mm256_add_ps(fiy3
,ty
);
1300 fiz3
= _mm256_add_ps(fiz3
,tz
);
1302 fjx3
= _mm256_add_ps(fjx3
,tx
);
1303 fjy3
= _mm256_add_ps(fjy3
,ty
);
1304 fjz3
= _mm256_add_ps(fjz3
,tz
);
1306 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
1307 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
1308 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
1309 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
1310 fjptrE
= (jnrlistE
>=0) ? f
+j_coord_offsetE
: scratch
;
1311 fjptrF
= (jnrlistF
>=0) ? f
+j_coord_offsetF
: scratch
;
1312 fjptrG
= (jnrlistG
>=0) ? f
+j_coord_offsetG
: scratch
;
1313 fjptrH
= (jnrlistH
>=0) ? f
+j_coord_offsetH
: scratch
;
1315 gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,fjptrE
,fjptrF
,fjptrG
,fjptrH
,
1316 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,
1317 fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
1319 /* Inner loop uses 820 flops */
1322 /* End of innermost loop */
1324 gmx_mm256_update_iforce_4atom_swizzle_ps(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
1325 f
+i_coord_offset
,fshift
+i_shift_offset
);
1328 /* Update potential energies */
1329 gmx_mm256_update_1pot_ps(velecsum
,kernel_data
->energygrp_elec
+ggid
);
1330 gmx_mm256_update_1pot_ps(vvdwsum
,kernel_data
->energygrp_vdw
+ggid
);
1332 /* Increment number of inner iterations */
1333 inneriter
+= j_index_end
- j_index_start
;
1335 /* Outer loop uses 26 flops */
1338 /* Increment number of outer iterations */
1341 /* Update outer/inner flops */
1343 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W4W4_VF
,outeriter
*26 + inneriter
*820);
1346 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_avx_256_single
1347 * Electrostatics interaction: Ewald
1348 * VdW interaction: LJEwald
1349 * Geometry: Water4-Water4
1350 * Calculate force/pot: Force
1353 nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_avx_256_single
1354 (t_nblist
* gmx_restrict nlist
,
1355 rvec
* gmx_restrict xx
,
1356 rvec
* gmx_restrict ff
,
1357 t_forcerec
* gmx_restrict fr
,
1358 t_mdatoms
* gmx_restrict mdatoms
,
1359 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
1360 t_nrnb
* gmx_restrict nrnb
)
1362 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1363 * just 0 for non-waters.
1364 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1365 * jnr indices corresponding to data put in the four positions in the SIMD register.
1367 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
1368 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
1369 int jnrA
,jnrB
,jnrC
,jnrD
;
1370 int jnrE
,jnrF
,jnrG
,jnrH
;
1371 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
1372 int jnrlistE
,jnrlistF
,jnrlistG
,jnrlistH
;
1373 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
1374 int j_coord_offsetE
,j_coord_offsetF
,j_coord_offsetG
,j_coord_offsetH
;
1375 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
1376 real rcutoff_scalar
;
1377 real
*shiftvec
,*fshift
,*x
,*f
;
1378 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
,*fjptrE
,*fjptrF
,*fjptrG
,*fjptrH
;
1379 real scratch
[4*DIM
];
1380 __m256 tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
1381 real
* vdwioffsetptr0
;
1382 real
* vdwgridioffsetptr0
;
1383 __m256 ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
1384 real
* vdwioffsetptr1
;
1385 real
* vdwgridioffsetptr1
;
1386 __m256 ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
1387 real
* vdwioffsetptr2
;
1388 real
* vdwgridioffsetptr2
;
1389 __m256 ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
1390 real
* vdwioffsetptr3
;
1391 real
* vdwgridioffsetptr3
;
1392 __m256 ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
1393 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
,vdwjidx0E
,vdwjidx0F
,vdwjidx0G
,vdwjidx0H
;
1394 __m256 jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
1395 int vdwjidx1A
,vdwjidx1B
,vdwjidx1C
,vdwjidx1D
,vdwjidx1E
,vdwjidx1F
,vdwjidx1G
,vdwjidx1H
;
1396 __m256 jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
1397 int vdwjidx2A
,vdwjidx2B
,vdwjidx2C
,vdwjidx2D
,vdwjidx2E
,vdwjidx2F
,vdwjidx2G
,vdwjidx2H
;
1398 __m256 jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
1399 int vdwjidx3A
,vdwjidx3B
,vdwjidx3C
,vdwjidx3D
,vdwjidx3E
,vdwjidx3F
,vdwjidx3G
,vdwjidx3H
;
1400 __m256 jx3
,jy3
,jz3
,fjx3
,fjy3
,fjz3
,jq3
,isaj3
;
1401 __m256 dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
1402 __m256 dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
1403 __m256 dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
1404 __m256 dx13
,dy13
,dz13
,rsq13
,rinv13
,rinvsq13
,r13
,qq13
,c6_13
,c12_13
;
1405 __m256 dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
1406 __m256 dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
1407 __m256 dx23
,dy23
,dz23
,rsq23
,rinv23
,rinvsq23
,r23
,qq23
,c6_23
,c12_23
;
1408 __m256 dx31
,dy31
,dz31
,rsq31
,rinv31
,rinvsq31
,r31
,qq31
,c6_31
,c12_31
;
1409 __m256 dx32
,dy32
,dz32
,rsq32
,rinv32
,rinvsq32
,r32
,qq32
,c6_32
,c12_32
;
1410 __m256 dx33
,dy33
,dz33
,rsq33
,rinv33
,rinvsq33
,r33
,qq33
,c6_33
,c12_33
;
1411 __m256 velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
1414 __m256 rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
1417 __m256 one_sixth
= _mm256_set1_ps(1.0/6.0);
1418 __m256 one_twelfth
= _mm256_set1_ps(1.0/12.0);
1430 __m256 ewclj
,ewclj2
,ewclj6
,ewcljrsq
,poly
,exponent
,f6A
,f6B
,sh_lj_ewald
;
1431 __m256 one_half
= _mm256_set1_ps(0.5);
1432 __m256 minus_one
= _mm256_set1_ps(-1.0);
1434 __m128i ewitab_lo
,ewitab_hi
;
1435 __m256 ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
1436 __m256 beta
,beta2
,beta3
,zeta2
,pmecorrF
,pmecorrV
,rinv3
;
1438 __m256 dummy_mask
,cutoff_mask
;
1439 __m256 signbit
= _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1440 __m256 one
= _mm256_set1_ps(1.0);
1441 __m256 two
= _mm256_set1_ps(2.0);
1447 jindex
= nlist
->jindex
;
1449 shiftidx
= nlist
->shift
;
1451 shiftvec
= fr
->shift_vec
[0];
1452 fshift
= fr
->fshift
[0];
1453 facel
= _mm256_set1_ps(fr
->epsfac
);
1454 charge
= mdatoms
->chargeA
;
1455 nvdwtype
= fr
->ntype
;
1456 vdwparam
= fr
->nbfp
;
1457 vdwtype
= mdatoms
->typeA
;
1458 vdwgridparam
= fr
->ljpme_c6grid
;
1459 sh_lj_ewald
= _mm256_set1_ps(fr
->ic
->sh_lj_ewald
);
1460 ewclj
= _mm256_set1_ps(fr
->ewaldcoeff_lj
);
1461 ewclj2
= _mm256_mul_ps(minus_one
,_mm256_mul_ps(ewclj
,ewclj
));
1463 sh_ewald
= _mm256_set1_ps(fr
->ic
->sh_ewald
);
1464 beta
= _mm256_set1_ps(fr
->ic
->ewaldcoeff_q
);
1465 beta2
= _mm256_mul_ps(beta
,beta
);
1466 beta3
= _mm256_mul_ps(beta
,beta2
);
1468 ewtab
= fr
->ic
->tabq_coul_F
;
1469 ewtabscale
= _mm256_set1_ps(fr
->ic
->tabq_scale
);
1470 ewtabhalfspace
= _mm256_set1_ps(0.5/fr
->ic
->tabq_scale
);
1472 /* Setup water-specific parameters */
1473 inr
= nlist
->iinr
[0];
1474 iq1
= _mm256_mul_ps(facel
,_mm256_set1_ps(charge
[inr
+1]));
1475 iq2
= _mm256_mul_ps(facel
,_mm256_set1_ps(charge
[inr
+2]));
1476 iq3
= _mm256_mul_ps(facel
,_mm256_set1_ps(charge
[inr
+3]));
1477 vdwioffsetptr0
= vdwparam
+2*nvdwtype
*vdwtype
[inr
+0];
1478 vdwgridioffsetptr0
= vdwgridparam
+2*nvdwtype
*vdwtype
[inr
+0];
1480 jq1
= _mm256_set1_ps(charge
[inr
+1]);
1481 jq2
= _mm256_set1_ps(charge
[inr
+2]);
1482 jq3
= _mm256_set1_ps(charge
[inr
+3]);
1483 vdwjidx0A
= 2*vdwtype
[inr
+0];
1484 c6_00
= _mm256_set1_ps(vdwioffsetptr0
[vdwjidx0A
]);
1485 c12_00
= _mm256_set1_ps(vdwioffsetptr0
[vdwjidx0A
+1]);
1486 c6grid_00
= _mm256_set1_ps(vdwgridioffsetptr0
[vdwjidx0A
]);
1487 qq11
= _mm256_mul_ps(iq1
,jq1
);
1488 qq12
= _mm256_mul_ps(iq1
,jq2
);
1489 qq13
= _mm256_mul_ps(iq1
,jq3
);
1490 qq21
= _mm256_mul_ps(iq2
,jq1
);
1491 qq22
= _mm256_mul_ps(iq2
,jq2
);
1492 qq23
= _mm256_mul_ps(iq2
,jq3
);
1493 qq31
= _mm256_mul_ps(iq3
,jq1
);
1494 qq32
= _mm256_mul_ps(iq3
,jq2
);
1495 qq33
= _mm256_mul_ps(iq3
,jq3
);
1497 /* Avoid stupid compiler warnings */
1498 jnrA
= jnrB
= jnrC
= jnrD
= jnrE
= jnrF
= jnrG
= jnrH
= 0;
1499 j_coord_offsetA
= 0;
1500 j_coord_offsetB
= 0;
1501 j_coord_offsetC
= 0;
1502 j_coord_offsetD
= 0;
1503 j_coord_offsetE
= 0;
1504 j_coord_offsetF
= 0;
1505 j_coord_offsetG
= 0;
1506 j_coord_offsetH
= 0;
1511 for(iidx
=0;iidx
<4*DIM
;iidx
++)
1513 scratch
[iidx
] = 0.0;
1516 /* Start outer loop over neighborlists */
1517 for(iidx
=0; iidx
<nri
; iidx
++)
1519 /* Load shift vector for this list */
1520 i_shift_offset
= DIM
*shiftidx
[iidx
];
1522 /* Load limits for loop over neighbors */
1523 j_index_start
= jindex
[iidx
];
1524 j_index_end
= jindex
[iidx
+1];
1526 /* Get outer coordinate index */
1528 i_coord_offset
= DIM
*inr
;
1530 /* Load i particle coords and add shift vector */
1531 gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
1532 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
1534 fix0
= _mm256_setzero_ps();
1535 fiy0
= _mm256_setzero_ps();
1536 fiz0
= _mm256_setzero_ps();
1537 fix1
= _mm256_setzero_ps();
1538 fiy1
= _mm256_setzero_ps();
1539 fiz1
= _mm256_setzero_ps();
1540 fix2
= _mm256_setzero_ps();
1541 fiy2
= _mm256_setzero_ps();
1542 fiz2
= _mm256_setzero_ps();
1543 fix3
= _mm256_setzero_ps();
1544 fiy3
= _mm256_setzero_ps();
1545 fiz3
= _mm256_setzero_ps();
1547 /* Start inner kernel loop */
1548 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+7]>=0; jidx
+=8)
1551 /* Get j neighbor index, and coordinate index */
1553 jnrB
= jjnr
[jidx
+1];
1554 jnrC
= jjnr
[jidx
+2];
1555 jnrD
= jjnr
[jidx
+3];
1556 jnrE
= jjnr
[jidx
+4];
1557 jnrF
= jjnr
[jidx
+5];
1558 jnrG
= jjnr
[jidx
+6];
1559 jnrH
= jjnr
[jidx
+7];
1560 j_coord_offsetA
= DIM
*jnrA
;
1561 j_coord_offsetB
= DIM
*jnrB
;
1562 j_coord_offsetC
= DIM
*jnrC
;
1563 j_coord_offsetD
= DIM
*jnrD
;
1564 j_coord_offsetE
= DIM
*jnrE
;
1565 j_coord_offsetF
= DIM
*jnrF
;
1566 j_coord_offsetG
= DIM
*jnrG
;
1567 j_coord_offsetH
= DIM
*jnrH
;
1569 /* load j atom coordinates */
1570 gmx_mm256_load_4rvec_8ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1571 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
1572 x
+j_coord_offsetE
,x
+j_coord_offsetF
,
1573 x
+j_coord_offsetG
,x
+j_coord_offsetH
,
1574 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
1575 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
1577 /* Calculate displacement vector */
1578 dx00
= _mm256_sub_ps(ix0
,jx0
);
1579 dy00
= _mm256_sub_ps(iy0
,jy0
);
1580 dz00
= _mm256_sub_ps(iz0
,jz0
);
1581 dx11
= _mm256_sub_ps(ix1
,jx1
);
1582 dy11
= _mm256_sub_ps(iy1
,jy1
);
1583 dz11
= _mm256_sub_ps(iz1
,jz1
);
1584 dx12
= _mm256_sub_ps(ix1
,jx2
);
1585 dy12
= _mm256_sub_ps(iy1
,jy2
);
1586 dz12
= _mm256_sub_ps(iz1
,jz2
);
1587 dx13
= _mm256_sub_ps(ix1
,jx3
);
1588 dy13
= _mm256_sub_ps(iy1
,jy3
);
1589 dz13
= _mm256_sub_ps(iz1
,jz3
);
1590 dx21
= _mm256_sub_ps(ix2
,jx1
);
1591 dy21
= _mm256_sub_ps(iy2
,jy1
);
1592 dz21
= _mm256_sub_ps(iz2
,jz1
);
1593 dx22
= _mm256_sub_ps(ix2
,jx2
);
1594 dy22
= _mm256_sub_ps(iy2
,jy2
);
1595 dz22
= _mm256_sub_ps(iz2
,jz2
);
1596 dx23
= _mm256_sub_ps(ix2
,jx3
);
1597 dy23
= _mm256_sub_ps(iy2
,jy3
);
1598 dz23
= _mm256_sub_ps(iz2
,jz3
);
1599 dx31
= _mm256_sub_ps(ix3
,jx1
);
1600 dy31
= _mm256_sub_ps(iy3
,jy1
);
1601 dz31
= _mm256_sub_ps(iz3
,jz1
);
1602 dx32
= _mm256_sub_ps(ix3
,jx2
);
1603 dy32
= _mm256_sub_ps(iy3
,jy2
);
1604 dz32
= _mm256_sub_ps(iz3
,jz2
);
1605 dx33
= _mm256_sub_ps(ix3
,jx3
);
1606 dy33
= _mm256_sub_ps(iy3
,jy3
);
1607 dz33
= _mm256_sub_ps(iz3
,jz3
);
1609 /* Calculate squared distance and things based on it */
1610 rsq00
= gmx_mm256_calc_rsq_ps(dx00
,dy00
,dz00
);
1611 rsq11
= gmx_mm256_calc_rsq_ps(dx11
,dy11
,dz11
);
1612 rsq12
= gmx_mm256_calc_rsq_ps(dx12
,dy12
,dz12
);
1613 rsq13
= gmx_mm256_calc_rsq_ps(dx13
,dy13
,dz13
);
1614 rsq21
= gmx_mm256_calc_rsq_ps(dx21
,dy21
,dz21
);
1615 rsq22
= gmx_mm256_calc_rsq_ps(dx22
,dy22
,dz22
);
1616 rsq23
= gmx_mm256_calc_rsq_ps(dx23
,dy23
,dz23
);
1617 rsq31
= gmx_mm256_calc_rsq_ps(dx31
,dy31
,dz31
);
1618 rsq32
= gmx_mm256_calc_rsq_ps(dx32
,dy32
,dz32
);
1619 rsq33
= gmx_mm256_calc_rsq_ps(dx33
,dy33
,dz33
);
1621 rinv00
= gmx_mm256_invsqrt_ps(rsq00
);
1622 rinv11
= gmx_mm256_invsqrt_ps(rsq11
);
1623 rinv12
= gmx_mm256_invsqrt_ps(rsq12
);
1624 rinv13
= gmx_mm256_invsqrt_ps(rsq13
);
1625 rinv21
= gmx_mm256_invsqrt_ps(rsq21
);
1626 rinv22
= gmx_mm256_invsqrt_ps(rsq22
);
1627 rinv23
= gmx_mm256_invsqrt_ps(rsq23
);
1628 rinv31
= gmx_mm256_invsqrt_ps(rsq31
);
1629 rinv32
= gmx_mm256_invsqrt_ps(rsq32
);
1630 rinv33
= gmx_mm256_invsqrt_ps(rsq33
);
1632 rinvsq00
= _mm256_mul_ps(rinv00
,rinv00
);
1633 rinvsq11
= _mm256_mul_ps(rinv11
,rinv11
);
1634 rinvsq12
= _mm256_mul_ps(rinv12
,rinv12
);
1635 rinvsq13
= _mm256_mul_ps(rinv13
,rinv13
);
1636 rinvsq21
= _mm256_mul_ps(rinv21
,rinv21
);
1637 rinvsq22
= _mm256_mul_ps(rinv22
,rinv22
);
1638 rinvsq23
= _mm256_mul_ps(rinv23
,rinv23
);
1639 rinvsq31
= _mm256_mul_ps(rinv31
,rinv31
);
1640 rinvsq32
= _mm256_mul_ps(rinv32
,rinv32
);
1641 rinvsq33
= _mm256_mul_ps(rinv33
,rinv33
);
1643 fjx0
= _mm256_setzero_ps();
1644 fjy0
= _mm256_setzero_ps();
1645 fjz0
= _mm256_setzero_ps();
1646 fjx1
= _mm256_setzero_ps();
1647 fjy1
= _mm256_setzero_ps();
1648 fjz1
= _mm256_setzero_ps();
1649 fjx2
= _mm256_setzero_ps();
1650 fjy2
= _mm256_setzero_ps();
1651 fjz2
= _mm256_setzero_ps();
1652 fjx3
= _mm256_setzero_ps();
1653 fjy3
= _mm256_setzero_ps();
1654 fjz3
= _mm256_setzero_ps();
1656 /**************************
1657 * CALCULATE INTERACTIONS *
1658 **************************/
1660 r00
= _mm256_mul_ps(rsq00
,rinv00
);
1662 /* Analytical LJ-PME */
1663 rinvsix
= _mm256_mul_ps(_mm256_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
1664 ewcljrsq
= _mm256_mul_ps(ewclj2
,rsq00
);
1665 ewclj6
= _mm256_mul_ps(ewclj2
,_mm256_mul_ps(ewclj2
,ewclj2
));
1666 exponent
= gmx_simd_exp_r(ewcljrsq
);
1667 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
1668 poly
= _mm256_mul_ps(exponent
,_mm256_add_ps(_mm256_sub_ps(one
,ewcljrsq
),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq
,ewcljrsq
),one_half
)));
1669 /* f6A = 6 * C6grid * (1 - poly) */
1670 f6A
= _mm256_mul_ps(c6grid_00
,_mm256_sub_ps(one
,poly
));
1671 /* f6B = C6grid * exponent * beta^6 */
1672 f6B
= _mm256_mul_ps(_mm256_mul_ps(c6grid_00
,one_sixth
),_mm256_mul_ps(exponent
,ewclj6
));
1673 /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
1674 fvdw
= _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00
,rinvsix
),_mm256_sub_ps(c6_00
,f6A
)),rinvsix
),f6B
),rinvsq00
);
1678 /* Calculate temporary vectorial force */
1679 tx
= _mm256_mul_ps(fscal
,dx00
);
1680 ty
= _mm256_mul_ps(fscal
,dy00
);
1681 tz
= _mm256_mul_ps(fscal
,dz00
);
1683 /* Update vectorial force */
1684 fix0
= _mm256_add_ps(fix0
,tx
);
1685 fiy0
= _mm256_add_ps(fiy0
,ty
);
1686 fiz0
= _mm256_add_ps(fiz0
,tz
);
1688 fjx0
= _mm256_add_ps(fjx0
,tx
);
1689 fjy0
= _mm256_add_ps(fjy0
,ty
);
1690 fjz0
= _mm256_add_ps(fjz0
,tz
);
1692 /**************************
1693 * CALCULATE INTERACTIONS *
1694 **************************/
1696 r11
= _mm256_mul_ps(rsq11
,rinv11
);
1698 /* EWALD ELECTROSTATICS */
1700 /* Analytical PME correction */
1701 zeta2
= _mm256_mul_ps(beta2
,rsq11
);
1702 rinv3
= _mm256_mul_ps(rinvsq11
,rinv11
);
1703 pmecorrF
= gmx_mm256_pmecorrF_ps(zeta2
);
1704 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
1705 felec
= _mm256_mul_ps(qq11
,felec
);
1709 /* Calculate temporary vectorial force */
1710 tx
= _mm256_mul_ps(fscal
,dx11
);
1711 ty
= _mm256_mul_ps(fscal
,dy11
);
1712 tz
= _mm256_mul_ps(fscal
,dz11
);
1714 /* Update vectorial force */
1715 fix1
= _mm256_add_ps(fix1
,tx
);
1716 fiy1
= _mm256_add_ps(fiy1
,ty
);
1717 fiz1
= _mm256_add_ps(fiz1
,tz
);
1719 fjx1
= _mm256_add_ps(fjx1
,tx
);
1720 fjy1
= _mm256_add_ps(fjy1
,ty
);
1721 fjz1
= _mm256_add_ps(fjz1
,tz
);
1723 /**************************
1724 * CALCULATE INTERACTIONS *
1725 **************************/
1727 r12
= _mm256_mul_ps(rsq12
,rinv12
);
1729 /* EWALD ELECTROSTATICS */
1731 /* Analytical PME correction */
1732 zeta2
= _mm256_mul_ps(beta2
,rsq12
);
1733 rinv3
= _mm256_mul_ps(rinvsq12
,rinv12
);
1734 pmecorrF
= gmx_mm256_pmecorrF_ps(zeta2
);
1735 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
1736 felec
= _mm256_mul_ps(qq12
,felec
);
1740 /* Calculate temporary vectorial force */
1741 tx
= _mm256_mul_ps(fscal
,dx12
);
1742 ty
= _mm256_mul_ps(fscal
,dy12
);
1743 tz
= _mm256_mul_ps(fscal
,dz12
);
1745 /* Update vectorial force */
1746 fix1
= _mm256_add_ps(fix1
,tx
);
1747 fiy1
= _mm256_add_ps(fiy1
,ty
);
1748 fiz1
= _mm256_add_ps(fiz1
,tz
);
1750 fjx2
= _mm256_add_ps(fjx2
,tx
);
1751 fjy2
= _mm256_add_ps(fjy2
,ty
);
1752 fjz2
= _mm256_add_ps(fjz2
,tz
);
1754 /**************************
1755 * CALCULATE INTERACTIONS *
1756 **************************/
1758 r13
= _mm256_mul_ps(rsq13
,rinv13
);
1760 /* EWALD ELECTROSTATICS */
1762 /* Analytical PME correction */
1763 zeta2
= _mm256_mul_ps(beta2
,rsq13
);
1764 rinv3
= _mm256_mul_ps(rinvsq13
,rinv13
);
1765 pmecorrF
= gmx_mm256_pmecorrF_ps(zeta2
);
1766 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
1767 felec
= _mm256_mul_ps(qq13
,felec
);
1771 /* Calculate temporary vectorial force */
1772 tx
= _mm256_mul_ps(fscal
,dx13
);
1773 ty
= _mm256_mul_ps(fscal
,dy13
);
1774 tz
= _mm256_mul_ps(fscal
,dz13
);
1776 /* Update vectorial force */
1777 fix1
= _mm256_add_ps(fix1
,tx
);
1778 fiy1
= _mm256_add_ps(fiy1
,ty
);
1779 fiz1
= _mm256_add_ps(fiz1
,tz
);
1781 fjx3
= _mm256_add_ps(fjx3
,tx
);
1782 fjy3
= _mm256_add_ps(fjy3
,ty
);
1783 fjz3
= _mm256_add_ps(fjz3
,tz
);
1785 /**************************
1786 * CALCULATE INTERACTIONS *
1787 **************************/
1789 r21
= _mm256_mul_ps(rsq21
,rinv21
);
1791 /* EWALD ELECTROSTATICS */
1793 /* Analytical PME correction */
1794 zeta2
= _mm256_mul_ps(beta2
,rsq21
);
1795 rinv3
= _mm256_mul_ps(rinvsq21
,rinv21
);
1796 pmecorrF
= gmx_mm256_pmecorrF_ps(zeta2
);
1797 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
1798 felec
= _mm256_mul_ps(qq21
,felec
);
1802 /* Calculate temporary vectorial force */
1803 tx
= _mm256_mul_ps(fscal
,dx21
);
1804 ty
= _mm256_mul_ps(fscal
,dy21
);
1805 tz
= _mm256_mul_ps(fscal
,dz21
);
1807 /* Update vectorial force */
1808 fix2
= _mm256_add_ps(fix2
,tx
);
1809 fiy2
= _mm256_add_ps(fiy2
,ty
);
1810 fiz2
= _mm256_add_ps(fiz2
,tz
);
1812 fjx1
= _mm256_add_ps(fjx1
,tx
);
1813 fjy1
= _mm256_add_ps(fjy1
,ty
);
1814 fjz1
= _mm256_add_ps(fjz1
,tz
);
1816 /**************************
1817 * CALCULATE INTERACTIONS *
1818 **************************/
1820 r22
= _mm256_mul_ps(rsq22
,rinv22
);
1822 /* EWALD ELECTROSTATICS */
1824 /* Analytical PME correction */
1825 zeta2
= _mm256_mul_ps(beta2
,rsq22
);
1826 rinv3
= _mm256_mul_ps(rinvsq22
,rinv22
);
1827 pmecorrF
= gmx_mm256_pmecorrF_ps(zeta2
);
1828 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
1829 felec
= _mm256_mul_ps(qq22
,felec
);
1833 /* Calculate temporary vectorial force */
1834 tx
= _mm256_mul_ps(fscal
,dx22
);
1835 ty
= _mm256_mul_ps(fscal
,dy22
);
1836 tz
= _mm256_mul_ps(fscal
,dz22
);
1838 /* Update vectorial force */
1839 fix2
= _mm256_add_ps(fix2
,tx
);
1840 fiy2
= _mm256_add_ps(fiy2
,ty
);
1841 fiz2
= _mm256_add_ps(fiz2
,tz
);
1843 fjx2
= _mm256_add_ps(fjx2
,tx
);
1844 fjy2
= _mm256_add_ps(fjy2
,ty
);
1845 fjz2
= _mm256_add_ps(fjz2
,tz
);
1847 /**************************
1848 * CALCULATE INTERACTIONS *
1849 **************************/
1851 r23
= _mm256_mul_ps(rsq23
,rinv23
);
1853 /* EWALD ELECTROSTATICS */
1855 /* Analytical PME correction */
1856 zeta2
= _mm256_mul_ps(beta2
,rsq23
);
1857 rinv3
= _mm256_mul_ps(rinvsq23
,rinv23
);
1858 pmecorrF
= gmx_mm256_pmecorrF_ps(zeta2
);
1859 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
1860 felec
= _mm256_mul_ps(qq23
,felec
);
1864 /* Calculate temporary vectorial force */
1865 tx
= _mm256_mul_ps(fscal
,dx23
);
1866 ty
= _mm256_mul_ps(fscal
,dy23
);
1867 tz
= _mm256_mul_ps(fscal
,dz23
);
1869 /* Update vectorial force */
1870 fix2
= _mm256_add_ps(fix2
,tx
);
1871 fiy2
= _mm256_add_ps(fiy2
,ty
);
1872 fiz2
= _mm256_add_ps(fiz2
,tz
);
1874 fjx3
= _mm256_add_ps(fjx3
,tx
);
1875 fjy3
= _mm256_add_ps(fjy3
,ty
);
1876 fjz3
= _mm256_add_ps(fjz3
,tz
);
1878 /**************************
1879 * CALCULATE INTERACTIONS *
1880 **************************/
1882 r31
= _mm256_mul_ps(rsq31
,rinv31
);
1884 /* EWALD ELECTROSTATICS */
1886 /* Analytical PME correction */
1887 zeta2
= _mm256_mul_ps(beta2
,rsq31
);
1888 rinv3
= _mm256_mul_ps(rinvsq31
,rinv31
);
1889 pmecorrF
= gmx_mm256_pmecorrF_ps(zeta2
);
1890 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
1891 felec
= _mm256_mul_ps(qq31
,felec
);
1895 /* Calculate temporary vectorial force */
1896 tx
= _mm256_mul_ps(fscal
,dx31
);
1897 ty
= _mm256_mul_ps(fscal
,dy31
);
1898 tz
= _mm256_mul_ps(fscal
,dz31
);
1900 /* Update vectorial force */
1901 fix3
= _mm256_add_ps(fix3
,tx
);
1902 fiy3
= _mm256_add_ps(fiy3
,ty
);
1903 fiz3
= _mm256_add_ps(fiz3
,tz
);
1905 fjx1
= _mm256_add_ps(fjx1
,tx
);
1906 fjy1
= _mm256_add_ps(fjy1
,ty
);
1907 fjz1
= _mm256_add_ps(fjz1
,tz
);
1909 /**************************
1910 * CALCULATE INTERACTIONS *
1911 **************************/
1913 r32
= _mm256_mul_ps(rsq32
,rinv32
);
1915 /* EWALD ELECTROSTATICS */
1917 /* Analytical PME correction */
1918 zeta2
= _mm256_mul_ps(beta2
,rsq32
);
1919 rinv3
= _mm256_mul_ps(rinvsq32
,rinv32
);
1920 pmecorrF
= gmx_mm256_pmecorrF_ps(zeta2
);
1921 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
1922 felec
= _mm256_mul_ps(qq32
,felec
);
1926 /* Calculate temporary vectorial force */
1927 tx
= _mm256_mul_ps(fscal
,dx32
);
1928 ty
= _mm256_mul_ps(fscal
,dy32
);
1929 tz
= _mm256_mul_ps(fscal
,dz32
);
1931 /* Update vectorial force */
1932 fix3
= _mm256_add_ps(fix3
,tx
);
1933 fiy3
= _mm256_add_ps(fiy3
,ty
);
1934 fiz3
= _mm256_add_ps(fiz3
,tz
);
1936 fjx2
= _mm256_add_ps(fjx2
,tx
);
1937 fjy2
= _mm256_add_ps(fjy2
,ty
);
1938 fjz2
= _mm256_add_ps(fjz2
,tz
);
1940 /**************************
1941 * CALCULATE INTERACTIONS *
1942 **************************/
1944 r33
= _mm256_mul_ps(rsq33
,rinv33
);
1946 /* EWALD ELECTROSTATICS */
1948 /* Analytical PME correction */
1949 zeta2
= _mm256_mul_ps(beta2
,rsq33
);
1950 rinv3
= _mm256_mul_ps(rinvsq33
,rinv33
);
1951 pmecorrF
= gmx_mm256_pmecorrF_ps(zeta2
);
1952 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
1953 felec
= _mm256_mul_ps(qq33
,felec
);
1957 /* Calculate temporary vectorial force */
1958 tx
= _mm256_mul_ps(fscal
,dx33
);
1959 ty
= _mm256_mul_ps(fscal
,dy33
);
1960 tz
= _mm256_mul_ps(fscal
,dz33
);
1962 /* Update vectorial force */
1963 fix3
= _mm256_add_ps(fix3
,tx
);
1964 fiy3
= _mm256_add_ps(fiy3
,ty
);
1965 fiz3
= _mm256_add_ps(fiz3
,tz
);
1967 fjx3
= _mm256_add_ps(fjx3
,tx
);
1968 fjy3
= _mm256_add_ps(fjy3
,ty
);
1969 fjz3
= _mm256_add_ps(fjz3
,tz
);
1971 fjptrA
= f
+j_coord_offsetA
;
1972 fjptrB
= f
+j_coord_offsetB
;
1973 fjptrC
= f
+j_coord_offsetC
;
1974 fjptrD
= f
+j_coord_offsetD
;
1975 fjptrE
= f
+j_coord_offsetE
;
1976 fjptrF
= f
+j_coord_offsetF
;
1977 fjptrG
= f
+j_coord_offsetG
;
1978 fjptrH
= f
+j_coord_offsetH
;
1980 gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,fjptrE
,fjptrF
,fjptrG
,fjptrH
,
1981 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,
1982 fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
1984 /* Inner loop uses 553 flops */
1987 if(jidx
<j_index_end
)
1990 /* Get j neighbor index, and coordinate index */
1991 jnrlistA
= jjnr
[jidx
];
1992 jnrlistB
= jjnr
[jidx
+1];
1993 jnrlistC
= jjnr
[jidx
+2];
1994 jnrlistD
= jjnr
[jidx
+3];
1995 jnrlistE
= jjnr
[jidx
+4];
1996 jnrlistF
= jjnr
[jidx
+5];
1997 jnrlistG
= jjnr
[jidx
+6];
1998 jnrlistH
= jjnr
[jidx
+7];
1999 /* Sign of each element will be negative for non-real atoms.
2000 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2001 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2003 dummy_mask
= gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
+4)),_mm_setzero_si128())),
2004 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128())));
2006 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
2007 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
2008 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
2009 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
2010 jnrE
= (jnrlistE
>=0) ? jnrlistE
: 0;
2011 jnrF
= (jnrlistF
>=0) ? jnrlistF
: 0;
2012 jnrG
= (jnrlistG
>=0) ? jnrlistG
: 0;
2013 jnrH
= (jnrlistH
>=0) ? jnrlistH
: 0;
2014 j_coord_offsetA
= DIM
*jnrA
;
2015 j_coord_offsetB
= DIM
*jnrB
;
2016 j_coord_offsetC
= DIM
*jnrC
;
2017 j_coord_offsetD
= DIM
*jnrD
;
2018 j_coord_offsetE
= DIM
*jnrE
;
2019 j_coord_offsetF
= DIM
*jnrF
;
2020 j_coord_offsetG
= DIM
*jnrG
;
2021 j_coord_offsetH
= DIM
*jnrH
;
2023 /* load j atom coordinates */
2024 gmx_mm256_load_4rvec_8ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
2025 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
2026 x
+j_coord_offsetE
,x
+j_coord_offsetF
,
2027 x
+j_coord_offsetG
,x
+j_coord_offsetH
,
2028 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
2029 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
2031 /* Calculate displacement vector */
2032 dx00
= _mm256_sub_ps(ix0
,jx0
);
2033 dy00
= _mm256_sub_ps(iy0
,jy0
);
2034 dz00
= _mm256_sub_ps(iz0
,jz0
);
2035 dx11
= _mm256_sub_ps(ix1
,jx1
);
2036 dy11
= _mm256_sub_ps(iy1
,jy1
);
2037 dz11
= _mm256_sub_ps(iz1
,jz1
);
2038 dx12
= _mm256_sub_ps(ix1
,jx2
);
2039 dy12
= _mm256_sub_ps(iy1
,jy2
);
2040 dz12
= _mm256_sub_ps(iz1
,jz2
);
2041 dx13
= _mm256_sub_ps(ix1
,jx3
);
2042 dy13
= _mm256_sub_ps(iy1
,jy3
);
2043 dz13
= _mm256_sub_ps(iz1
,jz3
);
2044 dx21
= _mm256_sub_ps(ix2
,jx1
);
2045 dy21
= _mm256_sub_ps(iy2
,jy1
);
2046 dz21
= _mm256_sub_ps(iz2
,jz1
);
2047 dx22
= _mm256_sub_ps(ix2
,jx2
);
2048 dy22
= _mm256_sub_ps(iy2
,jy2
);
2049 dz22
= _mm256_sub_ps(iz2
,jz2
);
2050 dx23
= _mm256_sub_ps(ix2
,jx3
);
2051 dy23
= _mm256_sub_ps(iy2
,jy3
);
2052 dz23
= _mm256_sub_ps(iz2
,jz3
);
2053 dx31
= _mm256_sub_ps(ix3
,jx1
);
2054 dy31
= _mm256_sub_ps(iy3
,jy1
);
2055 dz31
= _mm256_sub_ps(iz3
,jz1
);
2056 dx32
= _mm256_sub_ps(ix3
,jx2
);
2057 dy32
= _mm256_sub_ps(iy3
,jy2
);
2058 dz32
= _mm256_sub_ps(iz3
,jz2
);
2059 dx33
= _mm256_sub_ps(ix3
,jx3
);
2060 dy33
= _mm256_sub_ps(iy3
,jy3
);
2061 dz33
= _mm256_sub_ps(iz3
,jz3
);
2063 /* Calculate squared distance and things based on it */
2064 rsq00
= gmx_mm256_calc_rsq_ps(dx00
,dy00
,dz00
);
2065 rsq11
= gmx_mm256_calc_rsq_ps(dx11
,dy11
,dz11
);
2066 rsq12
= gmx_mm256_calc_rsq_ps(dx12
,dy12
,dz12
);
2067 rsq13
= gmx_mm256_calc_rsq_ps(dx13
,dy13
,dz13
);
2068 rsq21
= gmx_mm256_calc_rsq_ps(dx21
,dy21
,dz21
);
2069 rsq22
= gmx_mm256_calc_rsq_ps(dx22
,dy22
,dz22
);
2070 rsq23
= gmx_mm256_calc_rsq_ps(dx23
,dy23
,dz23
);
2071 rsq31
= gmx_mm256_calc_rsq_ps(dx31
,dy31
,dz31
);
2072 rsq32
= gmx_mm256_calc_rsq_ps(dx32
,dy32
,dz32
);
2073 rsq33
= gmx_mm256_calc_rsq_ps(dx33
,dy33
,dz33
);
2075 rinv00
= gmx_mm256_invsqrt_ps(rsq00
);
2076 rinv11
= gmx_mm256_invsqrt_ps(rsq11
);
2077 rinv12
= gmx_mm256_invsqrt_ps(rsq12
);
2078 rinv13
= gmx_mm256_invsqrt_ps(rsq13
);
2079 rinv21
= gmx_mm256_invsqrt_ps(rsq21
);
2080 rinv22
= gmx_mm256_invsqrt_ps(rsq22
);
2081 rinv23
= gmx_mm256_invsqrt_ps(rsq23
);
2082 rinv31
= gmx_mm256_invsqrt_ps(rsq31
);
2083 rinv32
= gmx_mm256_invsqrt_ps(rsq32
);
2084 rinv33
= gmx_mm256_invsqrt_ps(rsq33
);
2086 rinvsq00
= _mm256_mul_ps(rinv00
,rinv00
);
2087 rinvsq11
= _mm256_mul_ps(rinv11
,rinv11
);
2088 rinvsq12
= _mm256_mul_ps(rinv12
,rinv12
);
2089 rinvsq13
= _mm256_mul_ps(rinv13
,rinv13
);
2090 rinvsq21
= _mm256_mul_ps(rinv21
,rinv21
);
2091 rinvsq22
= _mm256_mul_ps(rinv22
,rinv22
);
2092 rinvsq23
= _mm256_mul_ps(rinv23
,rinv23
);
2093 rinvsq31
= _mm256_mul_ps(rinv31
,rinv31
);
2094 rinvsq32
= _mm256_mul_ps(rinv32
,rinv32
);
2095 rinvsq33
= _mm256_mul_ps(rinv33
,rinv33
);
2097 fjx0
= _mm256_setzero_ps();
2098 fjy0
= _mm256_setzero_ps();
2099 fjz0
= _mm256_setzero_ps();
2100 fjx1
= _mm256_setzero_ps();
2101 fjy1
= _mm256_setzero_ps();
2102 fjz1
= _mm256_setzero_ps();
2103 fjx2
= _mm256_setzero_ps();
2104 fjy2
= _mm256_setzero_ps();
2105 fjz2
= _mm256_setzero_ps();
2106 fjx3
= _mm256_setzero_ps();
2107 fjy3
= _mm256_setzero_ps();
2108 fjz3
= _mm256_setzero_ps();
2110 /**************************
2111 * CALCULATE INTERACTIONS *
2112 **************************/
2114 r00
= _mm256_mul_ps(rsq00
,rinv00
);
2115 r00
= _mm256_andnot_ps(dummy_mask
,r00
);
2117 /* Analytical LJ-PME */
2118 rinvsix
= _mm256_mul_ps(_mm256_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
2119 ewcljrsq
= _mm256_mul_ps(ewclj2
,rsq00
);
2120 ewclj6
= _mm256_mul_ps(ewclj2
,_mm256_mul_ps(ewclj2
,ewclj2
));
2121 exponent
= gmx_simd_exp_r(ewcljrsq
);
2122 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
2123 poly
= _mm256_mul_ps(exponent
,_mm256_add_ps(_mm256_sub_ps(one
,ewcljrsq
),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq
,ewcljrsq
),one_half
)));
2124 /* f6A = 6 * C6grid * (1 - poly) */
2125 f6A
= _mm256_mul_ps(c6grid_00
,_mm256_sub_ps(one
,poly
));
2126 /* f6B = C6grid * exponent * beta^6 */
2127 f6B
= _mm256_mul_ps(_mm256_mul_ps(c6grid_00
,one_sixth
),_mm256_mul_ps(exponent
,ewclj6
));
2128 /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
2129 fvdw
= _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00
,rinvsix
),_mm256_sub_ps(c6_00
,f6A
)),rinvsix
),f6B
),rinvsq00
);
2133 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
2135 /* Calculate temporary vectorial force */
2136 tx
= _mm256_mul_ps(fscal
,dx00
);
2137 ty
= _mm256_mul_ps(fscal
,dy00
);
2138 tz
= _mm256_mul_ps(fscal
,dz00
);
2140 /* Update vectorial force */
2141 fix0
= _mm256_add_ps(fix0
,tx
);
2142 fiy0
= _mm256_add_ps(fiy0
,ty
);
2143 fiz0
= _mm256_add_ps(fiz0
,tz
);
2145 fjx0
= _mm256_add_ps(fjx0
,tx
);
2146 fjy0
= _mm256_add_ps(fjy0
,ty
);
2147 fjz0
= _mm256_add_ps(fjz0
,tz
);
2149 /**************************
2150 * CALCULATE INTERACTIONS *
2151 **************************/
2153 r11
= _mm256_mul_ps(rsq11
,rinv11
);
2154 r11
= _mm256_andnot_ps(dummy_mask
,r11
);
2156 /* EWALD ELECTROSTATICS */
2158 /* Analytical PME correction */
2159 zeta2
= _mm256_mul_ps(beta2
,rsq11
);
2160 rinv3
= _mm256_mul_ps(rinvsq11
,rinv11
);
2161 pmecorrF
= gmx_mm256_pmecorrF_ps(zeta2
);
2162 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
2163 felec
= _mm256_mul_ps(qq11
,felec
);
2167 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
2169 /* Calculate temporary vectorial force */
2170 tx
= _mm256_mul_ps(fscal
,dx11
);
2171 ty
= _mm256_mul_ps(fscal
,dy11
);
2172 tz
= _mm256_mul_ps(fscal
,dz11
);
2174 /* Update vectorial force */
2175 fix1
= _mm256_add_ps(fix1
,tx
);
2176 fiy1
= _mm256_add_ps(fiy1
,ty
);
2177 fiz1
= _mm256_add_ps(fiz1
,tz
);
2179 fjx1
= _mm256_add_ps(fjx1
,tx
);
2180 fjy1
= _mm256_add_ps(fjy1
,ty
);
2181 fjz1
= _mm256_add_ps(fjz1
,tz
);
2183 /**************************
2184 * CALCULATE INTERACTIONS *
2185 **************************/
2187 r12
= _mm256_mul_ps(rsq12
,rinv12
);
2188 r12
= _mm256_andnot_ps(dummy_mask
,r12
);
2190 /* EWALD ELECTROSTATICS */
2192 /* Analytical PME correction */
2193 zeta2
= _mm256_mul_ps(beta2
,rsq12
);
2194 rinv3
= _mm256_mul_ps(rinvsq12
,rinv12
);
2195 pmecorrF
= gmx_mm256_pmecorrF_ps(zeta2
);
2196 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
2197 felec
= _mm256_mul_ps(qq12
,felec
);
2201 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
2203 /* Calculate temporary vectorial force */
2204 tx
= _mm256_mul_ps(fscal
,dx12
);
2205 ty
= _mm256_mul_ps(fscal
,dy12
);
2206 tz
= _mm256_mul_ps(fscal
,dz12
);
2208 /* Update vectorial force */
2209 fix1
= _mm256_add_ps(fix1
,tx
);
2210 fiy1
= _mm256_add_ps(fiy1
,ty
);
2211 fiz1
= _mm256_add_ps(fiz1
,tz
);
2213 fjx2
= _mm256_add_ps(fjx2
,tx
);
2214 fjy2
= _mm256_add_ps(fjy2
,ty
);
2215 fjz2
= _mm256_add_ps(fjz2
,tz
);
2217 /**************************
2218 * CALCULATE INTERACTIONS *
2219 **************************/
2221 r13
= _mm256_mul_ps(rsq13
,rinv13
);
2222 r13
= _mm256_andnot_ps(dummy_mask
,r13
);
2224 /* EWALD ELECTROSTATICS */
2226 /* Analytical PME correction */
2227 zeta2
= _mm256_mul_ps(beta2
,rsq13
);
2228 rinv3
= _mm256_mul_ps(rinvsq13
,rinv13
);
2229 pmecorrF
= gmx_mm256_pmecorrF_ps(zeta2
);
2230 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
2231 felec
= _mm256_mul_ps(qq13
,felec
);
2235 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
2237 /* Calculate temporary vectorial force */
2238 tx
= _mm256_mul_ps(fscal
,dx13
);
2239 ty
= _mm256_mul_ps(fscal
,dy13
);
2240 tz
= _mm256_mul_ps(fscal
,dz13
);
2242 /* Update vectorial force */
2243 fix1
= _mm256_add_ps(fix1
,tx
);
2244 fiy1
= _mm256_add_ps(fiy1
,ty
);
2245 fiz1
= _mm256_add_ps(fiz1
,tz
);
2247 fjx3
= _mm256_add_ps(fjx3
,tx
);
2248 fjy3
= _mm256_add_ps(fjy3
,ty
);
2249 fjz3
= _mm256_add_ps(fjz3
,tz
);
2251 /**************************
2252 * CALCULATE INTERACTIONS *
2253 **************************/
2255 r21
= _mm256_mul_ps(rsq21
,rinv21
);
2256 r21
= _mm256_andnot_ps(dummy_mask
,r21
);
2258 /* EWALD ELECTROSTATICS */
2260 /* Analytical PME correction */
2261 zeta2
= _mm256_mul_ps(beta2
,rsq21
);
2262 rinv3
= _mm256_mul_ps(rinvsq21
,rinv21
);
2263 pmecorrF
= gmx_mm256_pmecorrF_ps(zeta2
);
2264 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
2265 felec
= _mm256_mul_ps(qq21
,felec
);
2269 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
2271 /* Calculate temporary vectorial force */
2272 tx
= _mm256_mul_ps(fscal
,dx21
);
2273 ty
= _mm256_mul_ps(fscal
,dy21
);
2274 tz
= _mm256_mul_ps(fscal
,dz21
);
2276 /* Update vectorial force */
2277 fix2
= _mm256_add_ps(fix2
,tx
);
2278 fiy2
= _mm256_add_ps(fiy2
,ty
);
2279 fiz2
= _mm256_add_ps(fiz2
,tz
);
2281 fjx1
= _mm256_add_ps(fjx1
,tx
);
2282 fjy1
= _mm256_add_ps(fjy1
,ty
);
2283 fjz1
= _mm256_add_ps(fjz1
,tz
);
2285 /**************************
2286 * CALCULATE INTERACTIONS *
2287 **************************/
2289 r22
= _mm256_mul_ps(rsq22
,rinv22
);
2290 r22
= _mm256_andnot_ps(dummy_mask
,r22
);
2292 /* EWALD ELECTROSTATICS */
2294 /* Analytical PME correction */
2295 zeta2
= _mm256_mul_ps(beta2
,rsq22
);
2296 rinv3
= _mm256_mul_ps(rinvsq22
,rinv22
);
2297 pmecorrF
= gmx_mm256_pmecorrF_ps(zeta2
);
2298 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
2299 felec
= _mm256_mul_ps(qq22
,felec
);
2303 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
2305 /* Calculate temporary vectorial force */
2306 tx
= _mm256_mul_ps(fscal
,dx22
);
2307 ty
= _mm256_mul_ps(fscal
,dy22
);
2308 tz
= _mm256_mul_ps(fscal
,dz22
);
2310 /* Update vectorial force */
2311 fix2
= _mm256_add_ps(fix2
,tx
);
2312 fiy2
= _mm256_add_ps(fiy2
,ty
);
2313 fiz2
= _mm256_add_ps(fiz2
,tz
);
2315 fjx2
= _mm256_add_ps(fjx2
,tx
);
2316 fjy2
= _mm256_add_ps(fjy2
,ty
);
2317 fjz2
= _mm256_add_ps(fjz2
,tz
);
2319 /**************************
2320 * CALCULATE INTERACTIONS *
2321 **************************/
2323 r23
= _mm256_mul_ps(rsq23
,rinv23
);
2324 r23
= _mm256_andnot_ps(dummy_mask
,r23
);
2326 /* EWALD ELECTROSTATICS */
2328 /* Analytical PME correction */
2329 zeta2
= _mm256_mul_ps(beta2
,rsq23
);
2330 rinv3
= _mm256_mul_ps(rinvsq23
,rinv23
);
2331 pmecorrF
= gmx_mm256_pmecorrF_ps(zeta2
);
2332 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
2333 felec
= _mm256_mul_ps(qq23
,felec
);
2337 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
2339 /* Calculate temporary vectorial force */
2340 tx
= _mm256_mul_ps(fscal
,dx23
);
2341 ty
= _mm256_mul_ps(fscal
,dy23
);
2342 tz
= _mm256_mul_ps(fscal
,dz23
);
2344 /* Update vectorial force */
2345 fix2
= _mm256_add_ps(fix2
,tx
);
2346 fiy2
= _mm256_add_ps(fiy2
,ty
);
2347 fiz2
= _mm256_add_ps(fiz2
,tz
);
2349 fjx3
= _mm256_add_ps(fjx3
,tx
);
2350 fjy3
= _mm256_add_ps(fjy3
,ty
);
2351 fjz3
= _mm256_add_ps(fjz3
,tz
);
2353 /**************************
2354 * CALCULATE INTERACTIONS *
2355 **************************/
2357 r31
= _mm256_mul_ps(rsq31
,rinv31
);
2358 r31
= _mm256_andnot_ps(dummy_mask
,r31
);
2360 /* EWALD ELECTROSTATICS */
2362 /* Analytical PME correction */
2363 zeta2
= _mm256_mul_ps(beta2
,rsq31
);
2364 rinv3
= _mm256_mul_ps(rinvsq31
,rinv31
);
2365 pmecorrF
= gmx_mm256_pmecorrF_ps(zeta2
);
2366 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
2367 felec
= _mm256_mul_ps(qq31
,felec
);
2371 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
2373 /* Calculate temporary vectorial force */
2374 tx
= _mm256_mul_ps(fscal
,dx31
);
2375 ty
= _mm256_mul_ps(fscal
,dy31
);
2376 tz
= _mm256_mul_ps(fscal
,dz31
);
2378 /* Update vectorial force */
2379 fix3
= _mm256_add_ps(fix3
,tx
);
2380 fiy3
= _mm256_add_ps(fiy3
,ty
);
2381 fiz3
= _mm256_add_ps(fiz3
,tz
);
2383 fjx1
= _mm256_add_ps(fjx1
,tx
);
2384 fjy1
= _mm256_add_ps(fjy1
,ty
);
2385 fjz1
= _mm256_add_ps(fjz1
,tz
);
2387 /**************************
2388 * CALCULATE INTERACTIONS *
2389 **************************/
2391 r32
= _mm256_mul_ps(rsq32
,rinv32
);
2392 r32
= _mm256_andnot_ps(dummy_mask
,r32
);
2394 /* EWALD ELECTROSTATICS */
2396 /* Analytical PME correction */
2397 zeta2
= _mm256_mul_ps(beta2
,rsq32
);
2398 rinv3
= _mm256_mul_ps(rinvsq32
,rinv32
);
2399 pmecorrF
= gmx_mm256_pmecorrF_ps(zeta2
);
2400 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
2401 felec
= _mm256_mul_ps(qq32
,felec
);
2405 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
2407 /* Calculate temporary vectorial force */
2408 tx
= _mm256_mul_ps(fscal
,dx32
);
2409 ty
= _mm256_mul_ps(fscal
,dy32
);
2410 tz
= _mm256_mul_ps(fscal
,dz32
);
2412 /* Update vectorial force */
2413 fix3
= _mm256_add_ps(fix3
,tx
);
2414 fiy3
= _mm256_add_ps(fiy3
,ty
);
2415 fiz3
= _mm256_add_ps(fiz3
,tz
);
2417 fjx2
= _mm256_add_ps(fjx2
,tx
);
2418 fjy2
= _mm256_add_ps(fjy2
,ty
);
2419 fjz2
= _mm256_add_ps(fjz2
,tz
);
2421 /**************************
2422 * CALCULATE INTERACTIONS *
2423 **************************/
2425 r33
= _mm256_mul_ps(rsq33
,rinv33
);
2426 r33
= _mm256_andnot_ps(dummy_mask
,r33
);
2428 /* EWALD ELECTROSTATICS */
2430 /* Analytical PME correction */
2431 zeta2
= _mm256_mul_ps(beta2
,rsq33
);
2432 rinv3
= _mm256_mul_ps(rinvsq33
,rinv33
);
2433 pmecorrF
= gmx_mm256_pmecorrF_ps(zeta2
);
2434 felec
= _mm256_add_ps( _mm256_mul_ps(pmecorrF
,beta3
), rinv3
);
2435 felec
= _mm256_mul_ps(qq33
,felec
);
2439 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
2441 /* Calculate temporary vectorial force */
2442 tx
= _mm256_mul_ps(fscal
,dx33
);
2443 ty
= _mm256_mul_ps(fscal
,dy33
);
2444 tz
= _mm256_mul_ps(fscal
,dz33
);
2446 /* Update vectorial force */
2447 fix3
= _mm256_add_ps(fix3
,tx
);
2448 fiy3
= _mm256_add_ps(fiy3
,ty
);
2449 fiz3
= _mm256_add_ps(fiz3
,tz
);
2451 fjx3
= _mm256_add_ps(fjx3
,tx
);
2452 fjy3
= _mm256_add_ps(fjy3
,ty
);
2453 fjz3
= _mm256_add_ps(fjz3
,tz
);
2455 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
2456 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
2457 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
2458 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
2459 fjptrE
= (jnrlistE
>=0) ? f
+j_coord_offsetE
: scratch
;
2460 fjptrF
= (jnrlistF
>=0) ? f
+j_coord_offsetF
: scratch
;
2461 fjptrG
= (jnrlistG
>=0) ? f
+j_coord_offsetG
: scratch
;
2462 fjptrH
= (jnrlistH
>=0) ? f
+j_coord_offsetH
: scratch
;
2464 gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,fjptrE
,fjptrF
,fjptrG
,fjptrH
,
2465 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,
2466 fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
2468 /* Inner loop uses 563 flops */
2471 /* End of innermost loop */
2473 gmx_mm256_update_iforce_4atom_swizzle_ps(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
2474 f
+i_coord_offset
,fshift
+i_shift_offset
);
2476 /* Increment number of inner iterations */
2477 inneriter
+= j_index_end
- j_index_start
;
2479 /* Outer loop uses 24 flops */
2482 /* Increment number of outer iterations */
2485 /* Update outer/inner flops */
2487 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W4W4_F
,outeriter
*24 + inneriter
*563);