2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_avx_256_double.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_avx_256_double
51 * Electrostatics interaction: Ewald
52 * VdW interaction: LJEwald
53 * Geometry: Water4-Water4
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_avx_256_double
58 (t_nblist
* gmx_restrict nlist
,
59 rvec
* gmx_restrict xx
,
60 rvec
* gmx_restrict ff
,
61 struct t_forcerec
* gmx_restrict fr
,
62 t_mdatoms
* gmx_restrict mdatoms
,
63 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
64 t_nrnb
* gmx_restrict nrnb
)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
72 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
73 int jnrA
,jnrB
,jnrC
,jnrD
;
74 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
75 int jnrlistE
,jnrlistF
,jnrlistG
,jnrlistH
;
76 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
77 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
79 real
*shiftvec
,*fshift
,*x
,*f
;
80 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
;
82 __m256d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
83 real
* vdwioffsetptr0
;
84 real
* vdwgridioffsetptr0
;
85 __m256d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
86 real
* vdwioffsetptr1
;
87 real
* vdwgridioffsetptr1
;
88 __m256d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
89 real
* vdwioffsetptr2
;
90 real
* vdwgridioffsetptr2
;
91 __m256d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
92 real
* vdwioffsetptr3
;
93 real
* vdwgridioffsetptr3
;
94 __m256d ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
95 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
;
96 __m256d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
97 int vdwjidx1A
,vdwjidx1B
,vdwjidx1C
,vdwjidx1D
;
98 __m256d jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
99 int vdwjidx2A
,vdwjidx2B
,vdwjidx2C
,vdwjidx2D
;
100 __m256d jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
101 int vdwjidx3A
,vdwjidx3B
,vdwjidx3C
,vdwjidx3D
;
102 __m256d jx3
,jy3
,jz3
,fjx3
,fjy3
,fjz3
,jq3
,isaj3
;
103 __m256d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
104 __m256d dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
105 __m256d dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
106 __m256d dx13
,dy13
,dz13
,rsq13
,rinv13
,rinvsq13
,r13
,qq13
,c6_13
,c12_13
;
107 __m256d dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
108 __m256d dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
109 __m256d dx23
,dy23
,dz23
,rsq23
,rinv23
,rinvsq23
,r23
,qq23
,c6_23
,c12_23
;
110 __m256d dx31
,dy31
,dz31
,rsq31
,rinv31
,rinvsq31
,r31
,qq31
,c6_31
,c12_31
;
111 __m256d dx32
,dy32
,dz32
,rsq32
,rinv32
,rinvsq32
,r32
,qq32
,c6_32
,c12_32
;
112 __m256d dx33
,dy33
,dz33
,rsq33
,rinv33
,rinvsq33
,r33
,qq33
,c6_33
,c12_33
;
113 __m256d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
116 __m256d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
119 __m256d one_sixth
= _mm256_set1_pd(1.0/6.0);
120 __m256d one_twelfth
= _mm256_set1_pd(1.0/12.0);
132 __m256d ewclj
,ewclj2
,ewclj6
,ewcljrsq
,poly
,exponent
,f6A
,f6B
,sh_lj_ewald
;
133 __m256d one_half
= _mm256_set1_pd(0.5);
134 __m256d minus_one
= _mm256_set1_pd(-1.0);
136 __m256d ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
137 __m256d beta
,beta2
,beta3
,zeta2
,pmecorrF
,pmecorrV
,rinv3
;
139 __m256d dummy_mask
,cutoff_mask
;
140 __m128 tmpmask0
,tmpmask1
;
141 __m256d signbit
= _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
142 __m256d one
= _mm256_set1_pd(1.0);
143 __m256d two
= _mm256_set1_pd(2.0);
149 jindex
= nlist
->jindex
;
151 shiftidx
= nlist
->shift
;
153 shiftvec
= fr
->shift_vec
[0];
154 fshift
= fr
->fshift
[0];
155 facel
= _mm256_set1_pd(fr
->ic
->epsfac
);
156 charge
= mdatoms
->chargeA
;
157 nvdwtype
= fr
->ntype
;
159 vdwtype
= mdatoms
->typeA
;
160 vdwgridparam
= fr
->ljpme_c6grid
;
161 sh_lj_ewald
= _mm256_set1_pd(fr
->ic
->sh_lj_ewald
);
162 ewclj
= _mm256_set1_pd(fr
->ic
->ewaldcoeff_lj
);
163 ewclj2
= _mm256_mul_pd(minus_one
,_mm256_mul_pd(ewclj
,ewclj
));
165 sh_ewald
= _mm256_set1_pd(fr
->ic
->sh_ewald
);
166 beta
= _mm256_set1_pd(fr
->ic
->ewaldcoeff_q
);
167 beta2
= _mm256_mul_pd(beta
,beta
);
168 beta3
= _mm256_mul_pd(beta
,beta2
);
170 ewtab
= fr
->ic
->tabq_coul_FDV0
;
171 ewtabscale
= _mm256_set1_pd(fr
->ic
->tabq_scale
);
172 ewtabhalfspace
= _mm256_set1_pd(0.5/fr
->ic
->tabq_scale
);
174 /* Setup water-specific parameters */
175 inr
= nlist
->iinr
[0];
176 iq1
= _mm256_mul_pd(facel
,_mm256_set1_pd(charge
[inr
+1]));
177 iq2
= _mm256_mul_pd(facel
,_mm256_set1_pd(charge
[inr
+2]));
178 iq3
= _mm256_mul_pd(facel
,_mm256_set1_pd(charge
[inr
+3]));
179 vdwioffsetptr0
= vdwparam
+2*nvdwtype
*vdwtype
[inr
+0];
180 vdwgridioffsetptr0
= vdwgridparam
+2*nvdwtype
*vdwtype
[inr
+0];
182 jq1
= _mm256_set1_pd(charge
[inr
+1]);
183 jq2
= _mm256_set1_pd(charge
[inr
+2]);
184 jq3
= _mm256_set1_pd(charge
[inr
+3]);
185 vdwjidx0A
= 2*vdwtype
[inr
+0];
186 c6_00
= _mm256_set1_pd(vdwioffsetptr0
[vdwjidx0A
]);
187 c12_00
= _mm256_set1_pd(vdwioffsetptr0
[vdwjidx0A
+1]);
188 c6grid_00
= _mm256_set1_pd(vdwgridioffsetptr0
[vdwjidx0A
]);
189 qq11
= _mm256_mul_pd(iq1
,jq1
);
190 qq12
= _mm256_mul_pd(iq1
,jq2
);
191 qq13
= _mm256_mul_pd(iq1
,jq3
);
192 qq21
= _mm256_mul_pd(iq2
,jq1
);
193 qq22
= _mm256_mul_pd(iq2
,jq2
);
194 qq23
= _mm256_mul_pd(iq2
,jq3
);
195 qq31
= _mm256_mul_pd(iq3
,jq1
);
196 qq32
= _mm256_mul_pd(iq3
,jq2
);
197 qq33
= _mm256_mul_pd(iq3
,jq3
);
199 /* Avoid stupid compiler warnings */
200 jnrA
= jnrB
= jnrC
= jnrD
= 0;
209 for(iidx
=0;iidx
<4*DIM
;iidx
++)
214 /* Start outer loop over neighborlists */
215 for(iidx
=0; iidx
<nri
; iidx
++)
217 /* Load shift vector for this list */
218 i_shift_offset
= DIM
*shiftidx
[iidx
];
220 /* Load limits for loop over neighbors */
221 j_index_start
= jindex
[iidx
];
222 j_index_end
= jindex
[iidx
+1];
224 /* Get outer coordinate index */
226 i_coord_offset
= DIM
*inr
;
228 /* Load i particle coords and add shift vector */
229 gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
230 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
232 fix0
= _mm256_setzero_pd();
233 fiy0
= _mm256_setzero_pd();
234 fiz0
= _mm256_setzero_pd();
235 fix1
= _mm256_setzero_pd();
236 fiy1
= _mm256_setzero_pd();
237 fiz1
= _mm256_setzero_pd();
238 fix2
= _mm256_setzero_pd();
239 fiy2
= _mm256_setzero_pd();
240 fiz2
= _mm256_setzero_pd();
241 fix3
= _mm256_setzero_pd();
242 fiy3
= _mm256_setzero_pd();
243 fiz3
= _mm256_setzero_pd();
245 /* Reset potential sums */
246 velecsum
= _mm256_setzero_pd();
247 vvdwsum
= _mm256_setzero_pd();
249 /* Start inner kernel loop */
250 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+3]>=0; jidx
+=4)
253 /* Get j neighbor index, and coordinate index */
258 j_coord_offsetA
= DIM
*jnrA
;
259 j_coord_offsetB
= DIM
*jnrB
;
260 j_coord_offsetC
= DIM
*jnrC
;
261 j_coord_offsetD
= DIM
*jnrD
;
263 /* load j atom coordinates */
264 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
265 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
266 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
267 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
269 /* Calculate displacement vector */
270 dx00
= _mm256_sub_pd(ix0
,jx0
);
271 dy00
= _mm256_sub_pd(iy0
,jy0
);
272 dz00
= _mm256_sub_pd(iz0
,jz0
);
273 dx11
= _mm256_sub_pd(ix1
,jx1
);
274 dy11
= _mm256_sub_pd(iy1
,jy1
);
275 dz11
= _mm256_sub_pd(iz1
,jz1
);
276 dx12
= _mm256_sub_pd(ix1
,jx2
);
277 dy12
= _mm256_sub_pd(iy1
,jy2
);
278 dz12
= _mm256_sub_pd(iz1
,jz2
);
279 dx13
= _mm256_sub_pd(ix1
,jx3
);
280 dy13
= _mm256_sub_pd(iy1
,jy3
);
281 dz13
= _mm256_sub_pd(iz1
,jz3
);
282 dx21
= _mm256_sub_pd(ix2
,jx1
);
283 dy21
= _mm256_sub_pd(iy2
,jy1
);
284 dz21
= _mm256_sub_pd(iz2
,jz1
);
285 dx22
= _mm256_sub_pd(ix2
,jx2
);
286 dy22
= _mm256_sub_pd(iy2
,jy2
);
287 dz22
= _mm256_sub_pd(iz2
,jz2
);
288 dx23
= _mm256_sub_pd(ix2
,jx3
);
289 dy23
= _mm256_sub_pd(iy2
,jy3
);
290 dz23
= _mm256_sub_pd(iz2
,jz3
);
291 dx31
= _mm256_sub_pd(ix3
,jx1
);
292 dy31
= _mm256_sub_pd(iy3
,jy1
);
293 dz31
= _mm256_sub_pd(iz3
,jz1
);
294 dx32
= _mm256_sub_pd(ix3
,jx2
);
295 dy32
= _mm256_sub_pd(iy3
,jy2
);
296 dz32
= _mm256_sub_pd(iz3
,jz2
);
297 dx33
= _mm256_sub_pd(ix3
,jx3
);
298 dy33
= _mm256_sub_pd(iy3
,jy3
);
299 dz33
= _mm256_sub_pd(iz3
,jz3
);
301 /* Calculate squared distance and things based on it */
302 rsq00
= gmx_mm256_calc_rsq_pd(dx00
,dy00
,dz00
);
303 rsq11
= gmx_mm256_calc_rsq_pd(dx11
,dy11
,dz11
);
304 rsq12
= gmx_mm256_calc_rsq_pd(dx12
,dy12
,dz12
);
305 rsq13
= gmx_mm256_calc_rsq_pd(dx13
,dy13
,dz13
);
306 rsq21
= gmx_mm256_calc_rsq_pd(dx21
,dy21
,dz21
);
307 rsq22
= gmx_mm256_calc_rsq_pd(dx22
,dy22
,dz22
);
308 rsq23
= gmx_mm256_calc_rsq_pd(dx23
,dy23
,dz23
);
309 rsq31
= gmx_mm256_calc_rsq_pd(dx31
,dy31
,dz31
);
310 rsq32
= gmx_mm256_calc_rsq_pd(dx32
,dy32
,dz32
);
311 rsq33
= gmx_mm256_calc_rsq_pd(dx33
,dy33
,dz33
);
313 rinv00
= avx256_invsqrt_d(rsq00
);
314 rinv11
= avx256_invsqrt_d(rsq11
);
315 rinv12
= avx256_invsqrt_d(rsq12
);
316 rinv13
= avx256_invsqrt_d(rsq13
);
317 rinv21
= avx256_invsqrt_d(rsq21
);
318 rinv22
= avx256_invsqrt_d(rsq22
);
319 rinv23
= avx256_invsqrt_d(rsq23
);
320 rinv31
= avx256_invsqrt_d(rsq31
);
321 rinv32
= avx256_invsqrt_d(rsq32
);
322 rinv33
= avx256_invsqrt_d(rsq33
);
324 rinvsq00
= _mm256_mul_pd(rinv00
,rinv00
);
325 rinvsq11
= _mm256_mul_pd(rinv11
,rinv11
);
326 rinvsq12
= _mm256_mul_pd(rinv12
,rinv12
);
327 rinvsq13
= _mm256_mul_pd(rinv13
,rinv13
);
328 rinvsq21
= _mm256_mul_pd(rinv21
,rinv21
);
329 rinvsq22
= _mm256_mul_pd(rinv22
,rinv22
);
330 rinvsq23
= _mm256_mul_pd(rinv23
,rinv23
);
331 rinvsq31
= _mm256_mul_pd(rinv31
,rinv31
);
332 rinvsq32
= _mm256_mul_pd(rinv32
,rinv32
);
333 rinvsq33
= _mm256_mul_pd(rinv33
,rinv33
);
335 fjx0
= _mm256_setzero_pd();
336 fjy0
= _mm256_setzero_pd();
337 fjz0
= _mm256_setzero_pd();
338 fjx1
= _mm256_setzero_pd();
339 fjy1
= _mm256_setzero_pd();
340 fjz1
= _mm256_setzero_pd();
341 fjx2
= _mm256_setzero_pd();
342 fjy2
= _mm256_setzero_pd();
343 fjz2
= _mm256_setzero_pd();
344 fjx3
= _mm256_setzero_pd();
345 fjy3
= _mm256_setzero_pd();
346 fjz3
= _mm256_setzero_pd();
348 /**************************
349 * CALCULATE INTERACTIONS *
350 **************************/
352 r00
= _mm256_mul_pd(rsq00
,rinv00
);
354 /* Analytical LJ-PME */
355 rinvsix
= _mm256_mul_pd(_mm256_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
356 ewcljrsq
= _mm256_mul_pd(ewclj2
,rsq00
);
357 ewclj6
= _mm256_mul_pd(ewclj2
,_mm256_mul_pd(ewclj2
,ewclj2
));
358 exponent
= avx256_exp_d(ewcljrsq
);
359 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
360 poly
= _mm256_mul_pd(exponent
,_mm256_add_pd(_mm256_sub_pd(one
,ewcljrsq
),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq
,ewcljrsq
),one_half
)));
361 /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
362 vvdw6
= _mm256_mul_pd(_mm256_sub_pd(c6_00
,_mm256_mul_pd(c6grid_00
,_mm256_sub_pd(one
,poly
))),rinvsix
);
363 vvdw12
= _mm256_mul_pd(c12_00
,_mm256_mul_pd(rinvsix
,rinvsix
));
364 vvdw
= _mm256_sub_pd(_mm256_mul_pd(vvdw12
,one_twelfth
),_mm256_mul_pd(vvdw6
,one_sixth
));
365 /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
366 fvdw
= _mm256_mul_pd(_mm256_sub_pd(vvdw12
,_mm256_sub_pd(vvdw6
,_mm256_mul_pd(_mm256_mul_pd(c6grid_00
,one_sixth
),_mm256_mul_pd(exponent
,ewclj6
)))),rinvsq00
);
368 /* Update potential sum for this i atom from the interaction with this j atom. */
369 vvdwsum
= _mm256_add_pd(vvdwsum
,vvdw
);
373 /* Calculate temporary vectorial force */
374 tx
= _mm256_mul_pd(fscal
,dx00
);
375 ty
= _mm256_mul_pd(fscal
,dy00
);
376 tz
= _mm256_mul_pd(fscal
,dz00
);
378 /* Update vectorial force */
379 fix0
= _mm256_add_pd(fix0
,tx
);
380 fiy0
= _mm256_add_pd(fiy0
,ty
);
381 fiz0
= _mm256_add_pd(fiz0
,tz
);
383 fjx0
= _mm256_add_pd(fjx0
,tx
);
384 fjy0
= _mm256_add_pd(fjy0
,ty
);
385 fjz0
= _mm256_add_pd(fjz0
,tz
);
387 /**************************
388 * CALCULATE INTERACTIONS *
389 **************************/
391 r11
= _mm256_mul_pd(rsq11
,rinv11
);
393 /* EWALD ELECTROSTATICS */
395 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
396 ewrt
= _mm256_mul_pd(r11
,ewtabscale
);
397 ewitab
= _mm256_cvttpd_epi32(ewrt
);
398 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
399 ewitab
= _mm_slli_epi32(ewitab
,2);
400 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
401 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
402 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
403 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
404 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
405 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
406 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
407 velec
= _mm256_mul_pd(qq11
,_mm256_sub_pd(rinv11
,velec
));
408 felec
= _mm256_mul_pd(_mm256_mul_pd(qq11
,rinv11
),_mm256_sub_pd(rinvsq11
,felec
));
410 /* Update potential sum for this i atom from the interaction with this j atom. */
411 velecsum
= _mm256_add_pd(velecsum
,velec
);
415 /* Calculate temporary vectorial force */
416 tx
= _mm256_mul_pd(fscal
,dx11
);
417 ty
= _mm256_mul_pd(fscal
,dy11
);
418 tz
= _mm256_mul_pd(fscal
,dz11
);
420 /* Update vectorial force */
421 fix1
= _mm256_add_pd(fix1
,tx
);
422 fiy1
= _mm256_add_pd(fiy1
,ty
);
423 fiz1
= _mm256_add_pd(fiz1
,tz
);
425 fjx1
= _mm256_add_pd(fjx1
,tx
);
426 fjy1
= _mm256_add_pd(fjy1
,ty
);
427 fjz1
= _mm256_add_pd(fjz1
,tz
);
429 /**************************
430 * CALCULATE INTERACTIONS *
431 **************************/
433 r12
= _mm256_mul_pd(rsq12
,rinv12
);
435 /* EWALD ELECTROSTATICS */
437 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
438 ewrt
= _mm256_mul_pd(r12
,ewtabscale
);
439 ewitab
= _mm256_cvttpd_epi32(ewrt
);
440 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
441 ewitab
= _mm_slli_epi32(ewitab
,2);
442 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
443 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
444 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
445 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
446 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
447 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
448 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
449 velec
= _mm256_mul_pd(qq12
,_mm256_sub_pd(rinv12
,velec
));
450 felec
= _mm256_mul_pd(_mm256_mul_pd(qq12
,rinv12
),_mm256_sub_pd(rinvsq12
,felec
));
452 /* Update potential sum for this i atom from the interaction with this j atom. */
453 velecsum
= _mm256_add_pd(velecsum
,velec
);
457 /* Calculate temporary vectorial force */
458 tx
= _mm256_mul_pd(fscal
,dx12
);
459 ty
= _mm256_mul_pd(fscal
,dy12
);
460 tz
= _mm256_mul_pd(fscal
,dz12
);
462 /* Update vectorial force */
463 fix1
= _mm256_add_pd(fix1
,tx
);
464 fiy1
= _mm256_add_pd(fiy1
,ty
);
465 fiz1
= _mm256_add_pd(fiz1
,tz
);
467 fjx2
= _mm256_add_pd(fjx2
,tx
);
468 fjy2
= _mm256_add_pd(fjy2
,ty
);
469 fjz2
= _mm256_add_pd(fjz2
,tz
);
471 /**************************
472 * CALCULATE INTERACTIONS *
473 **************************/
475 r13
= _mm256_mul_pd(rsq13
,rinv13
);
477 /* EWALD ELECTROSTATICS */
479 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
480 ewrt
= _mm256_mul_pd(r13
,ewtabscale
);
481 ewitab
= _mm256_cvttpd_epi32(ewrt
);
482 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
483 ewitab
= _mm_slli_epi32(ewitab
,2);
484 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
485 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
486 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
487 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
488 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
489 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
490 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
491 velec
= _mm256_mul_pd(qq13
,_mm256_sub_pd(rinv13
,velec
));
492 felec
= _mm256_mul_pd(_mm256_mul_pd(qq13
,rinv13
),_mm256_sub_pd(rinvsq13
,felec
));
494 /* Update potential sum for this i atom from the interaction with this j atom. */
495 velecsum
= _mm256_add_pd(velecsum
,velec
);
499 /* Calculate temporary vectorial force */
500 tx
= _mm256_mul_pd(fscal
,dx13
);
501 ty
= _mm256_mul_pd(fscal
,dy13
);
502 tz
= _mm256_mul_pd(fscal
,dz13
);
504 /* Update vectorial force */
505 fix1
= _mm256_add_pd(fix1
,tx
);
506 fiy1
= _mm256_add_pd(fiy1
,ty
);
507 fiz1
= _mm256_add_pd(fiz1
,tz
);
509 fjx3
= _mm256_add_pd(fjx3
,tx
);
510 fjy3
= _mm256_add_pd(fjy3
,ty
);
511 fjz3
= _mm256_add_pd(fjz3
,tz
);
513 /**************************
514 * CALCULATE INTERACTIONS *
515 **************************/
517 r21
= _mm256_mul_pd(rsq21
,rinv21
);
519 /* EWALD ELECTROSTATICS */
521 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
522 ewrt
= _mm256_mul_pd(r21
,ewtabscale
);
523 ewitab
= _mm256_cvttpd_epi32(ewrt
);
524 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
525 ewitab
= _mm_slli_epi32(ewitab
,2);
526 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
527 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
528 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
529 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
530 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
531 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
532 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
533 velec
= _mm256_mul_pd(qq21
,_mm256_sub_pd(rinv21
,velec
));
534 felec
= _mm256_mul_pd(_mm256_mul_pd(qq21
,rinv21
),_mm256_sub_pd(rinvsq21
,felec
));
536 /* Update potential sum for this i atom from the interaction with this j atom. */
537 velecsum
= _mm256_add_pd(velecsum
,velec
);
541 /* Calculate temporary vectorial force */
542 tx
= _mm256_mul_pd(fscal
,dx21
);
543 ty
= _mm256_mul_pd(fscal
,dy21
);
544 tz
= _mm256_mul_pd(fscal
,dz21
);
546 /* Update vectorial force */
547 fix2
= _mm256_add_pd(fix2
,tx
);
548 fiy2
= _mm256_add_pd(fiy2
,ty
);
549 fiz2
= _mm256_add_pd(fiz2
,tz
);
551 fjx1
= _mm256_add_pd(fjx1
,tx
);
552 fjy1
= _mm256_add_pd(fjy1
,ty
);
553 fjz1
= _mm256_add_pd(fjz1
,tz
);
555 /**************************
556 * CALCULATE INTERACTIONS *
557 **************************/
559 r22
= _mm256_mul_pd(rsq22
,rinv22
);
561 /* EWALD ELECTROSTATICS */
563 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
564 ewrt
= _mm256_mul_pd(r22
,ewtabscale
);
565 ewitab
= _mm256_cvttpd_epi32(ewrt
);
566 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
567 ewitab
= _mm_slli_epi32(ewitab
,2);
568 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
569 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
570 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
571 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
572 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
573 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
574 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
575 velec
= _mm256_mul_pd(qq22
,_mm256_sub_pd(rinv22
,velec
));
576 felec
= _mm256_mul_pd(_mm256_mul_pd(qq22
,rinv22
),_mm256_sub_pd(rinvsq22
,felec
));
578 /* Update potential sum for this i atom from the interaction with this j atom. */
579 velecsum
= _mm256_add_pd(velecsum
,velec
);
583 /* Calculate temporary vectorial force */
584 tx
= _mm256_mul_pd(fscal
,dx22
);
585 ty
= _mm256_mul_pd(fscal
,dy22
);
586 tz
= _mm256_mul_pd(fscal
,dz22
);
588 /* Update vectorial force */
589 fix2
= _mm256_add_pd(fix2
,tx
);
590 fiy2
= _mm256_add_pd(fiy2
,ty
);
591 fiz2
= _mm256_add_pd(fiz2
,tz
);
593 fjx2
= _mm256_add_pd(fjx2
,tx
);
594 fjy2
= _mm256_add_pd(fjy2
,ty
);
595 fjz2
= _mm256_add_pd(fjz2
,tz
);
597 /**************************
598 * CALCULATE INTERACTIONS *
599 **************************/
601 r23
= _mm256_mul_pd(rsq23
,rinv23
);
603 /* EWALD ELECTROSTATICS */
605 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
606 ewrt
= _mm256_mul_pd(r23
,ewtabscale
);
607 ewitab
= _mm256_cvttpd_epi32(ewrt
);
608 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
609 ewitab
= _mm_slli_epi32(ewitab
,2);
610 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
611 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
612 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
613 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
614 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
615 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
616 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
617 velec
= _mm256_mul_pd(qq23
,_mm256_sub_pd(rinv23
,velec
));
618 felec
= _mm256_mul_pd(_mm256_mul_pd(qq23
,rinv23
),_mm256_sub_pd(rinvsq23
,felec
));
620 /* Update potential sum for this i atom from the interaction with this j atom. */
621 velecsum
= _mm256_add_pd(velecsum
,velec
);
625 /* Calculate temporary vectorial force */
626 tx
= _mm256_mul_pd(fscal
,dx23
);
627 ty
= _mm256_mul_pd(fscal
,dy23
);
628 tz
= _mm256_mul_pd(fscal
,dz23
);
630 /* Update vectorial force */
631 fix2
= _mm256_add_pd(fix2
,tx
);
632 fiy2
= _mm256_add_pd(fiy2
,ty
);
633 fiz2
= _mm256_add_pd(fiz2
,tz
);
635 fjx3
= _mm256_add_pd(fjx3
,tx
);
636 fjy3
= _mm256_add_pd(fjy3
,ty
);
637 fjz3
= _mm256_add_pd(fjz3
,tz
);
639 /**************************
640 * CALCULATE INTERACTIONS *
641 **************************/
643 r31
= _mm256_mul_pd(rsq31
,rinv31
);
645 /* EWALD ELECTROSTATICS */
647 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
648 ewrt
= _mm256_mul_pd(r31
,ewtabscale
);
649 ewitab
= _mm256_cvttpd_epi32(ewrt
);
650 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
651 ewitab
= _mm_slli_epi32(ewitab
,2);
652 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
653 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
654 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
655 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
656 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
657 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
658 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
659 velec
= _mm256_mul_pd(qq31
,_mm256_sub_pd(rinv31
,velec
));
660 felec
= _mm256_mul_pd(_mm256_mul_pd(qq31
,rinv31
),_mm256_sub_pd(rinvsq31
,felec
));
662 /* Update potential sum for this i atom from the interaction with this j atom. */
663 velecsum
= _mm256_add_pd(velecsum
,velec
);
667 /* Calculate temporary vectorial force */
668 tx
= _mm256_mul_pd(fscal
,dx31
);
669 ty
= _mm256_mul_pd(fscal
,dy31
);
670 tz
= _mm256_mul_pd(fscal
,dz31
);
672 /* Update vectorial force */
673 fix3
= _mm256_add_pd(fix3
,tx
);
674 fiy3
= _mm256_add_pd(fiy3
,ty
);
675 fiz3
= _mm256_add_pd(fiz3
,tz
);
677 fjx1
= _mm256_add_pd(fjx1
,tx
);
678 fjy1
= _mm256_add_pd(fjy1
,ty
);
679 fjz1
= _mm256_add_pd(fjz1
,tz
);
681 /**************************
682 * CALCULATE INTERACTIONS *
683 **************************/
685 r32
= _mm256_mul_pd(rsq32
,rinv32
);
687 /* EWALD ELECTROSTATICS */
689 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
690 ewrt
= _mm256_mul_pd(r32
,ewtabscale
);
691 ewitab
= _mm256_cvttpd_epi32(ewrt
);
692 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
693 ewitab
= _mm_slli_epi32(ewitab
,2);
694 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
695 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
696 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
697 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
698 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
699 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
700 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
701 velec
= _mm256_mul_pd(qq32
,_mm256_sub_pd(rinv32
,velec
));
702 felec
= _mm256_mul_pd(_mm256_mul_pd(qq32
,rinv32
),_mm256_sub_pd(rinvsq32
,felec
));
704 /* Update potential sum for this i atom from the interaction with this j atom. */
705 velecsum
= _mm256_add_pd(velecsum
,velec
);
709 /* Calculate temporary vectorial force */
710 tx
= _mm256_mul_pd(fscal
,dx32
);
711 ty
= _mm256_mul_pd(fscal
,dy32
);
712 tz
= _mm256_mul_pd(fscal
,dz32
);
714 /* Update vectorial force */
715 fix3
= _mm256_add_pd(fix3
,tx
);
716 fiy3
= _mm256_add_pd(fiy3
,ty
);
717 fiz3
= _mm256_add_pd(fiz3
,tz
);
719 fjx2
= _mm256_add_pd(fjx2
,tx
);
720 fjy2
= _mm256_add_pd(fjy2
,ty
);
721 fjz2
= _mm256_add_pd(fjz2
,tz
);
723 /**************************
724 * CALCULATE INTERACTIONS *
725 **************************/
727 r33
= _mm256_mul_pd(rsq33
,rinv33
);
729 /* EWALD ELECTROSTATICS */
731 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
732 ewrt
= _mm256_mul_pd(r33
,ewtabscale
);
733 ewitab
= _mm256_cvttpd_epi32(ewrt
);
734 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
735 ewitab
= _mm_slli_epi32(ewitab
,2);
736 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
737 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
738 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
739 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
740 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
741 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
742 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
743 velec
= _mm256_mul_pd(qq33
,_mm256_sub_pd(rinv33
,velec
));
744 felec
= _mm256_mul_pd(_mm256_mul_pd(qq33
,rinv33
),_mm256_sub_pd(rinvsq33
,felec
));
746 /* Update potential sum for this i atom from the interaction with this j atom. */
747 velecsum
= _mm256_add_pd(velecsum
,velec
);
751 /* Calculate temporary vectorial force */
752 tx
= _mm256_mul_pd(fscal
,dx33
);
753 ty
= _mm256_mul_pd(fscal
,dy33
);
754 tz
= _mm256_mul_pd(fscal
,dz33
);
756 /* Update vectorial force */
757 fix3
= _mm256_add_pd(fix3
,tx
);
758 fiy3
= _mm256_add_pd(fiy3
,ty
);
759 fiz3
= _mm256_add_pd(fiz3
,tz
);
761 fjx3
= _mm256_add_pd(fjx3
,tx
);
762 fjy3
= _mm256_add_pd(fjy3
,ty
);
763 fjz3
= _mm256_add_pd(fjz3
,tz
);
765 fjptrA
= f
+j_coord_offsetA
;
766 fjptrB
= f
+j_coord_offsetB
;
767 fjptrC
= f
+j_coord_offsetC
;
768 fjptrD
= f
+j_coord_offsetD
;
770 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
771 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,
772 fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
774 /* Inner loop uses 426 flops */
780 /* Get j neighbor index, and coordinate index */
781 jnrlistA
= jjnr
[jidx
];
782 jnrlistB
= jjnr
[jidx
+1];
783 jnrlistC
= jjnr
[jidx
+2];
784 jnrlistD
= jjnr
[jidx
+3];
785 /* Sign of each element will be negative for non-real atoms.
786 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
787 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
789 tmpmask0
= gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128()));
791 tmpmask1
= _mm_permute_ps(tmpmask0
,_GMX_MM_PERMUTE(3,3,2,2));
792 tmpmask0
= _mm_permute_ps(tmpmask0
,_GMX_MM_PERMUTE(1,1,0,0));
793 dummy_mask
= _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1
,tmpmask0
));
795 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
796 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
797 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
798 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
799 j_coord_offsetA
= DIM
*jnrA
;
800 j_coord_offsetB
= DIM
*jnrB
;
801 j_coord_offsetC
= DIM
*jnrC
;
802 j_coord_offsetD
= DIM
*jnrD
;
804 /* load j atom coordinates */
805 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
806 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
807 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
808 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
810 /* Calculate displacement vector */
811 dx00
= _mm256_sub_pd(ix0
,jx0
);
812 dy00
= _mm256_sub_pd(iy0
,jy0
);
813 dz00
= _mm256_sub_pd(iz0
,jz0
);
814 dx11
= _mm256_sub_pd(ix1
,jx1
);
815 dy11
= _mm256_sub_pd(iy1
,jy1
);
816 dz11
= _mm256_sub_pd(iz1
,jz1
);
817 dx12
= _mm256_sub_pd(ix1
,jx2
);
818 dy12
= _mm256_sub_pd(iy1
,jy2
);
819 dz12
= _mm256_sub_pd(iz1
,jz2
);
820 dx13
= _mm256_sub_pd(ix1
,jx3
);
821 dy13
= _mm256_sub_pd(iy1
,jy3
);
822 dz13
= _mm256_sub_pd(iz1
,jz3
);
823 dx21
= _mm256_sub_pd(ix2
,jx1
);
824 dy21
= _mm256_sub_pd(iy2
,jy1
);
825 dz21
= _mm256_sub_pd(iz2
,jz1
);
826 dx22
= _mm256_sub_pd(ix2
,jx2
);
827 dy22
= _mm256_sub_pd(iy2
,jy2
);
828 dz22
= _mm256_sub_pd(iz2
,jz2
);
829 dx23
= _mm256_sub_pd(ix2
,jx3
);
830 dy23
= _mm256_sub_pd(iy2
,jy3
);
831 dz23
= _mm256_sub_pd(iz2
,jz3
);
832 dx31
= _mm256_sub_pd(ix3
,jx1
);
833 dy31
= _mm256_sub_pd(iy3
,jy1
);
834 dz31
= _mm256_sub_pd(iz3
,jz1
);
835 dx32
= _mm256_sub_pd(ix3
,jx2
);
836 dy32
= _mm256_sub_pd(iy3
,jy2
);
837 dz32
= _mm256_sub_pd(iz3
,jz2
);
838 dx33
= _mm256_sub_pd(ix3
,jx3
);
839 dy33
= _mm256_sub_pd(iy3
,jy3
);
840 dz33
= _mm256_sub_pd(iz3
,jz3
);
842 /* Calculate squared distance and things based on it */
843 rsq00
= gmx_mm256_calc_rsq_pd(dx00
,dy00
,dz00
);
844 rsq11
= gmx_mm256_calc_rsq_pd(dx11
,dy11
,dz11
);
845 rsq12
= gmx_mm256_calc_rsq_pd(dx12
,dy12
,dz12
);
846 rsq13
= gmx_mm256_calc_rsq_pd(dx13
,dy13
,dz13
);
847 rsq21
= gmx_mm256_calc_rsq_pd(dx21
,dy21
,dz21
);
848 rsq22
= gmx_mm256_calc_rsq_pd(dx22
,dy22
,dz22
);
849 rsq23
= gmx_mm256_calc_rsq_pd(dx23
,dy23
,dz23
);
850 rsq31
= gmx_mm256_calc_rsq_pd(dx31
,dy31
,dz31
);
851 rsq32
= gmx_mm256_calc_rsq_pd(dx32
,dy32
,dz32
);
852 rsq33
= gmx_mm256_calc_rsq_pd(dx33
,dy33
,dz33
);
854 rinv00
= avx256_invsqrt_d(rsq00
);
855 rinv11
= avx256_invsqrt_d(rsq11
);
856 rinv12
= avx256_invsqrt_d(rsq12
);
857 rinv13
= avx256_invsqrt_d(rsq13
);
858 rinv21
= avx256_invsqrt_d(rsq21
);
859 rinv22
= avx256_invsqrt_d(rsq22
);
860 rinv23
= avx256_invsqrt_d(rsq23
);
861 rinv31
= avx256_invsqrt_d(rsq31
);
862 rinv32
= avx256_invsqrt_d(rsq32
);
863 rinv33
= avx256_invsqrt_d(rsq33
);
865 rinvsq00
= _mm256_mul_pd(rinv00
,rinv00
);
866 rinvsq11
= _mm256_mul_pd(rinv11
,rinv11
);
867 rinvsq12
= _mm256_mul_pd(rinv12
,rinv12
);
868 rinvsq13
= _mm256_mul_pd(rinv13
,rinv13
);
869 rinvsq21
= _mm256_mul_pd(rinv21
,rinv21
);
870 rinvsq22
= _mm256_mul_pd(rinv22
,rinv22
);
871 rinvsq23
= _mm256_mul_pd(rinv23
,rinv23
);
872 rinvsq31
= _mm256_mul_pd(rinv31
,rinv31
);
873 rinvsq32
= _mm256_mul_pd(rinv32
,rinv32
);
874 rinvsq33
= _mm256_mul_pd(rinv33
,rinv33
);
876 fjx0
= _mm256_setzero_pd();
877 fjy0
= _mm256_setzero_pd();
878 fjz0
= _mm256_setzero_pd();
879 fjx1
= _mm256_setzero_pd();
880 fjy1
= _mm256_setzero_pd();
881 fjz1
= _mm256_setzero_pd();
882 fjx2
= _mm256_setzero_pd();
883 fjy2
= _mm256_setzero_pd();
884 fjz2
= _mm256_setzero_pd();
885 fjx3
= _mm256_setzero_pd();
886 fjy3
= _mm256_setzero_pd();
887 fjz3
= _mm256_setzero_pd();
889 /**************************
890 * CALCULATE INTERACTIONS *
891 **************************/
893 r00
= _mm256_mul_pd(rsq00
,rinv00
);
894 r00
= _mm256_andnot_pd(dummy_mask
,r00
);
896 /* Analytical LJ-PME */
897 rinvsix
= _mm256_mul_pd(_mm256_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
898 ewcljrsq
= _mm256_mul_pd(ewclj2
,rsq00
);
899 ewclj6
= _mm256_mul_pd(ewclj2
,_mm256_mul_pd(ewclj2
,ewclj2
));
900 exponent
= avx256_exp_d(ewcljrsq
);
901 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
902 poly
= _mm256_mul_pd(exponent
,_mm256_add_pd(_mm256_sub_pd(one
,ewcljrsq
),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq
,ewcljrsq
),one_half
)));
903 /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
904 vvdw6
= _mm256_mul_pd(_mm256_sub_pd(c6_00
,_mm256_mul_pd(c6grid_00
,_mm256_sub_pd(one
,poly
))),rinvsix
);
905 vvdw12
= _mm256_mul_pd(c12_00
,_mm256_mul_pd(rinvsix
,rinvsix
));
906 vvdw
= _mm256_sub_pd(_mm256_mul_pd(vvdw12
,one_twelfth
),_mm256_mul_pd(vvdw6
,one_sixth
));
907 /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
908 fvdw
= _mm256_mul_pd(_mm256_sub_pd(vvdw12
,_mm256_sub_pd(vvdw6
,_mm256_mul_pd(_mm256_mul_pd(c6grid_00
,one_sixth
),_mm256_mul_pd(exponent
,ewclj6
)))),rinvsq00
);
910 /* Update potential sum for this i atom from the interaction with this j atom. */
911 vvdw
= _mm256_andnot_pd(dummy_mask
,vvdw
);
912 vvdwsum
= _mm256_add_pd(vvdwsum
,vvdw
);
916 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
918 /* Calculate temporary vectorial force */
919 tx
= _mm256_mul_pd(fscal
,dx00
);
920 ty
= _mm256_mul_pd(fscal
,dy00
);
921 tz
= _mm256_mul_pd(fscal
,dz00
);
923 /* Update vectorial force */
924 fix0
= _mm256_add_pd(fix0
,tx
);
925 fiy0
= _mm256_add_pd(fiy0
,ty
);
926 fiz0
= _mm256_add_pd(fiz0
,tz
);
928 fjx0
= _mm256_add_pd(fjx0
,tx
);
929 fjy0
= _mm256_add_pd(fjy0
,ty
);
930 fjz0
= _mm256_add_pd(fjz0
,tz
);
932 /**************************
933 * CALCULATE INTERACTIONS *
934 **************************/
936 r11
= _mm256_mul_pd(rsq11
,rinv11
);
937 r11
= _mm256_andnot_pd(dummy_mask
,r11
);
939 /* EWALD ELECTROSTATICS */
941 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
942 ewrt
= _mm256_mul_pd(r11
,ewtabscale
);
943 ewitab
= _mm256_cvttpd_epi32(ewrt
);
944 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
945 ewitab
= _mm_slli_epi32(ewitab
,2);
946 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
947 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
948 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
949 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
950 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
951 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
952 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
953 velec
= _mm256_mul_pd(qq11
,_mm256_sub_pd(rinv11
,velec
));
954 felec
= _mm256_mul_pd(_mm256_mul_pd(qq11
,rinv11
),_mm256_sub_pd(rinvsq11
,felec
));
956 /* Update potential sum for this i atom from the interaction with this j atom. */
957 velec
= _mm256_andnot_pd(dummy_mask
,velec
);
958 velecsum
= _mm256_add_pd(velecsum
,velec
);
962 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
964 /* Calculate temporary vectorial force */
965 tx
= _mm256_mul_pd(fscal
,dx11
);
966 ty
= _mm256_mul_pd(fscal
,dy11
);
967 tz
= _mm256_mul_pd(fscal
,dz11
);
969 /* Update vectorial force */
970 fix1
= _mm256_add_pd(fix1
,tx
);
971 fiy1
= _mm256_add_pd(fiy1
,ty
);
972 fiz1
= _mm256_add_pd(fiz1
,tz
);
974 fjx1
= _mm256_add_pd(fjx1
,tx
);
975 fjy1
= _mm256_add_pd(fjy1
,ty
);
976 fjz1
= _mm256_add_pd(fjz1
,tz
);
978 /**************************
979 * CALCULATE INTERACTIONS *
980 **************************/
982 r12
= _mm256_mul_pd(rsq12
,rinv12
);
983 r12
= _mm256_andnot_pd(dummy_mask
,r12
);
985 /* EWALD ELECTROSTATICS */
987 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
988 ewrt
= _mm256_mul_pd(r12
,ewtabscale
);
989 ewitab
= _mm256_cvttpd_epi32(ewrt
);
990 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
991 ewitab
= _mm_slli_epi32(ewitab
,2);
992 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
993 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
994 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
995 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
996 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
997 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
998 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
999 velec
= _mm256_mul_pd(qq12
,_mm256_sub_pd(rinv12
,velec
));
1000 felec
= _mm256_mul_pd(_mm256_mul_pd(qq12
,rinv12
),_mm256_sub_pd(rinvsq12
,felec
));
1002 /* Update potential sum for this i atom from the interaction with this j atom. */
1003 velec
= _mm256_andnot_pd(dummy_mask
,velec
);
1004 velecsum
= _mm256_add_pd(velecsum
,velec
);
1008 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
1010 /* Calculate temporary vectorial force */
1011 tx
= _mm256_mul_pd(fscal
,dx12
);
1012 ty
= _mm256_mul_pd(fscal
,dy12
);
1013 tz
= _mm256_mul_pd(fscal
,dz12
);
1015 /* Update vectorial force */
1016 fix1
= _mm256_add_pd(fix1
,tx
);
1017 fiy1
= _mm256_add_pd(fiy1
,ty
);
1018 fiz1
= _mm256_add_pd(fiz1
,tz
);
1020 fjx2
= _mm256_add_pd(fjx2
,tx
);
1021 fjy2
= _mm256_add_pd(fjy2
,ty
);
1022 fjz2
= _mm256_add_pd(fjz2
,tz
);
1024 /**************************
1025 * CALCULATE INTERACTIONS *
1026 **************************/
1028 r13
= _mm256_mul_pd(rsq13
,rinv13
);
1029 r13
= _mm256_andnot_pd(dummy_mask
,r13
);
1031 /* EWALD ELECTROSTATICS */
1033 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1034 ewrt
= _mm256_mul_pd(r13
,ewtabscale
);
1035 ewitab
= _mm256_cvttpd_epi32(ewrt
);
1036 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1037 ewitab
= _mm_slli_epi32(ewitab
,2);
1038 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
1039 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
1040 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
1041 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
1042 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
1043 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
1044 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
1045 velec
= _mm256_mul_pd(qq13
,_mm256_sub_pd(rinv13
,velec
));
1046 felec
= _mm256_mul_pd(_mm256_mul_pd(qq13
,rinv13
),_mm256_sub_pd(rinvsq13
,felec
));
1048 /* Update potential sum for this i atom from the interaction with this j atom. */
1049 velec
= _mm256_andnot_pd(dummy_mask
,velec
);
1050 velecsum
= _mm256_add_pd(velecsum
,velec
);
1054 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
1056 /* Calculate temporary vectorial force */
1057 tx
= _mm256_mul_pd(fscal
,dx13
);
1058 ty
= _mm256_mul_pd(fscal
,dy13
);
1059 tz
= _mm256_mul_pd(fscal
,dz13
);
1061 /* Update vectorial force */
1062 fix1
= _mm256_add_pd(fix1
,tx
);
1063 fiy1
= _mm256_add_pd(fiy1
,ty
);
1064 fiz1
= _mm256_add_pd(fiz1
,tz
);
1066 fjx3
= _mm256_add_pd(fjx3
,tx
);
1067 fjy3
= _mm256_add_pd(fjy3
,ty
);
1068 fjz3
= _mm256_add_pd(fjz3
,tz
);
1070 /**************************
1071 * CALCULATE INTERACTIONS *
1072 **************************/
1074 r21
= _mm256_mul_pd(rsq21
,rinv21
);
1075 r21
= _mm256_andnot_pd(dummy_mask
,r21
);
1077 /* EWALD ELECTROSTATICS */
1079 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1080 ewrt
= _mm256_mul_pd(r21
,ewtabscale
);
1081 ewitab
= _mm256_cvttpd_epi32(ewrt
);
1082 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1083 ewitab
= _mm_slli_epi32(ewitab
,2);
1084 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
1085 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
1086 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
1087 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
1088 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
1089 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
1090 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
1091 velec
= _mm256_mul_pd(qq21
,_mm256_sub_pd(rinv21
,velec
));
1092 felec
= _mm256_mul_pd(_mm256_mul_pd(qq21
,rinv21
),_mm256_sub_pd(rinvsq21
,felec
));
1094 /* Update potential sum for this i atom from the interaction with this j atom. */
1095 velec
= _mm256_andnot_pd(dummy_mask
,velec
);
1096 velecsum
= _mm256_add_pd(velecsum
,velec
);
1100 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
1102 /* Calculate temporary vectorial force */
1103 tx
= _mm256_mul_pd(fscal
,dx21
);
1104 ty
= _mm256_mul_pd(fscal
,dy21
);
1105 tz
= _mm256_mul_pd(fscal
,dz21
);
1107 /* Update vectorial force */
1108 fix2
= _mm256_add_pd(fix2
,tx
);
1109 fiy2
= _mm256_add_pd(fiy2
,ty
);
1110 fiz2
= _mm256_add_pd(fiz2
,tz
);
1112 fjx1
= _mm256_add_pd(fjx1
,tx
);
1113 fjy1
= _mm256_add_pd(fjy1
,ty
);
1114 fjz1
= _mm256_add_pd(fjz1
,tz
);
1116 /**************************
1117 * CALCULATE INTERACTIONS *
1118 **************************/
1120 r22
= _mm256_mul_pd(rsq22
,rinv22
);
1121 r22
= _mm256_andnot_pd(dummy_mask
,r22
);
1123 /* EWALD ELECTROSTATICS */
1125 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1126 ewrt
= _mm256_mul_pd(r22
,ewtabscale
);
1127 ewitab
= _mm256_cvttpd_epi32(ewrt
);
1128 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1129 ewitab
= _mm_slli_epi32(ewitab
,2);
1130 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
1131 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
1132 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
1133 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
1134 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
1135 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
1136 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
1137 velec
= _mm256_mul_pd(qq22
,_mm256_sub_pd(rinv22
,velec
));
1138 felec
= _mm256_mul_pd(_mm256_mul_pd(qq22
,rinv22
),_mm256_sub_pd(rinvsq22
,felec
));
1140 /* Update potential sum for this i atom from the interaction with this j atom. */
1141 velec
= _mm256_andnot_pd(dummy_mask
,velec
);
1142 velecsum
= _mm256_add_pd(velecsum
,velec
);
1146 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
1148 /* Calculate temporary vectorial force */
1149 tx
= _mm256_mul_pd(fscal
,dx22
);
1150 ty
= _mm256_mul_pd(fscal
,dy22
);
1151 tz
= _mm256_mul_pd(fscal
,dz22
);
1153 /* Update vectorial force */
1154 fix2
= _mm256_add_pd(fix2
,tx
);
1155 fiy2
= _mm256_add_pd(fiy2
,ty
);
1156 fiz2
= _mm256_add_pd(fiz2
,tz
);
1158 fjx2
= _mm256_add_pd(fjx2
,tx
);
1159 fjy2
= _mm256_add_pd(fjy2
,ty
);
1160 fjz2
= _mm256_add_pd(fjz2
,tz
);
1162 /**************************
1163 * CALCULATE INTERACTIONS *
1164 **************************/
1166 r23
= _mm256_mul_pd(rsq23
,rinv23
);
1167 r23
= _mm256_andnot_pd(dummy_mask
,r23
);
1169 /* EWALD ELECTROSTATICS */
1171 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1172 ewrt
= _mm256_mul_pd(r23
,ewtabscale
);
1173 ewitab
= _mm256_cvttpd_epi32(ewrt
);
1174 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1175 ewitab
= _mm_slli_epi32(ewitab
,2);
1176 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
1177 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
1178 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
1179 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
1180 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
1181 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
1182 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
1183 velec
= _mm256_mul_pd(qq23
,_mm256_sub_pd(rinv23
,velec
));
1184 felec
= _mm256_mul_pd(_mm256_mul_pd(qq23
,rinv23
),_mm256_sub_pd(rinvsq23
,felec
));
1186 /* Update potential sum for this i atom from the interaction with this j atom. */
1187 velec
= _mm256_andnot_pd(dummy_mask
,velec
);
1188 velecsum
= _mm256_add_pd(velecsum
,velec
);
1192 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
1194 /* Calculate temporary vectorial force */
1195 tx
= _mm256_mul_pd(fscal
,dx23
);
1196 ty
= _mm256_mul_pd(fscal
,dy23
);
1197 tz
= _mm256_mul_pd(fscal
,dz23
);
1199 /* Update vectorial force */
1200 fix2
= _mm256_add_pd(fix2
,tx
);
1201 fiy2
= _mm256_add_pd(fiy2
,ty
);
1202 fiz2
= _mm256_add_pd(fiz2
,tz
);
1204 fjx3
= _mm256_add_pd(fjx3
,tx
);
1205 fjy3
= _mm256_add_pd(fjy3
,ty
);
1206 fjz3
= _mm256_add_pd(fjz3
,tz
);
1208 /**************************
1209 * CALCULATE INTERACTIONS *
1210 **************************/
1212 r31
= _mm256_mul_pd(rsq31
,rinv31
);
1213 r31
= _mm256_andnot_pd(dummy_mask
,r31
);
1215 /* EWALD ELECTROSTATICS */
1217 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1218 ewrt
= _mm256_mul_pd(r31
,ewtabscale
);
1219 ewitab
= _mm256_cvttpd_epi32(ewrt
);
1220 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1221 ewitab
= _mm_slli_epi32(ewitab
,2);
1222 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
1223 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
1224 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
1225 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
1226 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
1227 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
1228 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
1229 velec
= _mm256_mul_pd(qq31
,_mm256_sub_pd(rinv31
,velec
));
1230 felec
= _mm256_mul_pd(_mm256_mul_pd(qq31
,rinv31
),_mm256_sub_pd(rinvsq31
,felec
));
1232 /* Update potential sum for this i atom from the interaction with this j atom. */
1233 velec
= _mm256_andnot_pd(dummy_mask
,velec
);
1234 velecsum
= _mm256_add_pd(velecsum
,velec
);
1238 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
1240 /* Calculate temporary vectorial force */
1241 tx
= _mm256_mul_pd(fscal
,dx31
);
1242 ty
= _mm256_mul_pd(fscal
,dy31
);
1243 tz
= _mm256_mul_pd(fscal
,dz31
);
1245 /* Update vectorial force */
1246 fix3
= _mm256_add_pd(fix3
,tx
);
1247 fiy3
= _mm256_add_pd(fiy3
,ty
);
1248 fiz3
= _mm256_add_pd(fiz3
,tz
);
1250 fjx1
= _mm256_add_pd(fjx1
,tx
);
1251 fjy1
= _mm256_add_pd(fjy1
,ty
);
1252 fjz1
= _mm256_add_pd(fjz1
,tz
);
1254 /**************************
1255 * CALCULATE INTERACTIONS *
1256 **************************/
1258 r32
= _mm256_mul_pd(rsq32
,rinv32
);
1259 r32
= _mm256_andnot_pd(dummy_mask
,r32
);
1261 /* EWALD ELECTROSTATICS */
1263 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1264 ewrt
= _mm256_mul_pd(r32
,ewtabscale
);
1265 ewitab
= _mm256_cvttpd_epi32(ewrt
);
1266 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1267 ewitab
= _mm_slli_epi32(ewitab
,2);
1268 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
1269 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
1270 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
1271 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
1272 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
1273 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
1274 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
1275 velec
= _mm256_mul_pd(qq32
,_mm256_sub_pd(rinv32
,velec
));
1276 felec
= _mm256_mul_pd(_mm256_mul_pd(qq32
,rinv32
),_mm256_sub_pd(rinvsq32
,felec
));
1278 /* Update potential sum for this i atom from the interaction with this j atom. */
1279 velec
= _mm256_andnot_pd(dummy_mask
,velec
);
1280 velecsum
= _mm256_add_pd(velecsum
,velec
);
1284 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
1286 /* Calculate temporary vectorial force */
1287 tx
= _mm256_mul_pd(fscal
,dx32
);
1288 ty
= _mm256_mul_pd(fscal
,dy32
);
1289 tz
= _mm256_mul_pd(fscal
,dz32
);
1291 /* Update vectorial force */
1292 fix3
= _mm256_add_pd(fix3
,tx
);
1293 fiy3
= _mm256_add_pd(fiy3
,ty
);
1294 fiz3
= _mm256_add_pd(fiz3
,tz
);
1296 fjx2
= _mm256_add_pd(fjx2
,tx
);
1297 fjy2
= _mm256_add_pd(fjy2
,ty
);
1298 fjz2
= _mm256_add_pd(fjz2
,tz
);
1300 /**************************
1301 * CALCULATE INTERACTIONS *
1302 **************************/
1304 r33
= _mm256_mul_pd(rsq33
,rinv33
);
1305 r33
= _mm256_andnot_pd(dummy_mask
,r33
);
1307 /* EWALD ELECTROSTATICS */
1309 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1310 ewrt
= _mm256_mul_pd(r33
,ewtabscale
);
1311 ewitab
= _mm256_cvttpd_epi32(ewrt
);
1312 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1313 ewitab
= _mm_slli_epi32(ewitab
,2);
1314 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
1315 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
1316 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
1317 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
1318 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
1319 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
1320 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
1321 velec
= _mm256_mul_pd(qq33
,_mm256_sub_pd(rinv33
,velec
));
1322 felec
= _mm256_mul_pd(_mm256_mul_pd(qq33
,rinv33
),_mm256_sub_pd(rinvsq33
,felec
));
1324 /* Update potential sum for this i atom from the interaction with this j atom. */
1325 velec
= _mm256_andnot_pd(dummy_mask
,velec
);
1326 velecsum
= _mm256_add_pd(velecsum
,velec
);
1330 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
1332 /* Calculate temporary vectorial force */
1333 tx
= _mm256_mul_pd(fscal
,dx33
);
1334 ty
= _mm256_mul_pd(fscal
,dy33
);
1335 tz
= _mm256_mul_pd(fscal
,dz33
);
1337 /* Update vectorial force */
1338 fix3
= _mm256_add_pd(fix3
,tx
);
1339 fiy3
= _mm256_add_pd(fiy3
,ty
);
1340 fiz3
= _mm256_add_pd(fiz3
,tz
);
1342 fjx3
= _mm256_add_pd(fjx3
,tx
);
1343 fjy3
= _mm256_add_pd(fjy3
,ty
);
1344 fjz3
= _mm256_add_pd(fjz3
,tz
);
1346 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
1347 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
1348 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
1349 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
1351 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
1352 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,
1353 fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
1355 /* Inner loop uses 436 flops */
1358 /* End of innermost loop */
1360 gmx_mm256_update_iforce_4atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
1361 f
+i_coord_offset
,fshift
+i_shift_offset
);
1364 /* Update potential energies */
1365 gmx_mm256_update_1pot_pd(velecsum
,kernel_data
->energygrp_elec
+ggid
);
1366 gmx_mm256_update_1pot_pd(vvdwsum
,kernel_data
->energygrp_vdw
+ggid
);
1368 /* Increment number of inner iterations */
1369 inneriter
+= j_index_end
- j_index_start
;
1371 /* Outer loop uses 26 flops */
1374 /* Increment number of outer iterations */
1377 /* Update outer/inner flops */
1379 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W4W4_VF
,outeriter
*26 + inneriter
*436);
1382 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_avx_256_double
1383 * Electrostatics interaction: Ewald
1384 * VdW interaction: LJEwald
1385 * Geometry: Water4-Water4
1386 * Calculate force/pot: Force
1389 nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_avx_256_double
1390 (t_nblist
* gmx_restrict nlist
,
1391 rvec
* gmx_restrict xx
,
1392 rvec
* gmx_restrict ff
,
1393 struct t_forcerec
* gmx_restrict fr
,
1394 t_mdatoms
* gmx_restrict mdatoms
,
1395 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
1396 t_nrnb
* gmx_restrict nrnb
)
1398 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1399 * just 0 for non-waters.
1400 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1401 * jnr indices corresponding to data put in the four positions in the SIMD register.
1403 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
1404 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
1405 int jnrA
,jnrB
,jnrC
,jnrD
;
1406 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
1407 int jnrlistE
,jnrlistF
,jnrlistG
,jnrlistH
;
1408 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
1409 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
1410 real rcutoff_scalar
;
1411 real
*shiftvec
,*fshift
,*x
,*f
;
1412 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
;
1413 real scratch
[4*DIM
];
1414 __m256d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
1415 real
* vdwioffsetptr0
;
1416 real
* vdwgridioffsetptr0
;
1417 __m256d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
1418 real
* vdwioffsetptr1
;
1419 real
* vdwgridioffsetptr1
;
1420 __m256d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
1421 real
* vdwioffsetptr2
;
1422 real
* vdwgridioffsetptr2
;
1423 __m256d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
1424 real
* vdwioffsetptr3
;
1425 real
* vdwgridioffsetptr3
;
1426 __m256d ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
1427 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
;
1428 __m256d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
1429 int vdwjidx1A
,vdwjidx1B
,vdwjidx1C
,vdwjidx1D
;
1430 __m256d jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
1431 int vdwjidx2A
,vdwjidx2B
,vdwjidx2C
,vdwjidx2D
;
1432 __m256d jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
1433 int vdwjidx3A
,vdwjidx3B
,vdwjidx3C
,vdwjidx3D
;
1434 __m256d jx3
,jy3
,jz3
,fjx3
,fjy3
,fjz3
,jq3
,isaj3
;
1435 __m256d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
1436 __m256d dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
1437 __m256d dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
1438 __m256d dx13
,dy13
,dz13
,rsq13
,rinv13
,rinvsq13
,r13
,qq13
,c6_13
,c12_13
;
1439 __m256d dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
1440 __m256d dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
1441 __m256d dx23
,dy23
,dz23
,rsq23
,rinv23
,rinvsq23
,r23
,qq23
,c6_23
,c12_23
;
1442 __m256d dx31
,dy31
,dz31
,rsq31
,rinv31
,rinvsq31
,r31
,qq31
,c6_31
,c12_31
;
1443 __m256d dx32
,dy32
,dz32
,rsq32
,rinv32
,rinvsq32
,r32
,qq32
,c6_32
,c12_32
;
1444 __m256d dx33
,dy33
,dz33
,rsq33
,rinv33
,rinvsq33
,r33
,qq33
,c6_33
,c12_33
;
1445 __m256d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
1448 __m256d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
1451 __m256d one_sixth
= _mm256_set1_pd(1.0/6.0);
1452 __m256d one_twelfth
= _mm256_set1_pd(1.0/12.0);
1464 __m256d ewclj
,ewclj2
,ewclj6
,ewcljrsq
,poly
,exponent
,f6A
,f6B
,sh_lj_ewald
;
1465 __m256d one_half
= _mm256_set1_pd(0.5);
1466 __m256d minus_one
= _mm256_set1_pd(-1.0);
1468 __m256d ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
1469 __m256d beta
,beta2
,beta3
,zeta2
,pmecorrF
,pmecorrV
,rinv3
;
1471 __m256d dummy_mask
,cutoff_mask
;
1472 __m128 tmpmask0
,tmpmask1
;
1473 __m256d signbit
= _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1474 __m256d one
= _mm256_set1_pd(1.0);
1475 __m256d two
= _mm256_set1_pd(2.0);
1481 jindex
= nlist
->jindex
;
1483 shiftidx
= nlist
->shift
;
1485 shiftvec
= fr
->shift_vec
[0];
1486 fshift
= fr
->fshift
[0];
1487 facel
= _mm256_set1_pd(fr
->ic
->epsfac
);
1488 charge
= mdatoms
->chargeA
;
1489 nvdwtype
= fr
->ntype
;
1490 vdwparam
= fr
->nbfp
;
1491 vdwtype
= mdatoms
->typeA
;
1492 vdwgridparam
= fr
->ljpme_c6grid
;
1493 sh_lj_ewald
= _mm256_set1_pd(fr
->ic
->sh_lj_ewald
);
1494 ewclj
= _mm256_set1_pd(fr
->ic
->ewaldcoeff_lj
);
1495 ewclj2
= _mm256_mul_pd(minus_one
,_mm256_mul_pd(ewclj
,ewclj
));
1497 sh_ewald
= _mm256_set1_pd(fr
->ic
->sh_ewald
);
1498 beta
= _mm256_set1_pd(fr
->ic
->ewaldcoeff_q
);
1499 beta2
= _mm256_mul_pd(beta
,beta
);
1500 beta3
= _mm256_mul_pd(beta
,beta2
);
1502 ewtab
= fr
->ic
->tabq_coul_F
;
1503 ewtabscale
= _mm256_set1_pd(fr
->ic
->tabq_scale
);
1504 ewtabhalfspace
= _mm256_set1_pd(0.5/fr
->ic
->tabq_scale
);
1506 /* Setup water-specific parameters */
1507 inr
= nlist
->iinr
[0];
1508 iq1
= _mm256_mul_pd(facel
,_mm256_set1_pd(charge
[inr
+1]));
1509 iq2
= _mm256_mul_pd(facel
,_mm256_set1_pd(charge
[inr
+2]));
1510 iq3
= _mm256_mul_pd(facel
,_mm256_set1_pd(charge
[inr
+3]));
1511 vdwioffsetptr0
= vdwparam
+2*nvdwtype
*vdwtype
[inr
+0];
1512 vdwgridioffsetptr0
= vdwgridparam
+2*nvdwtype
*vdwtype
[inr
+0];
1514 jq1
= _mm256_set1_pd(charge
[inr
+1]);
1515 jq2
= _mm256_set1_pd(charge
[inr
+2]);
1516 jq3
= _mm256_set1_pd(charge
[inr
+3]);
1517 vdwjidx0A
= 2*vdwtype
[inr
+0];
1518 c6_00
= _mm256_set1_pd(vdwioffsetptr0
[vdwjidx0A
]);
1519 c12_00
= _mm256_set1_pd(vdwioffsetptr0
[vdwjidx0A
+1]);
1520 c6grid_00
= _mm256_set1_pd(vdwgridioffsetptr0
[vdwjidx0A
]);
1521 qq11
= _mm256_mul_pd(iq1
,jq1
);
1522 qq12
= _mm256_mul_pd(iq1
,jq2
);
1523 qq13
= _mm256_mul_pd(iq1
,jq3
);
1524 qq21
= _mm256_mul_pd(iq2
,jq1
);
1525 qq22
= _mm256_mul_pd(iq2
,jq2
);
1526 qq23
= _mm256_mul_pd(iq2
,jq3
);
1527 qq31
= _mm256_mul_pd(iq3
,jq1
);
1528 qq32
= _mm256_mul_pd(iq3
,jq2
);
1529 qq33
= _mm256_mul_pd(iq3
,jq3
);
1531 /* Avoid stupid compiler warnings */
1532 jnrA
= jnrB
= jnrC
= jnrD
= 0;
1533 j_coord_offsetA
= 0;
1534 j_coord_offsetB
= 0;
1535 j_coord_offsetC
= 0;
1536 j_coord_offsetD
= 0;
1541 for(iidx
=0;iidx
<4*DIM
;iidx
++)
1543 scratch
[iidx
] = 0.0;
1546 /* Start outer loop over neighborlists */
1547 for(iidx
=0; iidx
<nri
; iidx
++)
1549 /* Load shift vector for this list */
1550 i_shift_offset
= DIM
*shiftidx
[iidx
];
1552 /* Load limits for loop over neighbors */
1553 j_index_start
= jindex
[iidx
];
1554 j_index_end
= jindex
[iidx
+1];
1556 /* Get outer coordinate index */
1558 i_coord_offset
= DIM
*inr
;
1560 /* Load i particle coords and add shift vector */
1561 gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
1562 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
1564 fix0
= _mm256_setzero_pd();
1565 fiy0
= _mm256_setzero_pd();
1566 fiz0
= _mm256_setzero_pd();
1567 fix1
= _mm256_setzero_pd();
1568 fiy1
= _mm256_setzero_pd();
1569 fiz1
= _mm256_setzero_pd();
1570 fix2
= _mm256_setzero_pd();
1571 fiy2
= _mm256_setzero_pd();
1572 fiz2
= _mm256_setzero_pd();
1573 fix3
= _mm256_setzero_pd();
1574 fiy3
= _mm256_setzero_pd();
1575 fiz3
= _mm256_setzero_pd();
1577 /* Start inner kernel loop */
1578 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+3]>=0; jidx
+=4)
1581 /* Get j neighbor index, and coordinate index */
1583 jnrB
= jjnr
[jidx
+1];
1584 jnrC
= jjnr
[jidx
+2];
1585 jnrD
= jjnr
[jidx
+3];
1586 j_coord_offsetA
= DIM
*jnrA
;
1587 j_coord_offsetB
= DIM
*jnrB
;
1588 j_coord_offsetC
= DIM
*jnrC
;
1589 j_coord_offsetD
= DIM
*jnrD
;
1591 /* load j atom coordinates */
1592 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1593 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
1594 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
1595 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
1597 /* Calculate displacement vector */
1598 dx00
= _mm256_sub_pd(ix0
,jx0
);
1599 dy00
= _mm256_sub_pd(iy0
,jy0
);
1600 dz00
= _mm256_sub_pd(iz0
,jz0
);
1601 dx11
= _mm256_sub_pd(ix1
,jx1
);
1602 dy11
= _mm256_sub_pd(iy1
,jy1
);
1603 dz11
= _mm256_sub_pd(iz1
,jz1
);
1604 dx12
= _mm256_sub_pd(ix1
,jx2
);
1605 dy12
= _mm256_sub_pd(iy1
,jy2
);
1606 dz12
= _mm256_sub_pd(iz1
,jz2
);
1607 dx13
= _mm256_sub_pd(ix1
,jx3
);
1608 dy13
= _mm256_sub_pd(iy1
,jy3
);
1609 dz13
= _mm256_sub_pd(iz1
,jz3
);
1610 dx21
= _mm256_sub_pd(ix2
,jx1
);
1611 dy21
= _mm256_sub_pd(iy2
,jy1
);
1612 dz21
= _mm256_sub_pd(iz2
,jz1
);
1613 dx22
= _mm256_sub_pd(ix2
,jx2
);
1614 dy22
= _mm256_sub_pd(iy2
,jy2
);
1615 dz22
= _mm256_sub_pd(iz2
,jz2
);
1616 dx23
= _mm256_sub_pd(ix2
,jx3
);
1617 dy23
= _mm256_sub_pd(iy2
,jy3
);
1618 dz23
= _mm256_sub_pd(iz2
,jz3
);
1619 dx31
= _mm256_sub_pd(ix3
,jx1
);
1620 dy31
= _mm256_sub_pd(iy3
,jy1
);
1621 dz31
= _mm256_sub_pd(iz3
,jz1
);
1622 dx32
= _mm256_sub_pd(ix3
,jx2
);
1623 dy32
= _mm256_sub_pd(iy3
,jy2
);
1624 dz32
= _mm256_sub_pd(iz3
,jz2
);
1625 dx33
= _mm256_sub_pd(ix3
,jx3
);
1626 dy33
= _mm256_sub_pd(iy3
,jy3
);
1627 dz33
= _mm256_sub_pd(iz3
,jz3
);
1629 /* Calculate squared distance and things based on it */
1630 rsq00
= gmx_mm256_calc_rsq_pd(dx00
,dy00
,dz00
);
1631 rsq11
= gmx_mm256_calc_rsq_pd(dx11
,dy11
,dz11
);
1632 rsq12
= gmx_mm256_calc_rsq_pd(dx12
,dy12
,dz12
);
1633 rsq13
= gmx_mm256_calc_rsq_pd(dx13
,dy13
,dz13
);
1634 rsq21
= gmx_mm256_calc_rsq_pd(dx21
,dy21
,dz21
);
1635 rsq22
= gmx_mm256_calc_rsq_pd(dx22
,dy22
,dz22
);
1636 rsq23
= gmx_mm256_calc_rsq_pd(dx23
,dy23
,dz23
);
1637 rsq31
= gmx_mm256_calc_rsq_pd(dx31
,dy31
,dz31
);
1638 rsq32
= gmx_mm256_calc_rsq_pd(dx32
,dy32
,dz32
);
1639 rsq33
= gmx_mm256_calc_rsq_pd(dx33
,dy33
,dz33
);
1641 rinv00
= avx256_invsqrt_d(rsq00
);
1642 rinv11
= avx256_invsqrt_d(rsq11
);
1643 rinv12
= avx256_invsqrt_d(rsq12
);
1644 rinv13
= avx256_invsqrt_d(rsq13
);
1645 rinv21
= avx256_invsqrt_d(rsq21
);
1646 rinv22
= avx256_invsqrt_d(rsq22
);
1647 rinv23
= avx256_invsqrt_d(rsq23
);
1648 rinv31
= avx256_invsqrt_d(rsq31
);
1649 rinv32
= avx256_invsqrt_d(rsq32
);
1650 rinv33
= avx256_invsqrt_d(rsq33
);
1652 rinvsq00
= _mm256_mul_pd(rinv00
,rinv00
);
1653 rinvsq11
= _mm256_mul_pd(rinv11
,rinv11
);
1654 rinvsq12
= _mm256_mul_pd(rinv12
,rinv12
);
1655 rinvsq13
= _mm256_mul_pd(rinv13
,rinv13
);
1656 rinvsq21
= _mm256_mul_pd(rinv21
,rinv21
);
1657 rinvsq22
= _mm256_mul_pd(rinv22
,rinv22
);
1658 rinvsq23
= _mm256_mul_pd(rinv23
,rinv23
);
1659 rinvsq31
= _mm256_mul_pd(rinv31
,rinv31
);
1660 rinvsq32
= _mm256_mul_pd(rinv32
,rinv32
);
1661 rinvsq33
= _mm256_mul_pd(rinv33
,rinv33
);
1663 fjx0
= _mm256_setzero_pd();
1664 fjy0
= _mm256_setzero_pd();
1665 fjz0
= _mm256_setzero_pd();
1666 fjx1
= _mm256_setzero_pd();
1667 fjy1
= _mm256_setzero_pd();
1668 fjz1
= _mm256_setzero_pd();
1669 fjx2
= _mm256_setzero_pd();
1670 fjy2
= _mm256_setzero_pd();
1671 fjz2
= _mm256_setzero_pd();
1672 fjx3
= _mm256_setzero_pd();
1673 fjy3
= _mm256_setzero_pd();
1674 fjz3
= _mm256_setzero_pd();
1676 /**************************
1677 * CALCULATE INTERACTIONS *
1678 **************************/
1680 r00
= _mm256_mul_pd(rsq00
,rinv00
);
1682 /* Analytical LJ-PME */
1683 rinvsix
= _mm256_mul_pd(_mm256_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
1684 ewcljrsq
= _mm256_mul_pd(ewclj2
,rsq00
);
1685 ewclj6
= _mm256_mul_pd(ewclj2
,_mm256_mul_pd(ewclj2
,ewclj2
));
1686 exponent
= avx256_exp_d(ewcljrsq
);
1687 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
1688 poly
= _mm256_mul_pd(exponent
,_mm256_add_pd(_mm256_sub_pd(one
,ewcljrsq
),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq
,ewcljrsq
),one_half
)));
1689 /* f6A = 6 * C6grid * (1 - poly) */
1690 f6A
= _mm256_mul_pd(c6grid_00
,_mm256_sub_pd(one
,poly
));
1691 /* f6B = C6grid * exponent * beta^6 */
1692 f6B
= _mm256_mul_pd(_mm256_mul_pd(c6grid_00
,one_sixth
),_mm256_mul_pd(exponent
,ewclj6
));
1693 /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
1694 fvdw
= _mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00
,rinvsix
),_mm256_sub_pd(c6_00
,f6A
)),rinvsix
),f6B
),rinvsq00
);
1698 /* Calculate temporary vectorial force */
1699 tx
= _mm256_mul_pd(fscal
,dx00
);
1700 ty
= _mm256_mul_pd(fscal
,dy00
);
1701 tz
= _mm256_mul_pd(fscal
,dz00
);
1703 /* Update vectorial force */
1704 fix0
= _mm256_add_pd(fix0
,tx
);
1705 fiy0
= _mm256_add_pd(fiy0
,ty
);
1706 fiz0
= _mm256_add_pd(fiz0
,tz
);
1708 fjx0
= _mm256_add_pd(fjx0
,tx
);
1709 fjy0
= _mm256_add_pd(fjy0
,ty
);
1710 fjz0
= _mm256_add_pd(fjz0
,tz
);
1712 /**************************
1713 * CALCULATE INTERACTIONS *
1714 **************************/
1716 r11
= _mm256_mul_pd(rsq11
,rinv11
);
1718 /* EWALD ELECTROSTATICS */
1720 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1721 ewrt
= _mm256_mul_pd(r11
,ewtabscale
);
1722 ewitab
= _mm256_cvttpd_epi32(ewrt
);
1723 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1724 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
1725 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
1727 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
1728 felec
= _mm256_mul_pd(_mm256_mul_pd(qq11
,rinv11
),_mm256_sub_pd(rinvsq11
,felec
));
1732 /* Calculate temporary vectorial force */
1733 tx
= _mm256_mul_pd(fscal
,dx11
);
1734 ty
= _mm256_mul_pd(fscal
,dy11
);
1735 tz
= _mm256_mul_pd(fscal
,dz11
);
1737 /* Update vectorial force */
1738 fix1
= _mm256_add_pd(fix1
,tx
);
1739 fiy1
= _mm256_add_pd(fiy1
,ty
);
1740 fiz1
= _mm256_add_pd(fiz1
,tz
);
1742 fjx1
= _mm256_add_pd(fjx1
,tx
);
1743 fjy1
= _mm256_add_pd(fjy1
,ty
);
1744 fjz1
= _mm256_add_pd(fjz1
,tz
);
1746 /**************************
1747 * CALCULATE INTERACTIONS *
1748 **************************/
1750 r12
= _mm256_mul_pd(rsq12
,rinv12
);
1752 /* EWALD ELECTROSTATICS */
1754 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1755 ewrt
= _mm256_mul_pd(r12
,ewtabscale
);
1756 ewitab
= _mm256_cvttpd_epi32(ewrt
);
1757 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1758 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
1759 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
1761 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
1762 felec
= _mm256_mul_pd(_mm256_mul_pd(qq12
,rinv12
),_mm256_sub_pd(rinvsq12
,felec
));
1766 /* Calculate temporary vectorial force */
1767 tx
= _mm256_mul_pd(fscal
,dx12
);
1768 ty
= _mm256_mul_pd(fscal
,dy12
);
1769 tz
= _mm256_mul_pd(fscal
,dz12
);
1771 /* Update vectorial force */
1772 fix1
= _mm256_add_pd(fix1
,tx
);
1773 fiy1
= _mm256_add_pd(fiy1
,ty
);
1774 fiz1
= _mm256_add_pd(fiz1
,tz
);
1776 fjx2
= _mm256_add_pd(fjx2
,tx
);
1777 fjy2
= _mm256_add_pd(fjy2
,ty
);
1778 fjz2
= _mm256_add_pd(fjz2
,tz
);
1780 /**************************
1781 * CALCULATE INTERACTIONS *
1782 **************************/
1784 r13
= _mm256_mul_pd(rsq13
,rinv13
);
1786 /* EWALD ELECTROSTATICS */
1788 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1789 ewrt
= _mm256_mul_pd(r13
,ewtabscale
);
1790 ewitab
= _mm256_cvttpd_epi32(ewrt
);
1791 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1792 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
1793 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
1795 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
1796 felec
= _mm256_mul_pd(_mm256_mul_pd(qq13
,rinv13
),_mm256_sub_pd(rinvsq13
,felec
));
1800 /* Calculate temporary vectorial force */
1801 tx
= _mm256_mul_pd(fscal
,dx13
);
1802 ty
= _mm256_mul_pd(fscal
,dy13
);
1803 tz
= _mm256_mul_pd(fscal
,dz13
);
1805 /* Update vectorial force */
1806 fix1
= _mm256_add_pd(fix1
,tx
);
1807 fiy1
= _mm256_add_pd(fiy1
,ty
);
1808 fiz1
= _mm256_add_pd(fiz1
,tz
);
1810 fjx3
= _mm256_add_pd(fjx3
,tx
);
1811 fjy3
= _mm256_add_pd(fjy3
,ty
);
1812 fjz3
= _mm256_add_pd(fjz3
,tz
);
1814 /**************************
1815 * CALCULATE INTERACTIONS *
1816 **************************/
1818 r21
= _mm256_mul_pd(rsq21
,rinv21
);
1820 /* EWALD ELECTROSTATICS */
1822 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1823 ewrt
= _mm256_mul_pd(r21
,ewtabscale
);
1824 ewitab
= _mm256_cvttpd_epi32(ewrt
);
1825 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1826 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
1827 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
1829 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
1830 felec
= _mm256_mul_pd(_mm256_mul_pd(qq21
,rinv21
),_mm256_sub_pd(rinvsq21
,felec
));
1834 /* Calculate temporary vectorial force */
1835 tx
= _mm256_mul_pd(fscal
,dx21
);
1836 ty
= _mm256_mul_pd(fscal
,dy21
);
1837 tz
= _mm256_mul_pd(fscal
,dz21
);
1839 /* Update vectorial force */
1840 fix2
= _mm256_add_pd(fix2
,tx
);
1841 fiy2
= _mm256_add_pd(fiy2
,ty
);
1842 fiz2
= _mm256_add_pd(fiz2
,tz
);
1844 fjx1
= _mm256_add_pd(fjx1
,tx
);
1845 fjy1
= _mm256_add_pd(fjy1
,ty
);
1846 fjz1
= _mm256_add_pd(fjz1
,tz
);
1848 /**************************
1849 * CALCULATE INTERACTIONS *
1850 **************************/
1852 r22
= _mm256_mul_pd(rsq22
,rinv22
);
1854 /* EWALD ELECTROSTATICS */
1856 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1857 ewrt
= _mm256_mul_pd(r22
,ewtabscale
);
1858 ewitab
= _mm256_cvttpd_epi32(ewrt
);
1859 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1860 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
1861 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
1863 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
1864 felec
= _mm256_mul_pd(_mm256_mul_pd(qq22
,rinv22
),_mm256_sub_pd(rinvsq22
,felec
));
1868 /* Calculate temporary vectorial force */
1869 tx
= _mm256_mul_pd(fscal
,dx22
);
1870 ty
= _mm256_mul_pd(fscal
,dy22
);
1871 tz
= _mm256_mul_pd(fscal
,dz22
);
1873 /* Update vectorial force */
1874 fix2
= _mm256_add_pd(fix2
,tx
);
1875 fiy2
= _mm256_add_pd(fiy2
,ty
);
1876 fiz2
= _mm256_add_pd(fiz2
,tz
);
1878 fjx2
= _mm256_add_pd(fjx2
,tx
);
1879 fjy2
= _mm256_add_pd(fjy2
,ty
);
1880 fjz2
= _mm256_add_pd(fjz2
,tz
);
1882 /**************************
1883 * CALCULATE INTERACTIONS *
1884 **************************/
1886 r23
= _mm256_mul_pd(rsq23
,rinv23
);
1888 /* EWALD ELECTROSTATICS */
1890 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1891 ewrt
= _mm256_mul_pd(r23
,ewtabscale
);
1892 ewitab
= _mm256_cvttpd_epi32(ewrt
);
1893 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1894 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
1895 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
1897 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
1898 felec
= _mm256_mul_pd(_mm256_mul_pd(qq23
,rinv23
),_mm256_sub_pd(rinvsq23
,felec
));
1902 /* Calculate temporary vectorial force */
1903 tx
= _mm256_mul_pd(fscal
,dx23
);
1904 ty
= _mm256_mul_pd(fscal
,dy23
);
1905 tz
= _mm256_mul_pd(fscal
,dz23
);
1907 /* Update vectorial force */
1908 fix2
= _mm256_add_pd(fix2
,tx
);
1909 fiy2
= _mm256_add_pd(fiy2
,ty
);
1910 fiz2
= _mm256_add_pd(fiz2
,tz
);
1912 fjx3
= _mm256_add_pd(fjx3
,tx
);
1913 fjy3
= _mm256_add_pd(fjy3
,ty
);
1914 fjz3
= _mm256_add_pd(fjz3
,tz
);
1916 /**************************
1917 * CALCULATE INTERACTIONS *
1918 **************************/
1920 r31
= _mm256_mul_pd(rsq31
,rinv31
);
1922 /* EWALD ELECTROSTATICS */
1924 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1925 ewrt
= _mm256_mul_pd(r31
,ewtabscale
);
1926 ewitab
= _mm256_cvttpd_epi32(ewrt
);
1927 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1928 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
1929 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
1931 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
1932 felec
= _mm256_mul_pd(_mm256_mul_pd(qq31
,rinv31
),_mm256_sub_pd(rinvsq31
,felec
));
1936 /* Calculate temporary vectorial force */
1937 tx
= _mm256_mul_pd(fscal
,dx31
);
1938 ty
= _mm256_mul_pd(fscal
,dy31
);
1939 tz
= _mm256_mul_pd(fscal
,dz31
);
1941 /* Update vectorial force */
1942 fix3
= _mm256_add_pd(fix3
,tx
);
1943 fiy3
= _mm256_add_pd(fiy3
,ty
);
1944 fiz3
= _mm256_add_pd(fiz3
,tz
);
1946 fjx1
= _mm256_add_pd(fjx1
,tx
);
1947 fjy1
= _mm256_add_pd(fjy1
,ty
);
1948 fjz1
= _mm256_add_pd(fjz1
,tz
);
1950 /**************************
1951 * CALCULATE INTERACTIONS *
1952 **************************/
1954 r32
= _mm256_mul_pd(rsq32
,rinv32
);
1956 /* EWALD ELECTROSTATICS */
1958 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1959 ewrt
= _mm256_mul_pd(r32
,ewtabscale
);
1960 ewitab
= _mm256_cvttpd_epi32(ewrt
);
1961 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1962 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
1963 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
1965 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
1966 felec
= _mm256_mul_pd(_mm256_mul_pd(qq32
,rinv32
),_mm256_sub_pd(rinvsq32
,felec
));
1970 /* Calculate temporary vectorial force */
1971 tx
= _mm256_mul_pd(fscal
,dx32
);
1972 ty
= _mm256_mul_pd(fscal
,dy32
);
1973 tz
= _mm256_mul_pd(fscal
,dz32
);
1975 /* Update vectorial force */
1976 fix3
= _mm256_add_pd(fix3
,tx
);
1977 fiy3
= _mm256_add_pd(fiy3
,ty
);
1978 fiz3
= _mm256_add_pd(fiz3
,tz
);
1980 fjx2
= _mm256_add_pd(fjx2
,tx
);
1981 fjy2
= _mm256_add_pd(fjy2
,ty
);
1982 fjz2
= _mm256_add_pd(fjz2
,tz
);
1984 /**************************
1985 * CALCULATE INTERACTIONS *
1986 **************************/
1988 r33
= _mm256_mul_pd(rsq33
,rinv33
);
1990 /* EWALD ELECTROSTATICS */
1992 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1993 ewrt
= _mm256_mul_pd(r33
,ewtabscale
);
1994 ewitab
= _mm256_cvttpd_epi32(ewrt
);
1995 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1996 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
1997 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
1999 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
2000 felec
= _mm256_mul_pd(_mm256_mul_pd(qq33
,rinv33
),_mm256_sub_pd(rinvsq33
,felec
));
2004 /* Calculate temporary vectorial force */
2005 tx
= _mm256_mul_pd(fscal
,dx33
);
2006 ty
= _mm256_mul_pd(fscal
,dy33
);
2007 tz
= _mm256_mul_pd(fscal
,dz33
);
2009 /* Update vectorial force */
2010 fix3
= _mm256_add_pd(fix3
,tx
);
2011 fiy3
= _mm256_add_pd(fiy3
,ty
);
2012 fiz3
= _mm256_add_pd(fiz3
,tz
);
2014 fjx3
= _mm256_add_pd(fjx3
,tx
);
2015 fjy3
= _mm256_add_pd(fjy3
,ty
);
2016 fjz3
= _mm256_add_pd(fjz3
,tz
);
2018 fjptrA
= f
+j_coord_offsetA
;
2019 fjptrB
= f
+j_coord_offsetB
;
2020 fjptrC
= f
+j_coord_offsetC
;
2021 fjptrD
= f
+j_coord_offsetD
;
2023 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
2024 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,
2025 fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
2027 /* Inner loop uses 373 flops */
2030 if(jidx
<j_index_end
)
2033 /* Get j neighbor index, and coordinate index */
2034 jnrlistA
= jjnr
[jidx
];
2035 jnrlistB
= jjnr
[jidx
+1];
2036 jnrlistC
= jjnr
[jidx
+2];
2037 jnrlistD
= jjnr
[jidx
+3];
2038 /* Sign of each element will be negative for non-real atoms.
2039 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2040 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
2042 tmpmask0
= gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128()));
2044 tmpmask1
= _mm_permute_ps(tmpmask0
,_GMX_MM_PERMUTE(3,3,2,2));
2045 tmpmask0
= _mm_permute_ps(tmpmask0
,_GMX_MM_PERMUTE(1,1,0,0));
2046 dummy_mask
= _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1
,tmpmask0
));
2048 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
2049 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
2050 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
2051 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
2052 j_coord_offsetA
= DIM
*jnrA
;
2053 j_coord_offsetB
= DIM
*jnrB
;
2054 j_coord_offsetC
= DIM
*jnrC
;
2055 j_coord_offsetD
= DIM
*jnrD
;
2057 /* load j atom coordinates */
2058 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
2059 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
2060 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
2061 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
2063 /* Calculate displacement vector */
2064 dx00
= _mm256_sub_pd(ix0
,jx0
);
2065 dy00
= _mm256_sub_pd(iy0
,jy0
);
2066 dz00
= _mm256_sub_pd(iz0
,jz0
);
2067 dx11
= _mm256_sub_pd(ix1
,jx1
);
2068 dy11
= _mm256_sub_pd(iy1
,jy1
);
2069 dz11
= _mm256_sub_pd(iz1
,jz1
);
2070 dx12
= _mm256_sub_pd(ix1
,jx2
);
2071 dy12
= _mm256_sub_pd(iy1
,jy2
);
2072 dz12
= _mm256_sub_pd(iz1
,jz2
);
2073 dx13
= _mm256_sub_pd(ix1
,jx3
);
2074 dy13
= _mm256_sub_pd(iy1
,jy3
);
2075 dz13
= _mm256_sub_pd(iz1
,jz3
);
2076 dx21
= _mm256_sub_pd(ix2
,jx1
);
2077 dy21
= _mm256_sub_pd(iy2
,jy1
);
2078 dz21
= _mm256_sub_pd(iz2
,jz1
);
2079 dx22
= _mm256_sub_pd(ix2
,jx2
);
2080 dy22
= _mm256_sub_pd(iy2
,jy2
);
2081 dz22
= _mm256_sub_pd(iz2
,jz2
);
2082 dx23
= _mm256_sub_pd(ix2
,jx3
);
2083 dy23
= _mm256_sub_pd(iy2
,jy3
);
2084 dz23
= _mm256_sub_pd(iz2
,jz3
);
2085 dx31
= _mm256_sub_pd(ix3
,jx1
);
2086 dy31
= _mm256_sub_pd(iy3
,jy1
);
2087 dz31
= _mm256_sub_pd(iz3
,jz1
);
2088 dx32
= _mm256_sub_pd(ix3
,jx2
);
2089 dy32
= _mm256_sub_pd(iy3
,jy2
);
2090 dz32
= _mm256_sub_pd(iz3
,jz2
);
2091 dx33
= _mm256_sub_pd(ix3
,jx3
);
2092 dy33
= _mm256_sub_pd(iy3
,jy3
);
2093 dz33
= _mm256_sub_pd(iz3
,jz3
);
2095 /* Calculate squared distance and things based on it */
2096 rsq00
= gmx_mm256_calc_rsq_pd(dx00
,dy00
,dz00
);
2097 rsq11
= gmx_mm256_calc_rsq_pd(dx11
,dy11
,dz11
);
2098 rsq12
= gmx_mm256_calc_rsq_pd(dx12
,dy12
,dz12
);
2099 rsq13
= gmx_mm256_calc_rsq_pd(dx13
,dy13
,dz13
);
2100 rsq21
= gmx_mm256_calc_rsq_pd(dx21
,dy21
,dz21
);
2101 rsq22
= gmx_mm256_calc_rsq_pd(dx22
,dy22
,dz22
);
2102 rsq23
= gmx_mm256_calc_rsq_pd(dx23
,dy23
,dz23
);
2103 rsq31
= gmx_mm256_calc_rsq_pd(dx31
,dy31
,dz31
);
2104 rsq32
= gmx_mm256_calc_rsq_pd(dx32
,dy32
,dz32
);
2105 rsq33
= gmx_mm256_calc_rsq_pd(dx33
,dy33
,dz33
);
2107 rinv00
= avx256_invsqrt_d(rsq00
);
2108 rinv11
= avx256_invsqrt_d(rsq11
);
2109 rinv12
= avx256_invsqrt_d(rsq12
);
2110 rinv13
= avx256_invsqrt_d(rsq13
);
2111 rinv21
= avx256_invsqrt_d(rsq21
);
2112 rinv22
= avx256_invsqrt_d(rsq22
);
2113 rinv23
= avx256_invsqrt_d(rsq23
);
2114 rinv31
= avx256_invsqrt_d(rsq31
);
2115 rinv32
= avx256_invsqrt_d(rsq32
);
2116 rinv33
= avx256_invsqrt_d(rsq33
);
2118 rinvsq00
= _mm256_mul_pd(rinv00
,rinv00
);
2119 rinvsq11
= _mm256_mul_pd(rinv11
,rinv11
);
2120 rinvsq12
= _mm256_mul_pd(rinv12
,rinv12
);
2121 rinvsq13
= _mm256_mul_pd(rinv13
,rinv13
);
2122 rinvsq21
= _mm256_mul_pd(rinv21
,rinv21
);
2123 rinvsq22
= _mm256_mul_pd(rinv22
,rinv22
);
2124 rinvsq23
= _mm256_mul_pd(rinv23
,rinv23
);
2125 rinvsq31
= _mm256_mul_pd(rinv31
,rinv31
);
2126 rinvsq32
= _mm256_mul_pd(rinv32
,rinv32
);
2127 rinvsq33
= _mm256_mul_pd(rinv33
,rinv33
);
2129 fjx0
= _mm256_setzero_pd();
2130 fjy0
= _mm256_setzero_pd();
2131 fjz0
= _mm256_setzero_pd();
2132 fjx1
= _mm256_setzero_pd();
2133 fjy1
= _mm256_setzero_pd();
2134 fjz1
= _mm256_setzero_pd();
2135 fjx2
= _mm256_setzero_pd();
2136 fjy2
= _mm256_setzero_pd();
2137 fjz2
= _mm256_setzero_pd();
2138 fjx3
= _mm256_setzero_pd();
2139 fjy3
= _mm256_setzero_pd();
2140 fjz3
= _mm256_setzero_pd();
2142 /**************************
2143 * CALCULATE INTERACTIONS *
2144 **************************/
2146 r00
= _mm256_mul_pd(rsq00
,rinv00
);
2147 r00
= _mm256_andnot_pd(dummy_mask
,r00
);
2149 /* Analytical LJ-PME */
2150 rinvsix
= _mm256_mul_pd(_mm256_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
2151 ewcljrsq
= _mm256_mul_pd(ewclj2
,rsq00
);
2152 ewclj6
= _mm256_mul_pd(ewclj2
,_mm256_mul_pd(ewclj2
,ewclj2
));
2153 exponent
= avx256_exp_d(ewcljrsq
);
2154 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
2155 poly
= _mm256_mul_pd(exponent
,_mm256_add_pd(_mm256_sub_pd(one
,ewcljrsq
),_mm256_mul_pd(_mm256_mul_pd(ewcljrsq
,ewcljrsq
),one_half
)));
2156 /* f6A = 6 * C6grid * (1 - poly) */
2157 f6A
= _mm256_mul_pd(c6grid_00
,_mm256_sub_pd(one
,poly
));
2158 /* f6B = C6grid * exponent * beta^6 */
2159 f6B
= _mm256_mul_pd(_mm256_mul_pd(c6grid_00
,one_sixth
),_mm256_mul_pd(exponent
,ewclj6
));
2160 /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
2161 fvdw
= _mm256_mul_pd(_mm256_add_pd(_mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00
,rinvsix
),_mm256_sub_pd(c6_00
,f6A
)),rinvsix
),f6B
),rinvsq00
);
2165 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
2167 /* Calculate temporary vectorial force */
2168 tx
= _mm256_mul_pd(fscal
,dx00
);
2169 ty
= _mm256_mul_pd(fscal
,dy00
);
2170 tz
= _mm256_mul_pd(fscal
,dz00
);
2172 /* Update vectorial force */
2173 fix0
= _mm256_add_pd(fix0
,tx
);
2174 fiy0
= _mm256_add_pd(fiy0
,ty
);
2175 fiz0
= _mm256_add_pd(fiz0
,tz
);
2177 fjx0
= _mm256_add_pd(fjx0
,tx
);
2178 fjy0
= _mm256_add_pd(fjy0
,ty
);
2179 fjz0
= _mm256_add_pd(fjz0
,tz
);
2181 /**************************
2182 * CALCULATE INTERACTIONS *
2183 **************************/
2185 r11
= _mm256_mul_pd(rsq11
,rinv11
);
2186 r11
= _mm256_andnot_pd(dummy_mask
,r11
);
2188 /* EWALD ELECTROSTATICS */
2190 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2191 ewrt
= _mm256_mul_pd(r11
,ewtabscale
);
2192 ewitab
= _mm256_cvttpd_epi32(ewrt
);
2193 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2194 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
2195 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
2197 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
2198 felec
= _mm256_mul_pd(_mm256_mul_pd(qq11
,rinv11
),_mm256_sub_pd(rinvsq11
,felec
));
2202 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
2204 /* Calculate temporary vectorial force */
2205 tx
= _mm256_mul_pd(fscal
,dx11
);
2206 ty
= _mm256_mul_pd(fscal
,dy11
);
2207 tz
= _mm256_mul_pd(fscal
,dz11
);
2209 /* Update vectorial force */
2210 fix1
= _mm256_add_pd(fix1
,tx
);
2211 fiy1
= _mm256_add_pd(fiy1
,ty
);
2212 fiz1
= _mm256_add_pd(fiz1
,tz
);
2214 fjx1
= _mm256_add_pd(fjx1
,tx
);
2215 fjy1
= _mm256_add_pd(fjy1
,ty
);
2216 fjz1
= _mm256_add_pd(fjz1
,tz
);
2218 /**************************
2219 * CALCULATE INTERACTIONS *
2220 **************************/
2222 r12
= _mm256_mul_pd(rsq12
,rinv12
);
2223 r12
= _mm256_andnot_pd(dummy_mask
,r12
);
2225 /* EWALD ELECTROSTATICS */
2227 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2228 ewrt
= _mm256_mul_pd(r12
,ewtabscale
);
2229 ewitab
= _mm256_cvttpd_epi32(ewrt
);
2230 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2231 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
2232 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
2234 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
2235 felec
= _mm256_mul_pd(_mm256_mul_pd(qq12
,rinv12
),_mm256_sub_pd(rinvsq12
,felec
));
2239 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
2241 /* Calculate temporary vectorial force */
2242 tx
= _mm256_mul_pd(fscal
,dx12
);
2243 ty
= _mm256_mul_pd(fscal
,dy12
);
2244 tz
= _mm256_mul_pd(fscal
,dz12
);
2246 /* Update vectorial force */
2247 fix1
= _mm256_add_pd(fix1
,tx
);
2248 fiy1
= _mm256_add_pd(fiy1
,ty
);
2249 fiz1
= _mm256_add_pd(fiz1
,tz
);
2251 fjx2
= _mm256_add_pd(fjx2
,tx
);
2252 fjy2
= _mm256_add_pd(fjy2
,ty
);
2253 fjz2
= _mm256_add_pd(fjz2
,tz
);
2255 /**************************
2256 * CALCULATE INTERACTIONS *
2257 **************************/
2259 r13
= _mm256_mul_pd(rsq13
,rinv13
);
2260 r13
= _mm256_andnot_pd(dummy_mask
,r13
);
2262 /* EWALD ELECTROSTATICS */
2264 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2265 ewrt
= _mm256_mul_pd(r13
,ewtabscale
);
2266 ewitab
= _mm256_cvttpd_epi32(ewrt
);
2267 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2268 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
2269 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
2271 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
2272 felec
= _mm256_mul_pd(_mm256_mul_pd(qq13
,rinv13
),_mm256_sub_pd(rinvsq13
,felec
));
2276 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
2278 /* Calculate temporary vectorial force */
2279 tx
= _mm256_mul_pd(fscal
,dx13
);
2280 ty
= _mm256_mul_pd(fscal
,dy13
);
2281 tz
= _mm256_mul_pd(fscal
,dz13
);
2283 /* Update vectorial force */
2284 fix1
= _mm256_add_pd(fix1
,tx
);
2285 fiy1
= _mm256_add_pd(fiy1
,ty
);
2286 fiz1
= _mm256_add_pd(fiz1
,tz
);
2288 fjx3
= _mm256_add_pd(fjx3
,tx
);
2289 fjy3
= _mm256_add_pd(fjy3
,ty
);
2290 fjz3
= _mm256_add_pd(fjz3
,tz
);
2292 /**************************
2293 * CALCULATE INTERACTIONS *
2294 **************************/
2296 r21
= _mm256_mul_pd(rsq21
,rinv21
);
2297 r21
= _mm256_andnot_pd(dummy_mask
,r21
);
2299 /* EWALD ELECTROSTATICS */
2301 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2302 ewrt
= _mm256_mul_pd(r21
,ewtabscale
);
2303 ewitab
= _mm256_cvttpd_epi32(ewrt
);
2304 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2305 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
2306 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
2308 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
2309 felec
= _mm256_mul_pd(_mm256_mul_pd(qq21
,rinv21
),_mm256_sub_pd(rinvsq21
,felec
));
2313 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
2315 /* Calculate temporary vectorial force */
2316 tx
= _mm256_mul_pd(fscal
,dx21
);
2317 ty
= _mm256_mul_pd(fscal
,dy21
);
2318 tz
= _mm256_mul_pd(fscal
,dz21
);
2320 /* Update vectorial force */
2321 fix2
= _mm256_add_pd(fix2
,tx
);
2322 fiy2
= _mm256_add_pd(fiy2
,ty
);
2323 fiz2
= _mm256_add_pd(fiz2
,tz
);
2325 fjx1
= _mm256_add_pd(fjx1
,tx
);
2326 fjy1
= _mm256_add_pd(fjy1
,ty
);
2327 fjz1
= _mm256_add_pd(fjz1
,tz
);
2329 /**************************
2330 * CALCULATE INTERACTIONS *
2331 **************************/
2333 r22
= _mm256_mul_pd(rsq22
,rinv22
);
2334 r22
= _mm256_andnot_pd(dummy_mask
,r22
);
2336 /* EWALD ELECTROSTATICS */
2338 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2339 ewrt
= _mm256_mul_pd(r22
,ewtabscale
);
2340 ewitab
= _mm256_cvttpd_epi32(ewrt
);
2341 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2342 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
2343 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
2345 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
2346 felec
= _mm256_mul_pd(_mm256_mul_pd(qq22
,rinv22
),_mm256_sub_pd(rinvsq22
,felec
));
2350 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
2352 /* Calculate temporary vectorial force */
2353 tx
= _mm256_mul_pd(fscal
,dx22
);
2354 ty
= _mm256_mul_pd(fscal
,dy22
);
2355 tz
= _mm256_mul_pd(fscal
,dz22
);
2357 /* Update vectorial force */
2358 fix2
= _mm256_add_pd(fix2
,tx
);
2359 fiy2
= _mm256_add_pd(fiy2
,ty
);
2360 fiz2
= _mm256_add_pd(fiz2
,tz
);
2362 fjx2
= _mm256_add_pd(fjx2
,tx
);
2363 fjy2
= _mm256_add_pd(fjy2
,ty
);
2364 fjz2
= _mm256_add_pd(fjz2
,tz
);
2366 /**************************
2367 * CALCULATE INTERACTIONS *
2368 **************************/
2370 r23
= _mm256_mul_pd(rsq23
,rinv23
);
2371 r23
= _mm256_andnot_pd(dummy_mask
,r23
);
2373 /* EWALD ELECTROSTATICS */
2375 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2376 ewrt
= _mm256_mul_pd(r23
,ewtabscale
);
2377 ewitab
= _mm256_cvttpd_epi32(ewrt
);
2378 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2379 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
2380 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
2382 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
2383 felec
= _mm256_mul_pd(_mm256_mul_pd(qq23
,rinv23
),_mm256_sub_pd(rinvsq23
,felec
));
2387 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
2389 /* Calculate temporary vectorial force */
2390 tx
= _mm256_mul_pd(fscal
,dx23
);
2391 ty
= _mm256_mul_pd(fscal
,dy23
);
2392 tz
= _mm256_mul_pd(fscal
,dz23
);
2394 /* Update vectorial force */
2395 fix2
= _mm256_add_pd(fix2
,tx
);
2396 fiy2
= _mm256_add_pd(fiy2
,ty
);
2397 fiz2
= _mm256_add_pd(fiz2
,tz
);
2399 fjx3
= _mm256_add_pd(fjx3
,tx
);
2400 fjy3
= _mm256_add_pd(fjy3
,ty
);
2401 fjz3
= _mm256_add_pd(fjz3
,tz
);
2403 /**************************
2404 * CALCULATE INTERACTIONS *
2405 **************************/
2407 r31
= _mm256_mul_pd(rsq31
,rinv31
);
2408 r31
= _mm256_andnot_pd(dummy_mask
,r31
);
2410 /* EWALD ELECTROSTATICS */
2412 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2413 ewrt
= _mm256_mul_pd(r31
,ewtabscale
);
2414 ewitab
= _mm256_cvttpd_epi32(ewrt
);
2415 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2416 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
2417 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
2419 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
2420 felec
= _mm256_mul_pd(_mm256_mul_pd(qq31
,rinv31
),_mm256_sub_pd(rinvsq31
,felec
));
2424 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
2426 /* Calculate temporary vectorial force */
2427 tx
= _mm256_mul_pd(fscal
,dx31
);
2428 ty
= _mm256_mul_pd(fscal
,dy31
);
2429 tz
= _mm256_mul_pd(fscal
,dz31
);
2431 /* Update vectorial force */
2432 fix3
= _mm256_add_pd(fix3
,tx
);
2433 fiy3
= _mm256_add_pd(fiy3
,ty
);
2434 fiz3
= _mm256_add_pd(fiz3
,tz
);
2436 fjx1
= _mm256_add_pd(fjx1
,tx
);
2437 fjy1
= _mm256_add_pd(fjy1
,ty
);
2438 fjz1
= _mm256_add_pd(fjz1
,tz
);
2440 /**************************
2441 * CALCULATE INTERACTIONS *
2442 **************************/
2444 r32
= _mm256_mul_pd(rsq32
,rinv32
);
2445 r32
= _mm256_andnot_pd(dummy_mask
,r32
);
2447 /* EWALD ELECTROSTATICS */
2449 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2450 ewrt
= _mm256_mul_pd(r32
,ewtabscale
);
2451 ewitab
= _mm256_cvttpd_epi32(ewrt
);
2452 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2453 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
2454 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
2456 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
2457 felec
= _mm256_mul_pd(_mm256_mul_pd(qq32
,rinv32
),_mm256_sub_pd(rinvsq32
,felec
));
2461 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
2463 /* Calculate temporary vectorial force */
2464 tx
= _mm256_mul_pd(fscal
,dx32
);
2465 ty
= _mm256_mul_pd(fscal
,dy32
);
2466 tz
= _mm256_mul_pd(fscal
,dz32
);
2468 /* Update vectorial force */
2469 fix3
= _mm256_add_pd(fix3
,tx
);
2470 fiy3
= _mm256_add_pd(fiy3
,ty
);
2471 fiz3
= _mm256_add_pd(fiz3
,tz
);
2473 fjx2
= _mm256_add_pd(fjx2
,tx
);
2474 fjy2
= _mm256_add_pd(fjy2
,ty
);
2475 fjz2
= _mm256_add_pd(fjz2
,tz
);
2477 /**************************
2478 * CALCULATE INTERACTIONS *
2479 **************************/
2481 r33
= _mm256_mul_pd(rsq33
,rinv33
);
2482 r33
= _mm256_andnot_pd(dummy_mask
,r33
);
2484 /* EWALD ELECTROSTATICS */
2486 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2487 ewrt
= _mm256_mul_pd(r33
,ewtabscale
);
2488 ewitab
= _mm256_cvttpd_epi32(ewrt
);
2489 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2490 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
2491 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
2493 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
2494 felec
= _mm256_mul_pd(_mm256_mul_pd(qq33
,rinv33
),_mm256_sub_pd(rinvsq33
,felec
));
2498 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
2500 /* Calculate temporary vectorial force */
2501 tx
= _mm256_mul_pd(fscal
,dx33
);
2502 ty
= _mm256_mul_pd(fscal
,dy33
);
2503 tz
= _mm256_mul_pd(fscal
,dz33
);
2505 /* Update vectorial force */
2506 fix3
= _mm256_add_pd(fix3
,tx
);
2507 fiy3
= _mm256_add_pd(fiy3
,ty
);
2508 fiz3
= _mm256_add_pd(fiz3
,tz
);
2510 fjx3
= _mm256_add_pd(fjx3
,tx
);
2511 fjy3
= _mm256_add_pd(fjy3
,ty
);
2512 fjz3
= _mm256_add_pd(fjz3
,tz
);
2514 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
2515 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
2516 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
2517 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
2519 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
2520 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,
2521 fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
2523 /* Inner loop uses 383 flops */
2526 /* End of innermost loop */
2528 gmx_mm256_update_iforce_4atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
2529 f
+i_coord_offset
,fshift
+i_shift_offset
);
2531 /* Increment number of inner iterations */
2532 inneriter
+= j_index_end
- j_index_start
;
2534 /* Outer loop uses 24 flops */
2537 /* Increment number of outer iterations */
2540 /* Update outer/inner flops */
2542 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W4W4_F
,outeriter
*24 + inneriter
*383);