2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_avx_256_double.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_avx_256_double
51 * Electrostatics interaction: Ewald
52 * VdW interaction: CubicSplineTable
53 * Geometry: Water3-Water3
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_avx_256_double
58 (t_nblist
* gmx_restrict nlist
,
59 rvec
* gmx_restrict xx
,
60 rvec
* gmx_restrict ff
,
61 struct t_forcerec
* gmx_restrict fr
,
62 t_mdatoms
* gmx_restrict mdatoms
,
63 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
64 t_nrnb
* gmx_restrict nrnb
)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
72 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
73 int jnrA
,jnrB
,jnrC
,jnrD
;
74 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
75 int jnrlistE
,jnrlistF
,jnrlistG
,jnrlistH
;
76 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
77 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
79 real
*shiftvec
,*fshift
,*x
,*f
;
80 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
;
82 __m256d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
83 real
* vdwioffsetptr0
;
84 __m256d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
85 real
* vdwioffsetptr1
;
86 __m256d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
87 real
* vdwioffsetptr2
;
88 __m256d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
89 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
;
90 __m256d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
91 int vdwjidx1A
,vdwjidx1B
,vdwjidx1C
,vdwjidx1D
;
92 __m256d jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
93 int vdwjidx2A
,vdwjidx2B
,vdwjidx2C
,vdwjidx2D
;
94 __m256d jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
95 __m256d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
96 __m256d dx01
,dy01
,dz01
,rsq01
,rinv01
,rinvsq01
,r01
,qq01
,c6_01
,c12_01
;
97 __m256d dx02
,dy02
,dz02
,rsq02
,rinv02
,rinvsq02
,r02
,qq02
,c6_02
,c12_02
;
98 __m256d dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
99 __m256d dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
100 __m256d dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
101 __m256d dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
102 __m256d dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
103 __m256d dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
104 __m256d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
107 __m256d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
110 __m256d one_sixth
= _mm256_set1_pd(1.0/6.0);
111 __m256d one_twelfth
= _mm256_set1_pd(1.0/12.0);
113 __m128i ifour
= _mm_set1_epi32(4);
114 __m256d rt
,vfeps
,vftabscale
,Y
,F
,G
,H
,Heps
,Fp
,VV
,FF
;
117 __m256d ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
118 __m256d beta
,beta2
,beta3
,zeta2
,pmecorrF
,pmecorrV
,rinv3
;
120 __m256d dummy_mask
,cutoff_mask
;
121 __m128 tmpmask0
,tmpmask1
;
122 __m256d signbit
= _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
123 __m256d one
= _mm256_set1_pd(1.0);
124 __m256d two
= _mm256_set1_pd(2.0);
130 jindex
= nlist
->jindex
;
132 shiftidx
= nlist
->shift
;
134 shiftvec
= fr
->shift_vec
[0];
135 fshift
= fr
->fshift
[0];
136 facel
= _mm256_set1_pd(fr
->ic
->epsfac
);
137 charge
= mdatoms
->chargeA
;
138 nvdwtype
= fr
->ntype
;
140 vdwtype
= mdatoms
->typeA
;
142 vftab
= kernel_data
->table_vdw
->data
;
143 vftabscale
= _mm256_set1_pd(kernel_data
->table_vdw
->scale
);
145 sh_ewald
= _mm256_set1_pd(fr
->ic
->sh_ewald
);
146 beta
= _mm256_set1_pd(fr
->ic
->ewaldcoeff_q
);
147 beta2
= _mm256_mul_pd(beta
,beta
);
148 beta3
= _mm256_mul_pd(beta
,beta2
);
150 ewtab
= fr
->ic
->tabq_coul_FDV0
;
151 ewtabscale
= _mm256_set1_pd(fr
->ic
->tabq_scale
);
152 ewtabhalfspace
= _mm256_set1_pd(0.5/fr
->ic
->tabq_scale
);
154 /* Setup water-specific parameters */
155 inr
= nlist
->iinr
[0];
156 iq0
= _mm256_mul_pd(facel
,_mm256_set1_pd(charge
[inr
+0]));
157 iq1
= _mm256_mul_pd(facel
,_mm256_set1_pd(charge
[inr
+1]));
158 iq2
= _mm256_mul_pd(facel
,_mm256_set1_pd(charge
[inr
+2]));
159 vdwioffsetptr0
= vdwparam
+2*nvdwtype
*vdwtype
[inr
+0];
161 jq0
= _mm256_set1_pd(charge
[inr
+0]);
162 jq1
= _mm256_set1_pd(charge
[inr
+1]);
163 jq2
= _mm256_set1_pd(charge
[inr
+2]);
164 vdwjidx0A
= 2*vdwtype
[inr
+0];
165 qq00
= _mm256_mul_pd(iq0
,jq0
);
166 c6_00
= _mm256_set1_pd(vdwioffsetptr0
[vdwjidx0A
]);
167 c12_00
= _mm256_set1_pd(vdwioffsetptr0
[vdwjidx0A
+1]);
168 qq01
= _mm256_mul_pd(iq0
,jq1
);
169 qq02
= _mm256_mul_pd(iq0
,jq2
);
170 qq10
= _mm256_mul_pd(iq1
,jq0
);
171 qq11
= _mm256_mul_pd(iq1
,jq1
);
172 qq12
= _mm256_mul_pd(iq1
,jq2
);
173 qq20
= _mm256_mul_pd(iq2
,jq0
);
174 qq21
= _mm256_mul_pd(iq2
,jq1
);
175 qq22
= _mm256_mul_pd(iq2
,jq2
);
177 /* Avoid stupid compiler warnings */
178 jnrA
= jnrB
= jnrC
= jnrD
= 0;
187 for(iidx
=0;iidx
<4*DIM
;iidx
++)
192 /* Start outer loop over neighborlists */
193 for(iidx
=0; iidx
<nri
; iidx
++)
195 /* Load shift vector for this list */
196 i_shift_offset
= DIM
*shiftidx
[iidx
];
198 /* Load limits for loop over neighbors */
199 j_index_start
= jindex
[iidx
];
200 j_index_end
= jindex
[iidx
+1];
202 /* Get outer coordinate index */
204 i_coord_offset
= DIM
*inr
;
206 /* Load i particle coords and add shift vector */
207 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
208 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
210 fix0
= _mm256_setzero_pd();
211 fiy0
= _mm256_setzero_pd();
212 fiz0
= _mm256_setzero_pd();
213 fix1
= _mm256_setzero_pd();
214 fiy1
= _mm256_setzero_pd();
215 fiz1
= _mm256_setzero_pd();
216 fix2
= _mm256_setzero_pd();
217 fiy2
= _mm256_setzero_pd();
218 fiz2
= _mm256_setzero_pd();
220 /* Reset potential sums */
221 velecsum
= _mm256_setzero_pd();
222 vvdwsum
= _mm256_setzero_pd();
224 /* Start inner kernel loop */
225 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+3]>=0; jidx
+=4)
228 /* Get j neighbor index, and coordinate index */
233 j_coord_offsetA
= DIM
*jnrA
;
234 j_coord_offsetB
= DIM
*jnrB
;
235 j_coord_offsetC
= DIM
*jnrC
;
236 j_coord_offsetD
= DIM
*jnrD
;
238 /* load j atom coordinates */
239 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
240 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
241 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
243 /* Calculate displacement vector */
244 dx00
= _mm256_sub_pd(ix0
,jx0
);
245 dy00
= _mm256_sub_pd(iy0
,jy0
);
246 dz00
= _mm256_sub_pd(iz0
,jz0
);
247 dx01
= _mm256_sub_pd(ix0
,jx1
);
248 dy01
= _mm256_sub_pd(iy0
,jy1
);
249 dz01
= _mm256_sub_pd(iz0
,jz1
);
250 dx02
= _mm256_sub_pd(ix0
,jx2
);
251 dy02
= _mm256_sub_pd(iy0
,jy2
);
252 dz02
= _mm256_sub_pd(iz0
,jz2
);
253 dx10
= _mm256_sub_pd(ix1
,jx0
);
254 dy10
= _mm256_sub_pd(iy1
,jy0
);
255 dz10
= _mm256_sub_pd(iz1
,jz0
);
256 dx11
= _mm256_sub_pd(ix1
,jx1
);
257 dy11
= _mm256_sub_pd(iy1
,jy1
);
258 dz11
= _mm256_sub_pd(iz1
,jz1
);
259 dx12
= _mm256_sub_pd(ix1
,jx2
);
260 dy12
= _mm256_sub_pd(iy1
,jy2
);
261 dz12
= _mm256_sub_pd(iz1
,jz2
);
262 dx20
= _mm256_sub_pd(ix2
,jx0
);
263 dy20
= _mm256_sub_pd(iy2
,jy0
);
264 dz20
= _mm256_sub_pd(iz2
,jz0
);
265 dx21
= _mm256_sub_pd(ix2
,jx1
);
266 dy21
= _mm256_sub_pd(iy2
,jy1
);
267 dz21
= _mm256_sub_pd(iz2
,jz1
);
268 dx22
= _mm256_sub_pd(ix2
,jx2
);
269 dy22
= _mm256_sub_pd(iy2
,jy2
);
270 dz22
= _mm256_sub_pd(iz2
,jz2
);
272 /* Calculate squared distance and things based on it */
273 rsq00
= gmx_mm256_calc_rsq_pd(dx00
,dy00
,dz00
);
274 rsq01
= gmx_mm256_calc_rsq_pd(dx01
,dy01
,dz01
);
275 rsq02
= gmx_mm256_calc_rsq_pd(dx02
,dy02
,dz02
);
276 rsq10
= gmx_mm256_calc_rsq_pd(dx10
,dy10
,dz10
);
277 rsq11
= gmx_mm256_calc_rsq_pd(dx11
,dy11
,dz11
);
278 rsq12
= gmx_mm256_calc_rsq_pd(dx12
,dy12
,dz12
);
279 rsq20
= gmx_mm256_calc_rsq_pd(dx20
,dy20
,dz20
);
280 rsq21
= gmx_mm256_calc_rsq_pd(dx21
,dy21
,dz21
);
281 rsq22
= gmx_mm256_calc_rsq_pd(dx22
,dy22
,dz22
);
283 rinv00
= avx256_invsqrt_d(rsq00
);
284 rinv01
= avx256_invsqrt_d(rsq01
);
285 rinv02
= avx256_invsqrt_d(rsq02
);
286 rinv10
= avx256_invsqrt_d(rsq10
);
287 rinv11
= avx256_invsqrt_d(rsq11
);
288 rinv12
= avx256_invsqrt_d(rsq12
);
289 rinv20
= avx256_invsqrt_d(rsq20
);
290 rinv21
= avx256_invsqrt_d(rsq21
);
291 rinv22
= avx256_invsqrt_d(rsq22
);
293 rinvsq00
= _mm256_mul_pd(rinv00
,rinv00
);
294 rinvsq01
= _mm256_mul_pd(rinv01
,rinv01
);
295 rinvsq02
= _mm256_mul_pd(rinv02
,rinv02
);
296 rinvsq10
= _mm256_mul_pd(rinv10
,rinv10
);
297 rinvsq11
= _mm256_mul_pd(rinv11
,rinv11
);
298 rinvsq12
= _mm256_mul_pd(rinv12
,rinv12
);
299 rinvsq20
= _mm256_mul_pd(rinv20
,rinv20
);
300 rinvsq21
= _mm256_mul_pd(rinv21
,rinv21
);
301 rinvsq22
= _mm256_mul_pd(rinv22
,rinv22
);
303 fjx0
= _mm256_setzero_pd();
304 fjy0
= _mm256_setzero_pd();
305 fjz0
= _mm256_setzero_pd();
306 fjx1
= _mm256_setzero_pd();
307 fjy1
= _mm256_setzero_pd();
308 fjz1
= _mm256_setzero_pd();
309 fjx2
= _mm256_setzero_pd();
310 fjy2
= _mm256_setzero_pd();
311 fjz2
= _mm256_setzero_pd();
313 /**************************
314 * CALCULATE INTERACTIONS *
315 **************************/
317 r00
= _mm256_mul_pd(rsq00
,rinv00
);
319 /* Calculate table index by multiplying r with table scale and truncate to integer */
320 rt
= _mm256_mul_pd(r00
,vftabscale
);
321 vfitab
= _mm256_cvttpd_epi32(rt
);
322 vfeps
= _mm256_sub_pd(rt
,_mm256_round_pd(rt
, _MM_FROUND_FLOOR
));
323 vfitab
= _mm_slli_epi32(vfitab
,3);
325 /* EWALD ELECTROSTATICS */
327 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
328 ewrt
= _mm256_mul_pd(r00
,ewtabscale
);
329 ewitab
= _mm256_cvttpd_epi32(ewrt
);
330 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
331 ewitab
= _mm_slli_epi32(ewitab
,2);
332 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
333 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
334 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
335 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
336 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
337 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
338 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
339 velec
= _mm256_mul_pd(qq00
,_mm256_sub_pd(rinv00
,velec
));
340 felec
= _mm256_mul_pd(_mm256_mul_pd(qq00
,rinv00
),_mm256_sub_pd(rinvsq00
,felec
));
342 /* CUBIC SPLINE TABLE DISPERSION */
343 Y
= _mm256_load_pd( vftab
+ _mm_extract_epi32(vfitab
,0) );
344 F
= _mm256_load_pd( vftab
+ _mm_extract_epi32(vfitab
,1) );
345 G
= _mm256_load_pd( vftab
+ _mm_extract_epi32(vfitab
,2) );
346 H
= _mm256_load_pd( vftab
+ _mm_extract_epi32(vfitab
,3) );
347 GMX_MM256_FULLTRANSPOSE4_PD(Y
,F
,G
,H
);
348 Heps
= _mm256_mul_pd(vfeps
,H
);
349 Fp
= _mm256_add_pd(F
,_mm256_mul_pd(vfeps
,_mm256_add_pd(G
,Heps
)));
350 VV
= _mm256_add_pd(Y
,_mm256_mul_pd(vfeps
,Fp
));
351 vvdw6
= _mm256_mul_pd(c6_00
,VV
);
352 FF
= _mm256_add_pd(Fp
,_mm256_mul_pd(vfeps
,_mm256_add_pd(G
,_mm256_add_pd(Heps
,Heps
))));
353 fvdw6
= _mm256_mul_pd(c6_00
,FF
);
355 /* CUBIC SPLINE TABLE REPULSION */
356 vfitab
= _mm_add_epi32(vfitab
,ifour
);
357 Y
= _mm256_load_pd( vftab
+ _mm_extract_epi32(vfitab
,0) );
358 F
= _mm256_load_pd( vftab
+ _mm_extract_epi32(vfitab
,1) );
359 G
= _mm256_load_pd( vftab
+ _mm_extract_epi32(vfitab
,2) );
360 H
= _mm256_load_pd( vftab
+ _mm_extract_epi32(vfitab
,3) );
361 GMX_MM256_FULLTRANSPOSE4_PD(Y
,F
,G
,H
);
362 Heps
= _mm256_mul_pd(vfeps
,H
);
363 Fp
= _mm256_add_pd(F
,_mm256_mul_pd(vfeps
,_mm256_add_pd(G
,Heps
)));
364 VV
= _mm256_add_pd(Y
,_mm256_mul_pd(vfeps
,Fp
));
365 vvdw12
= _mm256_mul_pd(c12_00
,VV
);
366 FF
= _mm256_add_pd(Fp
,_mm256_mul_pd(vfeps
,_mm256_add_pd(G
,_mm256_add_pd(Heps
,Heps
))));
367 fvdw12
= _mm256_mul_pd(c12_00
,FF
);
368 vvdw
= _mm256_add_pd(vvdw12
,vvdw6
);
369 fvdw
= _mm256_xor_pd(signbit
,_mm256_mul_pd(_mm256_add_pd(fvdw6
,fvdw12
),_mm256_mul_pd(vftabscale
,rinv00
)));
371 /* Update potential sum for this i atom from the interaction with this j atom. */
372 velecsum
= _mm256_add_pd(velecsum
,velec
);
373 vvdwsum
= _mm256_add_pd(vvdwsum
,vvdw
);
375 fscal
= _mm256_add_pd(felec
,fvdw
);
377 /* Calculate temporary vectorial force */
378 tx
= _mm256_mul_pd(fscal
,dx00
);
379 ty
= _mm256_mul_pd(fscal
,dy00
);
380 tz
= _mm256_mul_pd(fscal
,dz00
);
382 /* Update vectorial force */
383 fix0
= _mm256_add_pd(fix0
,tx
);
384 fiy0
= _mm256_add_pd(fiy0
,ty
);
385 fiz0
= _mm256_add_pd(fiz0
,tz
);
387 fjx0
= _mm256_add_pd(fjx0
,tx
);
388 fjy0
= _mm256_add_pd(fjy0
,ty
);
389 fjz0
= _mm256_add_pd(fjz0
,tz
);
391 /**************************
392 * CALCULATE INTERACTIONS *
393 **************************/
395 r01
= _mm256_mul_pd(rsq01
,rinv01
);
397 /* EWALD ELECTROSTATICS */
399 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
400 ewrt
= _mm256_mul_pd(r01
,ewtabscale
);
401 ewitab
= _mm256_cvttpd_epi32(ewrt
);
402 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
403 ewitab
= _mm_slli_epi32(ewitab
,2);
404 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
405 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
406 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
407 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
408 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
409 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
410 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
411 velec
= _mm256_mul_pd(qq01
,_mm256_sub_pd(rinv01
,velec
));
412 felec
= _mm256_mul_pd(_mm256_mul_pd(qq01
,rinv01
),_mm256_sub_pd(rinvsq01
,felec
));
414 /* Update potential sum for this i atom from the interaction with this j atom. */
415 velecsum
= _mm256_add_pd(velecsum
,velec
);
419 /* Calculate temporary vectorial force */
420 tx
= _mm256_mul_pd(fscal
,dx01
);
421 ty
= _mm256_mul_pd(fscal
,dy01
);
422 tz
= _mm256_mul_pd(fscal
,dz01
);
424 /* Update vectorial force */
425 fix0
= _mm256_add_pd(fix0
,tx
);
426 fiy0
= _mm256_add_pd(fiy0
,ty
);
427 fiz0
= _mm256_add_pd(fiz0
,tz
);
429 fjx1
= _mm256_add_pd(fjx1
,tx
);
430 fjy1
= _mm256_add_pd(fjy1
,ty
);
431 fjz1
= _mm256_add_pd(fjz1
,tz
);
433 /**************************
434 * CALCULATE INTERACTIONS *
435 **************************/
437 r02
= _mm256_mul_pd(rsq02
,rinv02
);
439 /* EWALD ELECTROSTATICS */
441 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
442 ewrt
= _mm256_mul_pd(r02
,ewtabscale
);
443 ewitab
= _mm256_cvttpd_epi32(ewrt
);
444 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
445 ewitab
= _mm_slli_epi32(ewitab
,2);
446 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
447 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
448 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
449 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
450 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
451 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
452 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
453 velec
= _mm256_mul_pd(qq02
,_mm256_sub_pd(rinv02
,velec
));
454 felec
= _mm256_mul_pd(_mm256_mul_pd(qq02
,rinv02
),_mm256_sub_pd(rinvsq02
,felec
));
456 /* Update potential sum for this i atom from the interaction with this j atom. */
457 velecsum
= _mm256_add_pd(velecsum
,velec
);
461 /* Calculate temporary vectorial force */
462 tx
= _mm256_mul_pd(fscal
,dx02
);
463 ty
= _mm256_mul_pd(fscal
,dy02
);
464 tz
= _mm256_mul_pd(fscal
,dz02
);
466 /* Update vectorial force */
467 fix0
= _mm256_add_pd(fix0
,tx
);
468 fiy0
= _mm256_add_pd(fiy0
,ty
);
469 fiz0
= _mm256_add_pd(fiz0
,tz
);
471 fjx2
= _mm256_add_pd(fjx2
,tx
);
472 fjy2
= _mm256_add_pd(fjy2
,ty
);
473 fjz2
= _mm256_add_pd(fjz2
,tz
);
475 /**************************
476 * CALCULATE INTERACTIONS *
477 **************************/
479 r10
= _mm256_mul_pd(rsq10
,rinv10
);
481 /* EWALD ELECTROSTATICS */
483 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
484 ewrt
= _mm256_mul_pd(r10
,ewtabscale
);
485 ewitab
= _mm256_cvttpd_epi32(ewrt
);
486 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
487 ewitab
= _mm_slli_epi32(ewitab
,2);
488 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
489 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
490 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
491 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
492 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
493 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
494 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
495 velec
= _mm256_mul_pd(qq10
,_mm256_sub_pd(rinv10
,velec
));
496 felec
= _mm256_mul_pd(_mm256_mul_pd(qq10
,rinv10
),_mm256_sub_pd(rinvsq10
,felec
));
498 /* Update potential sum for this i atom from the interaction with this j atom. */
499 velecsum
= _mm256_add_pd(velecsum
,velec
);
503 /* Calculate temporary vectorial force */
504 tx
= _mm256_mul_pd(fscal
,dx10
);
505 ty
= _mm256_mul_pd(fscal
,dy10
);
506 tz
= _mm256_mul_pd(fscal
,dz10
);
508 /* Update vectorial force */
509 fix1
= _mm256_add_pd(fix1
,tx
);
510 fiy1
= _mm256_add_pd(fiy1
,ty
);
511 fiz1
= _mm256_add_pd(fiz1
,tz
);
513 fjx0
= _mm256_add_pd(fjx0
,tx
);
514 fjy0
= _mm256_add_pd(fjy0
,ty
);
515 fjz0
= _mm256_add_pd(fjz0
,tz
);
517 /**************************
518 * CALCULATE INTERACTIONS *
519 **************************/
521 r11
= _mm256_mul_pd(rsq11
,rinv11
);
523 /* EWALD ELECTROSTATICS */
525 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
526 ewrt
= _mm256_mul_pd(r11
,ewtabscale
);
527 ewitab
= _mm256_cvttpd_epi32(ewrt
);
528 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
529 ewitab
= _mm_slli_epi32(ewitab
,2);
530 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
531 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
532 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
533 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
534 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
535 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
536 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
537 velec
= _mm256_mul_pd(qq11
,_mm256_sub_pd(rinv11
,velec
));
538 felec
= _mm256_mul_pd(_mm256_mul_pd(qq11
,rinv11
),_mm256_sub_pd(rinvsq11
,felec
));
540 /* Update potential sum for this i atom from the interaction with this j atom. */
541 velecsum
= _mm256_add_pd(velecsum
,velec
);
545 /* Calculate temporary vectorial force */
546 tx
= _mm256_mul_pd(fscal
,dx11
);
547 ty
= _mm256_mul_pd(fscal
,dy11
);
548 tz
= _mm256_mul_pd(fscal
,dz11
);
550 /* Update vectorial force */
551 fix1
= _mm256_add_pd(fix1
,tx
);
552 fiy1
= _mm256_add_pd(fiy1
,ty
);
553 fiz1
= _mm256_add_pd(fiz1
,tz
);
555 fjx1
= _mm256_add_pd(fjx1
,tx
);
556 fjy1
= _mm256_add_pd(fjy1
,ty
);
557 fjz1
= _mm256_add_pd(fjz1
,tz
);
559 /**************************
560 * CALCULATE INTERACTIONS *
561 **************************/
563 r12
= _mm256_mul_pd(rsq12
,rinv12
);
565 /* EWALD ELECTROSTATICS */
567 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
568 ewrt
= _mm256_mul_pd(r12
,ewtabscale
);
569 ewitab
= _mm256_cvttpd_epi32(ewrt
);
570 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
571 ewitab
= _mm_slli_epi32(ewitab
,2);
572 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
573 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
574 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
575 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
576 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
577 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
578 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
579 velec
= _mm256_mul_pd(qq12
,_mm256_sub_pd(rinv12
,velec
));
580 felec
= _mm256_mul_pd(_mm256_mul_pd(qq12
,rinv12
),_mm256_sub_pd(rinvsq12
,felec
));
582 /* Update potential sum for this i atom from the interaction with this j atom. */
583 velecsum
= _mm256_add_pd(velecsum
,velec
);
587 /* Calculate temporary vectorial force */
588 tx
= _mm256_mul_pd(fscal
,dx12
);
589 ty
= _mm256_mul_pd(fscal
,dy12
);
590 tz
= _mm256_mul_pd(fscal
,dz12
);
592 /* Update vectorial force */
593 fix1
= _mm256_add_pd(fix1
,tx
);
594 fiy1
= _mm256_add_pd(fiy1
,ty
);
595 fiz1
= _mm256_add_pd(fiz1
,tz
);
597 fjx2
= _mm256_add_pd(fjx2
,tx
);
598 fjy2
= _mm256_add_pd(fjy2
,ty
);
599 fjz2
= _mm256_add_pd(fjz2
,tz
);
601 /**************************
602 * CALCULATE INTERACTIONS *
603 **************************/
605 r20
= _mm256_mul_pd(rsq20
,rinv20
);
607 /* EWALD ELECTROSTATICS */
609 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
610 ewrt
= _mm256_mul_pd(r20
,ewtabscale
);
611 ewitab
= _mm256_cvttpd_epi32(ewrt
);
612 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
613 ewitab
= _mm_slli_epi32(ewitab
,2);
614 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
615 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
616 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
617 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
618 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
619 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
620 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
621 velec
= _mm256_mul_pd(qq20
,_mm256_sub_pd(rinv20
,velec
));
622 felec
= _mm256_mul_pd(_mm256_mul_pd(qq20
,rinv20
),_mm256_sub_pd(rinvsq20
,felec
));
624 /* Update potential sum for this i atom from the interaction with this j atom. */
625 velecsum
= _mm256_add_pd(velecsum
,velec
);
629 /* Calculate temporary vectorial force */
630 tx
= _mm256_mul_pd(fscal
,dx20
);
631 ty
= _mm256_mul_pd(fscal
,dy20
);
632 tz
= _mm256_mul_pd(fscal
,dz20
);
634 /* Update vectorial force */
635 fix2
= _mm256_add_pd(fix2
,tx
);
636 fiy2
= _mm256_add_pd(fiy2
,ty
);
637 fiz2
= _mm256_add_pd(fiz2
,tz
);
639 fjx0
= _mm256_add_pd(fjx0
,tx
);
640 fjy0
= _mm256_add_pd(fjy0
,ty
);
641 fjz0
= _mm256_add_pd(fjz0
,tz
);
643 /**************************
644 * CALCULATE INTERACTIONS *
645 **************************/
647 r21
= _mm256_mul_pd(rsq21
,rinv21
);
649 /* EWALD ELECTROSTATICS */
651 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
652 ewrt
= _mm256_mul_pd(r21
,ewtabscale
);
653 ewitab
= _mm256_cvttpd_epi32(ewrt
);
654 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
655 ewitab
= _mm_slli_epi32(ewitab
,2);
656 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
657 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
658 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
659 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
660 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
661 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
662 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
663 velec
= _mm256_mul_pd(qq21
,_mm256_sub_pd(rinv21
,velec
));
664 felec
= _mm256_mul_pd(_mm256_mul_pd(qq21
,rinv21
),_mm256_sub_pd(rinvsq21
,felec
));
666 /* Update potential sum for this i atom from the interaction with this j atom. */
667 velecsum
= _mm256_add_pd(velecsum
,velec
);
671 /* Calculate temporary vectorial force */
672 tx
= _mm256_mul_pd(fscal
,dx21
);
673 ty
= _mm256_mul_pd(fscal
,dy21
);
674 tz
= _mm256_mul_pd(fscal
,dz21
);
676 /* Update vectorial force */
677 fix2
= _mm256_add_pd(fix2
,tx
);
678 fiy2
= _mm256_add_pd(fiy2
,ty
);
679 fiz2
= _mm256_add_pd(fiz2
,tz
);
681 fjx1
= _mm256_add_pd(fjx1
,tx
);
682 fjy1
= _mm256_add_pd(fjy1
,ty
);
683 fjz1
= _mm256_add_pd(fjz1
,tz
);
685 /**************************
686 * CALCULATE INTERACTIONS *
687 **************************/
689 r22
= _mm256_mul_pd(rsq22
,rinv22
);
691 /* EWALD ELECTROSTATICS */
693 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
694 ewrt
= _mm256_mul_pd(r22
,ewtabscale
);
695 ewitab
= _mm256_cvttpd_epi32(ewrt
);
696 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
697 ewitab
= _mm_slli_epi32(ewitab
,2);
698 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
699 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
700 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
701 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
702 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
703 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
704 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
705 velec
= _mm256_mul_pd(qq22
,_mm256_sub_pd(rinv22
,velec
));
706 felec
= _mm256_mul_pd(_mm256_mul_pd(qq22
,rinv22
),_mm256_sub_pd(rinvsq22
,felec
));
708 /* Update potential sum for this i atom from the interaction with this j atom. */
709 velecsum
= _mm256_add_pd(velecsum
,velec
);
713 /* Calculate temporary vectorial force */
714 tx
= _mm256_mul_pd(fscal
,dx22
);
715 ty
= _mm256_mul_pd(fscal
,dy22
);
716 tz
= _mm256_mul_pd(fscal
,dz22
);
718 /* Update vectorial force */
719 fix2
= _mm256_add_pd(fix2
,tx
);
720 fiy2
= _mm256_add_pd(fiy2
,ty
);
721 fiz2
= _mm256_add_pd(fiz2
,tz
);
723 fjx2
= _mm256_add_pd(fjx2
,tx
);
724 fjy2
= _mm256_add_pd(fjy2
,ty
);
725 fjz2
= _mm256_add_pd(fjz2
,tz
);
727 fjptrA
= f
+j_coord_offsetA
;
728 fjptrB
= f
+j_coord_offsetB
;
729 fjptrC
= f
+j_coord_offsetC
;
730 fjptrD
= f
+j_coord_offsetD
;
732 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
733 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
735 /* Inner loop uses 403 flops */
741 /* Get j neighbor index, and coordinate index */
742 jnrlistA
= jjnr
[jidx
];
743 jnrlistB
= jjnr
[jidx
+1];
744 jnrlistC
= jjnr
[jidx
+2];
745 jnrlistD
= jjnr
[jidx
+3];
746 /* Sign of each element will be negative for non-real atoms.
747 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
748 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
750 tmpmask0
= gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128()));
752 tmpmask1
= _mm_permute_ps(tmpmask0
,_GMX_MM_PERMUTE(3,3,2,2));
753 tmpmask0
= _mm_permute_ps(tmpmask0
,_GMX_MM_PERMUTE(1,1,0,0));
754 dummy_mask
= _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1
,tmpmask0
));
756 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
757 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
758 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
759 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
760 j_coord_offsetA
= DIM
*jnrA
;
761 j_coord_offsetB
= DIM
*jnrB
;
762 j_coord_offsetC
= DIM
*jnrC
;
763 j_coord_offsetD
= DIM
*jnrD
;
765 /* load j atom coordinates */
766 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
767 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
768 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
770 /* Calculate displacement vector */
771 dx00
= _mm256_sub_pd(ix0
,jx0
);
772 dy00
= _mm256_sub_pd(iy0
,jy0
);
773 dz00
= _mm256_sub_pd(iz0
,jz0
);
774 dx01
= _mm256_sub_pd(ix0
,jx1
);
775 dy01
= _mm256_sub_pd(iy0
,jy1
);
776 dz01
= _mm256_sub_pd(iz0
,jz1
);
777 dx02
= _mm256_sub_pd(ix0
,jx2
);
778 dy02
= _mm256_sub_pd(iy0
,jy2
);
779 dz02
= _mm256_sub_pd(iz0
,jz2
);
780 dx10
= _mm256_sub_pd(ix1
,jx0
);
781 dy10
= _mm256_sub_pd(iy1
,jy0
);
782 dz10
= _mm256_sub_pd(iz1
,jz0
);
783 dx11
= _mm256_sub_pd(ix1
,jx1
);
784 dy11
= _mm256_sub_pd(iy1
,jy1
);
785 dz11
= _mm256_sub_pd(iz1
,jz1
);
786 dx12
= _mm256_sub_pd(ix1
,jx2
);
787 dy12
= _mm256_sub_pd(iy1
,jy2
);
788 dz12
= _mm256_sub_pd(iz1
,jz2
);
789 dx20
= _mm256_sub_pd(ix2
,jx0
);
790 dy20
= _mm256_sub_pd(iy2
,jy0
);
791 dz20
= _mm256_sub_pd(iz2
,jz0
);
792 dx21
= _mm256_sub_pd(ix2
,jx1
);
793 dy21
= _mm256_sub_pd(iy2
,jy1
);
794 dz21
= _mm256_sub_pd(iz2
,jz1
);
795 dx22
= _mm256_sub_pd(ix2
,jx2
);
796 dy22
= _mm256_sub_pd(iy2
,jy2
);
797 dz22
= _mm256_sub_pd(iz2
,jz2
);
799 /* Calculate squared distance and things based on it */
800 rsq00
= gmx_mm256_calc_rsq_pd(dx00
,dy00
,dz00
);
801 rsq01
= gmx_mm256_calc_rsq_pd(dx01
,dy01
,dz01
);
802 rsq02
= gmx_mm256_calc_rsq_pd(dx02
,dy02
,dz02
);
803 rsq10
= gmx_mm256_calc_rsq_pd(dx10
,dy10
,dz10
);
804 rsq11
= gmx_mm256_calc_rsq_pd(dx11
,dy11
,dz11
);
805 rsq12
= gmx_mm256_calc_rsq_pd(dx12
,dy12
,dz12
);
806 rsq20
= gmx_mm256_calc_rsq_pd(dx20
,dy20
,dz20
);
807 rsq21
= gmx_mm256_calc_rsq_pd(dx21
,dy21
,dz21
);
808 rsq22
= gmx_mm256_calc_rsq_pd(dx22
,dy22
,dz22
);
810 rinv00
= avx256_invsqrt_d(rsq00
);
811 rinv01
= avx256_invsqrt_d(rsq01
);
812 rinv02
= avx256_invsqrt_d(rsq02
);
813 rinv10
= avx256_invsqrt_d(rsq10
);
814 rinv11
= avx256_invsqrt_d(rsq11
);
815 rinv12
= avx256_invsqrt_d(rsq12
);
816 rinv20
= avx256_invsqrt_d(rsq20
);
817 rinv21
= avx256_invsqrt_d(rsq21
);
818 rinv22
= avx256_invsqrt_d(rsq22
);
820 rinvsq00
= _mm256_mul_pd(rinv00
,rinv00
);
821 rinvsq01
= _mm256_mul_pd(rinv01
,rinv01
);
822 rinvsq02
= _mm256_mul_pd(rinv02
,rinv02
);
823 rinvsq10
= _mm256_mul_pd(rinv10
,rinv10
);
824 rinvsq11
= _mm256_mul_pd(rinv11
,rinv11
);
825 rinvsq12
= _mm256_mul_pd(rinv12
,rinv12
);
826 rinvsq20
= _mm256_mul_pd(rinv20
,rinv20
);
827 rinvsq21
= _mm256_mul_pd(rinv21
,rinv21
);
828 rinvsq22
= _mm256_mul_pd(rinv22
,rinv22
);
830 fjx0
= _mm256_setzero_pd();
831 fjy0
= _mm256_setzero_pd();
832 fjz0
= _mm256_setzero_pd();
833 fjx1
= _mm256_setzero_pd();
834 fjy1
= _mm256_setzero_pd();
835 fjz1
= _mm256_setzero_pd();
836 fjx2
= _mm256_setzero_pd();
837 fjy2
= _mm256_setzero_pd();
838 fjz2
= _mm256_setzero_pd();
840 /**************************
841 * CALCULATE INTERACTIONS *
842 **************************/
844 r00
= _mm256_mul_pd(rsq00
,rinv00
);
845 r00
= _mm256_andnot_pd(dummy_mask
,r00
);
847 /* Calculate table index by multiplying r with table scale and truncate to integer */
848 rt
= _mm256_mul_pd(r00
,vftabscale
);
849 vfitab
= _mm256_cvttpd_epi32(rt
);
850 vfeps
= _mm256_sub_pd(rt
,_mm256_round_pd(rt
, _MM_FROUND_FLOOR
));
851 vfitab
= _mm_slli_epi32(vfitab
,3);
853 /* EWALD ELECTROSTATICS */
855 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
856 ewrt
= _mm256_mul_pd(r00
,ewtabscale
);
857 ewitab
= _mm256_cvttpd_epi32(ewrt
);
858 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
859 ewitab
= _mm_slli_epi32(ewitab
,2);
860 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
861 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
862 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
863 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
864 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
865 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
866 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
867 velec
= _mm256_mul_pd(qq00
,_mm256_sub_pd(rinv00
,velec
));
868 felec
= _mm256_mul_pd(_mm256_mul_pd(qq00
,rinv00
),_mm256_sub_pd(rinvsq00
,felec
));
870 /* CUBIC SPLINE TABLE DISPERSION */
871 Y
= _mm256_load_pd( vftab
+ _mm_extract_epi32(vfitab
,0) );
872 F
= _mm256_load_pd( vftab
+ _mm_extract_epi32(vfitab
,1) );
873 G
= _mm256_load_pd( vftab
+ _mm_extract_epi32(vfitab
,2) );
874 H
= _mm256_load_pd( vftab
+ _mm_extract_epi32(vfitab
,3) );
875 GMX_MM256_FULLTRANSPOSE4_PD(Y
,F
,G
,H
);
876 Heps
= _mm256_mul_pd(vfeps
,H
);
877 Fp
= _mm256_add_pd(F
,_mm256_mul_pd(vfeps
,_mm256_add_pd(G
,Heps
)));
878 VV
= _mm256_add_pd(Y
,_mm256_mul_pd(vfeps
,Fp
));
879 vvdw6
= _mm256_mul_pd(c6_00
,VV
);
880 FF
= _mm256_add_pd(Fp
,_mm256_mul_pd(vfeps
,_mm256_add_pd(G
,_mm256_add_pd(Heps
,Heps
))));
881 fvdw6
= _mm256_mul_pd(c6_00
,FF
);
883 /* CUBIC SPLINE TABLE REPULSION */
884 vfitab
= _mm_add_epi32(vfitab
,ifour
);
885 Y
= _mm256_load_pd( vftab
+ _mm_extract_epi32(vfitab
,0) );
886 F
= _mm256_load_pd( vftab
+ _mm_extract_epi32(vfitab
,1) );
887 G
= _mm256_load_pd( vftab
+ _mm_extract_epi32(vfitab
,2) );
888 H
= _mm256_load_pd( vftab
+ _mm_extract_epi32(vfitab
,3) );
889 GMX_MM256_FULLTRANSPOSE4_PD(Y
,F
,G
,H
);
890 Heps
= _mm256_mul_pd(vfeps
,H
);
891 Fp
= _mm256_add_pd(F
,_mm256_mul_pd(vfeps
,_mm256_add_pd(G
,Heps
)));
892 VV
= _mm256_add_pd(Y
,_mm256_mul_pd(vfeps
,Fp
));
893 vvdw12
= _mm256_mul_pd(c12_00
,VV
);
894 FF
= _mm256_add_pd(Fp
,_mm256_mul_pd(vfeps
,_mm256_add_pd(G
,_mm256_add_pd(Heps
,Heps
))));
895 fvdw12
= _mm256_mul_pd(c12_00
,FF
);
896 vvdw
= _mm256_add_pd(vvdw12
,vvdw6
);
897 fvdw
= _mm256_xor_pd(signbit
,_mm256_mul_pd(_mm256_add_pd(fvdw6
,fvdw12
),_mm256_mul_pd(vftabscale
,rinv00
)));
899 /* Update potential sum for this i atom from the interaction with this j atom. */
900 velec
= _mm256_andnot_pd(dummy_mask
,velec
);
901 velecsum
= _mm256_add_pd(velecsum
,velec
);
902 vvdw
= _mm256_andnot_pd(dummy_mask
,vvdw
);
903 vvdwsum
= _mm256_add_pd(vvdwsum
,vvdw
);
905 fscal
= _mm256_add_pd(felec
,fvdw
);
907 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
909 /* Calculate temporary vectorial force */
910 tx
= _mm256_mul_pd(fscal
,dx00
);
911 ty
= _mm256_mul_pd(fscal
,dy00
);
912 tz
= _mm256_mul_pd(fscal
,dz00
);
914 /* Update vectorial force */
915 fix0
= _mm256_add_pd(fix0
,tx
);
916 fiy0
= _mm256_add_pd(fiy0
,ty
);
917 fiz0
= _mm256_add_pd(fiz0
,tz
);
919 fjx0
= _mm256_add_pd(fjx0
,tx
);
920 fjy0
= _mm256_add_pd(fjy0
,ty
);
921 fjz0
= _mm256_add_pd(fjz0
,tz
);
923 /**************************
924 * CALCULATE INTERACTIONS *
925 **************************/
927 r01
= _mm256_mul_pd(rsq01
,rinv01
);
928 r01
= _mm256_andnot_pd(dummy_mask
,r01
);
930 /* EWALD ELECTROSTATICS */
932 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
933 ewrt
= _mm256_mul_pd(r01
,ewtabscale
);
934 ewitab
= _mm256_cvttpd_epi32(ewrt
);
935 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
936 ewitab
= _mm_slli_epi32(ewitab
,2);
937 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
938 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
939 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
940 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
941 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
942 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
943 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
944 velec
= _mm256_mul_pd(qq01
,_mm256_sub_pd(rinv01
,velec
));
945 felec
= _mm256_mul_pd(_mm256_mul_pd(qq01
,rinv01
),_mm256_sub_pd(rinvsq01
,felec
));
947 /* Update potential sum for this i atom from the interaction with this j atom. */
948 velec
= _mm256_andnot_pd(dummy_mask
,velec
);
949 velecsum
= _mm256_add_pd(velecsum
,velec
);
953 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
955 /* Calculate temporary vectorial force */
956 tx
= _mm256_mul_pd(fscal
,dx01
);
957 ty
= _mm256_mul_pd(fscal
,dy01
);
958 tz
= _mm256_mul_pd(fscal
,dz01
);
960 /* Update vectorial force */
961 fix0
= _mm256_add_pd(fix0
,tx
);
962 fiy0
= _mm256_add_pd(fiy0
,ty
);
963 fiz0
= _mm256_add_pd(fiz0
,tz
);
965 fjx1
= _mm256_add_pd(fjx1
,tx
);
966 fjy1
= _mm256_add_pd(fjy1
,ty
);
967 fjz1
= _mm256_add_pd(fjz1
,tz
);
969 /**************************
970 * CALCULATE INTERACTIONS *
971 **************************/
973 r02
= _mm256_mul_pd(rsq02
,rinv02
);
974 r02
= _mm256_andnot_pd(dummy_mask
,r02
);
976 /* EWALD ELECTROSTATICS */
978 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
979 ewrt
= _mm256_mul_pd(r02
,ewtabscale
);
980 ewitab
= _mm256_cvttpd_epi32(ewrt
);
981 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
982 ewitab
= _mm_slli_epi32(ewitab
,2);
983 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
984 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
985 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
986 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
987 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
988 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
989 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
990 velec
= _mm256_mul_pd(qq02
,_mm256_sub_pd(rinv02
,velec
));
991 felec
= _mm256_mul_pd(_mm256_mul_pd(qq02
,rinv02
),_mm256_sub_pd(rinvsq02
,felec
));
993 /* Update potential sum for this i atom from the interaction with this j atom. */
994 velec
= _mm256_andnot_pd(dummy_mask
,velec
);
995 velecsum
= _mm256_add_pd(velecsum
,velec
);
999 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
1001 /* Calculate temporary vectorial force */
1002 tx
= _mm256_mul_pd(fscal
,dx02
);
1003 ty
= _mm256_mul_pd(fscal
,dy02
);
1004 tz
= _mm256_mul_pd(fscal
,dz02
);
1006 /* Update vectorial force */
1007 fix0
= _mm256_add_pd(fix0
,tx
);
1008 fiy0
= _mm256_add_pd(fiy0
,ty
);
1009 fiz0
= _mm256_add_pd(fiz0
,tz
);
1011 fjx2
= _mm256_add_pd(fjx2
,tx
);
1012 fjy2
= _mm256_add_pd(fjy2
,ty
);
1013 fjz2
= _mm256_add_pd(fjz2
,tz
);
1015 /**************************
1016 * CALCULATE INTERACTIONS *
1017 **************************/
1019 r10
= _mm256_mul_pd(rsq10
,rinv10
);
1020 r10
= _mm256_andnot_pd(dummy_mask
,r10
);
1022 /* EWALD ELECTROSTATICS */
1024 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1025 ewrt
= _mm256_mul_pd(r10
,ewtabscale
);
1026 ewitab
= _mm256_cvttpd_epi32(ewrt
);
1027 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1028 ewitab
= _mm_slli_epi32(ewitab
,2);
1029 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
1030 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
1031 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
1032 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
1033 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
1034 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
1035 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
1036 velec
= _mm256_mul_pd(qq10
,_mm256_sub_pd(rinv10
,velec
));
1037 felec
= _mm256_mul_pd(_mm256_mul_pd(qq10
,rinv10
),_mm256_sub_pd(rinvsq10
,felec
));
1039 /* Update potential sum for this i atom from the interaction with this j atom. */
1040 velec
= _mm256_andnot_pd(dummy_mask
,velec
);
1041 velecsum
= _mm256_add_pd(velecsum
,velec
);
1045 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
1047 /* Calculate temporary vectorial force */
1048 tx
= _mm256_mul_pd(fscal
,dx10
);
1049 ty
= _mm256_mul_pd(fscal
,dy10
);
1050 tz
= _mm256_mul_pd(fscal
,dz10
);
1052 /* Update vectorial force */
1053 fix1
= _mm256_add_pd(fix1
,tx
);
1054 fiy1
= _mm256_add_pd(fiy1
,ty
);
1055 fiz1
= _mm256_add_pd(fiz1
,tz
);
1057 fjx0
= _mm256_add_pd(fjx0
,tx
);
1058 fjy0
= _mm256_add_pd(fjy0
,ty
);
1059 fjz0
= _mm256_add_pd(fjz0
,tz
);
1061 /**************************
1062 * CALCULATE INTERACTIONS *
1063 **************************/
1065 r11
= _mm256_mul_pd(rsq11
,rinv11
);
1066 r11
= _mm256_andnot_pd(dummy_mask
,r11
);
1068 /* EWALD ELECTROSTATICS */
1070 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1071 ewrt
= _mm256_mul_pd(r11
,ewtabscale
);
1072 ewitab
= _mm256_cvttpd_epi32(ewrt
);
1073 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1074 ewitab
= _mm_slli_epi32(ewitab
,2);
1075 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
1076 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
1077 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
1078 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
1079 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
1080 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
1081 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
1082 velec
= _mm256_mul_pd(qq11
,_mm256_sub_pd(rinv11
,velec
));
1083 felec
= _mm256_mul_pd(_mm256_mul_pd(qq11
,rinv11
),_mm256_sub_pd(rinvsq11
,felec
));
1085 /* Update potential sum for this i atom from the interaction with this j atom. */
1086 velec
= _mm256_andnot_pd(dummy_mask
,velec
);
1087 velecsum
= _mm256_add_pd(velecsum
,velec
);
1091 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
1093 /* Calculate temporary vectorial force */
1094 tx
= _mm256_mul_pd(fscal
,dx11
);
1095 ty
= _mm256_mul_pd(fscal
,dy11
);
1096 tz
= _mm256_mul_pd(fscal
,dz11
);
1098 /* Update vectorial force */
1099 fix1
= _mm256_add_pd(fix1
,tx
);
1100 fiy1
= _mm256_add_pd(fiy1
,ty
);
1101 fiz1
= _mm256_add_pd(fiz1
,tz
);
1103 fjx1
= _mm256_add_pd(fjx1
,tx
);
1104 fjy1
= _mm256_add_pd(fjy1
,ty
);
1105 fjz1
= _mm256_add_pd(fjz1
,tz
);
1107 /**************************
1108 * CALCULATE INTERACTIONS *
1109 **************************/
1111 r12
= _mm256_mul_pd(rsq12
,rinv12
);
1112 r12
= _mm256_andnot_pd(dummy_mask
,r12
);
1114 /* EWALD ELECTROSTATICS */
1116 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1117 ewrt
= _mm256_mul_pd(r12
,ewtabscale
);
1118 ewitab
= _mm256_cvttpd_epi32(ewrt
);
1119 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1120 ewitab
= _mm_slli_epi32(ewitab
,2);
1121 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
1122 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
1123 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
1124 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
1125 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
1126 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
1127 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
1128 velec
= _mm256_mul_pd(qq12
,_mm256_sub_pd(rinv12
,velec
));
1129 felec
= _mm256_mul_pd(_mm256_mul_pd(qq12
,rinv12
),_mm256_sub_pd(rinvsq12
,felec
));
1131 /* Update potential sum for this i atom from the interaction with this j atom. */
1132 velec
= _mm256_andnot_pd(dummy_mask
,velec
);
1133 velecsum
= _mm256_add_pd(velecsum
,velec
);
1137 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
1139 /* Calculate temporary vectorial force */
1140 tx
= _mm256_mul_pd(fscal
,dx12
);
1141 ty
= _mm256_mul_pd(fscal
,dy12
);
1142 tz
= _mm256_mul_pd(fscal
,dz12
);
1144 /* Update vectorial force */
1145 fix1
= _mm256_add_pd(fix1
,tx
);
1146 fiy1
= _mm256_add_pd(fiy1
,ty
);
1147 fiz1
= _mm256_add_pd(fiz1
,tz
);
1149 fjx2
= _mm256_add_pd(fjx2
,tx
);
1150 fjy2
= _mm256_add_pd(fjy2
,ty
);
1151 fjz2
= _mm256_add_pd(fjz2
,tz
);
1153 /**************************
1154 * CALCULATE INTERACTIONS *
1155 **************************/
1157 r20
= _mm256_mul_pd(rsq20
,rinv20
);
1158 r20
= _mm256_andnot_pd(dummy_mask
,r20
);
1160 /* EWALD ELECTROSTATICS */
1162 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1163 ewrt
= _mm256_mul_pd(r20
,ewtabscale
);
1164 ewitab
= _mm256_cvttpd_epi32(ewrt
);
1165 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1166 ewitab
= _mm_slli_epi32(ewitab
,2);
1167 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
1168 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
1169 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
1170 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
1171 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
1172 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
1173 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
1174 velec
= _mm256_mul_pd(qq20
,_mm256_sub_pd(rinv20
,velec
));
1175 felec
= _mm256_mul_pd(_mm256_mul_pd(qq20
,rinv20
),_mm256_sub_pd(rinvsq20
,felec
));
1177 /* Update potential sum for this i atom from the interaction with this j atom. */
1178 velec
= _mm256_andnot_pd(dummy_mask
,velec
);
1179 velecsum
= _mm256_add_pd(velecsum
,velec
);
1183 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
1185 /* Calculate temporary vectorial force */
1186 tx
= _mm256_mul_pd(fscal
,dx20
);
1187 ty
= _mm256_mul_pd(fscal
,dy20
);
1188 tz
= _mm256_mul_pd(fscal
,dz20
);
1190 /* Update vectorial force */
1191 fix2
= _mm256_add_pd(fix2
,tx
);
1192 fiy2
= _mm256_add_pd(fiy2
,ty
);
1193 fiz2
= _mm256_add_pd(fiz2
,tz
);
1195 fjx0
= _mm256_add_pd(fjx0
,tx
);
1196 fjy0
= _mm256_add_pd(fjy0
,ty
);
1197 fjz0
= _mm256_add_pd(fjz0
,tz
);
1199 /**************************
1200 * CALCULATE INTERACTIONS *
1201 **************************/
1203 r21
= _mm256_mul_pd(rsq21
,rinv21
);
1204 r21
= _mm256_andnot_pd(dummy_mask
,r21
);
1206 /* EWALD ELECTROSTATICS */
1208 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1209 ewrt
= _mm256_mul_pd(r21
,ewtabscale
);
1210 ewitab
= _mm256_cvttpd_epi32(ewrt
);
1211 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1212 ewitab
= _mm_slli_epi32(ewitab
,2);
1213 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
1214 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
1215 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
1216 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
1217 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
1218 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
1219 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
1220 velec
= _mm256_mul_pd(qq21
,_mm256_sub_pd(rinv21
,velec
));
1221 felec
= _mm256_mul_pd(_mm256_mul_pd(qq21
,rinv21
),_mm256_sub_pd(rinvsq21
,felec
));
1223 /* Update potential sum for this i atom from the interaction with this j atom. */
1224 velec
= _mm256_andnot_pd(dummy_mask
,velec
);
1225 velecsum
= _mm256_add_pd(velecsum
,velec
);
1229 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
1231 /* Calculate temporary vectorial force */
1232 tx
= _mm256_mul_pd(fscal
,dx21
);
1233 ty
= _mm256_mul_pd(fscal
,dy21
);
1234 tz
= _mm256_mul_pd(fscal
,dz21
);
1236 /* Update vectorial force */
1237 fix2
= _mm256_add_pd(fix2
,tx
);
1238 fiy2
= _mm256_add_pd(fiy2
,ty
);
1239 fiz2
= _mm256_add_pd(fiz2
,tz
);
1241 fjx1
= _mm256_add_pd(fjx1
,tx
);
1242 fjy1
= _mm256_add_pd(fjy1
,ty
);
1243 fjz1
= _mm256_add_pd(fjz1
,tz
);
1245 /**************************
1246 * CALCULATE INTERACTIONS *
1247 **************************/
1249 r22
= _mm256_mul_pd(rsq22
,rinv22
);
1250 r22
= _mm256_andnot_pd(dummy_mask
,r22
);
1252 /* EWALD ELECTROSTATICS */
1254 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1255 ewrt
= _mm256_mul_pd(r22
,ewtabscale
);
1256 ewitab
= _mm256_cvttpd_epi32(ewrt
);
1257 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1258 ewitab
= _mm_slli_epi32(ewitab
,2);
1259 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
1260 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
1261 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
1262 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
1263 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
1264 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
1265 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
1266 velec
= _mm256_mul_pd(qq22
,_mm256_sub_pd(rinv22
,velec
));
1267 felec
= _mm256_mul_pd(_mm256_mul_pd(qq22
,rinv22
),_mm256_sub_pd(rinvsq22
,felec
));
1269 /* Update potential sum for this i atom from the interaction with this j atom. */
1270 velec
= _mm256_andnot_pd(dummy_mask
,velec
);
1271 velecsum
= _mm256_add_pd(velecsum
,velec
);
1275 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
1277 /* Calculate temporary vectorial force */
1278 tx
= _mm256_mul_pd(fscal
,dx22
);
1279 ty
= _mm256_mul_pd(fscal
,dy22
);
1280 tz
= _mm256_mul_pd(fscal
,dz22
);
1282 /* Update vectorial force */
1283 fix2
= _mm256_add_pd(fix2
,tx
);
1284 fiy2
= _mm256_add_pd(fiy2
,ty
);
1285 fiz2
= _mm256_add_pd(fiz2
,tz
);
1287 fjx2
= _mm256_add_pd(fjx2
,tx
);
1288 fjy2
= _mm256_add_pd(fjy2
,ty
);
1289 fjz2
= _mm256_add_pd(fjz2
,tz
);
1291 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
1292 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
1293 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
1294 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
1296 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
1297 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
1299 /* Inner loop uses 412 flops */
1302 /* End of innermost loop */
1304 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
1305 f
+i_coord_offset
,fshift
+i_shift_offset
);
1308 /* Update potential energies */
1309 gmx_mm256_update_1pot_pd(velecsum
,kernel_data
->energygrp_elec
+ggid
);
1310 gmx_mm256_update_1pot_pd(vvdwsum
,kernel_data
->energygrp_vdw
+ggid
);
1312 /* Increment number of inner iterations */
1313 inneriter
+= j_index_end
- j_index_start
;
1315 /* Outer loop uses 20 flops */
1318 /* Increment number of outer iterations */
1321 /* Update outer/inner flops */
1323 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W3W3_VF
,outeriter
*20 + inneriter
*412);
1326 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_avx_256_double
1327 * Electrostatics interaction: Ewald
1328 * VdW interaction: CubicSplineTable
1329 * Geometry: Water3-Water3
1330 * Calculate force/pot: Force
1333 nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_avx_256_double
1334 (t_nblist
* gmx_restrict nlist
,
1335 rvec
* gmx_restrict xx
,
1336 rvec
* gmx_restrict ff
,
1337 struct t_forcerec
* gmx_restrict fr
,
1338 t_mdatoms
* gmx_restrict mdatoms
,
1339 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
1340 t_nrnb
* gmx_restrict nrnb
)
1342 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1343 * just 0 for non-waters.
1344 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1345 * jnr indices corresponding to data put in the four positions in the SIMD register.
1347 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
1348 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
1349 int jnrA
,jnrB
,jnrC
,jnrD
;
1350 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
1351 int jnrlistE
,jnrlistF
,jnrlistG
,jnrlistH
;
1352 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
1353 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
1354 real rcutoff_scalar
;
1355 real
*shiftvec
,*fshift
,*x
,*f
;
1356 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
;
1357 real scratch
[4*DIM
];
1358 __m256d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
1359 real
* vdwioffsetptr0
;
1360 __m256d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
1361 real
* vdwioffsetptr1
;
1362 __m256d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
1363 real
* vdwioffsetptr2
;
1364 __m256d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
1365 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
;
1366 __m256d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
1367 int vdwjidx1A
,vdwjidx1B
,vdwjidx1C
,vdwjidx1D
;
1368 __m256d jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
1369 int vdwjidx2A
,vdwjidx2B
,vdwjidx2C
,vdwjidx2D
;
1370 __m256d jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
1371 __m256d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
1372 __m256d dx01
,dy01
,dz01
,rsq01
,rinv01
,rinvsq01
,r01
,qq01
,c6_01
,c12_01
;
1373 __m256d dx02
,dy02
,dz02
,rsq02
,rinv02
,rinvsq02
,r02
,qq02
,c6_02
,c12_02
;
1374 __m256d dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
1375 __m256d dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
1376 __m256d dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
1377 __m256d dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
1378 __m256d dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
1379 __m256d dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
1380 __m256d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
1383 __m256d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
1386 __m256d one_sixth
= _mm256_set1_pd(1.0/6.0);
1387 __m256d one_twelfth
= _mm256_set1_pd(1.0/12.0);
1389 __m128i ifour
= _mm_set1_epi32(4);
1390 __m256d rt
,vfeps
,vftabscale
,Y
,F
,G
,H
,Heps
,Fp
,VV
,FF
;
1393 __m256d ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
1394 __m256d beta
,beta2
,beta3
,zeta2
,pmecorrF
,pmecorrV
,rinv3
;
1396 __m256d dummy_mask
,cutoff_mask
;
1397 __m128 tmpmask0
,tmpmask1
;
1398 __m256d signbit
= _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1399 __m256d one
= _mm256_set1_pd(1.0);
1400 __m256d two
= _mm256_set1_pd(2.0);
1406 jindex
= nlist
->jindex
;
1408 shiftidx
= nlist
->shift
;
1410 shiftvec
= fr
->shift_vec
[0];
1411 fshift
= fr
->fshift
[0];
1412 facel
= _mm256_set1_pd(fr
->ic
->epsfac
);
1413 charge
= mdatoms
->chargeA
;
1414 nvdwtype
= fr
->ntype
;
1415 vdwparam
= fr
->nbfp
;
1416 vdwtype
= mdatoms
->typeA
;
1418 vftab
= kernel_data
->table_vdw
->data
;
1419 vftabscale
= _mm256_set1_pd(kernel_data
->table_vdw
->scale
);
1421 sh_ewald
= _mm256_set1_pd(fr
->ic
->sh_ewald
);
1422 beta
= _mm256_set1_pd(fr
->ic
->ewaldcoeff_q
);
1423 beta2
= _mm256_mul_pd(beta
,beta
);
1424 beta3
= _mm256_mul_pd(beta
,beta2
);
1426 ewtab
= fr
->ic
->tabq_coul_F
;
1427 ewtabscale
= _mm256_set1_pd(fr
->ic
->tabq_scale
);
1428 ewtabhalfspace
= _mm256_set1_pd(0.5/fr
->ic
->tabq_scale
);
1430 /* Setup water-specific parameters */
1431 inr
= nlist
->iinr
[0];
1432 iq0
= _mm256_mul_pd(facel
,_mm256_set1_pd(charge
[inr
+0]));
1433 iq1
= _mm256_mul_pd(facel
,_mm256_set1_pd(charge
[inr
+1]));
1434 iq2
= _mm256_mul_pd(facel
,_mm256_set1_pd(charge
[inr
+2]));
1435 vdwioffsetptr0
= vdwparam
+2*nvdwtype
*vdwtype
[inr
+0];
1437 jq0
= _mm256_set1_pd(charge
[inr
+0]);
1438 jq1
= _mm256_set1_pd(charge
[inr
+1]);
1439 jq2
= _mm256_set1_pd(charge
[inr
+2]);
1440 vdwjidx0A
= 2*vdwtype
[inr
+0];
1441 qq00
= _mm256_mul_pd(iq0
,jq0
);
1442 c6_00
= _mm256_set1_pd(vdwioffsetptr0
[vdwjidx0A
]);
1443 c12_00
= _mm256_set1_pd(vdwioffsetptr0
[vdwjidx0A
+1]);
1444 qq01
= _mm256_mul_pd(iq0
,jq1
);
1445 qq02
= _mm256_mul_pd(iq0
,jq2
);
1446 qq10
= _mm256_mul_pd(iq1
,jq0
);
1447 qq11
= _mm256_mul_pd(iq1
,jq1
);
1448 qq12
= _mm256_mul_pd(iq1
,jq2
);
1449 qq20
= _mm256_mul_pd(iq2
,jq0
);
1450 qq21
= _mm256_mul_pd(iq2
,jq1
);
1451 qq22
= _mm256_mul_pd(iq2
,jq2
);
1453 /* Avoid stupid compiler warnings */
1454 jnrA
= jnrB
= jnrC
= jnrD
= 0;
1455 j_coord_offsetA
= 0;
1456 j_coord_offsetB
= 0;
1457 j_coord_offsetC
= 0;
1458 j_coord_offsetD
= 0;
1463 for(iidx
=0;iidx
<4*DIM
;iidx
++)
1465 scratch
[iidx
] = 0.0;
1468 /* Start outer loop over neighborlists */
1469 for(iidx
=0; iidx
<nri
; iidx
++)
1471 /* Load shift vector for this list */
1472 i_shift_offset
= DIM
*shiftidx
[iidx
];
1474 /* Load limits for loop over neighbors */
1475 j_index_start
= jindex
[iidx
];
1476 j_index_end
= jindex
[iidx
+1];
1478 /* Get outer coordinate index */
1480 i_coord_offset
= DIM
*inr
;
1482 /* Load i particle coords and add shift vector */
1483 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
1484 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
1486 fix0
= _mm256_setzero_pd();
1487 fiy0
= _mm256_setzero_pd();
1488 fiz0
= _mm256_setzero_pd();
1489 fix1
= _mm256_setzero_pd();
1490 fiy1
= _mm256_setzero_pd();
1491 fiz1
= _mm256_setzero_pd();
1492 fix2
= _mm256_setzero_pd();
1493 fiy2
= _mm256_setzero_pd();
1494 fiz2
= _mm256_setzero_pd();
1496 /* Start inner kernel loop */
1497 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+3]>=0; jidx
+=4)
1500 /* Get j neighbor index, and coordinate index */
1502 jnrB
= jjnr
[jidx
+1];
1503 jnrC
= jjnr
[jidx
+2];
1504 jnrD
= jjnr
[jidx
+3];
1505 j_coord_offsetA
= DIM
*jnrA
;
1506 j_coord_offsetB
= DIM
*jnrB
;
1507 j_coord_offsetC
= DIM
*jnrC
;
1508 j_coord_offsetD
= DIM
*jnrD
;
1510 /* load j atom coordinates */
1511 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1512 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
1513 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
1515 /* Calculate displacement vector */
1516 dx00
= _mm256_sub_pd(ix0
,jx0
);
1517 dy00
= _mm256_sub_pd(iy0
,jy0
);
1518 dz00
= _mm256_sub_pd(iz0
,jz0
);
1519 dx01
= _mm256_sub_pd(ix0
,jx1
);
1520 dy01
= _mm256_sub_pd(iy0
,jy1
);
1521 dz01
= _mm256_sub_pd(iz0
,jz1
);
1522 dx02
= _mm256_sub_pd(ix0
,jx2
);
1523 dy02
= _mm256_sub_pd(iy0
,jy2
);
1524 dz02
= _mm256_sub_pd(iz0
,jz2
);
1525 dx10
= _mm256_sub_pd(ix1
,jx0
);
1526 dy10
= _mm256_sub_pd(iy1
,jy0
);
1527 dz10
= _mm256_sub_pd(iz1
,jz0
);
1528 dx11
= _mm256_sub_pd(ix1
,jx1
);
1529 dy11
= _mm256_sub_pd(iy1
,jy1
);
1530 dz11
= _mm256_sub_pd(iz1
,jz1
);
1531 dx12
= _mm256_sub_pd(ix1
,jx2
);
1532 dy12
= _mm256_sub_pd(iy1
,jy2
);
1533 dz12
= _mm256_sub_pd(iz1
,jz2
);
1534 dx20
= _mm256_sub_pd(ix2
,jx0
);
1535 dy20
= _mm256_sub_pd(iy2
,jy0
);
1536 dz20
= _mm256_sub_pd(iz2
,jz0
);
1537 dx21
= _mm256_sub_pd(ix2
,jx1
);
1538 dy21
= _mm256_sub_pd(iy2
,jy1
);
1539 dz21
= _mm256_sub_pd(iz2
,jz1
);
1540 dx22
= _mm256_sub_pd(ix2
,jx2
);
1541 dy22
= _mm256_sub_pd(iy2
,jy2
);
1542 dz22
= _mm256_sub_pd(iz2
,jz2
);
1544 /* Calculate squared distance and things based on it */
1545 rsq00
= gmx_mm256_calc_rsq_pd(dx00
,dy00
,dz00
);
1546 rsq01
= gmx_mm256_calc_rsq_pd(dx01
,dy01
,dz01
);
1547 rsq02
= gmx_mm256_calc_rsq_pd(dx02
,dy02
,dz02
);
1548 rsq10
= gmx_mm256_calc_rsq_pd(dx10
,dy10
,dz10
);
1549 rsq11
= gmx_mm256_calc_rsq_pd(dx11
,dy11
,dz11
);
1550 rsq12
= gmx_mm256_calc_rsq_pd(dx12
,dy12
,dz12
);
1551 rsq20
= gmx_mm256_calc_rsq_pd(dx20
,dy20
,dz20
);
1552 rsq21
= gmx_mm256_calc_rsq_pd(dx21
,dy21
,dz21
);
1553 rsq22
= gmx_mm256_calc_rsq_pd(dx22
,dy22
,dz22
);
1555 rinv00
= avx256_invsqrt_d(rsq00
);
1556 rinv01
= avx256_invsqrt_d(rsq01
);
1557 rinv02
= avx256_invsqrt_d(rsq02
);
1558 rinv10
= avx256_invsqrt_d(rsq10
);
1559 rinv11
= avx256_invsqrt_d(rsq11
);
1560 rinv12
= avx256_invsqrt_d(rsq12
);
1561 rinv20
= avx256_invsqrt_d(rsq20
);
1562 rinv21
= avx256_invsqrt_d(rsq21
);
1563 rinv22
= avx256_invsqrt_d(rsq22
);
1565 rinvsq00
= _mm256_mul_pd(rinv00
,rinv00
);
1566 rinvsq01
= _mm256_mul_pd(rinv01
,rinv01
);
1567 rinvsq02
= _mm256_mul_pd(rinv02
,rinv02
);
1568 rinvsq10
= _mm256_mul_pd(rinv10
,rinv10
);
1569 rinvsq11
= _mm256_mul_pd(rinv11
,rinv11
);
1570 rinvsq12
= _mm256_mul_pd(rinv12
,rinv12
);
1571 rinvsq20
= _mm256_mul_pd(rinv20
,rinv20
);
1572 rinvsq21
= _mm256_mul_pd(rinv21
,rinv21
);
1573 rinvsq22
= _mm256_mul_pd(rinv22
,rinv22
);
1575 fjx0
= _mm256_setzero_pd();
1576 fjy0
= _mm256_setzero_pd();
1577 fjz0
= _mm256_setzero_pd();
1578 fjx1
= _mm256_setzero_pd();
1579 fjy1
= _mm256_setzero_pd();
1580 fjz1
= _mm256_setzero_pd();
1581 fjx2
= _mm256_setzero_pd();
1582 fjy2
= _mm256_setzero_pd();
1583 fjz2
= _mm256_setzero_pd();
1585 /**************************
1586 * CALCULATE INTERACTIONS *
1587 **************************/
1589 r00
= _mm256_mul_pd(rsq00
,rinv00
);
1591 /* Calculate table index by multiplying r with table scale and truncate to integer */
1592 rt
= _mm256_mul_pd(r00
,vftabscale
);
1593 vfitab
= _mm256_cvttpd_epi32(rt
);
1594 vfeps
= _mm256_sub_pd(rt
,_mm256_round_pd(rt
, _MM_FROUND_FLOOR
));
1595 vfitab
= _mm_slli_epi32(vfitab
,3);
1597 /* EWALD ELECTROSTATICS */
1599 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1600 ewrt
= _mm256_mul_pd(r00
,ewtabscale
);
1601 ewitab
= _mm256_cvttpd_epi32(ewrt
);
1602 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1603 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
1604 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
1606 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
1607 felec
= _mm256_mul_pd(_mm256_mul_pd(qq00
,rinv00
),_mm256_sub_pd(rinvsq00
,felec
));
1609 /* CUBIC SPLINE TABLE DISPERSION */
1610 Y
= _mm256_load_pd( vftab
+ _mm_extract_epi32(vfitab
,0) );
1611 F
= _mm256_load_pd( vftab
+ _mm_extract_epi32(vfitab
,1) );
1612 G
= _mm256_load_pd( vftab
+ _mm_extract_epi32(vfitab
,2) );
1613 H
= _mm256_load_pd( vftab
+ _mm_extract_epi32(vfitab
,3) );
1614 GMX_MM256_FULLTRANSPOSE4_PD(Y
,F
,G
,H
);
1615 Heps
= _mm256_mul_pd(vfeps
,H
);
1616 Fp
= _mm256_add_pd(F
,_mm256_mul_pd(vfeps
,_mm256_add_pd(G
,Heps
)));
1617 FF
= _mm256_add_pd(Fp
,_mm256_mul_pd(vfeps
,_mm256_add_pd(G
,_mm256_add_pd(Heps
,Heps
))));
1618 fvdw6
= _mm256_mul_pd(c6_00
,FF
);
1620 /* CUBIC SPLINE TABLE REPULSION */
1621 vfitab
= _mm_add_epi32(vfitab
,ifour
);
1622 Y
= _mm256_load_pd( vftab
+ _mm_extract_epi32(vfitab
,0) );
1623 F
= _mm256_load_pd( vftab
+ _mm_extract_epi32(vfitab
,1) );
1624 G
= _mm256_load_pd( vftab
+ _mm_extract_epi32(vfitab
,2) );
1625 H
= _mm256_load_pd( vftab
+ _mm_extract_epi32(vfitab
,3) );
1626 GMX_MM256_FULLTRANSPOSE4_PD(Y
,F
,G
,H
);
1627 Heps
= _mm256_mul_pd(vfeps
,H
);
1628 Fp
= _mm256_add_pd(F
,_mm256_mul_pd(vfeps
,_mm256_add_pd(G
,Heps
)));
1629 FF
= _mm256_add_pd(Fp
,_mm256_mul_pd(vfeps
,_mm256_add_pd(G
,_mm256_add_pd(Heps
,Heps
))));
1630 fvdw12
= _mm256_mul_pd(c12_00
,FF
);
1631 fvdw
= _mm256_xor_pd(signbit
,_mm256_mul_pd(_mm256_add_pd(fvdw6
,fvdw12
),_mm256_mul_pd(vftabscale
,rinv00
)));
1633 fscal
= _mm256_add_pd(felec
,fvdw
);
1635 /* Calculate temporary vectorial force */
1636 tx
= _mm256_mul_pd(fscal
,dx00
);
1637 ty
= _mm256_mul_pd(fscal
,dy00
);
1638 tz
= _mm256_mul_pd(fscal
,dz00
);
1640 /* Update vectorial force */
1641 fix0
= _mm256_add_pd(fix0
,tx
);
1642 fiy0
= _mm256_add_pd(fiy0
,ty
);
1643 fiz0
= _mm256_add_pd(fiz0
,tz
);
1645 fjx0
= _mm256_add_pd(fjx0
,tx
);
1646 fjy0
= _mm256_add_pd(fjy0
,ty
);
1647 fjz0
= _mm256_add_pd(fjz0
,tz
);
1649 /**************************
1650 * CALCULATE INTERACTIONS *
1651 **************************/
1653 r01
= _mm256_mul_pd(rsq01
,rinv01
);
1655 /* EWALD ELECTROSTATICS */
1657 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1658 ewrt
= _mm256_mul_pd(r01
,ewtabscale
);
1659 ewitab
= _mm256_cvttpd_epi32(ewrt
);
1660 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1661 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
1662 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
1664 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
1665 felec
= _mm256_mul_pd(_mm256_mul_pd(qq01
,rinv01
),_mm256_sub_pd(rinvsq01
,felec
));
1669 /* Calculate temporary vectorial force */
1670 tx
= _mm256_mul_pd(fscal
,dx01
);
1671 ty
= _mm256_mul_pd(fscal
,dy01
);
1672 tz
= _mm256_mul_pd(fscal
,dz01
);
1674 /* Update vectorial force */
1675 fix0
= _mm256_add_pd(fix0
,tx
);
1676 fiy0
= _mm256_add_pd(fiy0
,ty
);
1677 fiz0
= _mm256_add_pd(fiz0
,tz
);
1679 fjx1
= _mm256_add_pd(fjx1
,tx
);
1680 fjy1
= _mm256_add_pd(fjy1
,ty
);
1681 fjz1
= _mm256_add_pd(fjz1
,tz
);
1683 /**************************
1684 * CALCULATE INTERACTIONS *
1685 **************************/
1687 r02
= _mm256_mul_pd(rsq02
,rinv02
);
1689 /* EWALD ELECTROSTATICS */
1691 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1692 ewrt
= _mm256_mul_pd(r02
,ewtabscale
);
1693 ewitab
= _mm256_cvttpd_epi32(ewrt
);
1694 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1695 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
1696 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
1698 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
1699 felec
= _mm256_mul_pd(_mm256_mul_pd(qq02
,rinv02
),_mm256_sub_pd(rinvsq02
,felec
));
1703 /* Calculate temporary vectorial force */
1704 tx
= _mm256_mul_pd(fscal
,dx02
);
1705 ty
= _mm256_mul_pd(fscal
,dy02
);
1706 tz
= _mm256_mul_pd(fscal
,dz02
);
1708 /* Update vectorial force */
1709 fix0
= _mm256_add_pd(fix0
,tx
);
1710 fiy0
= _mm256_add_pd(fiy0
,ty
);
1711 fiz0
= _mm256_add_pd(fiz0
,tz
);
1713 fjx2
= _mm256_add_pd(fjx2
,tx
);
1714 fjy2
= _mm256_add_pd(fjy2
,ty
);
1715 fjz2
= _mm256_add_pd(fjz2
,tz
);
1717 /**************************
1718 * CALCULATE INTERACTIONS *
1719 **************************/
1721 r10
= _mm256_mul_pd(rsq10
,rinv10
);
1723 /* EWALD ELECTROSTATICS */
1725 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1726 ewrt
= _mm256_mul_pd(r10
,ewtabscale
);
1727 ewitab
= _mm256_cvttpd_epi32(ewrt
);
1728 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1729 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
1730 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
1732 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
1733 felec
= _mm256_mul_pd(_mm256_mul_pd(qq10
,rinv10
),_mm256_sub_pd(rinvsq10
,felec
));
1737 /* Calculate temporary vectorial force */
1738 tx
= _mm256_mul_pd(fscal
,dx10
);
1739 ty
= _mm256_mul_pd(fscal
,dy10
);
1740 tz
= _mm256_mul_pd(fscal
,dz10
);
1742 /* Update vectorial force */
1743 fix1
= _mm256_add_pd(fix1
,tx
);
1744 fiy1
= _mm256_add_pd(fiy1
,ty
);
1745 fiz1
= _mm256_add_pd(fiz1
,tz
);
1747 fjx0
= _mm256_add_pd(fjx0
,tx
);
1748 fjy0
= _mm256_add_pd(fjy0
,ty
);
1749 fjz0
= _mm256_add_pd(fjz0
,tz
);
1751 /**************************
1752 * CALCULATE INTERACTIONS *
1753 **************************/
1755 r11
= _mm256_mul_pd(rsq11
,rinv11
);
1757 /* EWALD ELECTROSTATICS */
1759 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1760 ewrt
= _mm256_mul_pd(r11
,ewtabscale
);
1761 ewitab
= _mm256_cvttpd_epi32(ewrt
);
1762 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1763 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
1764 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
1766 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
1767 felec
= _mm256_mul_pd(_mm256_mul_pd(qq11
,rinv11
),_mm256_sub_pd(rinvsq11
,felec
));
1771 /* Calculate temporary vectorial force */
1772 tx
= _mm256_mul_pd(fscal
,dx11
);
1773 ty
= _mm256_mul_pd(fscal
,dy11
);
1774 tz
= _mm256_mul_pd(fscal
,dz11
);
1776 /* Update vectorial force */
1777 fix1
= _mm256_add_pd(fix1
,tx
);
1778 fiy1
= _mm256_add_pd(fiy1
,ty
);
1779 fiz1
= _mm256_add_pd(fiz1
,tz
);
1781 fjx1
= _mm256_add_pd(fjx1
,tx
);
1782 fjy1
= _mm256_add_pd(fjy1
,ty
);
1783 fjz1
= _mm256_add_pd(fjz1
,tz
);
1785 /**************************
1786 * CALCULATE INTERACTIONS *
1787 **************************/
1789 r12
= _mm256_mul_pd(rsq12
,rinv12
);
1791 /* EWALD ELECTROSTATICS */
1793 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1794 ewrt
= _mm256_mul_pd(r12
,ewtabscale
);
1795 ewitab
= _mm256_cvttpd_epi32(ewrt
);
1796 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1797 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
1798 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
1800 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
1801 felec
= _mm256_mul_pd(_mm256_mul_pd(qq12
,rinv12
),_mm256_sub_pd(rinvsq12
,felec
));
1805 /* Calculate temporary vectorial force */
1806 tx
= _mm256_mul_pd(fscal
,dx12
);
1807 ty
= _mm256_mul_pd(fscal
,dy12
);
1808 tz
= _mm256_mul_pd(fscal
,dz12
);
1810 /* Update vectorial force */
1811 fix1
= _mm256_add_pd(fix1
,tx
);
1812 fiy1
= _mm256_add_pd(fiy1
,ty
);
1813 fiz1
= _mm256_add_pd(fiz1
,tz
);
1815 fjx2
= _mm256_add_pd(fjx2
,tx
);
1816 fjy2
= _mm256_add_pd(fjy2
,ty
);
1817 fjz2
= _mm256_add_pd(fjz2
,tz
);
1819 /**************************
1820 * CALCULATE INTERACTIONS *
1821 **************************/
1823 r20
= _mm256_mul_pd(rsq20
,rinv20
);
1825 /* EWALD ELECTROSTATICS */
1827 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1828 ewrt
= _mm256_mul_pd(r20
,ewtabscale
);
1829 ewitab
= _mm256_cvttpd_epi32(ewrt
);
1830 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1831 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
1832 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
1834 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
1835 felec
= _mm256_mul_pd(_mm256_mul_pd(qq20
,rinv20
),_mm256_sub_pd(rinvsq20
,felec
));
1839 /* Calculate temporary vectorial force */
1840 tx
= _mm256_mul_pd(fscal
,dx20
);
1841 ty
= _mm256_mul_pd(fscal
,dy20
);
1842 tz
= _mm256_mul_pd(fscal
,dz20
);
1844 /* Update vectorial force */
1845 fix2
= _mm256_add_pd(fix2
,tx
);
1846 fiy2
= _mm256_add_pd(fiy2
,ty
);
1847 fiz2
= _mm256_add_pd(fiz2
,tz
);
1849 fjx0
= _mm256_add_pd(fjx0
,tx
);
1850 fjy0
= _mm256_add_pd(fjy0
,ty
);
1851 fjz0
= _mm256_add_pd(fjz0
,tz
);
1853 /**************************
1854 * CALCULATE INTERACTIONS *
1855 **************************/
1857 r21
= _mm256_mul_pd(rsq21
,rinv21
);
1859 /* EWALD ELECTROSTATICS */
1861 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1862 ewrt
= _mm256_mul_pd(r21
,ewtabscale
);
1863 ewitab
= _mm256_cvttpd_epi32(ewrt
);
1864 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1865 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
1866 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
1868 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
1869 felec
= _mm256_mul_pd(_mm256_mul_pd(qq21
,rinv21
),_mm256_sub_pd(rinvsq21
,felec
));
1873 /* Calculate temporary vectorial force */
1874 tx
= _mm256_mul_pd(fscal
,dx21
);
1875 ty
= _mm256_mul_pd(fscal
,dy21
);
1876 tz
= _mm256_mul_pd(fscal
,dz21
);
1878 /* Update vectorial force */
1879 fix2
= _mm256_add_pd(fix2
,tx
);
1880 fiy2
= _mm256_add_pd(fiy2
,ty
);
1881 fiz2
= _mm256_add_pd(fiz2
,tz
);
1883 fjx1
= _mm256_add_pd(fjx1
,tx
);
1884 fjy1
= _mm256_add_pd(fjy1
,ty
);
1885 fjz1
= _mm256_add_pd(fjz1
,tz
);
1887 /**************************
1888 * CALCULATE INTERACTIONS *
1889 **************************/
1891 r22
= _mm256_mul_pd(rsq22
,rinv22
);
1893 /* EWALD ELECTROSTATICS */
1895 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1896 ewrt
= _mm256_mul_pd(r22
,ewtabscale
);
1897 ewitab
= _mm256_cvttpd_epi32(ewrt
);
1898 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1899 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
1900 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
1902 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
1903 felec
= _mm256_mul_pd(_mm256_mul_pd(qq22
,rinv22
),_mm256_sub_pd(rinvsq22
,felec
));
1907 /* Calculate temporary vectorial force */
1908 tx
= _mm256_mul_pd(fscal
,dx22
);
1909 ty
= _mm256_mul_pd(fscal
,dy22
);
1910 tz
= _mm256_mul_pd(fscal
,dz22
);
1912 /* Update vectorial force */
1913 fix2
= _mm256_add_pd(fix2
,tx
);
1914 fiy2
= _mm256_add_pd(fiy2
,ty
);
1915 fiz2
= _mm256_add_pd(fiz2
,tz
);
1917 fjx2
= _mm256_add_pd(fjx2
,tx
);
1918 fjy2
= _mm256_add_pd(fjy2
,ty
);
1919 fjz2
= _mm256_add_pd(fjz2
,tz
);
1921 fjptrA
= f
+j_coord_offsetA
;
1922 fjptrB
= f
+j_coord_offsetB
;
1923 fjptrC
= f
+j_coord_offsetC
;
1924 fjptrD
= f
+j_coord_offsetD
;
1926 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
1927 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
1929 /* Inner loop uses 350 flops */
1932 if(jidx
<j_index_end
)
1935 /* Get j neighbor index, and coordinate index */
1936 jnrlistA
= jjnr
[jidx
];
1937 jnrlistB
= jjnr
[jidx
+1];
1938 jnrlistC
= jjnr
[jidx
+2];
1939 jnrlistD
= jjnr
[jidx
+3];
1940 /* Sign of each element will be negative for non-real atoms.
1941 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1942 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
1944 tmpmask0
= gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128()));
1946 tmpmask1
= _mm_permute_ps(tmpmask0
,_GMX_MM_PERMUTE(3,3,2,2));
1947 tmpmask0
= _mm_permute_ps(tmpmask0
,_GMX_MM_PERMUTE(1,1,0,0));
1948 dummy_mask
= _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1
,tmpmask0
));
1950 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
1951 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
1952 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
1953 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
1954 j_coord_offsetA
= DIM
*jnrA
;
1955 j_coord_offsetB
= DIM
*jnrB
;
1956 j_coord_offsetC
= DIM
*jnrC
;
1957 j_coord_offsetD
= DIM
*jnrD
;
1959 /* load j atom coordinates */
1960 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1961 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
1962 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
1964 /* Calculate displacement vector */
1965 dx00
= _mm256_sub_pd(ix0
,jx0
);
1966 dy00
= _mm256_sub_pd(iy0
,jy0
);
1967 dz00
= _mm256_sub_pd(iz0
,jz0
);
1968 dx01
= _mm256_sub_pd(ix0
,jx1
);
1969 dy01
= _mm256_sub_pd(iy0
,jy1
);
1970 dz01
= _mm256_sub_pd(iz0
,jz1
);
1971 dx02
= _mm256_sub_pd(ix0
,jx2
);
1972 dy02
= _mm256_sub_pd(iy0
,jy2
);
1973 dz02
= _mm256_sub_pd(iz0
,jz2
);
1974 dx10
= _mm256_sub_pd(ix1
,jx0
);
1975 dy10
= _mm256_sub_pd(iy1
,jy0
);
1976 dz10
= _mm256_sub_pd(iz1
,jz0
);
1977 dx11
= _mm256_sub_pd(ix1
,jx1
);
1978 dy11
= _mm256_sub_pd(iy1
,jy1
);
1979 dz11
= _mm256_sub_pd(iz1
,jz1
);
1980 dx12
= _mm256_sub_pd(ix1
,jx2
);
1981 dy12
= _mm256_sub_pd(iy1
,jy2
);
1982 dz12
= _mm256_sub_pd(iz1
,jz2
);
1983 dx20
= _mm256_sub_pd(ix2
,jx0
);
1984 dy20
= _mm256_sub_pd(iy2
,jy0
);
1985 dz20
= _mm256_sub_pd(iz2
,jz0
);
1986 dx21
= _mm256_sub_pd(ix2
,jx1
);
1987 dy21
= _mm256_sub_pd(iy2
,jy1
);
1988 dz21
= _mm256_sub_pd(iz2
,jz1
);
1989 dx22
= _mm256_sub_pd(ix2
,jx2
);
1990 dy22
= _mm256_sub_pd(iy2
,jy2
);
1991 dz22
= _mm256_sub_pd(iz2
,jz2
);
1993 /* Calculate squared distance and things based on it */
1994 rsq00
= gmx_mm256_calc_rsq_pd(dx00
,dy00
,dz00
);
1995 rsq01
= gmx_mm256_calc_rsq_pd(dx01
,dy01
,dz01
);
1996 rsq02
= gmx_mm256_calc_rsq_pd(dx02
,dy02
,dz02
);
1997 rsq10
= gmx_mm256_calc_rsq_pd(dx10
,dy10
,dz10
);
1998 rsq11
= gmx_mm256_calc_rsq_pd(dx11
,dy11
,dz11
);
1999 rsq12
= gmx_mm256_calc_rsq_pd(dx12
,dy12
,dz12
);
2000 rsq20
= gmx_mm256_calc_rsq_pd(dx20
,dy20
,dz20
);
2001 rsq21
= gmx_mm256_calc_rsq_pd(dx21
,dy21
,dz21
);
2002 rsq22
= gmx_mm256_calc_rsq_pd(dx22
,dy22
,dz22
);
2004 rinv00
= avx256_invsqrt_d(rsq00
);
2005 rinv01
= avx256_invsqrt_d(rsq01
);
2006 rinv02
= avx256_invsqrt_d(rsq02
);
2007 rinv10
= avx256_invsqrt_d(rsq10
);
2008 rinv11
= avx256_invsqrt_d(rsq11
);
2009 rinv12
= avx256_invsqrt_d(rsq12
);
2010 rinv20
= avx256_invsqrt_d(rsq20
);
2011 rinv21
= avx256_invsqrt_d(rsq21
);
2012 rinv22
= avx256_invsqrt_d(rsq22
);
2014 rinvsq00
= _mm256_mul_pd(rinv00
,rinv00
);
2015 rinvsq01
= _mm256_mul_pd(rinv01
,rinv01
);
2016 rinvsq02
= _mm256_mul_pd(rinv02
,rinv02
);
2017 rinvsq10
= _mm256_mul_pd(rinv10
,rinv10
);
2018 rinvsq11
= _mm256_mul_pd(rinv11
,rinv11
);
2019 rinvsq12
= _mm256_mul_pd(rinv12
,rinv12
);
2020 rinvsq20
= _mm256_mul_pd(rinv20
,rinv20
);
2021 rinvsq21
= _mm256_mul_pd(rinv21
,rinv21
);
2022 rinvsq22
= _mm256_mul_pd(rinv22
,rinv22
);
2024 fjx0
= _mm256_setzero_pd();
2025 fjy0
= _mm256_setzero_pd();
2026 fjz0
= _mm256_setzero_pd();
2027 fjx1
= _mm256_setzero_pd();
2028 fjy1
= _mm256_setzero_pd();
2029 fjz1
= _mm256_setzero_pd();
2030 fjx2
= _mm256_setzero_pd();
2031 fjy2
= _mm256_setzero_pd();
2032 fjz2
= _mm256_setzero_pd();
2034 /**************************
2035 * CALCULATE INTERACTIONS *
2036 **************************/
2038 r00
= _mm256_mul_pd(rsq00
,rinv00
);
2039 r00
= _mm256_andnot_pd(dummy_mask
,r00
);
2041 /* Calculate table index by multiplying r with table scale and truncate to integer */
2042 rt
= _mm256_mul_pd(r00
,vftabscale
);
2043 vfitab
= _mm256_cvttpd_epi32(rt
);
2044 vfeps
= _mm256_sub_pd(rt
,_mm256_round_pd(rt
, _MM_FROUND_FLOOR
));
2045 vfitab
= _mm_slli_epi32(vfitab
,3);
2047 /* EWALD ELECTROSTATICS */
2049 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2050 ewrt
= _mm256_mul_pd(r00
,ewtabscale
);
2051 ewitab
= _mm256_cvttpd_epi32(ewrt
);
2052 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2053 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
2054 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
2056 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
2057 felec
= _mm256_mul_pd(_mm256_mul_pd(qq00
,rinv00
),_mm256_sub_pd(rinvsq00
,felec
));
2059 /* CUBIC SPLINE TABLE DISPERSION */
2060 Y
= _mm256_load_pd( vftab
+ _mm_extract_epi32(vfitab
,0) );
2061 F
= _mm256_load_pd( vftab
+ _mm_extract_epi32(vfitab
,1) );
2062 G
= _mm256_load_pd( vftab
+ _mm_extract_epi32(vfitab
,2) );
2063 H
= _mm256_load_pd( vftab
+ _mm_extract_epi32(vfitab
,3) );
2064 GMX_MM256_FULLTRANSPOSE4_PD(Y
,F
,G
,H
);
2065 Heps
= _mm256_mul_pd(vfeps
,H
);
2066 Fp
= _mm256_add_pd(F
,_mm256_mul_pd(vfeps
,_mm256_add_pd(G
,Heps
)));
2067 FF
= _mm256_add_pd(Fp
,_mm256_mul_pd(vfeps
,_mm256_add_pd(G
,_mm256_add_pd(Heps
,Heps
))));
2068 fvdw6
= _mm256_mul_pd(c6_00
,FF
);
2070 /* CUBIC SPLINE TABLE REPULSION */
2071 vfitab
= _mm_add_epi32(vfitab
,ifour
);
2072 Y
= _mm256_load_pd( vftab
+ _mm_extract_epi32(vfitab
,0) );
2073 F
= _mm256_load_pd( vftab
+ _mm_extract_epi32(vfitab
,1) );
2074 G
= _mm256_load_pd( vftab
+ _mm_extract_epi32(vfitab
,2) );
2075 H
= _mm256_load_pd( vftab
+ _mm_extract_epi32(vfitab
,3) );
2076 GMX_MM256_FULLTRANSPOSE4_PD(Y
,F
,G
,H
);
2077 Heps
= _mm256_mul_pd(vfeps
,H
);
2078 Fp
= _mm256_add_pd(F
,_mm256_mul_pd(vfeps
,_mm256_add_pd(G
,Heps
)));
2079 FF
= _mm256_add_pd(Fp
,_mm256_mul_pd(vfeps
,_mm256_add_pd(G
,_mm256_add_pd(Heps
,Heps
))));
2080 fvdw12
= _mm256_mul_pd(c12_00
,FF
);
2081 fvdw
= _mm256_xor_pd(signbit
,_mm256_mul_pd(_mm256_add_pd(fvdw6
,fvdw12
),_mm256_mul_pd(vftabscale
,rinv00
)));
2083 fscal
= _mm256_add_pd(felec
,fvdw
);
2085 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
2087 /* Calculate temporary vectorial force */
2088 tx
= _mm256_mul_pd(fscal
,dx00
);
2089 ty
= _mm256_mul_pd(fscal
,dy00
);
2090 tz
= _mm256_mul_pd(fscal
,dz00
);
2092 /* Update vectorial force */
2093 fix0
= _mm256_add_pd(fix0
,tx
);
2094 fiy0
= _mm256_add_pd(fiy0
,ty
);
2095 fiz0
= _mm256_add_pd(fiz0
,tz
);
2097 fjx0
= _mm256_add_pd(fjx0
,tx
);
2098 fjy0
= _mm256_add_pd(fjy0
,ty
);
2099 fjz0
= _mm256_add_pd(fjz0
,tz
);
2101 /**************************
2102 * CALCULATE INTERACTIONS *
2103 **************************/
2105 r01
= _mm256_mul_pd(rsq01
,rinv01
);
2106 r01
= _mm256_andnot_pd(dummy_mask
,r01
);
2108 /* EWALD ELECTROSTATICS */
2110 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2111 ewrt
= _mm256_mul_pd(r01
,ewtabscale
);
2112 ewitab
= _mm256_cvttpd_epi32(ewrt
);
2113 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2114 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
2115 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
2117 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
2118 felec
= _mm256_mul_pd(_mm256_mul_pd(qq01
,rinv01
),_mm256_sub_pd(rinvsq01
,felec
));
2122 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
2124 /* Calculate temporary vectorial force */
2125 tx
= _mm256_mul_pd(fscal
,dx01
);
2126 ty
= _mm256_mul_pd(fscal
,dy01
);
2127 tz
= _mm256_mul_pd(fscal
,dz01
);
2129 /* Update vectorial force */
2130 fix0
= _mm256_add_pd(fix0
,tx
);
2131 fiy0
= _mm256_add_pd(fiy0
,ty
);
2132 fiz0
= _mm256_add_pd(fiz0
,tz
);
2134 fjx1
= _mm256_add_pd(fjx1
,tx
);
2135 fjy1
= _mm256_add_pd(fjy1
,ty
);
2136 fjz1
= _mm256_add_pd(fjz1
,tz
);
2138 /**************************
2139 * CALCULATE INTERACTIONS *
2140 **************************/
2142 r02
= _mm256_mul_pd(rsq02
,rinv02
);
2143 r02
= _mm256_andnot_pd(dummy_mask
,r02
);
2145 /* EWALD ELECTROSTATICS */
2147 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2148 ewrt
= _mm256_mul_pd(r02
,ewtabscale
);
2149 ewitab
= _mm256_cvttpd_epi32(ewrt
);
2150 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2151 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
2152 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
2154 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
2155 felec
= _mm256_mul_pd(_mm256_mul_pd(qq02
,rinv02
),_mm256_sub_pd(rinvsq02
,felec
));
2159 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
2161 /* Calculate temporary vectorial force */
2162 tx
= _mm256_mul_pd(fscal
,dx02
);
2163 ty
= _mm256_mul_pd(fscal
,dy02
);
2164 tz
= _mm256_mul_pd(fscal
,dz02
);
2166 /* Update vectorial force */
2167 fix0
= _mm256_add_pd(fix0
,tx
);
2168 fiy0
= _mm256_add_pd(fiy0
,ty
);
2169 fiz0
= _mm256_add_pd(fiz0
,tz
);
2171 fjx2
= _mm256_add_pd(fjx2
,tx
);
2172 fjy2
= _mm256_add_pd(fjy2
,ty
);
2173 fjz2
= _mm256_add_pd(fjz2
,tz
);
2175 /**************************
2176 * CALCULATE INTERACTIONS *
2177 **************************/
2179 r10
= _mm256_mul_pd(rsq10
,rinv10
);
2180 r10
= _mm256_andnot_pd(dummy_mask
,r10
);
2182 /* EWALD ELECTROSTATICS */
2184 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2185 ewrt
= _mm256_mul_pd(r10
,ewtabscale
);
2186 ewitab
= _mm256_cvttpd_epi32(ewrt
);
2187 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2188 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
2189 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
2191 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
2192 felec
= _mm256_mul_pd(_mm256_mul_pd(qq10
,rinv10
),_mm256_sub_pd(rinvsq10
,felec
));
2196 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
2198 /* Calculate temporary vectorial force */
2199 tx
= _mm256_mul_pd(fscal
,dx10
);
2200 ty
= _mm256_mul_pd(fscal
,dy10
);
2201 tz
= _mm256_mul_pd(fscal
,dz10
);
2203 /* Update vectorial force */
2204 fix1
= _mm256_add_pd(fix1
,tx
);
2205 fiy1
= _mm256_add_pd(fiy1
,ty
);
2206 fiz1
= _mm256_add_pd(fiz1
,tz
);
2208 fjx0
= _mm256_add_pd(fjx0
,tx
);
2209 fjy0
= _mm256_add_pd(fjy0
,ty
);
2210 fjz0
= _mm256_add_pd(fjz0
,tz
);
2212 /**************************
2213 * CALCULATE INTERACTIONS *
2214 **************************/
2216 r11
= _mm256_mul_pd(rsq11
,rinv11
);
2217 r11
= _mm256_andnot_pd(dummy_mask
,r11
);
2219 /* EWALD ELECTROSTATICS */
2221 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2222 ewrt
= _mm256_mul_pd(r11
,ewtabscale
);
2223 ewitab
= _mm256_cvttpd_epi32(ewrt
);
2224 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2225 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
2226 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
2228 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
2229 felec
= _mm256_mul_pd(_mm256_mul_pd(qq11
,rinv11
),_mm256_sub_pd(rinvsq11
,felec
));
2233 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
2235 /* Calculate temporary vectorial force */
2236 tx
= _mm256_mul_pd(fscal
,dx11
);
2237 ty
= _mm256_mul_pd(fscal
,dy11
);
2238 tz
= _mm256_mul_pd(fscal
,dz11
);
2240 /* Update vectorial force */
2241 fix1
= _mm256_add_pd(fix1
,tx
);
2242 fiy1
= _mm256_add_pd(fiy1
,ty
);
2243 fiz1
= _mm256_add_pd(fiz1
,tz
);
2245 fjx1
= _mm256_add_pd(fjx1
,tx
);
2246 fjy1
= _mm256_add_pd(fjy1
,ty
);
2247 fjz1
= _mm256_add_pd(fjz1
,tz
);
2249 /**************************
2250 * CALCULATE INTERACTIONS *
2251 **************************/
2253 r12
= _mm256_mul_pd(rsq12
,rinv12
);
2254 r12
= _mm256_andnot_pd(dummy_mask
,r12
);
2256 /* EWALD ELECTROSTATICS */
2258 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2259 ewrt
= _mm256_mul_pd(r12
,ewtabscale
);
2260 ewitab
= _mm256_cvttpd_epi32(ewrt
);
2261 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2262 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
2263 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
2265 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
2266 felec
= _mm256_mul_pd(_mm256_mul_pd(qq12
,rinv12
),_mm256_sub_pd(rinvsq12
,felec
));
2270 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
2272 /* Calculate temporary vectorial force */
2273 tx
= _mm256_mul_pd(fscal
,dx12
);
2274 ty
= _mm256_mul_pd(fscal
,dy12
);
2275 tz
= _mm256_mul_pd(fscal
,dz12
);
2277 /* Update vectorial force */
2278 fix1
= _mm256_add_pd(fix1
,tx
);
2279 fiy1
= _mm256_add_pd(fiy1
,ty
);
2280 fiz1
= _mm256_add_pd(fiz1
,tz
);
2282 fjx2
= _mm256_add_pd(fjx2
,tx
);
2283 fjy2
= _mm256_add_pd(fjy2
,ty
);
2284 fjz2
= _mm256_add_pd(fjz2
,tz
);
2286 /**************************
2287 * CALCULATE INTERACTIONS *
2288 **************************/
2290 r20
= _mm256_mul_pd(rsq20
,rinv20
);
2291 r20
= _mm256_andnot_pd(dummy_mask
,r20
);
2293 /* EWALD ELECTROSTATICS */
2295 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2296 ewrt
= _mm256_mul_pd(r20
,ewtabscale
);
2297 ewitab
= _mm256_cvttpd_epi32(ewrt
);
2298 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2299 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
2300 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
2302 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
2303 felec
= _mm256_mul_pd(_mm256_mul_pd(qq20
,rinv20
),_mm256_sub_pd(rinvsq20
,felec
));
2307 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
2309 /* Calculate temporary vectorial force */
2310 tx
= _mm256_mul_pd(fscal
,dx20
);
2311 ty
= _mm256_mul_pd(fscal
,dy20
);
2312 tz
= _mm256_mul_pd(fscal
,dz20
);
2314 /* Update vectorial force */
2315 fix2
= _mm256_add_pd(fix2
,tx
);
2316 fiy2
= _mm256_add_pd(fiy2
,ty
);
2317 fiz2
= _mm256_add_pd(fiz2
,tz
);
2319 fjx0
= _mm256_add_pd(fjx0
,tx
);
2320 fjy0
= _mm256_add_pd(fjy0
,ty
);
2321 fjz0
= _mm256_add_pd(fjz0
,tz
);
2323 /**************************
2324 * CALCULATE INTERACTIONS *
2325 **************************/
2327 r21
= _mm256_mul_pd(rsq21
,rinv21
);
2328 r21
= _mm256_andnot_pd(dummy_mask
,r21
);
2330 /* EWALD ELECTROSTATICS */
2332 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2333 ewrt
= _mm256_mul_pd(r21
,ewtabscale
);
2334 ewitab
= _mm256_cvttpd_epi32(ewrt
);
2335 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2336 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
2337 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
2339 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
2340 felec
= _mm256_mul_pd(_mm256_mul_pd(qq21
,rinv21
),_mm256_sub_pd(rinvsq21
,felec
));
2344 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
2346 /* Calculate temporary vectorial force */
2347 tx
= _mm256_mul_pd(fscal
,dx21
);
2348 ty
= _mm256_mul_pd(fscal
,dy21
);
2349 tz
= _mm256_mul_pd(fscal
,dz21
);
2351 /* Update vectorial force */
2352 fix2
= _mm256_add_pd(fix2
,tx
);
2353 fiy2
= _mm256_add_pd(fiy2
,ty
);
2354 fiz2
= _mm256_add_pd(fiz2
,tz
);
2356 fjx1
= _mm256_add_pd(fjx1
,tx
);
2357 fjy1
= _mm256_add_pd(fjy1
,ty
);
2358 fjz1
= _mm256_add_pd(fjz1
,tz
);
2360 /**************************
2361 * CALCULATE INTERACTIONS *
2362 **************************/
2364 r22
= _mm256_mul_pd(rsq22
,rinv22
);
2365 r22
= _mm256_andnot_pd(dummy_mask
,r22
);
2367 /* EWALD ELECTROSTATICS */
2369 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2370 ewrt
= _mm256_mul_pd(r22
,ewtabscale
);
2371 ewitab
= _mm256_cvttpd_epi32(ewrt
);
2372 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2373 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
2374 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
2376 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
2377 felec
= _mm256_mul_pd(_mm256_mul_pd(qq22
,rinv22
),_mm256_sub_pd(rinvsq22
,felec
));
2381 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
2383 /* Calculate temporary vectorial force */
2384 tx
= _mm256_mul_pd(fscal
,dx22
);
2385 ty
= _mm256_mul_pd(fscal
,dy22
);
2386 tz
= _mm256_mul_pd(fscal
,dz22
);
2388 /* Update vectorial force */
2389 fix2
= _mm256_add_pd(fix2
,tx
);
2390 fiy2
= _mm256_add_pd(fiy2
,ty
);
2391 fiz2
= _mm256_add_pd(fiz2
,tz
);
2393 fjx2
= _mm256_add_pd(fjx2
,tx
);
2394 fjy2
= _mm256_add_pd(fjy2
,ty
);
2395 fjz2
= _mm256_add_pd(fjz2
,tz
);
2397 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
2398 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
2399 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
2400 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
2402 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
2403 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
2405 /* Inner loop uses 359 flops */
2408 /* End of innermost loop */
2410 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
2411 f
+i_coord_offset
,fshift
+i_shift_offset
);
2413 /* Increment number of inner iterations */
2414 inneriter
+= j_index_end
- j_index_start
;
2416 /* Outer loop uses 18 flops */
2419 /* Increment number of outer iterations */
2422 /* Update outer/inner flops */
2424 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W3W3_F
,outeriter
*18 + inneriter
*359);