2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_avx_256_double.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_avx_256_double
51 * Electrostatics interaction: Ewald
52 * VdW interaction: LennardJones
53 * Geometry: Water4-Water4
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_avx_256_double
58 (t_nblist
* gmx_restrict nlist
,
59 rvec
* gmx_restrict xx
,
60 rvec
* gmx_restrict ff
,
61 struct t_forcerec
* gmx_restrict fr
,
62 t_mdatoms
* gmx_restrict mdatoms
,
63 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
64 t_nrnb
* gmx_restrict nrnb
)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
72 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
73 int jnrA
,jnrB
,jnrC
,jnrD
;
74 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
75 int jnrlistE
,jnrlistF
,jnrlistG
,jnrlistH
;
76 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
77 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
79 real
*shiftvec
,*fshift
,*x
,*f
;
80 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
;
82 __m256d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
83 real
* vdwioffsetptr0
;
84 __m256d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
85 real
* vdwioffsetptr1
;
86 __m256d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
87 real
* vdwioffsetptr2
;
88 __m256d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
89 real
* vdwioffsetptr3
;
90 __m256d ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
91 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
;
92 __m256d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
93 int vdwjidx1A
,vdwjidx1B
,vdwjidx1C
,vdwjidx1D
;
94 __m256d jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
95 int vdwjidx2A
,vdwjidx2B
,vdwjidx2C
,vdwjidx2D
;
96 __m256d jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
97 int vdwjidx3A
,vdwjidx3B
,vdwjidx3C
,vdwjidx3D
;
98 __m256d jx3
,jy3
,jz3
,fjx3
,fjy3
,fjz3
,jq3
,isaj3
;
99 __m256d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
100 __m256d dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
101 __m256d dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
102 __m256d dx13
,dy13
,dz13
,rsq13
,rinv13
,rinvsq13
,r13
,qq13
,c6_13
,c12_13
;
103 __m256d dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
104 __m256d dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
105 __m256d dx23
,dy23
,dz23
,rsq23
,rinv23
,rinvsq23
,r23
,qq23
,c6_23
,c12_23
;
106 __m256d dx31
,dy31
,dz31
,rsq31
,rinv31
,rinvsq31
,r31
,qq31
,c6_31
,c12_31
;
107 __m256d dx32
,dy32
,dz32
,rsq32
,rinv32
,rinvsq32
,r32
,qq32
,c6_32
,c12_32
;
108 __m256d dx33
,dy33
,dz33
,rsq33
,rinv33
,rinvsq33
,r33
,qq33
,c6_33
,c12_33
;
109 __m256d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
112 __m256d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
115 __m256d one_sixth
= _mm256_set1_pd(1.0/6.0);
116 __m256d one_twelfth
= _mm256_set1_pd(1.0/12.0);
118 __m256d ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
119 __m256d beta
,beta2
,beta3
,zeta2
,pmecorrF
,pmecorrV
,rinv3
;
121 __m256d dummy_mask
,cutoff_mask
;
122 __m128 tmpmask0
,tmpmask1
;
123 __m256d signbit
= _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
124 __m256d one
= _mm256_set1_pd(1.0);
125 __m256d two
= _mm256_set1_pd(2.0);
131 jindex
= nlist
->jindex
;
133 shiftidx
= nlist
->shift
;
135 shiftvec
= fr
->shift_vec
[0];
136 fshift
= fr
->fshift
[0];
137 facel
= _mm256_set1_pd(fr
->ic
->epsfac
);
138 charge
= mdatoms
->chargeA
;
139 nvdwtype
= fr
->ntype
;
141 vdwtype
= mdatoms
->typeA
;
143 sh_ewald
= _mm256_set1_pd(fr
->ic
->sh_ewald
);
144 beta
= _mm256_set1_pd(fr
->ic
->ewaldcoeff_q
);
145 beta2
= _mm256_mul_pd(beta
,beta
);
146 beta3
= _mm256_mul_pd(beta
,beta2
);
148 ewtab
= fr
->ic
->tabq_coul_FDV0
;
149 ewtabscale
= _mm256_set1_pd(fr
->ic
->tabq_scale
);
150 ewtabhalfspace
= _mm256_set1_pd(0.5/fr
->ic
->tabq_scale
);
152 /* Setup water-specific parameters */
153 inr
= nlist
->iinr
[0];
154 iq1
= _mm256_mul_pd(facel
,_mm256_set1_pd(charge
[inr
+1]));
155 iq2
= _mm256_mul_pd(facel
,_mm256_set1_pd(charge
[inr
+2]));
156 iq3
= _mm256_mul_pd(facel
,_mm256_set1_pd(charge
[inr
+3]));
157 vdwioffsetptr0
= vdwparam
+2*nvdwtype
*vdwtype
[inr
+0];
159 jq1
= _mm256_set1_pd(charge
[inr
+1]);
160 jq2
= _mm256_set1_pd(charge
[inr
+2]);
161 jq3
= _mm256_set1_pd(charge
[inr
+3]);
162 vdwjidx0A
= 2*vdwtype
[inr
+0];
163 c6_00
= _mm256_set1_pd(vdwioffsetptr0
[vdwjidx0A
]);
164 c12_00
= _mm256_set1_pd(vdwioffsetptr0
[vdwjidx0A
+1]);
165 qq11
= _mm256_mul_pd(iq1
,jq1
);
166 qq12
= _mm256_mul_pd(iq1
,jq2
);
167 qq13
= _mm256_mul_pd(iq1
,jq3
);
168 qq21
= _mm256_mul_pd(iq2
,jq1
);
169 qq22
= _mm256_mul_pd(iq2
,jq2
);
170 qq23
= _mm256_mul_pd(iq2
,jq3
);
171 qq31
= _mm256_mul_pd(iq3
,jq1
);
172 qq32
= _mm256_mul_pd(iq3
,jq2
);
173 qq33
= _mm256_mul_pd(iq3
,jq3
);
175 /* Avoid stupid compiler warnings */
176 jnrA
= jnrB
= jnrC
= jnrD
= 0;
185 for(iidx
=0;iidx
<4*DIM
;iidx
++)
190 /* Start outer loop over neighborlists */
191 for(iidx
=0; iidx
<nri
; iidx
++)
193 /* Load shift vector for this list */
194 i_shift_offset
= DIM
*shiftidx
[iidx
];
196 /* Load limits for loop over neighbors */
197 j_index_start
= jindex
[iidx
];
198 j_index_end
= jindex
[iidx
+1];
200 /* Get outer coordinate index */
202 i_coord_offset
= DIM
*inr
;
204 /* Load i particle coords and add shift vector */
205 gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
206 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
208 fix0
= _mm256_setzero_pd();
209 fiy0
= _mm256_setzero_pd();
210 fiz0
= _mm256_setzero_pd();
211 fix1
= _mm256_setzero_pd();
212 fiy1
= _mm256_setzero_pd();
213 fiz1
= _mm256_setzero_pd();
214 fix2
= _mm256_setzero_pd();
215 fiy2
= _mm256_setzero_pd();
216 fiz2
= _mm256_setzero_pd();
217 fix3
= _mm256_setzero_pd();
218 fiy3
= _mm256_setzero_pd();
219 fiz3
= _mm256_setzero_pd();
221 /* Reset potential sums */
222 velecsum
= _mm256_setzero_pd();
223 vvdwsum
= _mm256_setzero_pd();
225 /* Start inner kernel loop */
226 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+3]>=0; jidx
+=4)
229 /* Get j neighbor index, and coordinate index */
234 j_coord_offsetA
= DIM
*jnrA
;
235 j_coord_offsetB
= DIM
*jnrB
;
236 j_coord_offsetC
= DIM
*jnrC
;
237 j_coord_offsetD
= DIM
*jnrD
;
239 /* load j atom coordinates */
240 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
241 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
242 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
243 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
245 /* Calculate displacement vector */
246 dx00
= _mm256_sub_pd(ix0
,jx0
);
247 dy00
= _mm256_sub_pd(iy0
,jy0
);
248 dz00
= _mm256_sub_pd(iz0
,jz0
);
249 dx11
= _mm256_sub_pd(ix1
,jx1
);
250 dy11
= _mm256_sub_pd(iy1
,jy1
);
251 dz11
= _mm256_sub_pd(iz1
,jz1
);
252 dx12
= _mm256_sub_pd(ix1
,jx2
);
253 dy12
= _mm256_sub_pd(iy1
,jy2
);
254 dz12
= _mm256_sub_pd(iz1
,jz2
);
255 dx13
= _mm256_sub_pd(ix1
,jx3
);
256 dy13
= _mm256_sub_pd(iy1
,jy3
);
257 dz13
= _mm256_sub_pd(iz1
,jz3
);
258 dx21
= _mm256_sub_pd(ix2
,jx1
);
259 dy21
= _mm256_sub_pd(iy2
,jy1
);
260 dz21
= _mm256_sub_pd(iz2
,jz1
);
261 dx22
= _mm256_sub_pd(ix2
,jx2
);
262 dy22
= _mm256_sub_pd(iy2
,jy2
);
263 dz22
= _mm256_sub_pd(iz2
,jz2
);
264 dx23
= _mm256_sub_pd(ix2
,jx3
);
265 dy23
= _mm256_sub_pd(iy2
,jy3
);
266 dz23
= _mm256_sub_pd(iz2
,jz3
);
267 dx31
= _mm256_sub_pd(ix3
,jx1
);
268 dy31
= _mm256_sub_pd(iy3
,jy1
);
269 dz31
= _mm256_sub_pd(iz3
,jz1
);
270 dx32
= _mm256_sub_pd(ix3
,jx2
);
271 dy32
= _mm256_sub_pd(iy3
,jy2
);
272 dz32
= _mm256_sub_pd(iz3
,jz2
);
273 dx33
= _mm256_sub_pd(ix3
,jx3
);
274 dy33
= _mm256_sub_pd(iy3
,jy3
);
275 dz33
= _mm256_sub_pd(iz3
,jz3
);
277 /* Calculate squared distance and things based on it */
278 rsq00
= gmx_mm256_calc_rsq_pd(dx00
,dy00
,dz00
);
279 rsq11
= gmx_mm256_calc_rsq_pd(dx11
,dy11
,dz11
);
280 rsq12
= gmx_mm256_calc_rsq_pd(dx12
,dy12
,dz12
);
281 rsq13
= gmx_mm256_calc_rsq_pd(dx13
,dy13
,dz13
);
282 rsq21
= gmx_mm256_calc_rsq_pd(dx21
,dy21
,dz21
);
283 rsq22
= gmx_mm256_calc_rsq_pd(dx22
,dy22
,dz22
);
284 rsq23
= gmx_mm256_calc_rsq_pd(dx23
,dy23
,dz23
);
285 rsq31
= gmx_mm256_calc_rsq_pd(dx31
,dy31
,dz31
);
286 rsq32
= gmx_mm256_calc_rsq_pd(dx32
,dy32
,dz32
);
287 rsq33
= gmx_mm256_calc_rsq_pd(dx33
,dy33
,dz33
);
289 rinv11
= avx256_invsqrt_d(rsq11
);
290 rinv12
= avx256_invsqrt_d(rsq12
);
291 rinv13
= avx256_invsqrt_d(rsq13
);
292 rinv21
= avx256_invsqrt_d(rsq21
);
293 rinv22
= avx256_invsqrt_d(rsq22
);
294 rinv23
= avx256_invsqrt_d(rsq23
);
295 rinv31
= avx256_invsqrt_d(rsq31
);
296 rinv32
= avx256_invsqrt_d(rsq32
);
297 rinv33
= avx256_invsqrt_d(rsq33
);
299 rinvsq00
= avx256_inv_d(rsq00
);
300 rinvsq11
= _mm256_mul_pd(rinv11
,rinv11
);
301 rinvsq12
= _mm256_mul_pd(rinv12
,rinv12
);
302 rinvsq13
= _mm256_mul_pd(rinv13
,rinv13
);
303 rinvsq21
= _mm256_mul_pd(rinv21
,rinv21
);
304 rinvsq22
= _mm256_mul_pd(rinv22
,rinv22
);
305 rinvsq23
= _mm256_mul_pd(rinv23
,rinv23
);
306 rinvsq31
= _mm256_mul_pd(rinv31
,rinv31
);
307 rinvsq32
= _mm256_mul_pd(rinv32
,rinv32
);
308 rinvsq33
= _mm256_mul_pd(rinv33
,rinv33
);
310 fjx0
= _mm256_setzero_pd();
311 fjy0
= _mm256_setzero_pd();
312 fjz0
= _mm256_setzero_pd();
313 fjx1
= _mm256_setzero_pd();
314 fjy1
= _mm256_setzero_pd();
315 fjz1
= _mm256_setzero_pd();
316 fjx2
= _mm256_setzero_pd();
317 fjy2
= _mm256_setzero_pd();
318 fjz2
= _mm256_setzero_pd();
319 fjx3
= _mm256_setzero_pd();
320 fjy3
= _mm256_setzero_pd();
321 fjz3
= _mm256_setzero_pd();
323 /**************************
324 * CALCULATE INTERACTIONS *
325 **************************/
327 /* LENNARD-JONES DISPERSION/REPULSION */
329 rinvsix
= _mm256_mul_pd(_mm256_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
330 vvdw6
= _mm256_mul_pd(c6_00
,rinvsix
);
331 vvdw12
= _mm256_mul_pd(c12_00
,_mm256_mul_pd(rinvsix
,rinvsix
));
332 vvdw
= _mm256_sub_pd( _mm256_mul_pd(vvdw12
,one_twelfth
) , _mm256_mul_pd(vvdw6
,one_sixth
) );
333 fvdw
= _mm256_mul_pd(_mm256_sub_pd(vvdw12
,vvdw6
),rinvsq00
);
335 /* Update potential sum for this i atom from the interaction with this j atom. */
336 vvdwsum
= _mm256_add_pd(vvdwsum
,vvdw
);
340 /* Calculate temporary vectorial force */
341 tx
= _mm256_mul_pd(fscal
,dx00
);
342 ty
= _mm256_mul_pd(fscal
,dy00
);
343 tz
= _mm256_mul_pd(fscal
,dz00
);
345 /* Update vectorial force */
346 fix0
= _mm256_add_pd(fix0
,tx
);
347 fiy0
= _mm256_add_pd(fiy0
,ty
);
348 fiz0
= _mm256_add_pd(fiz0
,tz
);
350 fjx0
= _mm256_add_pd(fjx0
,tx
);
351 fjy0
= _mm256_add_pd(fjy0
,ty
);
352 fjz0
= _mm256_add_pd(fjz0
,tz
);
354 /**************************
355 * CALCULATE INTERACTIONS *
356 **************************/
358 r11
= _mm256_mul_pd(rsq11
,rinv11
);
360 /* EWALD ELECTROSTATICS */
362 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
363 ewrt
= _mm256_mul_pd(r11
,ewtabscale
);
364 ewitab
= _mm256_cvttpd_epi32(ewrt
);
365 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
366 ewitab
= _mm_slli_epi32(ewitab
,2);
367 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
368 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
369 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
370 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
371 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
372 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
373 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
374 velec
= _mm256_mul_pd(qq11
,_mm256_sub_pd(rinv11
,velec
));
375 felec
= _mm256_mul_pd(_mm256_mul_pd(qq11
,rinv11
),_mm256_sub_pd(rinvsq11
,felec
));
377 /* Update potential sum for this i atom from the interaction with this j atom. */
378 velecsum
= _mm256_add_pd(velecsum
,velec
);
382 /* Calculate temporary vectorial force */
383 tx
= _mm256_mul_pd(fscal
,dx11
);
384 ty
= _mm256_mul_pd(fscal
,dy11
);
385 tz
= _mm256_mul_pd(fscal
,dz11
);
387 /* Update vectorial force */
388 fix1
= _mm256_add_pd(fix1
,tx
);
389 fiy1
= _mm256_add_pd(fiy1
,ty
);
390 fiz1
= _mm256_add_pd(fiz1
,tz
);
392 fjx1
= _mm256_add_pd(fjx1
,tx
);
393 fjy1
= _mm256_add_pd(fjy1
,ty
);
394 fjz1
= _mm256_add_pd(fjz1
,tz
);
396 /**************************
397 * CALCULATE INTERACTIONS *
398 **************************/
400 r12
= _mm256_mul_pd(rsq12
,rinv12
);
402 /* EWALD ELECTROSTATICS */
404 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
405 ewrt
= _mm256_mul_pd(r12
,ewtabscale
);
406 ewitab
= _mm256_cvttpd_epi32(ewrt
);
407 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
408 ewitab
= _mm_slli_epi32(ewitab
,2);
409 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
410 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
411 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
412 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
413 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
414 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
415 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
416 velec
= _mm256_mul_pd(qq12
,_mm256_sub_pd(rinv12
,velec
));
417 felec
= _mm256_mul_pd(_mm256_mul_pd(qq12
,rinv12
),_mm256_sub_pd(rinvsq12
,felec
));
419 /* Update potential sum for this i atom from the interaction with this j atom. */
420 velecsum
= _mm256_add_pd(velecsum
,velec
);
424 /* Calculate temporary vectorial force */
425 tx
= _mm256_mul_pd(fscal
,dx12
);
426 ty
= _mm256_mul_pd(fscal
,dy12
);
427 tz
= _mm256_mul_pd(fscal
,dz12
);
429 /* Update vectorial force */
430 fix1
= _mm256_add_pd(fix1
,tx
);
431 fiy1
= _mm256_add_pd(fiy1
,ty
);
432 fiz1
= _mm256_add_pd(fiz1
,tz
);
434 fjx2
= _mm256_add_pd(fjx2
,tx
);
435 fjy2
= _mm256_add_pd(fjy2
,ty
);
436 fjz2
= _mm256_add_pd(fjz2
,tz
);
438 /**************************
439 * CALCULATE INTERACTIONS *
440 **************************/
442 r13
= _mm256_mul_pd(rsq13
,rinv13
);
444 /* EWALD ELECTROSTATICS */
446 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
447 ewrt
= _mm256_mul_pd(r13
,ewtabscale
);
448 ewitab
= _mm256_cvttpd_epi32(ewrt
);
449 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
450 ewitab
= _mm_slli_epi32(ewitab
,2);
451 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
452 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
453 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
454 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
455 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
456 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
457 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
458 velec
= _mm256_mul_pd(qq13
,_mm256_sub_pd(rinv13
,velec
));
459 felec
= _mm256_mul_pd(_mm256_mul_pd(qq13
,rinv13
),_mm256_sub_pd(rinvsq13
,felec
));
461 /* Update potential sum for this i atom from the interaction with this j atom. */
462 velecsum
= _mm256_add_pd(velecsum
,velec
);
466 /* Calculate temporary vectorial force */
467 tx
= _mm256_mul_pd(fscal
,dx13
);
468 ty
= _mm256_mul_pd(fscal
,dy13
);
469 tz
= _mm256_mul_pd(fscal
,dz13
);
471 /* Update vectorial force */
472 fix1
= _mm256_add_pd(fix1
,tx
);
473 fiy1
= _mm256_add_pd(fiy1
,ty
);
474 fiz1
= _mm256_add_pd(fiz1
,tz
);
476 fjx3
= _mm256_add_pd(fjx3
,tx
);
477 fjy3
= _mm256_add_pd(fjy3
,ty
);
478 fjz3
= _mm256_add_pd(fjz3
,tz
);
480 /**************************
481 * CALCULATE INTERACTIONS *
482 **************************/
484 r21
= _mm256_mul_pd(rsq21
,rinv21
);
486 /* EWALD ELECTROSTATICS */
488 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
489 ewrt
= _mm256_mul_pd(r21
,ewtabscale
);
490 ewitab
= _mm256_cvttpd_epi32(ewrt
);
491 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
492 ewitab
= _mm_slli_epi32(ewitab
,2);
493 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
494 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
495 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
496 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
497 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
498 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
499 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
500 velec
= _mm256_mul_pd(qq21
,_mm256_sub_pd(rinv21
,velec
));
501 felec
= _mm256_mul_pd(_mm256_mul_pd(qq21
,rinv21
),_mm256_sub_pd(rinvsq21
,felec
));
503 /* Update potential sum for this i atom from the interaction with this j atom. */
504 velecsum
= _mm256_add_pd(velecsum
,velec
);
508 /* Calculate temporary vectorial force */
509 tx
= _mm256_mul_pd(fscal
,dx21
);
510 ty
= _mm256_mul_pd(fscal
,dy21
);
511 tz
= _mm256_mul_pd(fscal
,dz21
);
513 /* Update vectorial force */
514 fix2
= _mm256_add_pd(fix2
,tx
);
515 fiy2
= _mm256_add_pd(fiy2
,ty
);
516 fiz2
= _mm256_add_pd(fiz2
,tz
);
518 fjx1
= _mm256_add_pd(fjx1
,tx
);
519 fjy1
= _mm256_add_pd(fjy1
,ty
);
520 fjz1
= _mm256_add_pd(fjz1
,tz
);
522 /**************************
523 * CALCULATE INTERACTIONS *
524 **************************/
526 r22
= _mm256_mul_pd(rsq22
,rinv22
);
528 /* EWALD ELECTROSTATICS */
530 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
531 ewrt
= _mm256_mul_pd(r22
,ewtabscale
);
532 ewitab
= _mm256_cvttpd_epi32(ewrt
);
533 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
534 ewitab
= _mm_slli_epi32(ewitab
,2);
535 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
536 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
537 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
538 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
539 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
540 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
541 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
542 velec
= _mm256_mul_pd(qq22
,_mm256_sub_pd(rinv22
,velec
));
543 felec
= _mm256_mul_pd(_mm256_mul_pd(qq22
,rinv22
),_mm256_sub_pd(rinvsq22
,felec
));
545 /* Update potential sum for this i atom from the interaction with this j atom. */
546 velecsum
= _mm256_add_pd(velecsum
,velec
);
550 /* Calculate temporary vectorial force */
551 tx
= _mm256_mul_pd(fscal
,dx22
);
552 ty
= _mm256_mul_pd(fscal
,dy22
);
553 tz
= _mm256_mul_pd(fscal
,dz22
);
555 /* Update vectorial force */
556 fix2
= _mm256_add_pd(fix2
,tx
);
557 fiy2
= _mm256_add_pd(fiy2
,ty
);
558 fiz2
= _mm256_add_pd(fiz2
,tz
);
560 fjx2
= _mm256_add_pd(fjx2
,tx
);
561 fjy2
= _mm256_add_pd(fjy2
,ty
);
562 fjz2
= _mm256_add_pd(fjz2
,tz
);
564 /**************************
565 * CALCULATE INTERACTIONS *
566 **************************/
568 r23
= _mm256_mul_pd(rsq23
,rinv23
);
570 /* EWALD ELECTROSTATICS */
572 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
573 ewrt
= _mm256_mul_pd(r23
,ewtabscale
);
574 ewitab
= _mm256_cvttpd_epi32(ewrt
);
575 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
576 ewitab
= _mm_slli_epi32(ewitab
,2);
577 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
578 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
579 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
580 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
581 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
582 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
583 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
584 velec
= _mm256_mul_pd(qq23
,_mm256_sub_pd(rinv23
,velec
));
585 felec
= _mm256_mul_pd(_mm256_mul_pd(qq23
,rinv23
),_mm256_sub_pd(rinvsq23
,felec
));
587 /* Update potential sum for this i atom from the interaction with this j atom. */
588 velecsum
= _mm256_add_pd(velecsum
,velec
);
592 /* Calculate temporary vectorial force */
593 tx
= _mm256_mul_pd(fscal
,dx23
);
594 ty
= _mm256_mul_pd(fscal
,dy23
);
595 tz
= _mm256_mul_pd(fscal
,dz23
);
597 /* Update vectorial force */
598 fix2
= _mm256_add_pd(fix2
,tx
);
599 fiy2
= _mm256_add_pd(fiy2
,ty
);
600 fiz2
= _mm256_add_pd(fiz2
,tz
);
602 fjx3
= _mm256_add_pd(fjx3
,tx
);
603 fjy3
= _mm256_add_pd(fjy3
,ty
);
604 fjz3
= _mm256_add_pd(fjz3
,tz
);
606 /**************************
607 * CALCULATE INTERACTIONS *
608 **************************/
610 r31
= _mm256_mul_pd(rsq31
,rinv31
);
612 /* EWALD ELECTROSTATICS */
614 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
615 ewrt
= _mm256_mul_pd(r31
,ewtabscale
);
616 ewitab
= _mm256_cvttpd_epi32(ewrt
);
617 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
618 ewitab
= _mm_slli_epi32(ewitab
,2);
619 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
620 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
621 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
622 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
623 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
624 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
625 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
626 velec
= _mm256_mul_pd(qq31
,_mm256_sub_pd(rinv31
,velec
));
627 felec
= _mm256_mul_pd(_mm256_mul_pd(qq31
,rinv31
),_mm256_sub_pd(rinvsq31
,felec
));
629 /* Update potential sum for this i atom from the interaction with this j atom. */
630 velecsum
= _mm256_add_pd(velecsum
,velec
);
634 /* Calculate temporary vectorial force */
635 tx
= _mm256_mul_pd(fscal
,dx31
);
636 ty
= _mm256_mul_pd(fscal
,dy31
);
637 tz
= _mm256_mul_pd(fscal
,dz31
);
639 /* Update vectorial force */
640 fix3
= _mm256_add_pd(fix3
,tx
);
641 fiy3
= _mm256_add_pd(fiy3
,ty
);
642 fiz3
= _mm256_add_pd(fiz3
,tz
);
644 fjx1
= _mm256_add_pd(fjx1
,tx
);
645 fjy1
= _mm256_add_pd(fjy1
,ty
);
646 fjz1
= _mm256_add_pd(fjz1
,tz
);
648 /**************************
649 * CALCULATE INTERACTIONS *
650 **************************/
652 r32
= _mm256_mul_pd(rsq32
,rinv32
);
654 /* EWALD ELECTROSTATICS */
656 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
657 ewrt
= _mm256_mul_pd(r32
,ewtabscale
);
658 ewitab
= _mm256_cvttpd_epi32(ewrt
);
659 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
660 ewitab
= _mm_slli_epi32(ewitab
,2);
661 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
662 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
663 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
664 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
665 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
666 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
667 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
668 velec
= _mm256_mul_pd(qq32
,_mm256_sub_pd(rinv32
,velec
));
669 felec
= _mm256_mul_pd(_mm256_mul_pd(qq32
,rinv32
),_mm256_sub_pd(rinvsq32
,felec
));
671 /* Update potential sum for this i atom from the interaction with this j atom. */
672 velecsum
= _mm256_add_pd(velecsum
,velec
);
676 /* Calculate temporary vectorial force */
677 tx
= _mm256_mul_pd(fscal
,dx32
);
678 ty
= _mm256_mul_pd(fscal
,dy32
);
679 tz
= _mm256_mul_pd(fscal
,dz32
);
681 /* Update vectorial force */
682 fix3
= _mm256_add_pd(fix3
,tx
);
683 fiy3
= _mm256_add_pd(fiy3
,ty
);
684 fiz3
= _mm256_add_pd(fiz3
,tz
);
686 fjx2
= _mm256_add_pd(fjx2
,tx
);
687 fjy2
= _mm256_add_pd(fjy2
,ty
);
688 fjz2
= _mm256_add_pd(fjz2
,tz
);
690 /**************************
691 * CALCULATE INTERACTIONS *
692 **************************/
694 r33
= _mm256_mul_pd(rsq33
,rinv33
);
696 /* EWALD ELECTROSTATICS */
698 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
699 ewrt
= _mm256_mul_pd(r33
,ewtabscale
);
700 ewitab
= _mm256_cvttpd_epi32(ewrt
);
701 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
702 ewitab
= _mm_slli_epi32(ewitab
,2);
703 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
704 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
705 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
706 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
707 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
708 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
709 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
710 velec
= _mm256_mul_pd(qq33
,_mm256_sub_pd(rinv33
,velec
));
711 felec
= _mm256_mul_pd(_mm256_mul_pd(qq33
,rinv33
),_mm256_sub_pd(rinvsq33
,felec
));
713 /* Update potential sum for this i atom from the interaction with this j atom. */
714 velecsum
= _mm256_add_pd(velecsum
,velec
);
718 /* Calculate temporary vectorial force */
719 tx
= _mm256_mul_pd(fscal
,dx33
);
720 ty
= _mm256_mul_pd(fscal
,dy33
);
721 tz
= _mm256_mul_pd(fscal
,dz33
);
723 /* Update vectorial force */
724 fix3
= _mm256_add_pd(fix3
,tx
);
725 fiy3
= _mm256_add_pd(fiy3
,ty
);
726 fiz3
= _mm256_add_pd(fiz3
,tz
);
728 fjx3
= _mm256_add_pd(fjx3
,tx
);
729 fjy3
= _mm256_add_pd(fjy3
,ty
);
730 fjz3
= _mm256_add_pd(fjz3
,tz
);
732 fjptrA
= f
+j_coord_offsetA
;
733 fjptrB
= f
+j_coord_offsetB
;
734 fjptrC
= f
+j_coord_offsetC
;
735 fjptrD
= f
+j_coord_offsetD
;
737 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
738 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,
739 fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
741 /* Inner loop uses 404 flops */
747 /* Get j neighbor index, and coordinate index */
748 jnrlistA
= jjnr
[jidx
];
749 jnrlistB
= jjnr
[jidx
+1];
750 jnrlistC
= jjnr
[jidx
+2];
751 jnrlistD
= jjnr
[jidx
+3];
752 /* Sign of each element will be negative for non-real atoms.
753 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
754 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
756 tmpmask0
= gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128()));
758 tmpmask1
= _mm_permute_ps(tmpmask0
,_GMX_MM_PERMUTE(3,3,2,2));
759 tmpmask0
= _mm_permute_ps(tmpmask0
,_GMX_MM_PERMUTE(1,1,0,0));
760 dummy_mask
= _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1
,tmpmask0
));
762 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
763 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
764 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
765 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
766 j_coord_offsetA
= DIM
*jnrA
;
767 j_coord_offsetB
= DIM
*jnrB
;
768 j_coord_offsetC
= DIM
*jnrC
;
769 j_coord_offsetD
= DIM
*jnrD
;
771 /* load j atom coordinates */
772 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
773 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
774 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
775 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
777 /* Calculate displacement vector */
778 dx00
= _mm256_sub_pd(ix0
,jx0
);
779 dy00
= _mm256_sub_pd(iy0
,jy0
);
780 dz00
= _mm256_sub_pd(iz0
,jz0
);
781 dx11
= _mm256_sub_pd(ix1
,jx1
);
782 dy11
= _mm256_sub_pd(iy1
,jy1
);
783 dz11
= _mm256_sub_pd(iz1
,jz1
);
784 dx12
= _mm256_sub_pd(ix1
,jx2
);
785 dy12
= _mm256_sub_pd(iy1
,jy2
);
786 dz12
= _mm256_sub_pd(iz1
,jz2
);
787 dx13
= _mm256_sub_pd(ix1
,jx3
);
788 dy13
= _mm256_sub_pd(iy1
,jy3
);
789 dz13
= _mm256_sub_pd(iz1
,jz3
);
790 dx21
= _mm256_sub_pd(ix2
,jx1
);
791 dy21
= _mm256_sub_pd(iy2
,jy1
);
792 dz21
= _mm256_sub_pd(iz2
,jz1
);
793 dx22
= _mm256_sub_pd(ix2
,jx2
);
794 dy22
= _mm256_sub_pd(iy2
,jy2
);
795 dz22
= _mm256_sub_pd(iz2
,jz2
);
796 dx23
= _mm256_sub_pd(ix2
,jx3
);
797 dy23
= _mm256_sub_pd(iy2
,jy3
);
798 dz23
= _mm256_sub_pd(iz2
,jz3
);
799 dx31
= _mm256_sub_pd(ix3
,jx1
);
800 dy31
= _mm256_sub_pd(iy3
,jy1
);
801 dz31
= _mm256_sub_pd(iz3
,jz1
);
802 dx32
= _mm256_sub_pd(ix3
,jx2
);
803 dy32
= _mm256_sub_pd(iy3
,jy2
);
804 dz32
= _mm256_sub_pd(iz3
,jz2
);
805 dx33
= _mm256_sub_pd(ix3
,jx3
);
806 dy33
= _mm256_sub_pd(iy3
,jy3
);
807 dz33
= _mm256_sub_pd(iz3
,jz3
);
809 /* Calculate squared distance and things based on it */
810 rsq00
= gmx_mm256_calc_rsq_pd(dx00
,dy00
,dz00
);
811 rsq11
= gmx_mm256_calc_rsq_pd(dx11
,dy11
,dz11
);
812 rsq12
= gmx_mm256_calc_rsq_pd(dx12
,dy12
,dz12
);
813 rsq13
= gmx_mm256_calc_rsq_pd(dx13
,dy13
,dz13
);
814 rsq21
= gmx_mm256_calc_rsq_pd(dx21
,dy21
,dz21
);
815 rsq22
= gmx_mm256_calc_rsq_pd(dx22
,dy22
,dz22
);
816 rsq23
= gmx_mm256_calc_rsq_pd(dx23
,dy23
,dz23
);
817 rsq31
= gmx_mm256_calc_rsq_pd(dx31
,dy31
,dz31
);
818 rsq32
= gmx_mm256_calc_rsq_pd(dx32
,dy32
,dz32
);
819 rsq33
= gmx_mm256_calc_rsq_pd(dx33
,dy33
,dz33
);
821 rinv11
= avx256_invsqrt_d(rsq11
);
822 rinv12
= avx256_invsqrt_d(rsq12
);
823 rinv13
= avx256_invsqrt_d(rsq13
);
824 rinv21
= avx256_invsqrt_d(rsq21
);
825 rinv22
= avx256_invsqrt_d(rsq22
);
826 rinv23
= avx256_invsqrt_d(rsq23
);
827 rinv31
= avx256_invsqrt_d(rsq31
);
828 rinv32
= avx256_invsqrt_d(rsq32
);
829 rinv33
= avx256_invsqrt_d(rsq33
);
831 rinvsq00
= avx256_inv_d(rsq00
);
832 rinvsq11
= _mm256_mul_pd(rinv11
,rinv11
);
833 rinvsq12
= _mm256_mul_pd(rinv12
,rinv12
);
834 rinvsq13
= _mm256_mul_pd(rinv13
,rinv13
);
835 rinvsq21
= _mm256_mul_pd(rinv21
,rinv21
);
836 rinvsq22
= _mm256_mul_pd(rinv22
,rinv22
);
837 rinvsq23
= _mm256_mul_pd(rinv23
,rinv23
);
838 rinvsq31
= _mm256_mul_pd(rinv31
,rinv31
);
839 rinvsq32
= _mm256_mul_pd(rinv32
,rinv32
);
840 rinvsq33
= _mm256_mul_pd(rinv33
,rinv33
);
842 fjx0
= _mm256_setzero_pd();
843 fjy0
= _mm256_setzero_pd();
844 fjz0
= _mm256_setzero_pd();
845 fjx1
= _mm256_setzero_pd();
846 fjy1
= _mm256_setzero_pd();
847 fjz1
= _mm256_setzero_pd();
848 fjx2
= _mm256_setzero_pd();
849 fjy2
= _mm256_setzero_pd();
850 fjz2
= _mm256_setzero_pd();
851 fjx3
= _mm256_setzero_pd();
852 fjy3
= _mm256_setzero_pd();
853 fjz3
= _mm256_setzero_pd();
855 /**************************
856 * CALCULATE INTERACTIONS *
857 **************************/
859 /* LENNARD-JONES DISPERSION/REPULSION */
861 rinvsix
= _mm256_mul_pd(_mm256_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
862 vvdw6
= _mm256_mul_pd(c6_00
,rinvsix
);
863 vvdw12
= _mm256_mul_pd(c12_00
,_mm256_mul_pd(rinvsix
,rinvsix
));
864 vvdw
= _mm256_sub_pd( _mm256_mul_pd(vvdw12
,one_twelfth
) , _mm256_mul_pd(vvdw6
,one_sixth
) );
865 fvdw
= _mm256_mul_pd(_mm256_sub_pd(vvdw12
,vvdw6
),rinvsq00
);
867 /* Update potential sum for this i atom from the interaction with this j atom. */
868 vvdw
= _mm256_andnot_pd(dummy_mask
,vvdw
);
869 vvdwsum
= _mm256_add_pd(vvdwsum
,vvdw
);
873 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
875 /* Calculate temporary vectorial force */
876 tx
= _mm256_mul_pd(fscal
,dx00
);
877 ty
= _mm256_mul_pd(fscal
,dy00
);
878 tz
= _mm256_mul_pd(fscal
,dz00
);
880 /* Update vectorial force */
881 fix0
= _mm256_add_pd(fix0
,tx
);
882 fiy0
= _mm256_add_pd(fiy0
,ty
);
883 fiz0
= _mm256_add_pd(fiz0
,tz
);
885 fjx0
= _mm256_add_pd(fjx0
,tx
);
886 fjy0
= _mm256_add_pd(fjy0
,ty
);
887 fjz0
= _mm256_add_pd(fjz0
,tz
);
889 /**************************
890 * CALCULATE INTERACTIONS *
891 **************************/
893 r11
= _mm256_mul_pd(rsq11
,rinv11
);
894 r11
= _mm256_andnot_pd(dummy_mask
,r11
);
896 /* EWALD ELECTROSTATICS */
898 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
899 ewrt
= _mm256_mul_pd(r11
,ewtabscale
);
900 ewitab
= _mm256_cvttpd_epi32(ewrt
);
901 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
902 ewitab
= _mm_slli_epi32(ewitab
,2);
903 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
904 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
905 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
906 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
907 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
908 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
909 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
910 velec
= _mm256_mul_pd(qq11
,_mm256_sub_pd(rinv11
,velec
));
911 felec
= _mm256_mul_pd(_mm256_mul_pd(qq11
,rinv11
),_mm256_sub_pd(rinvsq11
,felec
));
913 /* Update potential sum for this i atom from the interaction with this j atom. */
914 velec
= _mm256_andnot_pd(dummy_mask
,velec
);
915 velecsum
= _mm256_add_pd(velecsum
,velec
);
919 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
921 /* Calculate temporary vectorial force */
922 tx
= _mm256_mul_pd(fscal
,dx11
);
923 ty
= _mm256_mul_pd(fscal
,dy11
);
924 tz
= _mm256_mul_pd(fscal
,dz11
);
926 /* Update vectorial force */
927 fix1
= _mm256_add_pd(fix1
,tx
);
928 fiy1
= _mm256_add_pd(fiy1
,ty
);
929 fiz1
= _mm256_add_pd(fiz1
,tz
);
931 fjx1
= _mm256_add_pd(fjx1
,tx
);
932 fjy1
= _mm256_add_pd(fjy1
,ty
);
933 fjz1
= _mm256_add_pd(fjz1
,tz
);
935 /**************************
936 * CALCULATE INTERACTIONS *
937 **************************/
939 r12
= _mm256_mul_pd(rsq12
,rinv12
);
940 r12
= _mm256_andnot_pd(dummy_mask
,r12
);
942 /* EWALD ELECTROSTATICS */
944 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
945 ewrt
= _mm256_mul_pd(r12
,ewtabscale
);
946 ewitab
= _mm256_cvttpd_epi32(ewrt
);
947 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
948 ewitab
= _mm_slli_epi32(ewitab
,2);
949 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
950 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
951 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
952 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
953 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
954 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
955 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
956 velec
= _mm256_mul_pd(qq12
,_mm256_sub_pd(rinv12
,velec
));
957 felec
= _mm256_mul_pd(_mm256_mul_pd(qq12
,rinv12
),_mm256_sub_pd(rinvsq12
,felec
));
959 /* Update potential sum for this i atom from the interaction with this j atom. */
960 velec
= _mm256_andnot_pd(dummy_mask
,velec
);
961 velecsum
= _mm256_add_pd(velecsum
,velec
);
965 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
967 /* Calculate temporary vectorial force */
968 tx
= _mm256_mul_pd(fscal
,dx12
);
969 ty
= _mm256_mul_pd(fscal
,dy12
);
970 tz
= _mm256_mul_pd(fscal
,dz12
);
972 /* Update vectorial force */
973 fix1
= _mm256_add_pd(fix1
,tx
);
974 fiy1
= _mm256_add_pd(fiy1
,ty
);
975 fiz1
= _mm256_add_pd(fiz1
,tz
);
977 fjx2
= _mm256_add_pd(fjx2
,tx
);
978 fjy2
= _mm256_add_pd(fjy2
,ty
);
979 fjz2
= _mm256_add_pd(fjz2
,tz
);
981 /**************************
982 * CALCULATE INTERACTIONS *
983 **************************/
985 r13
= _mm256_mul_pd(rsq13
,rinv13
);
986 r13
= _mm256_andnot_pd(dummy_mask
,r13
);
988 /* EWALD ELECTROSTATICS */
990 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
991 ewrt
= _mm256_mul_pd(r13
,ewtabscale
);
992 ewitab
= _mm256_cvttpd_epi32(ewrt
);
993 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
994 ewitab
= _mm_slli_epi32(ewitab
,2);
995 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
996 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
997 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
998 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
999 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
1000 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
1001 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
1002 velec
= _mm256_mul_pd(qq13
,_mm256_sub_pd(rinv13
,velec
));
1003 felec
= _mm256_mul_pd(_mm256_mul_pd(qq13
,rinv13
),_mm256_sub_pd(rinvsq13
,felec
));
1005 /* Update potential sum for this i atom from the interaction with this j atom. */
1006 velec
= _mm256_andnot_pd(dummy_mask
,velec
);
1007 velecsum
= _mm256_add_pd(velecsum
,velec
);
1011 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
1013 /* Calculate temporary vectorial force */
1014 tx
= _mm256_mul_pd(fscal
,dx13
);
1015 ty
= _mm256_mul_pd(fscal
,dy13
);
1016 tz
= _mm256_mul_pd(fscal
,dz13
);
1018 /* Update vectorial force */
1019 fix1
= _mm256_add_pd(fix1
,tx
);
1020 fiy1
= _mm256_add_pd(fiy1
,ty
);
1021 fiz1
= _mm256_add_pd(fiz1
,tz
);
1023 fjx3
= _mm256_add_pd(fjx3
,tx
);
1024 fjy3
= _mm256_add_pd(fjy3
,ty
);
1025 fjz3
= _mm256_add_pd(fjz3
,tz
);
1027 /**************************
1028 * CALCULATE INTERACTIONS *
1029 **************************/
1031 r21
= _mm256_mul_pd(rsq21
,rinv21
);
1032 r21
= _mm256_andnot_pd(dummy_mask
,r21
);
1034 /* EWALD ELECTROSTATICS */
1036 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1037 ewrt
= _mm256_mul_pd(r21
,ewtabscale
);
1038 ewitab
= _mm256_cvttpd_epi32(ewrt
);
1039 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1040 ewitab
= _mm_slli_epi32(ewitab
,2);
1041 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
1042 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
1043 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
1044 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
1045 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
1046 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
1047 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
1048 velec
= _mm256_mul_pd(qq21
,_mm256_sub_pd(rinv21
,velec
));
1049 felec
= _mm256_mul_pd(_mm256_mul_pd(qq21
,rinv21
),_mm256_sub_pd(rinvsq21
,felec
));
1051 /* Update potential sum for this i atom from the interaction with this j atom. */
1052 velec
= _mm256_andnot_pd(dummy_mask
,velec
);
1053 velecsum
= _mm256_add_pd(velecsum
,velec
);
1057 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
1059 /* Calculate temporary vectorial force */
1060 tx
= _mm256_mul_pd(fscal
,dx21
);
1061 ty
= _mm256_mul_pd(fscal
,dy21
);
1062 tz
= _mm256_mul_pd(fscal
,dz21
);
1064 /* Update vectorial force */
1065 fix2
= _mm256_add_pd(fix2
,tx
);
1066 fiy2
= _mm256_add_pd(fiy2
,ty
);
1067 fiz2
= _mm256_add_pd(fiz2
,tz
);
1069 fjx1
= _mm256_add_pd(fjx1
,tx
);
1070 fjy1
= _mm256_add_pd(fjy1
,ty
);
1071 fjz1
= _mm256_add_pd(fjz1
,tz
);
1073 /**************************
1074 * CALCULATE INTERACTIONS *
1075 **************************/
1077 r22
= _mm256_mul_pd(rsq22
,rinv22
);
1078 r22
= _mm256_andnot_pd(dummy_mask
,r22
);
1080 /* EWALD ELECTROSTATICS */
1082 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1083 ewrt
= _mm256_mul_pd(r22
,ewtabscale
);
1084 ewitab
= _mm256_cvttpd_epi32(ewrt
);
1085 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1086 ewitab
= _mm_slli_epi32(ewitab
,2);
1087 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
1088 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
1089 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
1090 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
1091 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
1092 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
1093 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
1094 velec
= _mm256_mul_pd(qq22
,_mm256_sub_pd(rinv22
,velec
));
1095 felec
= _mm256_mul_pd(_mm256_mul_pd(qq22
,rinv22
),_mm256_sub_pd(rinvsq22
,felec
));
1097 /* Update potential sum for this i atom from the interaction with this j atom. */
1098 velec
= _mm256_andnot_pd(dummy_mask
,velec
);
1099 velecsum
= _mm256_add_pd(velecsum
,velec
);
1103 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
1105 /* Calculate temporary vectorial force */
1106 tx
= _mm256_mul_pd(fscal
,dx22
);
1107 ty
= _mm256_mul_pd(fscal
,dy22
);
1108 tz
= _mm256_mul_pd(fscal
,dz22
);
1110 /* Update vectorial force */
1111 fix2
= _mm256_add_pd(fix2
,tx
);
1112 fiy2
= _mm256_add_pd(fiy2
,ty
);
1113 fiz2
= _mm256_add_pd(fiz2
,tz
);
1115 fjx2
= _mm256_add_pd(fjx2
,tx
);
1116 fjy2
= _mm256_add_pd(fjy2
,ty
);
1117 fjz2
= _mm256_add_pd(fjz2
,tz
);
1119 /**************************
1120 * CALCULATE INTERACTIONS *
1121 **************************/
1123 r23
= _mm256_mul_pd(rsq23
,rinv23
);
1124 r23
= _mm256_andnot_pd(dummy_mask
,r23
);
1126 /* EWALD ELECTROSTATICS */
1128 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1129 ewrt
= _mm256_mul_pd(r23
,ewtabscale
);
1130 ewitab
= _mm256_cvttpd_epi32(ewrt
);
1131 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1132 ewitab
= _mm_slli_epi32(ewitab
,2);
1133 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
1134 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
1135 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
1136 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
1137 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
1138 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
1139 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
1140 velec
= _mm256_mul_pd(qq23
,_mm256_sub_pd(rinv23
,velec
));
1141 felec
= _mm256_mul_pd(_mm256_mul_pd(qq23
,rinv23
),_mm256_sub_pd(rinvsq23
,felec
));
1143 /* Update potential sum for this i atom from the interaction with this j atom. */
1144 velec
= _mm256_andnot_pd(dummy_mask
,velec
);
1145 velecsum
= _mm256_add_pd(velecsum
,velec
);
1149 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
1151 /* Calculate temporary vectorial force */
1152 tx
= _mm256_mul_pd(fscal
,dx23
);
1153 ty
= _mm256_mul_pd(fscal
,dy23
);
1154 tz
= _mm256_mul_pd(fscal
,dz23
);
1156 /* Update vectorial force */
1157 fix2
= _mm256_add_pd(fix2
,tx
);
1158 fiy2
= _mm256_add_pd(fiy2
,ty
);
1159 fiz2
= _mm256_add_pd(fiz2
,tz
);
1161 fjx3
= _mm256_add_pd(fjx3
,tx
);
1162 fjy3
= _mm256_add_pd(fjy3
,ty
);
1163 fjz3
= _mm256_add_pd(fjz3
,tz
);
1165 /**************************
1166 * CALCULATE INTERACTIONS *
1167 **************************/
1169 r31
= _mm256_mul_pd(rsq31
,rinv31
);
1170 r31
= _mm256_andnot_pd(dummy_mask
,r31
);
1172 /* EWALD ELECTROSTATICS */
1174 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1175 ewrt
= _mm256_mul_pd(r31
,ewtabscale
);
1176 ewitab
= _mm256_cvttpd_epi32(ewrt
);
1177 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1178 ewitab
= _mm_slli_epi32(ewitab
,2);
1179 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
1180 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
1181 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
1182 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
1183 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
1184 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
1185 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
1186 velec
= _mm256_mul_pd(qq31
,_mm256_sub_pd(rinv31
,velec
));
1187 felec
= _mm256_mul_pd(_mm256_mul_pd(qq31
,rinv31
),_mm256_sub_pd(rinvsq31
,felec
));
1189 /* Update potential sum for this i atom from the interaction with this j atom. */
1190 velec
= _mm256_andnot_pd(dummy_mask
,velec
);
1191 velecsum
= _mm256_add_pd(velecsum
,velec
);
1195 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
1197 /* Calculate temporary vectorial force */
1198 tx
= _mm256_mul_pd(fscal
,dx31
);
1199 ty
= _mm256_mul_pd(fscal
,dy31
);
1200 tz
= _mm256_mul_pd(fscal
,dz31
);
1202 /* Update vectorial force */
1203 fix3
= _mm256_add_pd(fix3
,tx
);
1204 fiy3
= _mm256_add_pd(fiy3
,ty
);
1205 fiz3
= _mm256_add_pd(fiz3
,tz
);
1207 fjx1
= _mm256_add_pd(fjx1
,tx
);
1208 fjy1
= _mm256_add_pd(fjy1
,ty
);
1209 fjz1
= _mm256_add_pd(fjz1
,tz
);
1211 /**************************
1212 * CALCULATE INTERACTIONS *
1213 **************************/
1215 r32
= _mm256_mul_pd(rsq32
,rinv32
);
1216 r32
= _mm256_andnot_pd(dummy_mask
,r32
);
1218 /* EWALD ELECTROSTATICS */
1220 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1221 ewrt
= _mm256_mul_pd(r32
,ewtabscale
);
1222 ewitab
= _mm256_cvttpd_epi32(ewrt
);
1223 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1224 ewitab
= _mm_slli_epi32(ewitab
,2);
1225 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
1226 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
1227 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
1228 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
1229 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
1230 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
1231 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
1232 velec
= _mm256_mul_pd(qq32
,_mm256_sub_pd(rinv32
,velec
));
1233 felec
= _mm256_mul_pd(_mm256_mul_pd(qq32
,rinv32
),_mm256_sub_pd(rinvsq32
,felec
));
1235 /* Update potential sum for this i atom from the interaction with this j atom. */
1236 velec
= _mm256_andnot_pd(dummy_mask
,velec
);
1237 velecsum
= _mm256_add_pd(velecsum
,velec
);
1241 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
1243 /* Calculate temporary vectorial force */
1244 tx
= _mm256_mul_pd(fscal
,dx32
);
1245 ty
= _mm256_mul_pd(fscal
,dy32
);
1246 tz
= _mm256_mul_pd(fscal
,dz32
);
1248 /* Update vectorial force */
1249 fix3
= _mm256_add_pd(fix3
,tx
);
1250 fiy3
= _mm256_add_pd(fiy3
,ty
);
1251 fiz3
= _mm256_add_pd(fiz3
,tz
);
1253 fjx2
= _mm256_add_pd(fjx2
,tx
);
1254 fjy2
= _mm256_add_pd(fjy2
,ty
);
1255 fjz2
= _mm256_add_pd(fjz2
,tz
);
1257 /**************************
1258 * CALCULATE INTERACTIONS *
1259 **************************/
1261 r33
= _mm256_mul_pd(rsq33
,rinv33
);
1262 r33
= _mm256_andnot_pd(dummy_mask
,r33
);
1264 /* EWALD ELECTROSTATICS */
1266 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1267 ewrt
= _mm256_mul_pd(r33
,ewtabscale
);
1268 ewitab
= _mm256_cvttpd_epi32(ewrt
);
1269 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1270 ewitab
= _mm_slli_epi32(ewitab
,2);
1271 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
1272 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
1273 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
1274 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
1275 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
1276 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
1277 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
1278 velec
= _mm256_mul_pd(qq33
,_mm256_sub_pd(rinv33
,velec
));
1279 felec
= _mm256_mul_pd(_mm256_mul_pd(qq33
,rinv33
),_mm256_sub_pd(rinvsq33
,felec
));
1281 /* Update potential sum for this i atom from the interaction with this j atom. */
1282 velec
= _mm256_andnot_pd(dummy_mask
,velec
);
1283 velecsum
= _mm256_add_pd(velecsum
,velec
);
1287 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
1289 /* Calculate temporary vectorial force */
1290 tx
= _mm256_mul_pd(fscal
,dx33
);
1291 ty
= _mm256_mul_pd(fscal
,dy33
);
1292 tz
= _mm256_mul_pd(fscal
,dz33
);
1294 /* Update vectorial force */
1295 fix3
= _mm256_add_pd(fix3
,tx
);
1296 fiy3
= _mm256_add_pd(fiy3
,ty
);
1297 fiz3
= _mm256_add_pd(fiz3
,tz
);
1299 fjx3
= _mm256_add_pd(fjx3
,tx
);
1300 fjy3
= _mm256_add_pd(fjy3
,ty
);
1301 fjz3
= _mm256_add_pd(fjz3
,tz
);
1303 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
1304 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
1305 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
1306 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
1308 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
1309 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,
1310 fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
1312 /* Inner loop uses 413 flops */
1315 /* End of innermost loop */
1317 gmx_mm256_update_iforce_4atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
1318 f
+i_coord_offset
,fshift
+i_shift_offset
);
1321 /* Update potential energies */
1322 gmx_mm256_update_1pot_pd(velecsum
,kernel_data
->energygrp_elec
+ggid
);
1323 gmx_mm256_update_1pot_pd(vvdwsum
,kernel_data
->energygrp_vdw
+ggid
);
1325 /* Increment number of inner iterations */
1326 inneriter
+= j_index_end
- j_index_start
;
1328 /* Outer loop uses 26 flops */
1331 /* Increment number of outer iterations */
1334 /* Update outer/inner flops */
1336 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W4W4_VF
,outeriter
*26 + inneriter
*413);
1339 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_avx_256_double
1340 * Electrostatics interaction: Ewald
1341 * VdW interaction: LennardJones
1342 * Geometry: Water4-Water4
1343 * Calculate force/pot: Force
1346 nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_avx_256_double
1347 (t_nblist
* gmx_restrict nlist
,
1348 rvec
* gmx_restrict xx
,
1349 rvec
* gmx_restrict ff
,
1350 struct t_forcerec
* gmx_restrict fr
,
1351 t_mdatoms
* gmx_restrict mdatoms
,
1352 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
1353 t_nrnb
* gmx_restrict nrnb
)
1355 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1356 * just 0 for non-waters.
1357 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1358 * jnr indices corresponding to data put in the four positions in the SIMD register.
1360 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
1361 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
1362 int jnrA
,jnrB
,jnrC
,jnrD
;
1363 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
1364 int jnrlistE
,jnrlistF
,jnrlistG
,jnrlistH
;
1365 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
1366 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
1367 real rcutoff_scalar
;
1368 real
*shiftvec
,*fshift
,*x
,*f
;
1369 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
;
1370 real scratch
[4*DIM
];
1371 __m256d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
1372 real
* vdwioffsetptr0
;
1373 __m256d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
1374 real
* vdwioffsetptr1
;
1375 __m256d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
1376 real
* vdwioffsetptr2
;
1377 __m256d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
1378 real
* vdwioffsetptr3
;
1379 __m256d ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
1380 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
;
1381 __m256d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
1382 int vdwjidx1A
,vdwjidx1B
,vdwjidx1C
,vdwjidx1D
;
1383 __m256d jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
1384 int vdwjidx2A
,vdwjidx2B
,vdwjidx2C
,vdwjidx2D
;
1385 __m256d jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
1386 int vdwjidx3A
,vdwjidx3B
,vdwjidx3C
,vdwjidx3D
;
1387 __m256d jx3
,jy3
,jz3
,fjx3
,fjy3
,fjz3
,jq3
,isaj3
;
1388 __m256d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
1389 __m256d dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
1390 __m256d dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
1391 __m256d dx13
,dy13
,dz13
,rsq13
,rinv13
,rinvsq13
,r13
,qq13
,c6_13
,c12_13
;
1392 __m256d dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
1393 __m256d dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
1394 __m256d dx23
,dy23
,dz23
,rsq23
,rinv23
,rinvsq23
,r23
,qq23
,c6_23
,c12_23
;
1395 __m256d dx31
,dy31
,dz31
,rsq31
,rinv31
,rinvsq31
,r31
,qq31
,c6_31
,c12_31
;
1396 __m256d dx32
,dy32
,dz32
,rsq32
,rinv32
,rinvsq32
,r32
,qq32
,c6_32
,c12_32
;
1397 __m256d dx33
,dy33
,dz33
,rsq33
,rinv33
,rinvsq33
,r33
,qq33
,c6_33
,c12_33
;
1398 __m256d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
1401 __m256d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
1404 __m256d one_sixth
= _mm256_set1_pd(1.0/6.0);
1405 __m256d one_twelfth
= _mm256_set1_pd(1.0/12.0);
1407 __m256d ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
1408 __m256d beta
,beta2
,beta3
,zeta2
,pmecorrF
,pmecorrV
,rinv3
;
1410 __m256d dummy_mask
,cutoff_mask
;
1411 __m128 tmpmask0
,tmpmask1
;
1412 __m256d signbit
= _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1413 __m256d one
= _mm256_set1_pd(1.0);
1414 __m256d two
= _mm256_set1_pd(2.0);
1420 jindex
= nlist
->jindex
;
1422 shiftidx
= nlist
->shift
;
1424 shiftvec
= fr
->shift_vec
[0];
1425 fshift
= fr
->fshift
[0];
1426 facel
= _mm256_set1_pd(fr
->ic
->epsfac
);
1427 charge
= mdatoms
->chargeA
;
1428 nvdwtype
= fr
->ntype
;
1429 vdwparam
= fr
->nbfp
;
1430 vdwtype
= mdatoms
->typeA
;
1432 sh_ewald
= _mm256_set1_pd(fr
->ic
->sh_ewald
);
1433 beta
= _mm256_set1_pd(fr
->ic
->ewaldcoeff_q
);
1434 beta2
= _mm256_mul_pd(beta
,beta
);
1435 beta3
= _mm256_mul_pd(beta
,beta2
);
1437 ewtab
= fr
->ic
->tabq_coul_F
;
1438 ewtabscale
= _mm256_set1_pd(fr
->ic
->tabq_scale
);
1439 ewtabhalfspace
= _mm256_set1_pd(0.5/fr
->ic
->tabq_scale
);
1441 /* Setup water-specific parameters */
1442 inr
= nlist
->iinr
[0];
1443 iq1
= _mm256_mul_pd(facel
,_mm256_set1_pd(charge
[inr
+1]));
1444 iq2
= _mm256_mul_pd(facel
,_mm256_set1_pd(charge
[inr
+2]));
1445 iq3
= _mm256_mul_pd(facel
,_mm256_set1_pd(charge
[inr
+3]));
1446 vdwioffsetptr0
= vdwparam
+2*nvdwtype
*vdwtype
[inr
+0];
1448 jq1
= _mm256_set1_pd(charge
[inr
+1]);
1449 jq2
= _mm256_set1_pd(charge
[inr
+2]);
1450 jq3
= _mm256_set1_pd(charge
[inr
+3]);
1451 vdwjidx0A
= 2*vdwtype
[inr
+0];
1452 c6_00
= _mm256_set1_pd(vdwioffsetptr0
[vdwjidx0A
]);
1453 c12_00
= _mm256_set1_pd(vdwioffsetptr0
[vdwjidx0A
+1]);
1454 qq11
= _mm256_mul_pd(iq1
,jq1
);
1455 qq12
= _mm256_mul_pd(iq1
,jq2
);
1456 qq13
= _mm256_mul_pd(iq1
,jq3
);
1457 qq21
= _mm256_mul_pd(iq2
,jq1
);
1458 qq22
= _mm256_mul_pd(iq2
,jq2
);
1459 qq23
= _mm256_mul_pd(iq2
,jq3
);
1460 qq31
= _mm256_mul_pd(iq3
,jq1
);
1461 qq32
= _mm256_mul_pd(iq3
,jq2
);
1462 qq33
= _mm256_mul_pd(iq3
,jq3
);
1464 /* Avoid stupid compiler warnings */
1465 jnrA
= jnrB
= jnrC
= jnrD
= 0;
1466 j_coord_offsetA
= 0;
1467 j_coord_offsetB
= 0;
1468 j_coord_offsetC
= 0;
1469 j_coord_offsetD
= 0;
1474 for(iidx
=0;iidx
<4*DIM
;iidx
++)
1476 scratch
[iidx
] = 0.0;
1479 /* Start outer loop over neighborlists */
1480 for(iidx
=0; iidx
<nri
; iidx
++)
1482 /* Load shift vector for this list */
1483 i_shift_offset
= DIM
*shiftidx
[iidx
];
1485 /* Load limits for loop over neighbors */
1486 j_index_start
= jindex
[iidx
];
1487 j_index_end
= jindex
[iidx
+1];
1489 /* Get outer coordinate index */
1491 i_coord_offset
= DIM
*inr
;
1493 /* Load i particle coords and add shift vector */
1494 gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
1495 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
1497 fix0
= _mm256_setzero_pd();
1498 fiy0
= _mm256_setzero_pd();
1499 fiz0
= _mm256_setzero_pd();
1500 fix1
= _mm256_setzero_pd();
1501 fiy1
= _mm256_setzero_pd();
1502 fiz1
= _mm256_setzero_pd();
1503 fix2
= _mm256_setzero_pd();
1504 fiy2
= _mm256_setzero_pd();
1505 fiz2
= _mm256_setzero_pd();
1506 fix3
= _mm256_setzero_pd();
1507 fiy3
= _mm256_setzero_pd();
1508 fiz3
= _mm256_setzero_pd();
1510 /* Start inner kernel loop */
1511 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+3]>=0; jidx
+=4)
1514 /* Get j neighbor index, and coordinate index */
1516 jnrB
= jjnr
[jidx
+1];
1517 jnrC
= jjnr
[jidx
+2];
1518 jnrD
= jjnr
[jidx
+3];
1519 j_coord_offsetA
= DIM
*jnrA
;
1520 j_coord_offsetB
= DIM
*jnrB
;
1521 j_coord_offsetC
= DIM
*jnrC
;
1522 j_coord_offsetD
= DIM
*jnrD
;
1524 /* load j atom coordinates */
1525 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1526 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
1527 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
1528 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
1530 /* Calculate displacement vector */
1531 dx00
= _mm256_sub_pd(ix0
,jx0
);
1532 dy00
= _mm256_sub_pd(iy0
,jy0
);
1533 dz00
= _mm256_sub_pd(iz0
,jz0
);
1534 dx11
= _mm256_sub_pd(ix1
,jx1
);
1535 dy11
= _mm256_sub_pd(iy1
,jy1
);
1536 dz11
= _mm256_sub_pd(iz1
,jz1
);
1537 dx12
= _mm256_sub_pd(ix1
,jx2
);
1538 dy12
= _mm256_sub_pd(iy1
,jy2
);
1539 dz12
= _mm256_sub_pd(iz1
,jz2
);
1540 dx13
= _mm256_sub_pd(ix1
,jx3
);
1541 dy13
= _mm256_sub_pd(iy1
,jy3
);
1542 dz13
= _mm256_sub_pd(iz1
,jz3
);
1543 dx21
= _mm256_sub_pd(ix2
,jx1
);
1544 dy21
= _mm256_sub_pd(iy2
,jy1
);
1545 dz21
= _mm256_sub_pd(iz2
,jz1
);
1546 dx22
= _mm256_sub_pd(ix2
,jx2
);
1547 dy22
= _mm256_sub_pd(iy2
,jy2
);
1548 dz22
= _mm256_sub_pd(iz2
,jz2
);
1549 dx23
= _mm256_sub_pd(ix2
,jx3
);
1550 dy23
= _mm256_sub_pd(iy2
,jy3
);
1551 dz23
= _mm256_sub_pd(iz2
,jz3
);
1552 dx31
= _mm256_sub_pd(ix3
,jx1
);
1553 dy31
= _mm256_sub_pd(iy3
,jy1
);
1554 dz31
= _mm256_sub_pd(iz3
,jz1
);
1555 dx32
= _mm256_sub_pd(ix3
,jx2
);
1556 dy32
= _mm256_sub_pd(iy3
,jy2
);
1557 dz32
= _mm256_sub_pd(iz3
,jz2
);
1558 dx33
= _mm256_sub_pd(ix3
,jx3
);
1559 dy33
= _mm256_sub_pd(iy3
,jy3
);
1560 dz33
= _mm256_sub_pd(iz3
,jz3
);
1562 /* Calculate squared distance and things based on it */
1563 rsq00
= gmx_mm256_calc_rsq_pd(dx00
,dy00
,dz00
);
1564 rsq11
= gmx_mm256_calc_rsq_pd(dx11
,dy11
,dz11
);
1565 rsq12
= gmx_mm256_calc_rsq_pd(dx12
,dy12
,dz12
);
1566 rsq13
= gmx_mm256_calc_rsq_pd(dx13
,dy13
,dz13
);
1567 rsq21
= gmx_mm256_calc_rsq_pd(dx21
,dy21
,dz21
);
1568 rsq22
= gmx_mm256_calc_rsq_pd(dx22
,dy22
,dz22
);
1569 rsq23
= gmx_mm256_calc_rsq_pd(dx23
,dy23
,dz23
);
1570 rsq31
= gmx_mm256_calc_rsq_pd(dx31
,dy31
,dz31
);
1571 rsq32
= gmx_mm256_calc_rsq_pd(dx32
,dy32
,dz32
);
1572 rsq33
= gmx_mm256_calc_rsq_pd(dx33
,dy33
,dz33
);
1574 rinv11
= avx256_invsqrt_d(rsq11
);
1575 rinv12
= avx256_invsqrt_d(rsq12
);
1576 rinv13
= avx256_invsqrt_d(rsq13
);
1577 rinv21
= avx256_invsqrt_d(rsq21
);
1578 rinv22
= avx256_invsqrt_d(rsq22
);
1579 rinv23
= avx256_invsqrt_d(rsq23
);
1580 rinv31
= avx256_invsqrt_d(rsq31
);
1581 rinv32
= avx256_invsqrt_d(rsq32
);
1582 rinv33
= avx256_invsqrt_d(rsq33
);
1584 rinvsq00
= avx256_inv_d(rsq00
);
1585 rinvsq11
= _mm256_mul_pd(rinv11
,rinv11
);
1586 rinvsq12
= _mm256_mul_pd(rinv12
,rinv12
);
1587 rinvsq13
= _mm256_mul_pd(rinv13
,rinv13
);
1588 rinvsq21
= _mm256_mul_pd(rinv21
,rinv21
);
1589 rinvsq22
= _mm256_mul_pd(rinv22
,rinv22
);
1590 rinvsq23
= _mm256_mul_pd(rinv23
,rinv23
);
1591 rinvsq31
= _mm256_mul_pd(rinv31
,rinv31
);
1592 rinvsq32
= _mm256_mul_pd(rinv32
,rinv32
);
1593 rinvsq33
= _mm256_mul_pd(rinv33
,rinv33
);
1595 fjx0
= _mm256_setzero_pd();
1596 fjy0
= _mm256_setzero_pd();
1597 fjz0
= _mm256_setzero_pd();
1598 fjx1
= _mm256_setzero_pd();
1599 fjy1
= _mm256_setzero_pd();
1600 fjz1
= _mm256_setzero_pd();
1601 fjx2
= _mm256_setzero_pd();
1602 fjy2
= _mm256_setzero_pd();
1603 fjz2
= _mm256_setzero_pd();
1604 fjx3
= _mm256_setzero_pd();
1605 fjy3
= _mm256_setzero_pd();
1606 fjz3
= _mm256_setzero_pd();
1608 /**************************
1609 * CALCULATE INTERACTIONS *
1610 **************************/
1612 /* LENNARD-JONES DISPERSION/REPULSION */
1614 rinvsix
= _mm256_mul_pd(_mm256_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
1615 fvdw
= _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00
,rinvsix
),c6_00
),_mm256_mul_pd(rinvsix
,rinvsq00
));
1619 /* Calculate temporary vectorial force */
1620 tx
= _mm256_mul_pd(fscal
,dx00
);
1621 ty
= _mm256_mul_pd(fscal
,dy00
);
1622 tz
= _mm256_mul_pd(fscal
,dz00
);
1624 /* Update vectorial force */
1625 fix0
= _mm256_add_pd(fix0
,tx
);
1626 fiy0
= _mm256_add_pd(fiy0
,ty
);
1627 fiz0
= _mm256_add_pd(fiz0
,tz
);
1629 fjx0
= _mm256_add_pd(fjx0
,tx
);
1630 fjy0
= _mm256_add_pd(fjy0
,ty
);
1631 fjz0
= _mm256_add_pd(fjz0
,tz
);
1633 /**************************
1634 * CALCULATE INTERACTIONS *
1635 **************************/
1637 r11
= _mm256_mul_pd(rsq11
,rinv11
);
1639 /* EWALD ELECTROSTATICS */
1641 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1642 ewrt
= _mm256_mul_pd(r11
,ewtabscale
);
1643 ewitab
= _mm256_cvttpd_epi32(ewrt
);
1644 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1645 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
1646 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
1648 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
1649 felec
= _mm256_mul_pd(_mm256_mul_pd(qq11
,rinv11
),_mm256_sub_pd(rinvsq11
,felec
));
1653 /* Calculate temporary vectorial force */
1654 tx
= _mm256_mul_pd(fscal
,dx11
);
1655 ty
= _mm256_mul_pd(fscal
,dy11
);
1656 tz
= _mm256_mul_pd(fscal
,dz11
);
1658 /* Update vectorial force */
1659 fix1
= _mm256_add_pd(fix1
,tx
);
1660 fiy1
= _mm256_add_pd(fiy1
,ty
);
1661 fiz1
= _mm256_add_pd(fiz1
,tz
);
1663 fjx1
= _mm256_add_pd(fjx1
,tx
);
1664 fjy1
= _mm256_add_pd(fjy1
,ty
);
1665 fjz1
= _mm256_add_pd(fjz1
,tz
);
1667 /**************************
1668 * CALCULATE INTERACTIONS *
1669 **************************/
1671 r12
= _mm256_mul_pd(rsq12
,rinv12
);
1673 /* EWALD ELECTROSTATICS */
1675 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1676 ewrt
= _mm256_mul_pd(r12
,ewtabscale
);
1677 ewitab
= _mm256_cvttpd_epi32(ewrt
);
1678 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1679 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
1680 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
1682 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
1683 felec
= _mm256_mul_pd(_mm256_mul_pd(qq12
,rinv12
),_mm256_sub_pd(rinvsq12
,felec
));
1687 /* Calculate temporary vectorial force */
1688 tx
= _mm256_mul_pd(fscal
,dx12
);
1689 ty
= _mm256_mul_pd(fscal
,dy12
);
1690 tz
= _mm256_mul_pd(fscal
,dz12
);
1692 /* Update vectorial force */
1693 fix1
= _mm256_add_pd(fix1
,tx
);
1694 fiy1
= _mm256_add_pd(fiy1
,ty
);
1695 fiz1
= _mm256_add_pd(fiz1
,tz
);
1697 fjx2
= _mm256_add_pd(fjx2
,tx
);
1698 fjy2
= _mm256_add_pd(fjy2
,ty
);
1699 fjz2
= _mm256_add_pd(fjz2
,tz
);
1701 /**************************
1702 * CALCULATE INTERACTIONS *
1703 **************************/
1705 r13
= _mm256_mul_pd(rsq13
,rinv13
);
1707 /* EWALD ELECTROSTATICS */
1709 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1710 ewrt
= _mm256_mul_pd(r13
,ewtabscale
);
1711 ewitab
= _mm256_cvttpd_epi32(ewrt
);
1712 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1713 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
1714 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
1716 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
1717 felec
= _mm256_mul_pd(_mm256_mul_pd(qq13
,rinv13
),_mm256_sub_pd(rinvsq13
,felec
));
1721 /* Calculate temporary vectorial force */
1722 tx
= _mm256_mul_pd(fscal
,dx13
);
1723 ty
= _mm256_mul_pd(fscal
,dy13
);
1724 tz
= _mm256_mul_pd(fscal
,dz13
);
1726 /* Update vectorial force */
1727 fix1
= _mm256_add_pd(fix1
,tx
);
1728 fiy1
= _mm256_add_pd(fiy1
,ty
);
1729 fiz1
= _mm256_add_pd(fiz1
,tz
);
1731 fjx3
= _mm256_add_pd(fjx3
,tx
);
1732 fjy3
= _mm256_add_pd(fjy3
,ty
);
1733 fjz3
= _mm256_add_pd(fjz3
,tz
);
1735 /**************************
1736 * CALCULATE INTERACTIONS *
1737 **************************/
1739 r21
= _mm256_mul_pd(rsq21
,rinv21
);
1741 /* EWALD ELECTROSTATICS */
1743 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1744 ewrt
= _mm256_mul_pd(r21
,ewtabscale
);
1745 ewitab
= _mm256_cvttpd_epi32(ewrt
);
1746 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1747 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
1748 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
1750 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
1751 felec
= _mm256_mul_pd(_mm256_mul_pd(qq21
,rinv21
),_mm256_sub_pd(rinvsq21
,felec
));
1755 /* Calculate temporary vectorial force */
1756 tx
= _mm256_mul_pd(fscal
,dx21
);
1757 ty
= _mm256_mul_pd(fscal
,dy21
);
1758 tz
= _mm256_mul_pd(fscal
,dz21
);
1760 /* Update vectorial force */
1761 fix2
= _mm256_add_pd(fix2
,tx
);
1762 fiy2
= _mm256_add_pd(fiy2
,ty
);
1763 fiz2
= _mm256_add_pd(fiz2
,tz
);
1765 fjx1
= _mm256_add_pd(fjx1
,tx
);
1766 fjy1
= _mm256_add_pd(fjy1
,ty
);
1767 fjz1
= _mm256_add_pd(fjz1
,tz
);
1769 /**************************
1770 * CALCULATE INTERACTIONS *
1771 **************************/
1773 r22
= _mm256_mul_pd(rsq22
,rinv22
);
1775 /* EWALD ELECTROSTATICS */
1777 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1778 ewrt
= _mm256_mul_pd(r22
,ewtabscale
);
1779 ewitab
= _mm256_cvttpd_epi32(ewrt
);
1780 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1781 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
1782 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
1784 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
1785 felec
= _mm256_mul_pd(_mm256_mul_pd(qq22
,rinv22
),_mm256_sub_pd(rinvsq22
,felec
));
1789 /* Calculate temporary vectorial force */
1790 tx
= _mm256_mul_pd(fscal
,dx22
);
1791 ty
= _mm256_mul_pd(fscal
,dy22
);
1792 tz
= _mm256_mul_pd(fscal
,dz22
);
1794 /* Update vectorial force */
1795 fix2
= _mm256_add_pd(fix2
,tx
);
1796 fiy2
= _mm256_add_pd(fiy2
,ty
);
1797 fiz2
= _mm256_add_pd(fiz2
,tz
);
1799 fjx2
= _mm256_add_pd(fjx2
,tx
);
1800 fjy2
= _mm256_add_pd(fjy2
,ty
);
1801 fjz2
= _mm256_add_pd(fjz2
,tz
);
1803 /**************************
1804 * CALCULATE INTERACTIONS *
1805 **************************/
1807 r23
= _mm256_mul_pd(rsq23
,rinv23
);
1809 /* EWALD ELECTROSTATICS */
1811 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1812 ewrt
= _mm256_mul_pd(r23
,ewtabscale
);
1813 ewitab
= _mm256_cvttpd_epi32(ewrt
);
1814 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1815 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
1816 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
1818 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
1819 felec
= _mm256_mul_pd(_mm256_mul_pd(qq23
,rinv23
),_mm256_sub_pd(rinvsq23
,felec
));
1823 /* Calculate temporary vectorial force */
1824 tx
= _mm256_mul_pd(fscal
,dx23
);
1825 ty
= _mm256_mul_pd(fscal
,dy23
);
1826 tz
= _mm256_mul_pd(fscal
,dz23
);
1828 /* Update vectorial force */
1829 fix2
= _mm256_add_pd(fix2
,tx
);
1830 fiy2
= _mm256_add_pd(fiy2
,ty
);
1831 fiz2
= _mm256_add_pd(fiz2
,tz
);
1833 fjx3
= _mm256_add_pd(fjx3
,tx
);
1834 fjy3
= _mm256_add_pd(fjy3
,ty
);
1835 fjz3
= _mm256_add_pd(fjz3
,tz
);
1837 /**************************
1838 * CALCULATE INTERACTIONS *
1839 **************************/
1841 r31
= _mm256_mul_pd(rsq31
,rinv31
);
1843 /* EWALD ELECTROSTATICS */
1845 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1846 ewrt
= _mm256_mul_pd(r31
,ewtabscale
);
1847 ewitab
= _mm256_cvttpd_epi32(ewrt
);
1848 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1849 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
1850 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
1852 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
1853 felec
= _mm256_mul_pd(_mm256_mul_pd(qq31
,rinv31
),_mm256_sub_pd(rinvsq31
,felec
));
1857 /* Calculate temporary vectorial force */
1858 tx
= _mm256_mul_pd(fscal
,dx31
);
1859 ty
= _mm256_mul_pd(fscal
,dy31
);
1860 tz
= _mm256_mul_pd(fscal
,dz31
);
1862 /* Update vectorial force */
1863 fix3
= _mm256_add_pd(fix3
,tx
);
1864 fiy3
= _mm256_add_pd(fiy3
,ty
);
1865 fiz3
= _mm256_add_pd(fiz3
,tz
);
1867 fjx1
= _mm256_add_pd(fjx1
,tx
);
1868 fjy1
= _mm256_add_pd(fjy1
,ty
);
1869 fjz1
= _mm256_add_pd(fjz1
,tz
);
1871 /**************************
1872 * CALCULATE INTERACTIONS *
1873 **************************/
1875 r32
= _mm256_mul_pd(rsq32
,rinv32
);
1877 /* EWALD ELECTROSTATICS */
1879 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1880 ewrt
= _mm256_mul_pd(r32
,ewtabscale
);
1881 ewitab
= _mm256_cvttpd_epi32(ewrt
);
1882 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1883 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
1884 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
1886 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
1887 felec
= _mm256_mul_pd(_mm256_mul_pd(qq32
,rinv32
),_mm256_sub_pd(rinvsq32
,felec
));
1891 /* Calculate temporary vectorial force */
1892 tx
= _mm256_mul_pd(fscal
,dx32
);
1893 ty
= _mm256_mul_pd(fscal
,dy32
);
1894 tz
= _mm256_mul_pd(fscal
,dz32
);
1896 /* Update vectorial force */
1897 fix3
= _mm256_add_pd(fix3
,tx
);
1898 fiy3
= _mm256_add_pd(fiy3
,ty
);
1899 fiz3
= _mm256_add_pd(fiz3
,tz
);
1901 fjx2
= _mm256_add_pd(fjx2
,tx
);
1902 fjy2
= _mm256_add_pd(fjy2
,ty
);
1903 fjz2
= _mm256_add_pd(fjz2
,tz
);
1905 /**************************
1906 * CALCULATE INTERACTIONS *
1907 **************************/
1909 r33
= _mm256_mul_pd(rsq33
,rinv33
);
1911 /* EWALD ELECTROSTATICS */
1913 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1914 ewrt
= _mm256_mul_pd(r33
,ewtabscale
);
1915 ewitab
= _mm256_cvttpd_epi32(ewrt
);
1916 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1917 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
1918 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
1920 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
1921 felec
= _mm256_mul_pd(_mm256_mul_pd(qq33
,rinv33
),_mm256_sub_pd(rinvsq33
,felec
));
1925 /* Calculate temporary vectorial force */
1926 tx
= _mm256_mul_pd(fscal
,dx33
);
1927 ty
= _mm256_mul_pd(fscal
,dy33
);
1928 tz
= _mm256_mul_pd(fscal
,dz33
);
1930 /* Update vectorial force */
1931 fix3
= _mm256_add_pd(fix3
,tx
);
1932 fiy3
= _mm256_add_pd(fiy3
,ty
);
1933 fiz3
= _mm256_add_pd(fiz3
,tz
);
1935 fjx3
= _mm256_add_pd(fjx3
,tx
);
1936 fjy3
= _mm256_add_pd(fjy3
,ty
);
1937 fjz3
= _mm256_add_pd(fjz3
,tz
);
1939 fjptrA
= f
+j_coord_offsetA
;
1940 fjptrB
= f
+j_coord_offsetB
;
1941 fjptrC
= f
+j_coord_offsetC
;
1942 fjptrD
= f
+j_coord_offsetD
;
1944 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
1945 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,
1946 fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
1948 /* Inner loop uses 354 flops */
1951 if(jidx
<j_index_end
)
1954 /* Get j neighbor index, and coordinate index */
1955 jnrlistA
= jjnr
[jidx
];
1956 jnrlistB
= jjnr
[jidx
+1];
1957 jnrlistC
= jjnr
[jidx
+2];
1958 jnrlistD
= jjnr
[jidx
+3];
1959 /* Sign of each element will be negative for non-real atoms.
1960 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1961 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
1963 tmpmask0
= gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128()));
1965 tmpmask1
= _mm_permute_ps(tmpmask0
,_GMX_MM_PERMUTE(3,3,2,2));
1966 tmpmask0
= _mm_permute_ps(tmpmask0
,_GMX_MM_PERMUTE(1,1,0,0));
1967 dummy_mask
= _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1
,tmpmask0
));
1969 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
1970 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
1971 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
1972 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
1973 j_coord_offsetA
= DIM
*jnrA
;
1974 j_coord_offsetB
= DIM
*jnrB
;
1975 j_coord_offsetC
= DIM
*jnrC
;
1976 j_coord_offsetD
= DIM
*jnrD
;
1978 /* load j atom coordinates */
1979 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1980 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
1981 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
1982 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
1984 /* Calculate displacement vector */
1985 dx00
= _mm256_sub_pd(ix0
,jx0
);
1986 dy00
= _mm256_sub_pd(iy0
,jy0
);
1987 dz00
= _mm256_sub_pd(iz0
,jz0
);
1988 dx11
= _mm256_sub_pd(ix1
,jx1
);
1989 dy11
= _mm256_sub_pd(iy1
,jy1
);
1990 dz11
= _mm256_sub_pd(iz1
,jz1
);
1991 dx12
= _mm256_sub_pd(ix1
,jx2
);
1992 dy12
= _mm256_sub_pd(iy1
,jy2
);
1993 dz12
= _mm256_sub_pd(iz1
,jz2
);
1994 dx13
= _mm256_sub_pd(ix1
,jx3
);
1995 dy13
= _mm256_sub_pd(iy1
,jy3
);
1996 dz13
= _mm256_sub_pd(iz1
,jz3
);
1997 dx21
= _mm256_sub_pd(ix2
,jx1
);
1998 dy21
= _mm256_sub_pd(iy2
,jy1
);
1999 dz21
= _mm256_sub_pd(iz2
,jz1
);
2000 dx22
= _mm256_sub_pd(ix2
,jx2
);
2001 dy22
= _mm256_sub_pd(iy2
,jy2
);
2002 dz22
= _mm256_sub_pd(iz2
,jz2
);
2003 dx23
= _mm256_sub_pd(ix2
,jx3
);
2004 dy23
= _mm256_sub_pd(iy2
,jy3
);
2005 dz23
= _mm256_sub_pd(iz2
,jz3
);
2006 dx31
= _mm256_sub_pd(ix3
,jx1
);
2007 dy31
= _mm256_sub_pd(iy3
,jy1
);
2008 dz31
= _mm256_sub_pd(iz3
,jz1
);
2009 dx32
= _mm256_sub_pd(ix3
,jx2
);
2010 dy32
= _mm256_sub_pd(iy3
,jy2
);
2011 dz32
= _mm256_sub_pd(iz3
,jz2
);
2012 dx33
= _mm256_sub_pd(ix3
,jx3
);
2013 dy33
= _mm256_sub_pd(iy3
,jy3
);
2014 dz33
= _mm256_sub_pd(iz3
,jz3
);
2016 /* Calculate squared distance and things based on it */
2017 rsq00
= gmx_mm256_calc_rsq_pd(dx00
,dy00
,dz00
);
2018 rsq11
= gmx_mm256_calc_rsq_pd(dx11
,dy11
,dz11
);
2019 rsq12
= gmx_mm256_calc_rsq_pd(dx12
,dy12
,dz12
);
2020 rsq13
= gmx_mm256_calc_rsq_pd(dx13
,dy13
,dz13
);
2021 rsq21
= gmx_mm256_calc_rsq_pd(dx21
,dy21
,dz21
);
2022 rsq22
= gmx_mm256_calc_rsq_pd(dx22
,dy22
,dz22
);
2023 rsq23
= gmx_mm256_calc_rsq_pd(dx23
,dy23
,dz23
);
2024 rsq31
= gmx_mm256_calc_rsq_pd(dx31
,dy31
,dz31
);
2025 rsq32
= gmx_mm256_calc_rsq_pd(dx32
,dy32
,dz32
);
2026 rsq33
= gmx_mm256_calc_rsq_pd(dx33
,dy33
,dz33
);
2028 rinv11
= avx256_invsqrt_d(rsq11
);
2029 rinv12
= avx256_invsqrt_d(rsq12
);
2030 rinv13
= avx256_invsqrt_d(rsq13
);
2031 rinv21
= avx256_invsqrt_d(rsq21
);
2032 rinv22
= avx256_invsqrt_d(rsq22
);
2033 rinv23
= avx256_invsqrt_d(rsq23
);
2034 rinv31
= avx256_invsqrt_d(rsq31
);
2035 rinv32
= avx256_invsqrt_d(rsq32
);
2036 rinv33
= avx256_invsqrt_d(rsq33
);
2038 rinvsq00
= avx256_inv_d(rsq00
);
2039 rinvsq11
= _mm256_mul_pd(rinv11
,rinv11
);
2040 rinvsq12
= _mm256_mul_pd(rinv12
,rinv12
);
2041 rinvsq13
= _mm256_mul_pd(rinv13
,rinv13
);
2042 rinvsq21
= _mm256_mul_pd(rinv21
,rinv21
);
2043 rinvsq22
= _mm256_mul_pd(rinv22
,rinv22
);
2044 rinvsq23
= _mm256_mul_pd(rinv23
,rinv23
);
2045 rinvsq31
= _mm256_mul_pd(rinv31
,rinv31
);
2046 rinvsq32
= _mm256_mul_pd(rinv32
,rinv32
);
2047 rinvsq33
= _mm256_mul_pd(rinv33
,rinv33
);
2049 fjx0
= _mm256_setzero_pd();
2050 fjy0
= _mm256_setzero_pd();
2051 fjz0
= _mm256_setzero_pd();
2052 fjx1
= _mm256_setzero_pd();
2053 fjy1
= _mm256_setzero_pd();
2054 fjz1
= _mm256_setzero_pd();
2055 fjx2
= _mm256_setzero_pd();
2056 fjy2
= _mm256_setzero_pd();
2057 fjz2
= _mm256_setzero_pd();
2058 fjx3
= _mm256_setzero_pd();
2059 fjy3
= _mm256_setzero_pd();
2060 fjz3
= _mm256_setzero_pd();
2062 /**************************
2063 * CALCULATE INTERACTIONS *
2064 **************************/
2066 /* LENNARD-JONES DISPERSION/REPULSION */
2068 rinvsix
= _mm256_mul_pd(_mm256_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
2069 fvdw
= _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00
,rinvsix
),c6_00
),_mm256_mul_pd(rinvsix
,rinvsq00
));
2073 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
2075 /* Calculate temporary vectorial force */
2076 tx
= _mm256_mul_pd(fscal
,dx00
);
2077 ty
= _mm256_mul_pd(fscal
,dy00
);
2078 tz
= _mm256_mul_pd(fscal
,dz00
);
2080 /* Update vectorial force */
2081 fix0
= _mm256_add_pd(fix0
,tx
);
2082 fiy0
= _mm256_add_pd(fiy0
,ty
);
2083 fiz0
= _mm256_add_pd(fiz0
,tz
);
2085 fjx0
= _mm256_add_pd(fjx0
,tx
);
2086 fjy0
= _mm256_add_pd(fjy0
,ty
);
2087 fjz0
= _mm256_add_pd(fjz0
,tz
);
2089 /**************************
2090 * CALCULATE INTERACTIONS *
2091 **************************/
2093 r11
= _mm256_mul_pd(rsq11
,rinv11
);
2094 r11
= _mm256_andnot_pd(dummy_mask
,r11
);
2096 /* EWALD ELECTROSTATICS */
2098 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2099 ewrt
= _mm256_mul_pd(r11
,ewtabscale
);
2100 ewitab
= _mm256_cvttpd_epi32(ewrt
);
2101 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2102 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
2103 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
2105 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
2106 felec
= _mm256_mul_pd(_mm256_mul_pd(qq11
,rinv11
),_mm256_sub_pd(rinvsq11
,felec
));
2110 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
2112 /* Calculate temporary vectorial force */
2113 tx
= _mm256_mul_pd(fscal
,dx11
);
2114 ty
= _mm256_mul_pd(fscal
,dy11
);
2115 tz
= _mm256_mul_pd(fscal
,dz11
);
2117 /* Update vectorial force */
2118 fix1
= _mm256_add_pd(fix1
,tx
);
2119 fiy1
= _mm256_add_pd(fiy1
,ty
);
2120 fiz1
= _mm256_add_pd(fiz1
,tz
);
2122 fjx1
= _mm256_add_pd(fjx1
,tx
);
2123 fjy1
= _mm256_add_pd(fjy1
,ty
);
2124 fjz1
= _mm256_add_pd(fjz1
,tz
);
2126 /**************************
2127 * CALCULATE INTERACTIONS *
2128 **************************/
2130 r12
= _mm256_mul_pd(rsq12
,rinv12
);
2131 r12
= _mm256_andnot_pd(dummy_mask
,r12
);
2133 /* EWALD ELECTROSTATICS */
2135 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2136 ewrt
= _mm256_mul_pd(r12
,ewtabscale
);
2137 ewitab
= _mm256_cvttpd_epi32(ewrt
);
2138 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2139 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
2140 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
2142 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
2143 felec
= _mm256_mul_pd(_mm256_mul_pd(qq12
,rinv12
),_mm256_sub_pd(rinvsq12
,felec
));
2147 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
2149 /* Calculate temporary vectorial force */
2150 tx
= _mm256_mul_pd(fscal
,dx12
);
2151 ty
= _mm256_mul_pd(fscal
,dy12
);
2152 tz
= _mm256_mul_pd(fscal
,dz12
);
2154 /* Update vectorial force */
2155 fix1
= _mm256_add_pd(fix1
,tx
);
2156 fiy1
= _mm256_add_pd(fiy1
,ty
);
2157 fiz1
= _mm256_add_pd(fiz1
,tz
);
2159 fjx2
= _mm256_add_pd(fjx2
,tx
);
2160 fjy2
= _mm256_add_pd(fjy2
,ty
);
2161 fjz2
= _mm256_add_pd(fjz2
,tz
);
2163 /**************************
2164 * CALCULATE INTERACTIONS *
2165 **************************/
2167 r13
= _mm256_mul_pd(rsq13
,rinv13
);
2168 r13
= _mm256_andnot_pd(dummy_mask
,r13
);
2170 /* EWALD ELECTROSTATICS */
2172 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2173 ewrt
= _mm256_mul_pd(r13
,ewtabscale
);
2174 ewitab
= _mm256_cvttpd_epi32(ewrt
);
2175 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2176 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
2177 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
2179 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
2180 felec
= _mm256_mul_pd(_mm256_mul_pd(qq13
,rinv13
),_mm256_sub_pd(rinvsq13
,felec
));
2184 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
2186 /* Calculate temporary vectorial force */
2187 tx
= _mm256_mul_pd(fscal
,dx13
);
2188 ty
= _mm256_mul_pd(fscal
,dy13
);
2189 tz
= _mm256_mul_pd(fscal
,dz13
);
2191 /* Update vectorial force */
2192 fix1
= _mm256_add_pd(fix1
,tx
);
2193 fiy1
= _mm256_add_pd(fiy1
,ty
);
2194 fiz1
= _mm256_add_pd(fiz1
,tz
);
2196 fjx3
= _mm256_add_pd(fjx3
,tx
);
2197 fjy3
= _mm256_add_pd(fjy3
,ty
);
2198 fjz3
= _mm256_add_pd(fjz3
,tz
);
2200 /**************************
2201 * CALCULATE INTERACTIONS *
2202 **************************/
2204 r21
= _mm256_mul_pd(rsq21
,rinv21
);
2205 r21
= _mm256_andnot_pd(dummy_mask
,r21
);
2207 /* EWALD ELECTROSTATICS */
2209 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2210 ewrt
= _mm256_mul_pd(r21
,ewtabscale
);
2211 ewitab
= _mm256_cvttpd_epi32(ewrt
);
2212 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2213 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
2214 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
2216 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
2217 felec
= _mm256_mul_pd(_mm256_mul_pd(qq21
,rinv21
),_mm256_sub_pd(rinvsq21
,felec
));
2221 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
2223 /* Calculate temporary vectorial force */
2224 tx
= _mm256_mul_pd(fscal
,dx21
);
2225 ty
= _mm256_mul_pd(fscal
,dy21
);
2226 tz
= _mm256_mul_pd(fscal
,dz21
);
2228 /* Update vectorial force */
2229 fix2
= _mm256_add_pd(fix2
,tx
);
2230 fiy2
= _mm256_add_pd(fiy2
,ty
);
2231 fiz2
= _mm256_add_pd(fiz2
,tz
);
2233 fjx1
= _mm256_add_pd(fjx1
,tx
);
2234 fjy1
= _mm256_add_pd(fjy1
,ty
);
2235 fjz1
= _mm256_add_pd(fjz1
,tz
);
2237 /**************************
2238 * CALCULATE INTERACTIONS *
2239 **************************/
2241 r22
= _mm256_mul_pd(rsq22
,rinv22
);
2242 r22
= _mm256_andnot_pd(dummy_mask
,r22
);
2244 /* EWALD ELECTROSTATICS */
2246 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2247 ewrt
= _mm256_mul_pd(r22
,ewtabscale
);
2248 ewitab
= _mm256_cvttpd_epi32(ewrt
);
2249 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2250 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
2251 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
2253 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
2254 felec
= _mm256_mul_pd(_mm256_mul_pd(qq22
,rinv22
),_mm256_sub_pd(rinvsq22
,felec
));
2258 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
2260 /* Calculate temporary vectorial force */
2261 tx
= _mm256_mul_pd(fscal
,dx22
);
2262 ty
= _mm256_mul_pd(fscal
,dy22
);
2263 tz
= _mm256_mul_pd(fscal
,dz22
);
2265 /* Update vectorial force */
2266 fix2
= _mm256_add_pd(fix2
,tx
);
2267 fiy2
= _mm256_add_pd(fiy2
,ty
);
2268 fiz2
= _mm256_add_pd(fiz2
,tz
);
2270 fjx2
= _mm256_add_pd(fjx2
,tx
);
2271 fjy2
= _mm256_add_pd(fjy2
,ty
);
2272 fjz2
= _mm256_add_pd(fjz2
,tz
);
2274 /**************************
2275 * CALCULATE INTERACTIONS *
2276 **************************/
2278 r23
= _mm256_mul_pd(rsq23
,rinv23
);
2279 r23
= _mm256_andnot_pd(dummy_mask
,r23
);
2281 /* EWALD ELECTROSTATICS */
2283 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2284 ewrt
= _mm256_mul_pd(r23
,ewtabscale
);
2285 ewitab
= _mm256_cvttpd_epi32(ewrt
);
2286 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2287 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
2288 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
2290 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
2291 felec
= _mm256_mul_pd(_mm256_mul_pd(qq23
,rinv23
),_mm256_sub_pd(rinvsq23
,felec
));
2295 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
2297 /* Calculate temporary vectorial force */
2298 tx
= _mm256_mul_pd(fscal
,dx23
);
2299 ty
= _mm256_mul_pd(fscal
,dy23
);
2300 tz
= _mm256_mul_pd(fscal
,dz23
);
2302 /* Update vectorial force */
2303 fix2
= _mm256_add_pd(fix2
,tx
);
2304 fiy2
= _mm256_add_pd(fiy2
,ty
);
2305 fiz2
= _mm256_add_pd(fiz2
,tz
);
2307 fjx3
= _mm256_add_pd(fjx3
,tx
);
2308 fjy3
= _mm256_add_pd(fjy3
,ty
);
2309 fjz3
= _mm256_add_pd(fjz3
,tz
);
2311 /**************************
2312 * CALCULATE INTERACTIONS *
2313 **************************/
2315 r31
= _mm256_mul_pd(rsq31
,rinv31
);
2316 r31
= _mm256_andnot_pd(dummy_mask
,r31
);
2318 /* EWALD ELECTROSTATICS */
2320 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2321 ewrt
= _mm256_mul_pd(r31
,ewtabscale
);
2322 ewitab
= _mm256_cvttpd_epi32(ewrt
);
2323 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2324 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
2325 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
2327 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
2328 felec
= _mm256_mul_pd(_mm256_mul_pd(qq31
,rinv31
),_mm256_sub_pd(rinvsq31
,felec
));
2332 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
2334 /* Calculate temporary vectorial force */
2335 tx
= _mm256_mul_pd(fscal
,dx31
);
2336 ty
= _mm256_mul_pd(fscal
,dy31
);
2337 tz
= _mm256_mul_pd(fscal
,dz31
);
2339 /* Update vectorial force */
2340 fix3
= _mm256_add_pd(fix3
,tx
);
2341 fiy3
= _mm256_add_pd(fiy3
,ty
);
2342 fiz3
= _mm256_add_pd(fiz3
,tz
);
2344 fjx1
= _mm256_add_pd(fjx1
,tx
);
2345 fjy1
= _mm256_add_pd(fjy1
,ty
);
2346 fjz1
= _mm256_add_pd(fjz1
,tz
);
2348 /**************************
2349 * CALCULATE INTERACTIONS *
2350 **************************/
2352 r32
= _mm256_mul_pd(rsq32
,rinv32
);
2353 r32
= _mm256_andnot_pd(dummy_mask
,r32
);
2355 /* EWALD ELECTROSTATICS */
2357 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2358 ewrt
= _mm256_mul_pd(r32
,ewtabscale
);
2359 ewitab
= _mm256_cvttpd_epi32(ewrt
);
2360 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2361 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
2362 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
2364 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
2365 felec
= _mm256_mul_pd(_mm256_mul_pd(qq32
,rinv32
),_mm256_sub_pd(rinvsq32
,felec
));
2369 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
2371 /* Calculate temporary vectorial force */
2372 tx
= _mm256_mul_pd(fscal
,dx32
);
2373 ty
= _mm256_mul_pd(fscal
,dy32
);
2374 tz
= _mm256_mul_pd(fscal
,dz32
);
2376 /* Update vectorial force */
2377 fix3
= _mm256_add_pd(fix3
,tx
);
2378 fiy3
= _mm256_add_pd(fiy3
,ty
);
2379 fiz3
= _mm256_add_pd(fiz3
,tz
);
2381 fjx2
= _mm256_add_pd(fjx2
,tx
);
2382 fjy2
= _mm256_add_pd(fjy2
,ty
);
2383 fjz2
= _mm256_add_pd(fjz2
,tz
);
2385 /**************************
2386 * CALCULATE INTERACTIONS *
2387 **************************/
2389 r33
= _mm256_mul_pd(rsq33
,rinv33
);
2390 r33
= _mm256_andnot_pd(dummy_mask
,r33
);
2392 /* EWALD ELECTROSTATICS */
2394 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2395 ewrt
= _mm256_mul_pd(r33
,ewtabscale
);
2396 ewitab
= _mm256_cvttpd_epi32(ewrt
);
2397 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2398 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
2399 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
2401 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
2402 felec
= _mm256_mul_pd(_mm256_mul_pd(qq33
,rinv33
),_mm256_sub_pd(rinvsq33
,felec
));
2406 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
2408 /* Calculate temporary vectorial force */
2409 tx
= _mm256_mul_pd(fscal
,dx33
);
2410 ty
= _mm256_mul_pd(fscal
,dy33
);
2411 tz
= _mm256_mul_pd(fscal
,dz33
);
2413 /* Update vectorial force */
2414 fix3
= _mm256_add_pd(fix3
,tx
);
2415 fiy3
= _mm256_add_pd(fiy3
,ty
);
2416 fiz3
= _mm256_add_pd(fiz3
,tz
);
2418 fjx3
= _mm256_add_pd(fjx3
,tx
);
2419 fjy3
= _mm256_add_pd(fjy3
,ty
);
2420 fjz3
= _mm256_add_pd(fjz3
,tz
);
2422 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
2423 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
2424 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
2425 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
2427 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
2428 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,
2429 fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
2431 /* Inner loop uses 363 flops */
2434 /* End of innermost loop */
2436 gmx_mm256_update_iforce_4atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
2437 f
+i_coord_offset
,fshift
+i_shift_offset
);
2439 /* Increment number of inner iterations */
2440 inneriter
+= j_index_end
- j_index_start
;
2442 /* Outer loop uses 24 flops */
2445 /* Increment number of outer iterations */
2448 /* Update outer/inner flops */
2450 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W4W4_F
,outeriter
*24 + inneriter
*363);