2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_double kernel generator.
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
47 #include "gromacs/simd/math_x86_avx_256_double.h"
48 #include "kernelutil_x86_avx_256_double.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_avx_256_double
52 * Electrostatics interaction: Ewald
53 * VdW interaction: None
54 * Geometry: Water3-Water3
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecEwSh_VdwNone_GeomW3W3_VF_avx_256_double
59 (t_nblist
* gmx_restrict nlist
,
60 rvec
* gmx_restrict xx
,
61 rvec
* gmx_restrict ff
,
62 t_forcerec
* gmx_restrict fr
,
63 t_mdatoms
* gmx_restrict mdatoms
,
64 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
65 t_nrnb
* gmx_restrict nrnb
)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
73 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
74 int jnrA
,jnrB
,jnrC
,jnrD
;
75 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
76 int jnrlistE
,jnrlistF
,jnrlistG
,jnrlistH
;
77 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
78 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
80 real
*shiftvec
,*fshift
,*x
,*f
;
81 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
;
83 __m256d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
84 real
* vdwioffsetptr0
;
85 __m256d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
86 real
* vdwioffsetptr1
;
87 __m256d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
88 real
* vdwioffsetptr2
;
89 __m256d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
90 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
;
91 __m256d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
92 int vdwjidx1A
,vdwjidx1B
,vdwjidx1C
,vdwjidx1D
;
93 __m256d jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
94 int vdwjidx2A
,vdwjidx2B
,vdwjidx2C
,vdwjidx2D
;
95 __m256d jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
96 __m256d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
97 __m256d dx01
,dy01
,dz01
,rsq01
,rinv01
,rinvsq01
,r01
,qq01
,c6_01
,c12_01
;
98 __m256d dx02
,dy02
,dz02
,rsq02
,rinv02
,rinvsq02
,r02
,qq02
,c6_02
,c12_02
;
99 __m256d dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
100 __m256d dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
101 __m256d dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
102 __m256d dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
103 __m256d dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
104 __m256d dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
105 __m256d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
108 __m256d ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
109 __m256d beta
,beta2
,beta3
,zeta2
,pmecorrF
,pmecorrV
,rinv3
;
111 __m256d dummy_mask
,cutoff_mask
;
112 __m128 tmpmask0
,tmpmask1
;
113 __m256d signbit
= _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
114 __m256d one
= _mm256_set1_pd(1.0);
115 __m256d two
= _mm256_set1_pd(2.0);
121 jindex
= nlist
->jindex
;
123 shiftidx
= nlist
->shift
;
125 shiftvec
= fr
->shift_vec
[0];
126 fshift
= fr
->fshift
[0];
127 facel
= _mm256_set1_pd(fr
->epsfac
);
128 charge
= mdatoms
->chargeA
;
130 sh_ewald
= _mm256_set1_pd(fr
->ic
->sh_ewald
);
131 beta
= _mm256_set1_pd(fr
->ic
->ewaldcoeff_q
);
132 beta2
= _mm256_mul_pd(beta
,beta
);
133 beta3
= _mm256_mul_pd(beta
,beta2
);
135 ewtab
= fr
->ic
->tabq_coul_FDV0
;
136 ewtabscale
= _mm256_set1_pd(fr
->ic
->tabq_scale
);
137 ewtabhalfspace
= _mm256_set1_pd(0.5/fr
->ic
->tabq_scale
);
139 /* Setup water-specific parameters */
140 inr
= nlist
->iinr
[0];
141 iq0
= _mm256_mul_pd(facel
,_mm256_set1_pd(charge
[inr
+0]));
142 iq1
= _mm256_mul_pd(facel
,_mm256_set1_pd(charge
[inr
+1]));
143 iq2
= _mm256_mul_pd(facel
,_mm256_set1_pd(charge
[inr
+2]));
145 jq0
= _mm256_set1_pd(charge
[inr
+0]);
146 jq1
= _mm256_set1_pd(charge
[inr
+1]);
147 jq2
= _mm256_set1_pd(charge
[inr
+2]);
148 qq00
= _mm256_mul_pd(iq0
,jq0
);
149 qq01
= _mm256_mul_pd(iq0
,jq1
);
150 qq02
= _mm256_mul_pd(iq0
,jq2
);
151 qq10
= _mm256_mul_pd(iq1
,jq0
);
152 qq11
= _mm256_mul_pd(iq1
,jq1
);
153 qq12
= _mm256_mul_pd(iq1
,jq2
);
154 qq20
= _mm256_mul_pd(iq2
,jq0
);
155 qq21
= _mm256_mul_pd(iq2
,jq1
);
156 qq22
= _mm256_mul_pd(iq2
,jq2
);
158 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
159 rcutoff_scalar
= fr
->rcoulomb
;
160 rcutoff
= _mm256_set1_pd(rcutoff_scalar
);
161 rcutoff2
= _mm256_mul_pd(rcutoff
,rcutoff
);
163 /* Avoid stupid compiler warnings */
164 jnrA
= jnrB
= jnrC
= jnrD
= 0;
173 for(iidx
=0;iidx
<4*DIM
;iidx
++)
178 /* Start outer loop over neighborlists */
179 for(iidx
=0; iidx
<nri
; iidx
++)
181 /* Load shift vector for this list */
182 i_shift_offset
= DIM
*shiftidx
[iidx
];
184 /* Load limits for loop over neighbors */
185 j_index_start
= jindex
[iidx
];
186 j_index_end
= jindex
[iidx
+1];
188 /* Get outer coordinate index */
190 i_coord_offset
= DIM
*inr
;
192 /* Load i particle coords and add shift vector */
193 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
194 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
196 fix0
= _mm256_setzero_pd();
197 fiy0
= _mm256_setzero_pd();
198 fiz0
= _mm256_setzero_pd();
199 fix1
= _mm256_setzero_pd();
200 fiy1
= _mm256_setzero_pd();
201 fiz1
= _mm256_setzero_pd();
202 fix2
= _mm256_setzero_pd();
203 fiy2
= _mm256_setzero_pd();
204 fiz2
= _mm256_setzero_pd();
206 /* Reset potential sums */
207 velecsum
= _mm256_setzero_pd();
209 /* Start inner kernel loop */
210 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+3]>=0; jidx
+=4)
213 /* Get j neighbor index, and coordinate index */
218 j_coord_offsetA
= DIM
*jnrA
;
219 j_coord_offsetB
= DIM
*jnrB
;
220 j_coord_offsetC
= DIM
*jnrC
;
221 j_coord_offsetD
= DIM
*jnrD
;
223 /* load j atom coordinates */
224 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
225 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
226 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
228 /* Calculate displacement vector */
229 dx00
= _mm256_sub_pd(ix0
,jx0
);
230 dy00
= _mm256_sub_pd(iy0
,jy0
);
231 dz00
= _mm256_sub_pd(iz0
,jz0
);
232 dx01
= _mm256_sub_pd(ix0
,jx1
);
233 dy01
= _mm256_sub_pd(iy0
,jy1
);
234 dz01
= _mm256_sub_pd(iz0
,jz1
);
235 dx02
= _mm256_sub_pd(ix0
,jx2
);
236 dy02
= _mm256_sub_pd(iy0
,jy2
);
237 dz02
= _mm256_sub_pd(iz0
,jz2
);
238 dx10
= _mm256_sub_pd(ix1
,jx0
);
239 dy10
= _mm256_sub_pd(iy1
,jy0
);
240 dz10
= _mm256_sub_pd(iz1
,jz0
);
241 dx11
= _mm256_sub_pd(ix1
,jx1
);
242 dy11
= _mm256_sub_pd(iy1
,jy1
);
243 dz11
= _mm256_sub_pd(iz1
,jz1
);
244 dx12
= _mm256_sub_pd(ix1
,jx2
);
245 dy12
= _mm256_sub_pd(iy1
,jy2
);
246 dz12
= _mm256_sub_pd(iz1
,jz2
);
247 dx20
= _mm256_sub_pd(ix2
,jx0
);
248 dy20
= _mm256_sub_pd(iy2
,jy0
);
249 dz20
= _mm256_sub_pd(iz2
,jz0
);
250 dx21
= _mm256_sub_pd(ix2
,jx1
);
251 dy21
= _mm256_sub_pd(iy2
,jy1
);
252 dz21
= _mm256_sub_pd(iz2
,jz1
);
253 dx22
= _mm256_sub_pd(ix2
,jx2
);
254 dy22
= _mm256_sub_pd(iy2
,jy2
);
255 dz22
= _mm256_sub_pd(iz2
,jz2
);
257 /* Calculate squared distance and things based on it */
258 rsq00
= gmx_mm256_calc_rsq_pd(dx00
,dy00
,dz00
);
259 rsq01
= gmx_mm256_calc_rsq_pd(dx01
,dy01
,dz01
);
260 rsq02
= gmx_mm256_calc_rsq_pd(dx02
,dy02
,dz02
);
261 rsq10
= gmx_mm256_calc_rsq_pd(dx10
,dy10
,dz10
);
262 rsq11
= gmx_mm256_calc_rsq_pd(dx11
,dy11
,dz11
);
263 rsq12
= gmx_mm256_calc_rsq_pd(dx12
,dy12
,dz12
);
264 rsq20
= gmx_mm256_calc_rsq_pd(dx20
,dy20
,dz20
);
265 rsq21
= gmx_mm256_calc_rsq_pd(dx21
,dy21
,dz21
);
266 rsq22
= gmx_mm256_calc_rsq_pd(dx22
,dy22
,dz22
);
268 rinv00
= gmx_mm256_invsqrt_pd(rsq00
);
269 rinv01
= gmx_mm256_invsqrt_pd(rsq01
);
270 rinv02
= gmx_mm256_invsqrt_pd(rsq02
);
271 rinv10
= gmx_mm256_invsqrt_pd(rsq10
);
272 rinv11
= gmx_mm256_invsqrt_pd(rsq11
);
273 rinv12
= gmx_mm256_invsqrt_pd(rsq12
);
274 rinv20
= gmx_mm256_invsqrt_pd(rsq20
);
275 rinv21
= gmx_mm256_invsqrt_pd(rsq21
);
276 rinv22
= gmx_mm256_invsqrt_pd(rsq22
);
278 rinvsq00
= _mm256_mul_pd(rinv00
,rinv00
);
279 rinvsq01
= _mm256_mul_pd(rinv01
,rinv01
);
280 rinvsq02
= _mm256_mul_pd(rinv02
,rinv02
);
281 rinvsq10
= _mm256_mul_pd(rinv10
,rinv10
);
282 rinvsq11
= _mm256_mul_pd(rinv11
,rinv11
);
283 rinvsq12
= _mm256_mul_pd(rinv12
,rinv12
);
284 rinvsq20
= _mm256_mul_pd(rinv20
,rinv20
);
285 rinvsq21
= _mm256_mul_pd(rinv21
,rinv21
);
286 rinvsq22
= _mm256_mul_pd(rinv22
,rinv22
);
288 fjx0
= _mm256_setzero_pd();
289 fjy0
= _mm256_setzero_pd();
290 fjz0
= _mm256_setzero_pd();
291 fjx1
= _mm256_setzero_pd();
292 fjy1
= _mm256_setzero_pd();
293 fjz1
= _mm256_setzero_pd();
294 fjx2
= _mm256_setzero_pd();
295 fjy2
= _mm256_setzero_pd();
296 fjz2
= _mm256_setzero_pd();
298 /**************************
299 * CALCULATE INTERACTIONS *
300 **************************/
302 if (gmx_mm256_any_lt(rsq00
,rcutoff2
))
305 r00
= _mm256_mul_pd(rsq00
,rinv00
);
307 /* EWALD ELECTROSTATICS */
309 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
310 ewrt
= _mm256_mul_pd(r00
,ewtabscale
);
311 ewitab
= _mm256_cvttpd_epi32(ewrt
);
312 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
313 ewitab
= _mm_slli_epi32(ewitab
,2);
314 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
315 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
316 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
317 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
318 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
319 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
320 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
321 velec
= _mm256_mul_pd(qq00
,_mm256_sub_pd(_mm256_sub_pd(rinv00
,sh_ewald
),velec
));
322 felec
= _mm256_mul_pd(_mm256_mul_pd(qq00
,rinv00
),_mm256_sub_pd(rinvsq00
,felec
));
324 cutoff_mask
= _mm256_cmp_pd(rsq00
,rcutoff2
,_CMP_LT_OQ
);
326 /* Update potential sum for this i atom from the interaction with this j atom. */
327 velec
= _mm256_and_pd(velec
,cutoff_mask
);
328 velecsum
= _mm256_add_pd(velecsum
,velec
);
332 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
334 /* Calculate temporary vectorial force */
335 tx
= _mm256_mul_pd(fscal
,dx00
);
336 ty
= _mm256_mul_pd(fscal
,dy00
);
337 tz
= _mm256_mul_pd(fscal
,dz00
);
339 /* Update vectorial force */
340 fix0
= _mm256_add_pd(fix0
,tx
);
341 fiy0
= _mm256_add_pd(fiy0
,ty
);
342 fiz0
= _mm256_add_pd(fiz0
,tz
);
344 fjx0
= _mm256_add_pd(fjx0
,tx
);
345 fjy0
= _mm256_add_pd(fjy0
,ty
);
346 fjz0
= _mm256_add_pd(fjz0
,tz
);
350 /**************************
351 * CALCULATE INTERACTIONS *
352 **************************/
354 if (gmx_mm256_any_lt(rsq01
,rcutoff2
))
357 r01
= _mm256_mul_pd(rsq01
,rinv01
);
359 /* EWALD ELECTROSTATICS */
361 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
362 ewrt
= _mm256_mul_pd(r01
,ewtabscale
);
363 ewitab
= _mm256_cvttpd_epi32(ewrt
);
364 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
365 ewitab
= _mm_slli_epi32(ewitab
,2);
366 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
367 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
368 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
369 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
370 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
371 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
372 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
373 velec
= _mm256_mul_pd(qq01
,_mm256_sub_pd(_mm256_sub_pd(rinv01
,sh_ewald
),velec
));
374 felec
= _mm256_mul_pd(_mm256_mul_pd(qq01
,rinv01
),_mm256_sub_pd(rinvsq01
,felec
));
376 cutoff_mask
= _mm256_cmp_pd(rsq01
,rcutoff2
,_CMP_LT_OQ
);
378 /* Update potential sum for this i atom from the interaction with this j atom. */
379 velec
= _mm256_and_pd(velec
,cutoff_mask
);
380 velecsum
= _mm256_add_pd(velecsum
,velec
);
384 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
386 /* Calculate temporary vectorial force */
387 tx
= _mm256_mul_pd(fscal
,dx01
);
388 ty
= _mm256_mul_pd(fscal
,dy01
);
389 tz
= _mm256_mul_pd(fscal
,dz01
);
391 /* Update vectorial force */
392 fix0
= _mm256_add_pd(fix0
,tx
);
393 fiy0
= _mm256_add_pd(fiy0
,ty
);
394 fiz0
= _mm256_add_pd(fiz0
,tz
);
396 fjx1
= _mm256_add_pd(fjx1
,tx
);
397 fjy1
= _mm256_add_pd(fjy1
,ty
);
398 fjz1
= _mm256_add_pd(fjz1
,tz
);
402 /**************************
403 * CALCULATE INTERACTIONS *
404 **************************/
406 if (gmx_mm256_any_lt(rsq02
,rcutoff2
))
409 r02
= _mm256_mul_pd(rsq02
,rinv02
);
411 /* EWALD ELECTROSTATICS */
413 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
414 ewrt
= _mm256_mul_pd(r02
,ewtabscale
);
415 ewitab
= _mm256_cvttpd_epi32(ewrt
);
416 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
417 ewitab
= _mm_slli_epi32(ewitab
,2);
418 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
419 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
420 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
421 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
422 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
423 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
424 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
425 velec
= _mm256_mul_pd(qq02
,_mm256_sub_pd(_mm256_sub_pd(rinv02
,sh_ewald
),velec
));
426 felec
= _mm256_mul_pd(_mm256_mul_pd(qq02
,rinv02
),_mm256_sub_pd(rinvsq02
,felec
));
428 cutoff_mask
= _mm256_cmp_pd(rsq02
,rcutoff2
,_CMP_LT_OQ
);
430 /* Update potential sum for this i atom from the interaction with this j atom. */
431 velec
= _mm256_and_pd(velec
,cutoff_mask
);
432 velecsum
= _mm256_add_pd(velecsum
,velec
);
436 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
438 /* Calculate temporary vectorial force */
439 tx
= _mm256_mul_pd(fscal
,dx02
);
440 ty
= _mm256_mul_pd(fscal
,dy02
);
441 tz
= _mm256_mul_pd(fscal
,dz02
);
443 /* Update vectorial force */
444 fix0
= _mm256_add_pd(fix0
,tx
);
445 fiy0
= _mm256_add_pd(fiy0
,ty
);
446 fiz0
= _mm256_add_pd(fiz0
,tz
);
448 fjx2
= _mm256_add_pd(fjx2
,tx
);
449 fjy2
= _mm256_add_pd(fjy2
,ty
);
450 fjz2
= _mm256_add_pd(fjz2
,tz
);
454 /**************************
455 * CALCULATE INTERACTIONS *
456 **************************/
458 if (gmx_mm256_any_lt(rsq10
,rcutoff2
))
461 r10
= _mm256_mul_pd(rsq10
,rinv10
);
463 /* EWALD ELECTROSTATICS */
465 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
466 ewrt
= _mm256_mul_pd(r10
,ewtabscale
);
467 ewitab
= _mm256_cvttpd_epi32(ewrt
);
468 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
469 ewitab
= _mm_slli_epi32(ewitab
,2);
470 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
471 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
472 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
473 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
474 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
475 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
476 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
477 velec
= _mm256_mul_pd(qq10
,_mm256_sub_pd(_mm256_sub_pd(rinv10
,sh_ewald
),velec
));
478 felec
= _mm256_mul_pd(_mm256_mul_pd(qq10
,rinv10
),_mm256_sub_pd(rinvsq10
,felec
));
480 cutoff_mask
= _mm256_cmp_pd(rsq10
,rcutoff2
,_CMP_LT_OQ
);
482 /* Update potential sum for this i atom from the interaction with this j atom. */
483 velec
= _mm256_and_pd(velec
,cutoff_mask
);
484 velecsum
= _mm256_add_pd(velecsum
,velec
);
488 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
490 /* Calculate temporary vectorial force */
491 tx
= _mm256_mul_pd(fscal
,dx10
);
492 ty
= _mm256_mul_pd(fscal
,dy10
);
493 tz
= _mm256_mul_pd(fscal
,dz10
);
495 /* Update vectorial force */
496 fix1
= _mm256_add_pd(fix1
,tx
);
497 fiy1
= _mm256_add_pd(fiy1
,ty
);
498 fiz1
= _mm256_add_pd(fiz1
,tz
);
500 fjx0
= _mm256_add_pd(fjx0
,tx
);
501 fjy0
= _mm256_add_pd(fjy0
,ty
);
502 fjz0
= _mm256_add_pd(fjz0
,tz
);
506 /**************************
507 * CALCULATE INTERACTIONS *
508 **************************/
510 if (gmx_mm256_any_lt(rsq11
,rcutoff2
))
513 r11
= _mm256_mul_pd(rsq11
,rinv11
);
515 /* EWALD ELECTROSTATICS */
517 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
518 ewrt
= _mm256_mul_pd(r11
,ewtabscale
);
519 ewitab
= _mm256_cvttpd_epi32(ewrt
);
520 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
521 ewitab
= _mm_slli_epi32(ewitab
,2);
522 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
523 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
524 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
525 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
526 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
527 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
528 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
529 velec
= _mm256_mul_pd(qq11
,_mm256_sub_pd(_mm256_sub_pd(rinv11
,sh_ewald
),velec
));
530 felec
= _mm256_mul_pd(_mm256_mul_pd(qq11
,rinv11
),_mm256_sub_pd(rinvsq11
,felec
));
532 cutoff_mask
= _mm256_cmp_pd(rsq11
,rcutoff2
,_CMP_LT_OQ
);
534 /* Update potential sum for this i atom from the interaction with this j atom. */
535 velec
= _mm256_and_pd(velec
,cutoff_mask
);
536 velecsum
= _mm256_add_pd(velecsum
,velec
);
540 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
542 /* Calculate temporary vectorial force */
543 tx
= _mm256_mul_pd(fscal
,dx11
);
544 ty
= _mm256_mul_pd(fscal
,dy11
);
545 tz
= _mm256_mul_pd(fscal
,dz11
);
547 /* Update vectorial force */
548 fix1
= _mm256_add_pd(fix1
,tx
);
549 fiy1
= _mm256_add_pd(fiy1
,ty
);
550 fiz1
= _mm256_add_pd(fiz1
,tz
);
552 fjx1
= _mm256_add_pd(fjx1
,tx
);
553 fjy1
= _mm256_add_pd(fjy1
,ty
);
554 fjz1
= _mm256_add_pd(fjz1
,tz
);
558 /**************************
559 * CALCULATE INTERACTIONS *
560 **************************/
562 if (gmx_mm256_any_lt(rsq12
,rcutoff2
))
565 r12
= _mm256_mul_pd(rsq12
,rinv12
);
567 /* EWALD ELECTROSTATICS */
569 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
570 ewrt
= _mm256_mul_pd(r12
,ewtabscale
);
571 ewitab
= _mm256_cvttpd_epi32(ewrt
);
572 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
573 ewitab
= _mm_slli_epi32(ewitab
,2);
574 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
575 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
576 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
577 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
578 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
579 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
580 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
581 velec
= _mm256_mul_pd(qq12
,_mm256_sub_pd(_mm256_sub_pd(rinv12
,sh_ewald
),velec
));
582 felec
= _mm256_mul_pd(_mm256_mul_pd(qq12
,rinv12
),_mm256_sub_pd(rinvsq12
,felec
));
584 cutoff_mask
= _mm256_cmp_pd(rsq12
,rcutoff2
,_CMP_LT_OQ
);
586 /* Update potential sum for this i atom from the interaction with this j atom. */
587 velec
= _mm256_and_pd(velec
,cutoff_mask
);
588 velecsum
= _mm256_add_pd(velecsum
,velec
);
592 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
594 /* Calculate temporary vectorial force */
595 tx
= _mm256_mul_pd(fscal
,dx12
);
596 ty
= _mm256_mul_pd(fscal
,dy12
);
597 tz
= _mm256_mul_pd(fscal
,dz12
);
599 /* Update vectorial force */
600 fix1
= _mm256_add_pd(fix1
,tx
);
601 fiy1
= _mm256_add_pd(fiy1
,ty
);
602 fiz1
= _mm256_add_pd(fiz1
,tz
);
604 fjx2
= _mm256_add_pd(fjx2
,tx
);
605 fjy2
= _mm256_add_pd(fjy2
,ty
);
606 fjz2
= _mm256_add_pd(fjz2
,tz
);
610 /**************************
611 * CALCULATE INTERACTIONS *
612 **************************/
614 if (gmx_mm256_any_lt(rsq20
,rcutoff2
))
617 r20
= _mm256_mul_pd(rsq20
,rinv20
);
619 /* EWALD ELECTROSTATICS */
621 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
622 ewrt
= _mm256_mul_pd(r20
,ewtabscale
);
623 ewitab
= _mm256_cvttpd_epi32(ewrt
);
624 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
625 ewitab
= _mm_slli_epi32(ewitab
,2);
626 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
627 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
628 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
629 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
630 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
631 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
632 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
633 velec
= _mm256_mul_pd(qq20
,_mm256_sub_pd(_mm256_sub_pd(rinv20
,sh_ewald
),velec
));
634 felec
= _mm256_mul_pd(_mm256_mul_pd(qq20
,rinv20
),_mm256_sub_pd(rinvsq20
,felec
));
636 cutoff_mask
= _mm256_cmp_pd(rsq20
,rcutoff2
,_CMP_LT_OQ
);
638 /* Update potential sum for this i atom from the interaction with this j atom. */
639 velec
= _mm256_and_pd(velec
,cutoff_mask
);
640 velecsum
= _mm256_add_pd(velecsum
,velec
);
644 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
646 /* Calculate temporary vectorial force */
647 tx
= _mm256_mul_pd(fscal
,dx20
);
648 ty
= _mm256_mul_pd(fscal
,dy20
);
649 tz
= _mm256_mul_pd(fscal
,dz20
);
651 /* Update vectorial force */
652 fix2
= _mm256_add_pd(fix2
,tx
);
653 fiy2
= _mm256_add_pd(fiy2
,ty
);
654 fiz2
= _mm256_add_pd(fiz2
,tz
);
656 fjx0
= _mm256_add_pd(fjx0
,tx
);
657 fjy0
= _mm256_add_pd(fjy0
,ty
);
658 fjz0
= _mm256_add_pd(fjz0
,tz
);
662 /**************************
663 * CALCULATE INTERACTIONS *
664 **************************/
666 if (gmx_mm256_any_lt(rsq21
,rcutoff2
))
669 r21
= _mm256_mul_pd(rsq21
,rinv21
);
671 /* EWALD ELECTROSTATICS */
673 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
674 ewrt
= _mm256_mul_pd(r21
,ewtabscale
);
675 ewitab
= _mm256_cvttpd_epi32(ewrt
);
676 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
677 ewitab
= _mm_slli_epi32(ewitab
,2);
678 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
679 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
680 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
681 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
682 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
683 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
684 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
685 velec
= _mm256_mul_pd(qq21
,_mm256_sub_pd(_mm256_sub_pd(rinv21
,sh_ewald
),velec
));
686 felec
= _mm256_mul_pd(_mm256_mul_pd(qq21
,rinv21
),_mm256_sub_pd(rinvsq21
,felec
));
688 cutoff_mask
= _mm256_cmp_pd(rsq21
,rcutoff2
,_CMP_LT_OQ
);
690 /* Update potential sum for this i atom from the interaction with this j atom. */
691 velec
= _mm256_and_pd(velec
,cutoff_mask
);
692 velecsum
= _mm256_add_pd(velecsum
,velec
);
696 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
698 /* Calculate temporary vectorial force */
699 tx
= _mm256_mul_pd(fscal
,dx21
);
700 ty
= _mm256_mul_pd(fscal
,dy21
);
701 tz
= _mm256_mul_pd(fscal
,dz21
);
703 /* Update vectorial force */
704 fix2
= _mm256_add_pd(fix2
,tx
);
705 fiy2
= _mm256_add_pd(fiy2
,ty
);
706 fiz2
= _mm256_add_pd(fiz2
,tz
);
708 fjx1
= _mm256_add_pd(fjx1
,tx
);
709 fjy1
= _mm256_add_pd(fjy1
,ty
);
710 fjz1
= _mm256_add_pd(fjz1
,tz
);
714 /**************************
715 * CALCULATE INTERACTIONS *
716 **************************/
718 if (gmx_mm256_any_lt(rsq22
,rcutoff2
))
721 r22
= _mm256_mul_pd(rsq22
,rinv22
);
723 /* EWALD ELECTROSTATICS */
725 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
726 ewrt
= _mm256_mul_pd(r22
,ewtabscale
);
727 ewitab
= _mm256_cvttpd_epi32(ewrt
);
728 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
729 ewitab
= _mm_slli_epi32(ewitab
,2);
730 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
731 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
732 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
733 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
734 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
735 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
736 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
737 velec
= _mm256_mul_pd(qq22
,_mm256_sub_pd(_mm256_sub_pd(rinv22
,sh_ewald
),velec
));
738 felec
= _mm256_mul_pd(_mm256_mul_pd(qq22
,rinv22
),_mm256_sub_pd(rinvsq22
,felec
));
740 cutoff_mask
= _mm256_cmp_pd(rsq22
,rcutoff2
,_CMP_LT_OQ
);
742 /* Update potential sum for this i atom from the interaction with this j atom. */
743 velec
= _mm256_and_pd(velec
,cutoff_mask
);
744 velecsum
= _mm256_add_pd(velecsum
,velec
);
748 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
750 /* Calculate temporary vectorial force */
751 tx
= _mm256_mul_pd(fscal
,dx22
);
752 ty
= _mm256_mul_pd(fscal
,dy22
);
753 tz
= _mm256_mul_pd(fscal
,dz22
);
755 /* Update vectorial force */
756 fix2
= _mm256_add_pd(fix2
,tx
);
757 fiy2
= _mm256_add_pd(fiy2
,ty
);
758 fiz2
= _mm256_add_pd(fiz2
,tz
);
760 fjx2
= _mm256_add_pd(fjx2
,tx
);
761 fjy2
= _mm256_add_pd(fjy2
,ty
);
762 fjz2
= _mm256_add_pd(fjz2
,tz
);
766 fjptrA
= f
+j_coord_offsetA
;
767 fjptrB
= f
+j_coord_offsetB
;
768 fjptrC
= f
+j_coord_offsetC
;
769 fjptrD
= f
+j_coord_offsetD
;
771 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
772 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
774 /* Inner loop uses 414 flops */
780 /* Get j neighbor index, and coordinate index */
781 jnrlistA
= jjnr
[jidx
];
782 jnrlistB
= jjnr
[jidx
+1];
783 jnrlistC
= jjnr
[jidx
+2];
784 jnrlistD
= jjnr
[jidx
+3];
785 /* Sign of each element will be negative for non-real atoms.
786 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
787 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
789 tmpmask0
= gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128()));
791 tmpmask1
= _mm_permute_ps(tmpmask0
,_GMX_MM_PERMUTE(3,3,2,2));
792 tmpmask0
= _mm_permute_ps(tmpmask0
,_GMX_MM_PERMUTE(1,1,0,0));
793 dummy_mask
= _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1
,tmpmask0
));
795 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
796 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
797 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
798 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
799 j_coord_offsetA
= DIM
*jnrA
;
800 j_coord_offsetB
= DIM
*jnrB
;
801 j_coord_offsetC
= DIM
*jnrC
;
802 j_coord_offsetD
= DIM
*jnrD
;
804 /* load j atom coordinates */
805 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
806 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
807 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
809 /* Calculate displacement vector */
810 dx00
= _mm256_sub_pd(ix0
,jx0
);
811 dy00
= _mm256_sub_pd(iy0
,jy0
);
812 dz00
= _mm256_sub_pd(iz0
,jz0
);
813 dx01
= _mm256_sub_pd(ix0
,jx1
);
814 dy01
= _mm256_sub_pd(iy0
,jy1
);
815 dz01
= _mm256_sub_pd(iz0
,jz1
);
816 dx02
= _mm256_sub_pd(ix0
,jx2
);
817 dy02
= _mm256_sub_pd(iy0
,jy2
);
818 dz02
= _mm256_sub_pd(iz0
,jz2
);
819 dx10
= _mm256_sub_pd(ix1
,jx0
);
820 dy10
= _mm256_sub_pd(iy1
,jy0
);
821 dz10
= _mm256_sub_pd(iz1
,jz0
);
822 dx11
= _mm256_sub_pd(ix1
,jx1
);
823 dy11
= _mm256_sub_pd(iy1
,jy1
);
824 dz11
= _mm256_sub_pd(iz1
,jz1
);
825 dx12
= _mm256_sub_pd(ix1
,jx2
);
826 dy12
= _mm256_sub_pd(iy1
,jy2
);
827 dz12
= _mm256_sub_pd(iz1
,jz2
);
828 dx20
= _mm256_sub_pd(ix2
,jx0
);
829 dy20
= _mm256_sub_pd(iy2
,jy0
);
830 dz20
= _mm256_sub_pd(iz2
,jz0
);
831 dx21
= _mm256_sub_pd(ix2
,jx1
);
832 dy21
= _mm256_sub_pd(iy2
,jy1
);
833 dz21
= _mm256_sub_pd(iz2
,jz1
);
834 dx22
= _mm256_sub_pd(ix2
,jx2
);
835 dy22
= _mm256_sub_pd(iy2
,jy2
);
836 dz22
= _mm256_sub_pd(iz2
,jz2
);
838 /* Calculate squared distance and things based on it */
839 rsq00
= gmx_mm256_calc_rsq_pd(dx00
,dy00
,dz00
);
840 rsq01
= gmx_mm256_calc_rsq_pd(dx01
,dy01
,dz01
);
841 rsq02
= gmx_mm256_calc_rsq_pd(dx02
,dy02
,dz02
);
842 rsq10
= gmx_mm256_calc_rsq_pd(dx10
,dy10
,dz10
);
843 rsq11
= gmx_mm256_calc_rsq_pd(dx11
,dy11
,dz11
);
844 rsq12
= gmx_mm256_calc_rsq_pd(dx12
,dy12
,dz12
);
845 rsq20
= gmx_mm256_calc_rsq_pd(dx20
,dy20
,dz20
);
846 rsq21
= gmx_mm256_calc_rsq_pd(dx21
,dy21
,dz21
);
847 rsq22
= gmx_mm256_calc_rsq_pd(dx22
,dy22
,dz22
);
849 rinv00
= gmx_mm256_invsqrt_pd(rsq00
);
850 rinv01
= gmx_mm256_invsqrt_pd(rsq01
);
851 rinv02
= gmx_mm256_invsqrt_pd(rsq02
);
852 rinv10
= gmx_mm256_invsqrt_pd(rsq10
);
853 rinv11
= gmx_mm256_invsqrt_pd(rsq11
);
854 rinv12
= gmx_mm256_invsqrt_pd(rsq12
);
855 rinv20
= gmx_mm256_invsqrt_pd(rsq20
);
856 rinv21
= gmx_mm256_invsqrt_pd(rsq21
);
857 rinv22
= gmx_mm256_invsqrt_pd(rsq22
);
859 rinvsq00
= _mm256_mul_pd(rinv00
,rinv00
);
860 rinvsq01
= _mm256_mul_pd(rinv01
,rinv01
);
861 rinvsq02
= _mm256_mul_pd(rinv02
,rinv02
);
862 rinvsq10
= _mm256_mul_pd(rinv10
,rinv10
);
863 rinvsq11
= _mm256_mul_pd(rinv11
,rinv11
);
864 rinvsq12
= _mm256_mul_pd(rinv12
,rinv12
);
865 rinvsq20
= _mm256_mul_pd(rinv20
,rinv20
);
866 rinvsq21
= _mm256_mul_pd(rinv21
,rinv21
);
867 rinvsq22
= _mm256_mul_pd(rinv22
,rinv22
);
869 fjx0
= _mm256_setzero_pd();
870 fjy0
= _mm256_setzero_pd();
871 fjz0
= _mm256_setzero_pd();
872 fjx1
= _mm256_setzero_pd();
873 fjy1
= _mm256_setzero_pd();
874 fjz1
= _mm256_setzero_pd();
875 fjx2
= _mm256_setzero_pd();
876 fjy2
= _mm256_setzero_pd();
877 fjz2
= _mm256_setzero_pd();
879 /**************************
880 * CALCULATE INTERACTIONS *
881 **************************/
883 if (gmx_mm256_any_lt(rsq00
,rcutoff2
))
886 r00
= _mm256_mul_pd(rsq00
,rinv00
);
887 r00
= _mm256_andnot_pd(dummy_mask
,r00
);
889 /* EWALD ELECTROSTATICS */
891 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
892 ewrt
= _mm256_mul_pd(r00
,ewtabscale
);
893 ewitab
= _mm256_cvttpd_epi32(ewrt
);
894 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
895 ewitab
= _mm_slli_epi32(ewitab
,2);
896 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
897 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
898 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
899 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
900 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
901 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
902 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
903 velec
= _mm256_mul_pd(qq00
,_mm256_sub_pd(_mm256_sub_pd(rinv00
,sh_ewald
),velec
));
904 felec
= _mm256_mul_pd(_mm256_mul_pd(qq00
,rinv00
),_mm256_sub_pd(rinvsq00
,felec
));
906 cutoff_mask
= _mm256_cmp_pd(rsq00
,rcutoff2
,_CMP_LT_OQ
);
908 /* Update potential sum for this i atom from the interaction with this j atom. */
909 velec
= _mm256_and_pd(velec
,cutoff_mask
);
910 velec
= _mm256_andnot_pd(dummy_mask
,velec
);
911 velecsum
= _mm256_add_pd(velecsum
,velec
);
915 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
917 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
919 /* Calculate temporary vectorial force */
920 tx
= _mm256_mul_pd(fscal
,dx00
);
921 ty
= _mm256_mul_pd(fscal
,dy00
);
922 tz
= _mm256_mul_pd(fscal
,dz00
);
924 /* Update vectorial force */
925 fix0
= _mm256_add_pd(fix0
,tx
);
926 fiy0
= _mm256_add_pd(fiy0
,ty
);
927 fiz0
= _mm256_add_pd(fiz0
,tz
);
929 fjx0
= _mm256_add_pd(fjx0
,tx
);
930 fjy0
= _mm256_add_pd(fjy0
,ty
);
931 fjz0
= _mm256_add_pd(fjz0
,tz
);
935 /**************************
936 * CALCULATE INTERACTIONS *
937 **************************/
939 if (gmx_mm256_any_lt(rsq01
,rcutoff2
))
942 r01
= _mm256_mul_pd(rsq01
,rinv01
);
943 r01
= _mm256_andnot_pd(dummy_mask
,r01
);
945 /* EWALD ELECTROSTATICS */
947 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
948 ewrt
= _mm256_mul_pd(r01
,ewtabscale
);
949 ewitab
= _mm256_cvttpd_epi32(ewrt
);
950 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
951 ewitab
= _mm_slli_epi32(ewitab
,2);
952 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
953 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
954 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
955 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
956 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
957 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
958 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
959 velec
= _mm256_mul_pd(qq01
,_mm256_sub_pd(_mm256_sub_pd(rinv01
,sh_ewald
),velec
));
960 felec
= _mm256_mul_pd(_mm256_mul_pd(qq01
,rinv01
),_mm256_sub_pd(rinvsq01
,felec
));
962 cutoff_mask
= _mm256_cmp_pd(rsq01
,rcutoff2
,_CMP_LT_OQ
);
964 /* Update potential sum for this i atom from the interaction with this j atom. */
965 velec
= _mm256_and_pd(velec
,cutoff_mask
);
966 velec
= _mm256_andnot_pd(dummy_mask
,velec
);
967 velecsum
= _mm256_add_pd(velecsum
,velec
);
971 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
973 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
975 /* Calculate temporary vectorial force */
976 tx
= _mm256_mul_pd(fscal
,dx01
);
977 ty
= _mm256_mul_pd(fscal
,dy01
);
978 tz
= _mm256_mul_pd(fscal
,dz01
);
980 /* Update vectorial force */
981 fix0
= _mm256_add_pd(fix0
,tx
);
982 fiy0
= _mm256_add_pd(fiy0
,ty
);
983 fiz0
= _mm256_add_pd(fiz0
,tz
);
985 fjx1
= _mm256_add_pd(fjx1
,tx
);
986 fjy1
= _mm256_add_pd(fjy1
,ty
);
987 fjz1
= _mm256_add_pd(fjz1
,tz
);
991 /**************************
992 * CALCULATE INTERACTIONS *
993 **************************/
995 if (gmx_mm256_any_lt(rsq02
,rcutoff2
))
998 r02
= _mm256_mul_pd(rsq02
,rinv02
);
999 r02
= _mm256_andnot_pd(dummy_mask
,r02
);
1001 /* EWALD ELECTROSTATICS */
1003 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1004 ewrt
= _mm256_mul_pd(r02
,ewtabscale
);
1005 ewitab
= _mm256_cvttpd_epi32(ewrt
);
1006 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1007 ewitab
= _mm_slli_epi32(ewitab
,2);
1008 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
1009 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
1010 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
1011 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
1012 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
1013 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
1014 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
1015 velec
= _mm256_mul_pd(qq02
,_mm256_sub_pd(_mm256_sub_pd(rinv02
,sh_ewald
),velec
));
1016 felec
= _mm256_mul_pd(_mm256_mul_pd(qq02
,rinv02
),_mm256_sub_pd(rinvsq02
,felec
));
1018 cutoff_mask
= _mm256_cmp_pd(rsq02
,rcutoff2
,_CMP_LT_OQ
);
1020 /* Update potential sum for this i atom from the interaction with this j atom. */
1021 velec
= _mm256_and_pd(velec
,cutoff_mask
);
1022 velec
= _mm256_andnot_pd(dummy_mask
,velec
);
1023 velecsum
= _mm256_add_pd(velecsum
,velec
);
1027 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
1029 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
1031 /* Calculate temporary vectorial force */
1032 tx
= _mm256_mul_pd(fscal
,dx02
);
1033 ty
= _mm256_mul_pd(fscal
,dy02
);
1034 tz
= _mm256_mul_pd(fscal
,dz02
);
1036 /* Update vectorial force */
1037 fix0
= _mm256_add_pd(fix0
,tx
);
1038 fiy0
= _mm256_add_pd(fiy0
,ty
);
1039 fiz0
= _mm256_add_pd(fiz0
,tz
);
1041 fjx2
= _mm256_add_pd(fjx2
,tx
);
1042 fjy2
= _mm256_add_pd(fjy2
,ty
);
1043 fjz2
= _mm256_add_pd(fjz2
,tz
);
1047 /**************************
1048 * CALCULATE INTERACTIONS *
1049 **************************/
1051 if (gmx_mm256_any_lt(rsq10
,rcutoff2
))
1054 r10
= _mm256_mul_pd(rsq10
,rinv10
);
1055 r10
= _mm256_andnot_pd(dummy_mask
,r10
);
1057 /* EWALD ELECTROSTATICS */
1059 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1060 ewrt
= _mm256_mul_pd(r10
,ewtabscale
);
1061 ewitab
= _mm256_cvttpd_epi32(ewrt
);
1062 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1063 ewitab
= _mm_slli_epi32(ewitab
,2);
1064 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
1065 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
1066 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
1067 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
1068 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
1069 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
1070 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
1071 velec
= _mm256_mul_pd(qq10
,_mm256_sub_pd(_mm256_sub_pd(rinv10
,sh_ewald
),velec
));
1072 felec
= _mm256_mul_pd(_mm256_mul_pd(qq10
,rinv10
),_mm256_sub_pd(rinvsq10
,felec
));
1074 cutoff_mask
= _mm256_cmp_pd(rsq10
,rcutoff2
,_CMP_LT_OQ
);
1076 /* Update potential sum for this i atom from the interaction with this j atom. */
1077 velec
= _mm256_and_pd(velec
,cutoff_mask
);
1078 velec
= _mm256_andnot_pd(dummy_mask
,velec
);
1079 velecsum
= _mm256_add_pd(velecsum
,velec
);
1083 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
1085 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
1087 /* Calculate temporary vectorial force */
1088 tx
= _mm256_mul_pd(fscal
,dx10
);
1089 ty
= _mm256_mul_pd(fscal
,dy10
);
1090 tz
= _mm256_mul_pd(fscal
,dz10
);
1092 /* Update vectorial force */
1093 fix1
= _mm256_add_pd(fix1
,tx
);
1094 fiy1
= _mm256_add_pd(fiy1
,ty
);
1095 fiz1
= _mm256_add_pd(fiz1
,tz
);
1097 fjx0
= _mm256_add_pd(fjx0
,tx
);
1098 fjy0
= _mm256_add_pd(fjy0
,ty
);
1099 fjz0
= _mm256_add_pd(fjz0
,tz
);
1103 /**************************
1104 * CALCULATE INTERACTIONS *
1105 **************************/
1107 if (gmx_mm256_any_lt(rsq11
,rcutoff2
))
1110 r11
= _mm256_mul_pd(rsq11
,rinv11
);
1111 r11
= _mm256_andnot_pd(dummy_mask
,r11
);
1113 /* EWALD ELECTROSTATICS */
1115 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1116 ewrt
= _mm256_mul_pd(r11
,ewtabscale
);
1117 ewitab
= _mm256_cvttpd_epi32(ewrt
);
1118 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1119 ewitab
= _mm_slli_epi32(ewitab
,2);
1120 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
1121 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
1122 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
1123 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
1124 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
1125 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
1126 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
1127 velec
= _mm256_mul_pd(qq11
,_mm256_sub_pd(_mm256_sub_pd(rinv11
,sh_ewald
),velec
));
1128 felec
= _mm256_mul_pd(_mm256_mul_pd(qq11
,rinv11
),_mm256_sub_pd(rinvsq11
,felec
));
1130 cutoff_mask
= _mm256_cmp_pd(rsq11
,rcutoff2
,_CMP_LT_OQ
);
1132 /* Update potential sum for this i atom from the interaction with this j atom. */
1133 velec
= _mm256_and_pd(velec
,cutoff_mask
);
1134 velec
= _mm256_andnot_pd(dummy_mask
,velec
);
1135 velecsum
= _mm256_add_pd(velecsum
,velec
);
1139 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
1141 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
1143 /* Calculate temporary vectorial force */
1144 tx
= _mm256_mul_pd(fscal
,dx11
);
1145 ty
= _mm256_mul_pd(fscal
,dy11
);
1146 tz
= _mm256_mul_pd(fscal
,dz11
);
1148 /* Update vectorial force */
1149 fix1
= _mm256_add_pd(fix1
,tx
);
1150 fiy1
= _mm256_add_pd(fiy1
,ty
);
1151 fiz1
= _mm256_add_pd(fiz1
,tz
);
1153 fjx1
= _mm256_add_pd(fjx1
,tx
);
1154 fjy1
= _mm256_add_pd(fjy1
,ty
);
1155 fjz1
= _mm256_add_pd(fjz1
,tz
);
1159 /**************************
1160 * CALCULATE INTERACTIONS *
1161 **************************/
1163 if (gmx_mm256_any_lt(rsq12
,rcutoff2
))
1166 r12
= _mm256_mul_pd(rsq12
,rinv12
);
1167 r12
= _mm256_andnot_pd(dummy_mask
,r12
);
1169 /* EWALD ELECTROSTATICS */
1171 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1172 ewrt
= _mm256_mul_pd(r12
,ewtabscale
);
1173 ewitab
= _mm256_cvttpd_epi32(ewrt
);
1174 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1175 ewitab
= _mm_slli_epi32(ewitab
,2);
1176 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
1177 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
1178 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
1179 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
1180 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
1181 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
1182 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
1183 velec
= _mm256_mul_pd(qq12
,_mm256_sub_pd(_mm256_sub_pd(rinv12
,sh_ewald
),velec
));
1184 felec
= _mm256_mul_pd(_mm256_mul_pd(qq12
,rinv12
),_mm256_sub_pd(rinvsq12
,felec
));
1186 cutoff_mask
= _mm256_cmp_pd(rsq12
,rcutoff2
,_CMP_LT_OQ
);
1188 /* Update potential sum for this i atom from the interaction with this j atom. */
1189 velec
= _mm256_and_pd(velec
,cutoff_mask
);
1190 velec
= _mm256_andnot_pd(dummy_mask
,velec
);
1191 velecsum
= _mm256_add_pd(velecsum
,velec
);
1195 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
1197 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
1199 /* Calculate temporary vectorial force */
1200 tx
= _mm256_mul_pd(fscal
,dx12
);
1201 ty
= _mm256_mul_pd(fscal
,dy12
);
1202 tz
= _mm256_mul_pd(fscal
,dz12
);
1204 /* Update vectorial force */
1205 fix1
= _mm256_add_pd(fix1
,tx
);
1206 fiy1
= _mm256_add_pd(fiy1
,ty
);
1207 fiz1
= _mm256_add_pd(fiz1
,tz
);
1209 fjx2
= _mm256_add_pd(fjx2
,tx
);
1210 fjy2
= _mm256_add_pd(fjy2
,ty
);
1211 fjz2
= _mm256_add_pd(fjz2
,tz
);
1215 /**************************
1216 * CALCULATE INTERACTIONS *
1217 **************************/
1219 if (gmx_mm256_any_lt(rsq20
,rcutoff2
))
1222 r20
= _mm256_mul_pd(rsq20
,rinv20
);
1223 r20
= _mm256_andnot_pd(dummy_mask
,r20
);
1225 /* EWALD ELECTROSTATICS */
1227 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1228 ewrt
= _mm256_mul_pd(r20
,ewtabscale
);
1229 ewitab
= _mm256_cvttpd_epi32(ewrt
);
1230 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1231 ewitab
= _mm_slli_epi32(ewitab
,2);
1232 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
1233 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
1234 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
1235 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
1236 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
1237 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
1238 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
1239 velec
= _mm256_mul_pd(qq20
,_mm256_sub_pd(_mm256_sub_pd(rinv20
,sh_ewald
),velec
));
1240 felec
= _mm256_mul_pd(_mm256_mul_pd(qq20
,rinv20
),_mm256_sub_pd(rinvsq20
,felec
));
1242 cutoff_mask
= _mm256_cmp_pd(rsq20
,rcutoff2
,_CMP_LT_OQ
);
1244 /* Update potential sum for this i atom from the interaction with this j atom. */
1245 velec
= _mm256_and_pd(velec
,cutoff_mask
);
1246 velec
= _mm256_andnot_pd(dummy_mask
,velec
);
1247 velecsum
= _mm256_add_pd(velecsum
,velec
);
1251 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
1253 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
1255 /* Calculate temporary vectorial force */
1256 tx
= _mm256_mul_pd(fscal
,dx20
);
1257 ty
= _mm256_mul_pd(fscal
,dy20
);
1258 tz
= _mm256_mul_pd(fscal
,dz20
);
1260 /* Update vectorial force */
1261 fix2
= _mm256_add_pd(fix2
,tx
);
1262 fiy2
= _mm256_add_pd(fiy2
,ty
);
1263 fiz2
= _mm256_add_pd(fiz2
,tz
);
1265 fjx0
= _mm256_add_pd(fjx0
,tx
);
1266 fjy0
= _mm256_add_pd(fjy0
,ty
);
1267 fjz0
= _mm256_add_pd(fjz0
,tz
);
1271 /**************************
1272 * CALCULATE INTERACTIONS *
1273 **************************/
1275 if (gmx_mm256_any_lt(rsq21
,rcutoff2
))
1278 r21
= _mm256_mul_pd(rsq21
,rinv21
);
1279 r21
= _mm256_andnot_pd(dummy_mask
,r21
);
1281 /* EWALD ELECTROSTATICS */
1283 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1284 ewrt
= _mm256_mul_pd(r21
,ewtabscale
);
1285 ewitab
= _mm256_cvttpd_epi32(ewrt
);
1286 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1287 ewitab
= _mm_slli_epi32(ewitab
,2);
1288 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
1289 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
1290 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
1291 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
1292 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
1293 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
1294 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
1295 velec
= _mm256_mul_pd(qq21
,_mm256_sub_pd(_mm256_sub_pd(rinv21
,sh_ewald
),velec
));
1296 felec
= _mm256_mul_pd(_mm256_mul_pd(qq21
,rinv21
),_mm256_sub_pd(rinvsq21
,felec
));
1298 cutoff_mask
= _mm256_cmp_pd(rsq21
,rcutoff2
,_CMP_LT_OQ
);
1300 /* Update potential sum for this i atom from the interaction with this j atom. */
1301 velec
= _mm256_and_pd(velec
,cutoff_mask
);
1302 velec
= _mm256_andnot_pd(dummy_mask
,velec
);
1303 velecsum
= _mm256_add_pd(velecsum
,velec
);
1307 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
1309 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
1311 /* Calculate temporary vectorial force */
1312 tx
= _mm256_mul_pd(fscal
,dx21
);
1313 ty
= _mm256_mul_pd(fscal
,dy21
);
1314 tz
= _mm256_mul_pd(fscal
,dz21
);
1316 /* Update vectorial force */
1317 fix2
= _mm256_add_pd(fix2
,tx
);
1318 fiy2
= _mm256_add_pd(fiy2
,ty
);
1319 fiz2
= _mm256_add_pd(fiz2
,tz
);
1321 fjx1
= _mm256_add_pd(fjx1
,tx
);
1322 fjy1
= _mm256_add_pd(fjy1
,ty
);
1323 fjz1
= _mm256_add_pd(fjz1
,tz
);
1327 /**************************
1328 * CALCULATE INTERACTIONS *
1329 **************************/
1331 if (gmx_mm256_any_lt(rsq22
,rcutoff2
))
1334 r22
= _mm256_mul_pd(rsq22
,rinv22
);
1335 r22
= _mm256_andnot_pd(dummy_mask
,r22
);
1337 /* EWALD ELECTROSTATICS */
1339 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1340 ewrt
= _mm256_mul_pd(r22
,ewtabscale
);
1341 ewitab
= _mm256_cvttpd_epi32(ewrt
);
1342 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1343 ewitab
= _mm_slli_epi32(ewitab
,2);
1344 ewtabF
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,0) );
1345 ewtabD
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,1) );
1346 ewtabV
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,2) );
1347 ewtabFn
= _mm256_load_pd( ewtab
+ _mm_extract_epi32(ewitab
,3) );
1348 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF
,ewtabD
,ewtabV
,ewtabFn
);
1349 felec
= _mm256_add_pd(ewtabF
,_mm256_mul_pd(eweps
,ewtabD
));
1350 velec
= _mm256_sub_pd(ewtabV
,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace
,eweps
),_mm256_add_pd(ewtabF
,felec
)));
1351 velec
= _mm256_mul_pd(qq22
,_mm256_sub_pd(_mm256_sub_pd(rinv22
,sh_ewald
),velec
));
1352 felec
= _mm256_mul_pd(_mm256_mul_pd(qq22
,rinv22
),_mm256_sub_pd(rinvsq22
,felec
));
1354 cutoff_mask
= _mm256_cmp_pd(rsq22
,rcutoff2
,_CMP_LT_OQ
);
1356 /* Update potential sum for this i atom from the interaction with this j atom. */
1357 velec
= _mm256_and_pd(velec
,cutoff_mask
);
1358 velec
= _mm256_andnot_pd(dummy_mask
,velec
);
1359 velecsum
= _mm256_add_pd(velecsum
,velec
);
1363 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
1365 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
1367 /* Calculate temporary vectorial force */
1368 tx
= _mm256_mul_pd(fscal
,dx22
);
1369 ty
= _mm256_mul_pd(fscal
,dy22
);
1370 tz
= _mm256_mul_pd(fscal
,dz22
);
1372 /* Update vectorial force */
1373 fix2
= _mm256_add_pd(fix2
,tx
);
1374 fiy2
= _mm256_add_pd(fiy2
,ty
);
1375 fiz2
= _mm256_add_pd(fiz2
,tz
);
1377 fjx2
= _mm256_add_pd(fjx2
,tx
);
1378 fjy2
= _mm256_add_pd(fjy2
,ty
);
1379 fjz2
= _mm256_add_pd(fjz2
,tz
);
1383 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
1384 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
1385 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
1386 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
1388 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
1389 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
1391 /* Inner loop uses 423 flops */
1394 /* End of innermost loop */
1396 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
1397 f
+i_coord_offset
,fshift
+i_shift_offset
);
1400 /* Update potential energies */
1401 gmx_mm256_update_1pot_pd(velecsum
,kernel_data
->energygrp_elec
+ggid
);
1403 /* Increment number of inner iterations */
1404 inneriter
+= j_index_end
- j_index_start
;
1406 /* Outer loop uses 19 flops */
1409 /* Increment number of outer iterations */
1412 /* Update outer/inner flops */
1414 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_W3W3_VF
,outeriter
*19 + inneriter
*423);
1417 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_avx_256_double
1418 * Electrostatics interaction: Ewald
1419 * VdW interaction: None
1420 * Geometry: Water3-Water3
1421 * Calculate force/pot: Force
1424 nb_kernel_ElecEwSh_VdwNone_GeomW3W3_F_avx_256_double
1425 (t_nblist
* gmx_restrict nlist
,
1426 rvec
* gmx_restrict xx
,
1427 rvec
* gmx_restrict ff
,
1428 t_forcerec
* gmx_restrict fr
,
1429 t_mdatoms
* gmx_restrict mdatoms
,
1430 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
1431 t_nrnb
* gmx_restrict nrnb
)
1433 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1434 * just 0 for non-waters.
1435 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1436 * jnr indices corresponding to data put in the four positions in the SIMD register.
1438 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
1439 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
1440 int jnrA
,jnrB
,jnrC
,jnrD
;
1441 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
1442 int jnrlistE
,jnrlistF
,jnrlistG
,jnrlistH
;
1443 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
1444 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
1445 real rcutoff_scalar
;
1446 real
*shiftvec
,*fshift
,*x
,*f
;
1447 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
;
1448 real scratch
[4*DIM
];
1449 __m256d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
1450 real
* vdwioffsetptr0
;
1451 __m256d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
1452 real
* vdwioffsetptr1
;
1453 __m256d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
1454 real
* vdwioffsetptr2
;
1455 __m256d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
1456 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
;
1457 __m256d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
1458 int vdwjidx1A
,vdwjidx1B
,vdwjidx1C
,vdwjidx1D
;
1459 __m256d jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
1460 int vdwjidx2A
,vdwjidx2B
,vdwjidx2C
,vdwjidx2D
;
1461 __m256d jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
1462 __m256d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
1463 __m256d dx01
,dy01
,dz01
,rsq01
,rinv01
,rinvsq01
,r01
,qq01
,c6_01
,c12_01
;
1464 __m256d dx02
,dy02
,dz02
,rsq02
,rinv02
,rinvsq02
,r02
,qq02
,c6_02
,c12_02
;
1465 __m256d dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
1466 __m256d dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
1467 __m256d dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
1468 __m256d dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
1469 __m256d dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
1470 __m256d dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
1471 __m256d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
1474 __m256d ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
1475 __m256d beta
,beta2
,beta3
,zeta2
,pmecorrF
,pmecorrV
,rinv3
;
1477 __m256d dummy_mask
,cutoff_mask
;
1478 __m128 tmpmask0
,tmpmask1
;
1479 __m256d signbit
= _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1480 __m256d one
= _mm256_set1_pd(1.0);
1481 __m256d two
= _mm256_set1_pd(2.0);
1487 jindex
= nlist
->jindex
;
1489 shiftidx
= nlist
->shift
;
1491 shiftvec
= fr
->shift_vec
[0];
1492 fshift
= fr
->fshift
[0];
1493 facel
= _mm256_set1_pd(fr
->epsfac
);
1494 charge
= mdatoms
->chargeA
;
1496 sh_ewald
= _mm256_set1_pd(fr
->ic
->sh_ewald
);
1497 beta
= _mm256_set1_pd(fr
->ic
->ewaldcoeff_q
);
1498 beta2
= _mm256_mul_pd(beta
,beta
);
1499 beta3
= _mm256_mul_pd(beta
,beta2
);
1501 ewtab
= fr
->ic
->tabq_coul_F
;
1502 ewtabscale
= _mm256_set1_pd(fr
->ic
->tabq_scale
);
1503 ewtabhalfspace
= _mm256_set1_pd(0.5/fr
->ic
->tabq_scale
);
1505 /* Setup water-specific parameters */
1506 inr
= nlist
->iinr
[0];
1507 iq0
= _mm256_mul_pd(facel
,_mm256_set1_pd(charge
[inr
+0]));
1508 iq1
= _mm256_mul_pd(facel
,_mm256_set1_pd(charge
[inr
+1]));
1509 iq2
= _mm256_mul_pd(facel
,_mm256_set1_pd(charge
[inr
+2]));
1511 jq0
= _mm256_set1_pd(charge
[inr
+0]);
1512 jq1
= _mm256_set1_pd(charge
[inr
+1]);
1513 jq2
= _mm256_set1_pd(charge
[inr
+2]);
1514 qq00
= _mm256_mul_pd(iq0
,jq0
);
1515 qq01
= _mm256_mul_pd(iq0
,jq1
);
1516 qq02
= _mm256_mul_pd(iq0
,jq2
);
1517 qq10
= _mm256_mul_pd(iq1
,jq0
);
1518 qq11
= _mm256_mul_pd(iq1
,jq1
);
1519 qq12
= _mm256_mul_pd(iq1
,jq2
);
1520 qq20
= _mm256_mul_pd(iq2
,jq0
);
1521 qq21
= _mm256_mul_pd(iq2
,jq1
);
1522 qq22
= _mm256_mul_pd(iq2
,jq2
);
1524 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1525 rcutoff_scalar
= fr
->rcoulomb
;
1526 rcutoff
= _mm256_set1_pd(rcutoff_scalar
);
1527 rcutoff2
= _mm256_mul_pd(rcutoff
,rcutoff
);
1529 /* Avoid stupid compiler warnings */
1530 jnrA
= jnrB
= jnrC
= jnrD
= 0;
1531 j_coord_offsetA
= 0;
1532 j_coord_offsetB
= 0;
1533 j_coord_offsetC
= 0;
1534 j_coord_offsetD
= 0;
1539 for(iidx
=0;iidx
<4*DIM
;iidx
++)
1541 scratch
[iidx
] = 0.0;
1544 /* Start outer loop over neighborlists */
1545 for(iidx
=0; iidx
<nri
; iidx
++)
1547 /* Load shift vector for this list */
1548 i_shift_offset
= DIM
*shiftidx
[iidx
];
1550 /* Load limits for loop over neighbors */
1551 j_index_start
= jindex
[iidx
];
1552 j_index_end
= jindex
[iidx
+1];
1554 /* Get outer coordinate index */
1556 i_coord_offset
= DIM
*inr
;
1558 /* Load i particle coords and add shift vector */
1559 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
1560 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
1562 fix0
= _mm256_setzero_pd();
1563 fiy0
= _mm256_setzero_pd();
1564 fiz0
= _mm256_setzero_pd();
1565 fix1
= _mm256_setzero_pd();
1566 fiy1
= _mm256_setzero_pd();
1567 fiz1
= _mm256_setzero_pd();
1568 fix2
= _mm256_setzero_pd();
1569 fiy2
= _mm256_setzero_pd();
1570 fiz2
= _mm256_setzero_pd();
1572 /* Start inner kernel loop */
1573 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+3]>=0; jidx
+=4)
1576 /* Get j neighbor index, and coordinate index */
1578 jnrB
= jjnr
[jidx
+1];
1579 jnrC
= jjnr
[jidx
+2];
1580 jnrD
= jjnr
[jidx
+3];
1581 j_coord_offsetA
= DIM
*jnrA
;
1582 j_coord_offsetB
= DIM
*jnrB
;
1583 j_coord_offsetC
= DIM
*jnrC
;
1584 j_coord_offsetD
= DIM
*jnrD
;
1586 /* load j atom coordinates */
1587 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1588 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
1589 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
1591 /* Calculate displacement vector */
1592 dx00
= _mm256_sub_pd(ix0
,jx0
);
1593 dy00
= _mm256_sub_pd(iy0
,jy0
);
1594 dz00
= _mm256_sub_pd(iz0
,jz0
);
1595 dx01
= _mm256_sub_pd(ix0
,jx1
);
1596 dy01
= _mm256_sub_pd(iy0
,jy1
);
1597 dz01
= _mm256_sub_pd(iz0
,jz1
);
1598 dx02
= _mm256_sub_pd(ix0
,jx2
);
1599 dy02
= _mm256_sub_pd(iy0
,jy2
);
1600 dz02
= _mm256_sub_pd(iz0
,jz2
);
1601 dx10
= _mm256_sub_pd(ix1
,jx0
);
1602 dy10
= _mm256_sub_pd(iy1
,jy0
);
1603 dz10
= _mm256_sub_pd(iz1
,jz0
);
1604 dx11
= _mm256_sub_pd(ix1
,jx1
);
1605 dy11
= _mm256_sub_pd(iy1
,jy1
);
1606 dz11
= _mm256_sub_pd(iz1
,jz1
);
1607 dx12
= _mm256_sub_pd(ix1
,jx2
);
1608 dy12
= _mm256_sub_pd(iy1
,jy2
);
1609 dz12
= _mm256_sub_pd(iz1
,jz2
);
1610 dx20
= _mm256_sub_pd(ix2
,jx0
);
1611 dy20
= _mm256_sub_pd(iy2
,jy0
);
1612 dz20
= _mm256_sub_pd(iz2
,jz0
);
1613 dx21
= _mm256_sub_pd(ix2
,jx1
);
1614 dy21
= _mm256_sub_pd(iy2
,jy1
);
1615 dz21
= _mm256_sub_pd(iz2
,jz1
);
1616 dx22
= _mm256_sub_pd(ix2
,jx2
);
1617 dy22
= _mm256_sub_pd(iy2
,jy2
);
1618 dz22
= _mm256_sub_pd(iz2
,jz2
);
1620 /* Calculate squared distance and things based on it */
1621 rsq00
= gmx_mm256_calc_rsq_pd(dx00
,dy00
,dz00
);
1622 rsq01
= gmx_mm256_calc_rsq_pd(dx01
,dy01
,dz01
);
1623 rsq02
= gmx_mm256_calc_rsq_pd(dx02
,dy02
,dz02
);
1624 rsq10
= gmx_mm256_calc_rsq_pd(dx10
,dy10
,dz10
);
1625 rsq11
= gmx_mm256_calc_rsq_pd(dx11
,dy11
,dz11
);
1626 rsq12
= gmx_mm256_calc_rsq_pd(dx12
,dy12
,dz12
);
1627 rsq20
= gmx_mm256_calc_rsq_pd(dx20
,dy20
,dz20
);
1628 rsq21
= gmx_mm256_calc_rsq_pd(dx21
,dy21
,dz21
);
1629 rsq22
= gmx_mm256_calc_rsq_pd(dx22
,dy22
,dz22
);
1631 rinv00
= gmx_mm256_invsqrt_pd(rsq00
);
1632 rinv01
= gmx_mm256_invsqrt_pd(rsq01
);
1633 rinv02
= gmx_mm256_invsqrt_pd(rsq02
);
1634 rinv10
= gmx_mm256_invsqrt_pd(rsq10
);
1635 rinv11
= gmx_mm256_invsqrt_pd(rsq11
);
1636 rinv12
= gmx_mm256_invsqrt_pd(rsq12
);
1637 rinv20
= gmx_mm256_invsqrt_pd(rsq20
);
1638 rinv21
= gmx_mm256_invsqrt_pd(rsq21
);
1639 rinv22
= gmx_mm256_invsqrt_pd(rsq22
);
1641 rinvsq00
= _mm256_mul_pd(rinv00
,rinv00
);
1642 rinvsq01
= _mm256_mul_pd(rinv01
,rinv01
);
1643 rinvsq02
= _mm256_mul_pd(rinv02
,rinv02
);
1644 rinvsq10
= _mm256_mul_pd(rinv10
,rinv10
);
1645 rinvsq11
= _mm256_mul_pd(rinv11
,rinv11
);
1646 rinvsq12
= _mm256_mul_pd(rinv12
,rinv12
);
1647 rinvsq20
= _mm256_mul_pd(rinv20
,rinv20
);
1648 rinvsq21
= _mm256_mul_pd(rinv21
,rinv21
);
1649 rinvsq22
= _mm256_mul_pd(rinv22
,rinv22
);
1651 fjx0
= _mm256_setzero_pd();
1652 fjy0
= _mm256_setzero_pd();
1653 fjz0
= _mm256_setzero_pd();
1654 fjx1
= _mm256_setzero_pd();
1655 fjy1
= _mm256_setzero_pd();
1656 fjz1
= _mm256_setzero_pd();
1657 fjx2
= _mm256_setzero_pd();
1658 fjy2
= _mm256_setzero_pd();
1659 fjz2
= _mm256_setzero_pd();
1661 /**************************
1662 * CALCULATE INTERACTIONS *
1663 **************************/
1665 if (gmx_mm256_any_lt(rsq00
,rcutoff2
))
1668 r00
= _mm256_mul_pd(rsq00
,rinv00
);
1670 /* EWALD ELECTROSTATICS */
1672 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1673 ewrt
= _mm256_mul_pd(r00
,ewtabscale
);
1674 ewitab
= _mm256_cvttpd_epi32(ewrt
);
1675 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1676 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
1677 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
1679 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
1680 felec
= _mm256_mul_pd(_mm256_mul_pd(qq00
,rinv00
),_mm256_sub_pd(rinvsq00
,felec
));
1682 cutoff_mask
= _mm256_cmp_pd(rsq00
,rcutoff2
,_CMP_LT_OQ
);
1686 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
1688 /* Calculate temporary vectorial force */
1689 tx
= _mm256_mul_pd(fscal
,dx00
);
1690 ty
= _mm256_mul_pd(fscal
,dy00
);
1691 tz
= _mm256_mul_pd(fscal
,dz00
);
1693 /* Update vectorial force */
1694 fix0
= _mm256_add_pd(fix0
,tx
);
1695 fiy0
= _mm256_add_pd(fiy0
,ty
);
1696 fiz0
= _mm256_add_pd(fiz0
,tz
);
1698 fjx0
= _mm256_add_pd(fjx0
,tx
);
1699 fjy0
= _mm256_add_pd(fjy0
,ty
);
1700 fjz0
= _mm256_add_pd(fjz0
,tz
);
1704 /**************************
1705 * CALCULATE INTERACTIONS *
1706 **************************/
1708 if (gmx_mm256_any_lt(rsq01
,rcutoff2
))
1711 r01
= _mm256_mul_pd(rsq01
,rinv01
);
1713 /* EWALD ELECTROSTATICS */
1715 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1716 ewrt
= _mm256_mul_pd(r01
,ewtabscale
);
1717 ewitab
= _mm256_cvttpd_epi32(ewrt
);
1718 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1719 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
1720 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
1722 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
1723 felec
= _mm256_mul_pd(_mm256_mul_pd(qq01
,rinv01
),_mm256_sub_pd(rinvsq01
,felec
));
1725 cutoff_mask
= _mm256_cmp_pd(rsq01
,rcutoff2
,_CMP_LT_OQ
);
1729 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
1731 /* Calculate temporary vectorial force */
1732 tx
= _mm256_mul_pd(fscal
,dx01
);
1733 ty
= _mm256_mul_pd(fscal
,dy01
);
1734 tz
= _mm256_mul_pd(fscal
,dz01
);
1736 /* Update vectorial force */
1737 fix0
= _mm256_add_pd(fix0
,tx
);
1738 fiy0
= _mm256_add_pd(fiy0
,ty
);
1739 fiz0
= _mm256_add_pd(fiz0
,tz
);
1741 fjx1
= _mm256_add_pd(fjx1
,tx
);
1742 fjy1
= _mm256_add_pd(fjy1
,ty
);
1743 fjz1
= _mm256_add_pd(fjz1
,tz
);
1747 /**************************
1748 * CALCULATE INTERACTIONS *
1749 **************************/
1751 if (gmx_mm256_any_lt(rsq02
,rcutoff2
))
1754 r02
= _mm256_mul_pd(rsq02
,rinv02
);
1756 /* EWALD ELECTROSTATICS */
1758 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1759 ewrt
= _mm256_mul_pd(r02
,ewtabscale
);
1760 ewitab
= _mm256_cvttpd_epi32(ewrt
);
1761 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1762 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
1763 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
1765 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
1766 felec
= _mm256_mul_pd(_mm256_mul_pd(qq02
,rinv02
),_mm256_sub_pd(rinvsq02
,felec
));
1768 cutoff_mask
= _mm256_cmp_pd(rsq02
,rcutoff2
,_CMP_LT_OQ
);
1772 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
1774 /* Calculate temporary vectorial force */
1775 tx
= _mm256_mul_pd(fscal
,dx02
);
1776 ty
= _mm256_mul_pd(fscal
,dy02
);
1777 tz
= _mm256_mul_pd(fscal
,dz02
);
1779 /* Update vectorial force */
1780 fix0
= _mm256_add_pd(fix0
,tx
);
1781 fiy0
= _mm256_add_pd(fiy0
,ty
);
1782 fiz0
= _mm256_add_pd(fiz0
,tz
);
1784 fjx2
= _mm256_add_pd(fjx2
,tx
);
1785 fjy2
= _mm256_add_pd(fjy2
,ty
);
1786 fjz2
= _mm256_add_pd(fjz2
,tz
);
1790 /**************************
1791 * CALCULATE INTERACTIONS *
1792 **************************/
1794 if (gmx_mm256_any_lt(rsq10
,rcutoff2
))
1797 r10
= _mm256_mul_pd(rsq10
,rinv10
);
1799 /* EWALD ELECTROSTATICS */
1801 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1802 ewrt
= _mm256_mul_pd(r10
,ewtabscale
);
1803 ewitab
= _mm256_cvttpd_epi32(ewrt
);
1804 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1805 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
1806 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
1808 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
1809 felec
= _mm256_mul_pd(_mm256_mul_pd(qq10
,rinv10
),_mm256_sub_pd(rinvsq10
,felec
));
1811 cutoff_mask
= _mm256_cmp_pd(rsq10
,rcutoff2
,_CMP_LT_OQ
);
1815 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
1817 /* Calculate temporary vectorial force */
1818 tx
= _mm256_mul_pd(fscal
,dx10
);
1819 ty
= _mm256_mul_pd(fscal
,dy10
);
1820 tz
= _mm256_mul_pd(fscal
,dz10
);
1822 /* Update vectorial force */
1823 fix1
= _mm256_add_pd(fix1
,tx
);
1824 fiy1
= _mm256_add_pd(fiy1
,ty
);
1825 fiz1
= _mm256_add_pd(fiz1
,tz
);
1827 fjx0
= _mm256_add_pd(fjx0
,tx
);
1828 fjy0
= _mm256_add_pd(fjy0
,ty
);
1829 fjz0
= _mm256_add_pd(fjz0
,tz
);
1833 /**************************
1834 * CALCULATE INTERACTIONS *
1835 **************************/
1837 if (gmx_mm256_any_lt(rsq11
,rcutoff2
))
1840 r11
= _mm256_mul_pd(rsq11
,rinv11
);
1842 /* EWALD ELECTROSTATICS */
1844 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1845 ewrt
= _mm256_mul_pd(r11
,ewtabscale
);
1846 ewitab
= _mm256_cvttpd_epi32(ewrt
);
1847 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1848 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
1849 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
1851 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
1852 felec
= _mm256_mul_pd(_mm256_mul_pd(qq11
,rinv11
),_mm256_sub_pd(rinvsq11
,felec
));
1854 cutoff_mask
= _mm256_cmp_pd(rsq11
,rcutoff2
,_CMP_LT_OQ
);
1858 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
1860 /* Calculate temporary vectorial force */
1861 tx
= _mm256_mul_pd(fscal
,dx11
);
1862 ty
= _mm256_mul_pd(fscal
,dy11
);
1863 tz
= _mm256_mul_pd(fscal
,dz11
);
1865 /* Update vectorial force */
1866 fix1
= _mm256_add_pd(fix1
,tx
);
1867 fiy1
= _mm256_add_pd(fiy1
,ty
);
1868 fiz1
= _mm256_add_pd(fiz1
,tz
);
1870 fjx1
= _mm256_add_pd(fjx1
,tx
);
1871 fjy1
= _mm256_add_pd(fjy1
,ty
);
1872 fjz1
= _mm256_add_pd(fjz1
,tz
);
1876 /**************************
1877 * CALCULATE INTERACTIONS *
1878 **************************/
1880 if (gmx_mm256_any_lt(rsq12
,rcutoff2
))
1883 r12
= _mm256_mul_pd(rsq12
,rinv12
);
1885 /* EWALD ELECTROSTATICS */
1887 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1888 ewrt
= _mm256_mul_pd(r12
,ewtabscale
);
1889 ewitab
= _mm256_cvttpd_epi32(ewrt
);
1890 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1891 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
1892 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
1894 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
1895 felec
= _mm256_mul_pd(_mm256_mul_pd(qq12
,rinv12
),_mm256_sub_pd(rinvsq12
,felec
));
1897 cutoff_mask
= _mm256_cmp_pd(rsq12
,rcutoff2
,_CMP_LT_OQ
);
1901 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
1903 /* Calculate temporary vectorial force */
1904 tx
= _mm256_mul_pd(fscal
,dx12
);
1905 ty
= _mm256_mul_pd(fscal
,dy12
);
1906 tz
= _mm256_mul_pd(fscal
,dz12
);
1908 /* Update vectorial force */
1909 fix1
= _mm256_add_pd(fix1
,tx
);
1910 fiy1
= _mm256_add_pd(fiy1
,ty
);
1911 fiz1
= _mm256_add_pd(fiz1
,tz
);
1913 fjx2
= _mm256_add_pd(fjx2
,tx
);
1914 fjy2
= _mm256_add_pd(fjy2
,ty
);
1915 fjz2
= _mm256_add_pd(fjz2
,tz
);
1919 /**************************
1920 * CALCULATE INTERACTIONS *
1921 **************************/
1923 if (gmx_mm256_any_lt(rsq20
,rcutoff2
))
1926 r20
= _mm256_mul_pd(rsq20
,rinv20
);
1928 /* EWALD ELECTROSTATICS */
1930 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1931 ewrt
= _mm256_mul_pd(r20
,ewtabscale
);
1932 ewitab
= _mm256_cvttpd_epi32(ewrt
);
1933 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1934 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
1935 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
1937 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
1938 felec
= _mm256_mul_pd(_mm256_mul_pd(qq20
,rinv20
),_mm256_sub_pd(rinvsq20
,felec
));
1940 cutoff_mask
= _mm256_cmp_pd(rsq20
,rcutoff2
,_CMP_LT_OQ
);
1944 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
1946 /* Calculate temporary vectorial force */
1947 tx
= _mm256_mul_pd(fscal
,dx20
);
1948 ty
= _mm256_mul_pd(fscal
,dy20
);
1949 tz
= _mm256_mul_pd(fscal
,dz20
);
1951 /* Update vectorial force */
1952 fix2
= _mm256_add_pd(fix2
,tx
);
1953 fiy2
= _mm256_add_pd(fiy2
,ty
);
1954 fiz2
= _mm256_add_pd(fiz2
,tz
);
1956 fjx0
= _mm256_add_pd(fjx0
,tx
);
1957 fjy0
= _mm256_add_pd(fjy0
,ty
);
1958 fjz0
= _mm256_add_pd(fjz0
,tz
);
1962 /**************************
1963 * CALCULATE INTERACTIONS *
1964 **************************/
1966 if (gmx_mm256_any_lt(rsq21
,rcutoff2
))
1969 r21
= _mm256_mul_pd(rsq21
,rinv21
);
1971 /* EWALD ELECTROSTATICS */
1973 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1974 ewrt
= _mm256_mul_pd(r21
,ewtabscale
);
1975 ewitab
= _mm256_cvttpd_epi32(ewrt
);
1976 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
1977 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
1978 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
1980 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
1981 felec
= _mm256_mul_pd(_mm256_mul_pd(qq21
,rinv21
),_mm256_sub_pd(rinvsq21
,felec
));
1983 cutoff_mask
= _mm256_cmp_pd(rsq21
,rcutoff2
,_CMP_LT_OQ
);
1987 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
1989 /* Calculate temporary vectorial force */
1990 tx
= _mm256_mul_pd(fscal
,dx21
);
1991 ty
= _mm256_mul_pd(fscal
,dy21
);
1992 tz
= _mm256_mul_pd(fscal
,dz21
);
1994 /* Update vectorial force */
1995 fix2
= _mm256_add_pd(fix2
,tx
);
1996 fiy2
= _mm256_add_pd(fiy2
,ty
);
1997 fiz2
= _mm256_add_pd(fiz2
,tz
);
1999 fjx1
= _mm256_add_pd(fjx1
,tx
);
2000 fjy1
= _mm256_add_pd(fjy1
,ty
);
2001 fjz1
= _mm256_add_pd(fjz1
,tz
);
2005 /**************************
2006 * CALCULATE INTERACTIONS *
2007 **************************/
2009 if (gmx_mm256_any_lt(rsq22
,rcutoff2
))
2012 r22
= _mm256_mul_pd(rsq22
,rinv22
);
2014 /* EWALD ELECTROSTATICS */
2016 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2017 ewrt
= _mm256_mul_pd(r22
,ewtabscale
);
2018 ewitab
= _mm256_cvttpd_epi32(ewrt
);
2019 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2020 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
2021 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
2023 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
2024 felec
= _mm256_mul_pd(_mm256_mul_pd(qq22
,rinv22
),_mm256_sub_pd(rinvsq22
,felec
));
2026 cutoff_mask
= _mm256_cmp_pd(rsq22
,rcutoff2
,_CMP_LT_OQ
);
2030 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
2032 /* Calculate temporary vectorial force */
2033 tx
= _mm256_mul_pd(fscal
,dx22
);
2034 ty
= _mm256_mul_pd(fscal
,dy22
);
2035 tz
= _mm256_mul_pd(fscal
,dz22
);
2037 /* Update vectorial force */
2038 fix2
= _mm256_add_pd(fix2
,tx
);
2039 fiy2
= _mm256_add_pd(fiy2
,ty
);
2040 fiz2
= _mm256_add_pd(fiz2
,tz
);
2042 fjx2
= _mm256_add_pd(fjx2
,tx
);
2043 fjy2
= _mm256_add_pd(fjy2
,ty
);
2044 fjz2
= _mm256_add_pd(fjz2
,tz
);
2048 fjptrA
= f
+j_coord_offsetA
;
2049 fjptrB
= f
+j_coord_offsetB
;
2050 fjptrC
= f
+j_coord_offsetC
;
2051 fjptrD
= f
+j_coord_offsetD
;
2053 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
2054 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
2056 /* Inner loop uses 351 flops */
2059 if(jidx
<j_index_end
)
2062 /* Get j neighbor index, and coordinate index */
2063 jnrlistA
= jjnr
[jidx
];
2064 jnrlistB
= jjnr
[jidx
+1];
2065 jnrlistC
= jjnr
[jidx
+2];
2066 jnrlistD
= jjnr
[jidx
+3];
2067 /* Sign of each element will be negative for non-real atoms.
2068 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2069 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
2071 tmpmask0
= gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128()));
2073 tmpmask1
= _mm_permute_ps(tmpmask0
,_GMX_MM_PERMUTE(3,3,2,2));
2074 tmpmask0
= _mm_permute_ps(tmpmask0
,_GMX_MM_PERMUTE(1,1,0,0));
2075 dummy_mask
= _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1
,tmpmask0
));
2077 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
2078 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
2079 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
2080 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
2081 j_coord_offsetA
= DIM
*jnrA
;
2082 j_coord_offsetB
= DIM
*jnrB
;
2083 j_coord_offsetC
= DIM
*jnrC
;
2084 j_coord_offsetD
= DIM
*jnrD
;
2086 /* load j atom coordinates */
2087 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
2088 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
2089 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
2091 /* Calculate displacement vector */
2092 dx00
= _mm256_sub_pd(ix0
,jx0
);
2093 dy00
= _mm256_sub_pd(iy0
,jy0
);
2094 dz00
= _mm256_sub_pd(iz0
,jz0
);
2095 dx01
= _mm256_sub_pd(ix0
,jx1
);
2096 dy01
= _mm256_sub_pd(iy0
,jy1
);
2097 dz01
= _mm256_sub_pd(iz0
,jz1
);
2098 dx02
= _mm256_sub_pd(ix0
,jx2
);
2099 dy02
= _mm256_sub_pd(iy0
,jy2
);
2100 dz02
= _mm256_sub_pd(iz0
,jz2
);
2101 dx10
= _mm256_sub_pd(ix1
,jx0
);
2102 dy10
= _mm256_sub_pd(iy1
,jy0
);
2103 dz10
= _mm256_sub_pd(iz1
,jz0
);
2104 dx11
= _mm256_sub_pd(ix1
,jx1
);
2105 dy11
= _mm256_sub_pd(iy1
,jy1
);
2106 dz11
= _mm256_sub_pd(iz1
,jz1
);
2107 dx12
= _mm256_sub_pd(ix1
,jx2
);
2108 dy12
= _mm256_sub_pd(iy1
,jy2
);
2109 dz12
= _mm256_sub_pd(iz1
,jz2
);
2110 dx20
= _mm256_sub_pd(ix2
,jx0
);
2111 dy20
= _mm256_sub_pd(iy2
,jy0
);
2112 dz20
= _mm256_sub_pd(iz2
,jz0
);
2113 dx21
= _mm256_sub_pd(ix2
,jx1
);
2114 dy21
= _mm256_sub_pd(iy2
,jy1
);
2115 dz21
= _mm256_sub_pd(iz2
,jz1
);
2116 dx22
= _mm256_sub_pd(ix2
,jx2
);
2117 dy22
= _mm256_sub_pd(iy2
,jy2
);
2118 dz22
= _mm256_sub_pd(iz2
,jz2
);
2120 /* Calculate squared distance and things based on it */
2121 rsq00
= gmx_mm256_calc_rsq_pd(dx00
,dy00
,dz00
);
2122 rsq01
= gmx_mm256_calc_rsq_pd(dx01
,dy01
,dz01
);
2123 rsq02
= gmx_mm256_calc_rsq_pd(dx02
,dy02
,dz02
);
2124 rsq10
= gmx_mm256_calc_rsq_pd(dx10
,dy10
,dz10
);
2125 rsq11
= gmx_mm256_calc_rsq_pd(dx11
,dy11
,dz11
);
2126 rsq12
= gmx_mm256_calc_rsq_pd(dx12
,dy12
,dz12
);
2127 rsq20
= gmx_mm256_calc_rsq_pd(dx20
,dy20
,dz20
);
2128 rsq21
= gmx_mm256_calc_rsq_pd(dx21
,dy21
,dz21
);
2129 rsq22
= gmx_mm256_calc_rsq_pd(dx22
,dy22
,dz22
);
2131 rinv00
= gmx_mm256_invsqrt_pd(rsq00
);
2132 rinv01
= gmx_mm256_invsqrt_pd(rsq01
);
2133 rinv02
= gmx_mm256_invsqrt_pd(rsq02
);
2134 rinv10
= gmx_mm256_invsqrt_pd(rsq10
);
2135 rinv11
= gmx_mm256_invsqrt_pd(rsq11
);
2136 rinv12
= gmx_mm256_invsqrt_pd(rsq12
);
2137 rinv20
= gmx_mm256_invsqrt_pd(rsq20
);
2138 rinv21
= gmx_mm256_invsqrt_pd(rsq21
);
2139 rinv22
= gmx_mm256_invsqrt_pd(rsq22
);
2141 rinvsq00
= _mm256_mul_pd(rinv00
,rinv00
);
2142 rinvsq01
= _mm256_mul_pd(rinv01
,rinv01
);
2143 rinvsq02
= _mm256_mul_pd(rinv02
,rinv02
);
2144 rinvsq10
= _mm256_mul_pd(rinv10
,rinv10
);
2145 rinvsq11
= _mm256_mul_pd(rinv11
,rinv11
);
2146 rinvsq12
= _mm256_mul_pd(rinv12
,rinv12
);
2147 rinvsq20
= _mm256_mul_pd(rinv20
,rinv20
);
2148 rinvsq21
= _mm256_mul_pd(rinv21
,rinv21
);
2149 rinvsq22
= _mm256_mul_pd(rinv22
,rinv22
);
2151 fjx0
= _mm256_setzero_pd();
2152 fjy0
= _mm256_setzero_pd();
2153 fjz0
= _mm256_setzero_pd();
2154 fjx1
= _mm256_setzero_pd();
2155 fjy1
= _mm256_setzero_pd();
2156 fjz1
= _mm256_setzero_pd();
2157 fjx2
= _mm256_setzero_pd();
2158 fjy2
= _mm256_setzero_pd();
2159 fjz2
= _mm256_setzero_pd();
2161 /**************************
2162 * CALCULATE INTERACTIONS *
2163 **************************/
2165 if (gmx_mm256_any_lt(rsq00
,rcutoff2
))
2168 r00
= _mm256_mul_pd(rsq00
,rinv00
);
2169 r00
= _mm256_andnot_pd(dummy_mask
,r00
);
2171 /* EWALD ELECTROSTATICS */
2173 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2174 ewrt
= _mm256_mul_pd(r00
,ewtabscale
);
2175 ewitab
= _mm256_cvttpd_epi32(ewrt
);
2176 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2177 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
2178 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
2180 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
2181 felec
= _mm256_mul_pd(_mm256_mul_pd(qq00
,rinv00
),_mm256_sub_pd(rinvsq00
,felec
));
2183 cutoff_mask
= _mm256_cmp_pd(rsq00
,rcutoff2
,_CMP_LT_OQ
);
2187 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
2189 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
2191 /* Calculate temporary vectorial force */
2192 tx
= _mm256_mul_pd(fscal
,dx00
);
2193 ty
= _mm256_mul_pd(fscal
,dy00
);
2194 tz
= _mm256_mul_pd(fscal
,dz00
);
2196 /* Update vectorial force */
2197 fix0
= _mm256_add_pd(fix0
,tx
);
2198 fiy0
= _mm256_add_pd(fiy0
,ty
);
2199 fiz0
= _mm256_add_pd(fiz0
,tz
);
2201 fjx0
= _mm256_add_pd(fjx0
,tx
);
2202 fjy0
= _mm256_add_pd(fjy0
,ty
);
2203 fjz0
= _mm256_add_pd(fjz0
,tz
);
2207 /**************************
2208 * CALCULATE INTERACTIONS *
2209 **************************/
2211 if (gmx_mm256_any_lt(rsq01
,rcutoff2
))
2214 r01
= _mm256_mul_pd(rsq01
,rinv01
);
2215 r01
= _mm256_andnot_pd(dummy_mask
,r01
);
2217 /* EWALD ELECTROSTATICS */
2219 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2220 ewrt
= _mm256_mul_pd(r01
,ewtabscale
);
2221 ewitab
= _mm256_cvttpd_epi32(ewrt
);
2222 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2223 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
2224 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
2226 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
2227 felec
= _mm256_mul_pd(_mm256_mul_pd(qq01
,rinv01
),_mm256_sub_pd(rinvsq01
,felec
));
2229 cutoff_mask
= _mm256_cmp_pd(rsq01
,rcutoff2
,_CMP_LT_OQ
);
2233 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
2235 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
2237 /* Calculate temporary vectorial force */
2238 tx
= _mm256_mul_pd(fscal
,dx01
);
2239 ty
= _mm256_mul_pd(fscal
,dy01
);
2240 tz
= _mm256_mul_pd(fscal
,dz01
);
2242 /* Update vectorial force */
2243 fix0
= _mm256_add_pd(fix0
,tx
);
2244 fiy0
= _mm256_add_pd(fiy0
,ty
);
2245 fiz0
= _mm256_add_pd(fiz0
,tz
);
2247 fjx1
= _mm256_add_pd(fjx1
,tx
);
2248 fjy1
= _mm256_add_pd(fjy1
,ty
);
2249 fjz1
= _mm256_add_pd(fjz1
,tz
);
2253 /**************************
2254 * CALCULATE INTERACTIONS *
2255 **************************/
2257 if (gmx_mm256_any_lt(rsq02
,rcutoff2
))
2260 r02
= _mm256_mul_pd(rsq02
,rinv02
);
2261 r02
= _mm256_andnot_pd(dummy_mask
,r02
);
2263 /* EWALD ELECTROSTATICS */
2265 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2266 ewrt
= _mm256_mul_pd(r02
,ewtabscale
);
2267 ewitab
= _mm256_cvttpd_epi32(ewrt
);
2268 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2269 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
2270 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
2272 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
2273 felec
= _mm256_mul_pd(_mm256_mul_pd(qq02
,rinv02
),_mm256_sub_pd(rinvsq02
,felec
));
2275 cutoff_mask
= _mm256_cmp_pd(rsq02
,rcutoff2
,_CMP_LT_OQ
);
2279 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
2281 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
2283 /* Calculate temporary vectorial force */
2284 tx
= _mm256_mul_pd(fscal
,dx02
);
2285 ty
= _mm256_mul_pd(fscal
,dy02
);
2286 tz
= _mm256_mul_pd(fscal
,dz02
);
2288 /* Update vectorial force */
2289 fix0
= _mm256_add_pd(fix0
,tx
);
2290 fiy0
= _mm256_add_pd(fiy0
,ty
);
2291 fiz0
= _mm256_add_pd(fiz0
,tz
);
2293 fjx2
= _mm256_add_pd(fjx2
,tx
);
2294 fjy2
= _mm256_add_pd(fjy2
,ty
);
2295 fjz2
= _mm256_add_pd(fjz2
,tz
);
2299 /**************************
2300 * CALCULATE INTERACTIONS *
2301 **************************/
2303 if (gmx_mm256_any_lt(rsq10
,rcutoff2
))
2306 r10
= _mm256_mul_pd(rsq10
,rinv10
);
2307 r10
= _mm256_andnot_pd(dummy_mask
,r10
);
2309 /* EWALD ELECTROSTATICS */
2311 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2312 ewrt
= _mm256_mul_pd(r10
,ewtabscale
);
2313 ewitab
= _mm256_cvttpd_epi32(ewrt
);
2314 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2315 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
2316 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
2318 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
2319 felec
= _mm256_mul_pd(_mm256_mul_pd(qq10
,rinv10
),_mm256_sub_pd(rinvsq10
,felec
));
2321 cutoff_mask
= _mm256_cmp_pd(rsq10
,rcutoff2
,_CMP_LT_OQ
);
2325 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
2327 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
2329 /* Calculate temporary vectorial force */
2330 tx
= _mm256_mul_pd(fscal
,dx10
);
2331 ty
= _mm256_mul_pd(fscal
,dy10
);
2332 tz
= _mm256_mul_pd(fscal
,dz10
);
2334 /* Update vectorial force */
2335 fix1
= _mm256_add_pd(fix1
,tx
);
2336 fiy1
= _mm256_add_pd(fiy1
,ty
);
2337 fiz1
= _mm256_add_pd(fiz1
,tz
);
2339 fjx0
= _mm256_add_pd(fjx0
,tx
);
2340 fjy0
= _mm256_add_pd(fjy0
,ty
);
2341 fjz0
= _mm256_add_pd(fjz0
,tz
);
2345 /**************************
2346 * CALCULATE INTERACTIONS *
2347 **************************/
2349 if (gmx_mm256_any_lt(rsq11
,rcutoff2
))
2352 r11
= _mm256_mul_pd(rsq11
,rinv11
);
2353 r11
= _mm256_andnot_pd(dummy_mask
,r11
);
2355 /* EWALD ELECTROSTATICS */
2357 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2358 ewrt
= _mm256_mul_pd(r11
,ewtabscale
);
2359 ewitab
= _mm256_cvttpd_epi32(ewrt
);
2360 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2361 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
2362 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
2364 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
2365 felec
= _mm256_mul_pd(_mm256_mul_pd(qq11
,rinv11
),_mm256_sub_pd(rinvsq11
,felec
));
2367 cutoff_mask
= _mm256_cmp_pd(rsq11
,rcutoff2
,_CMP_LT_OQ
);
2371 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
2373 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
2375 /* Calculate temporary vectorial force */
2376 tx
= _mm256_mul_pd(fscal
,dx11
);
2377 ty
= _mm256_mul_pd(fscal
,dy11
);
2378 tz
= _mm256_mul_pd(fscal
,dz11
);
2380 /* Update vectorial force */
2381 fix1
= _mm256_add_pd(fix1
,tx
);
2382 fiy1
= _mm256_add_pd(fiy1
,ty
);
2383 fiz1
= _mm256_add_pd(fiz1
,tz
);
2385 fjx1
= _mm256_add_pd(fjx1
,tx
);
2386 fjy1
= _mm256_add_pd(fjy1
,ty
);
2387 fjz1
= _mm256_add_pd(fjz1
,tz
);
2391 /**************************
2392 * CALCULATE INTERACTIONS *
2393 **************************/
2395 if (gmx_mm256_any_lt(rsq12
,rcutoff2
))
2398 r12
= _mm256_mul_pd(rsq12
,rinv12
);
2399 r12
= _mm256_andnot_pd(dummy_mask
,r12
);
2401 /* EWALD ELECTROSTATICS */
2403 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2404 ewrt
= _mm256_mul_pd(r12
,ewtabscale
);
2405 ewitab
= _mm256_cvttpd_epi32(ewrt
);
2406 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2407 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
2408 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
2410 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
2411 felec
= _mm256_mul_pd(_mm256_mul_pd(qq12
,rinv12
),_mm256_sub_pd(rinvsq12
,felec
));
2413 cutoff_mask
= _mm256_cmp_pd(rsq12
,rcutoff2
,_CMP_LT_OQ
);
2417 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
2419 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
2421 /* Calculate temporary vectorial force */
2422 tx
= _mm256_mul_pd(fscal
,dx12
);
2423 ty
= _mm256_mul_pd(fscal
,dy12
);
2424 tz
= _mm256_mul_pd(fscal
,dz12
);
2426 /* Update vectorial force */
2427 fix1
= _mm256_add_pd(fix1
,tx
);
2428 fiy1
= _mm256_add_pd(fiy1
,ty
);
2429 fiz1
= _mm256_add_pd(fiz1
,tz
);
2431 fjx2
= _mm256_add_pd(fjx2
,tx
);
2432 fjy2
= _mm256_add_pd(fjy2
,ty
);
2433 fjz2
= _mm256_add_pd(fjz2
,tz
);
2437 /**************************
2438 * CALCULATE INTERACTIONS *
2439 **************************/
2441 if (gmx_mm256_any_lt(rsq20
,rcutoff2
))
2444 r20
= _mm256_mul_pd(rsq20
,rinv20
);
2445 r20
= _mm256_andnot_pd(dummy_mask
,r20
);
2447 /* EWALD ELECTROSTATICS */
2449 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2450 ewrt
= _mm256_mul_pd(r20
,ewtabscale
);
2451 ewitab
= _mm256_cvttpd_epi32(ewrt
);
2452 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2453 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
2454 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
2456 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
2457 felec
= _mm256_mul_pd(_mm256_mul_pd(qq20
,rinv20
),_mm256_sub_pd(rinvsq20
,felec
));
2459 cutoff_mask
= _mm256_cmp_pd(rsq20
,rcutoff2
,_CMP_LT_OQ
);
2463 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
2465 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
2467 /* Calculate temporary vectorial force */
2468 tx
= _mm256_mul_pd(fscal
,dx20
);
2469 ty
= _mm256_mul_pd(fscal
,dy20
);
2470 tz
= _mm256_mul_pd(fscal
,dz20
);
2472 /* Update vectorial force */
2473 fix2
= _mm256_add_pd(fix2
,tx
);
2474 fiy2
= _mm256_add_pd(fiy2
,ty
);
2475 fiz2
= _mm256_add_pd(fiz2
,tz
);
2477 fjx0
= _mm256_add_pd(fjx0
,tx
);
2478 fjy0
= _mm256_add_pd(fjy0
,ty
);
2479 fjz0
= _mm256_add_pd(fjz0
,tz
);
2483 /**************************
2484 * CALCULATE INTERACTIONS *
2485 **************************/
2487 if (gmx_mm256_any_lt(rsq21
,rcutoff2
))
2490 r21
= _mm256_mul_pd(rsq21
,rinv21
);
2491 r21
= _mm256_andnot_pd(dummy_mask
,r21
);
2493 /* EWALD ELECTROSTATICS */
2495 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2496 ewrt
= _mm256_mul_pd(r21
,ewtabscale
);
2497 ewitab
= _mm256_cvttpd_epi32(ewrt
);
2498 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2499 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
2500 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
2502 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
2503 felec
= _mm256_mul_pd(_mm256_mul_pd(qq21
,rinv21
),_mm256_sub_pd(rinvsq21
,felec
));
2505 cutoff_mask
= _mm256_cmp_pd(rsq21
,rcutoff2
,_CMP_LT_OQ
);
2509 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
2511 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
2513 /* Calculate temporary vectorial force */
2514 tx
= _mm256_mul_pd(fscal
,dx21
);
2515 ty
= _mm256_mul_pd(fscal
,dy21
);
2516 tz
= _mm256_mul_pd(fscal
,dz21
);
2518 /* Update vectorial force */
2519 fix2
= _mm256_add_pd(fix2
,tx
);
2520 fiy2
= _mm256_add_pd(fiy2
,ty
);
2521 fiz2
= _mm256_add_pd(fiz2
,tz
);
2523 fjx1
= _mm256_add_pd(fjx1
,tx
);
2524 fjy1
= _mm256_add_pd(fjy1
,ty
);
2525 fjz1
= _mm256_add_pd(fjz1
,tz
);
2529 /**************************
2530 * CALCULATE INTERACTIONS *
2531 **************************/
2533 if (gmx_mm256_any_lt(rsq22
,rcutoff2
))
2536 r22
= _mm256_mul_pd(rsq22
,rinv22
);
2537 r22
= _mm256_andnot_pd(dummy_mask
,r22
);
2539 /* EWALD ELECTROSTATICS */
2541 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2542 ewrt
= _mm256_mul_pd(r22
,ewtabscale
);
2543 ewitab
= _mm256_cvttpd_epi32(ewrt
);
2544 eweps
= _mm256_sub_pd(ewrt
,_mm256_round_pd(ewrt
, _MM_FROUND_FLOOR
));
2545 gmx_mm256_load_4pair_swizzle_pd(ewtab
+ _mm_extract_epi32(ewitab
,0),ewtab
+ _mm_extract_epi32(ewitab
,1),
2546 ewtab
+ _mm_extract_epi32(ewitab
,2),ewtab
+ _mm_extract_epi32(ewitab
,3),
2548 felec
= _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one
,eweps
),ewtabF
),_mm256_mul_pd(eweps
,ewtabFn
));
2549 felec
= _mm256_mul_pd(_mm256_mul_pd(qq22
,rinv22
),_mm256_sub_pd(rinvsq22
,felec
));
2551 cutoff_mask
= _mm256_cmp_pd(rsq22
,rcutoff2
,_CMP_LT_OQ
);
2555 fscal
= _mm256_and_pd(fscal
,cutoff_mask
);
2557 fscal
= _mm256_andnot_pd(dummy_mask
,fscal
);
2559 /* Calculate temporary vectorial force */
2560 tx
= _mm256_mul_pd(fscal
,dx22
);
2561 ty
= _mm256_mul_pd(fscal
,dy22
);
2562 tz
= _mm256_mul_pd(fscal
,dz22
);
2564 /* Update vectorial force */
2565 fix2
= _mm256_add_pd(fix2
,tx
);
2566 fiy2
= _mm256_add_pd(fiy2
,ty
);
2567 fiz2
= _mm256_add_pd(fiz2
,tz
);
2569 fjx2
= _mm256_add_pd(fjx2
,tx
);
2570 fjy2
= _mm256_add_pd(fjy2
,ty
);
2571 fjz2
= _mm256_add_pd(fjz2
,tz
);
2575 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
2576 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
2577 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
2578 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
2580 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
2581 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
2583 /* Inner loop uses 360 flops */
2586 /* End of innermost loop */
2588 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
2589 f
+i_coord_offset
,fshift
+i_shift_offset
);
2591 /* Increment number of inner iterations */
2592 inneriter
+= j_index_end
- j_index_start
;
2594 /* Outer loop uses 18 flops */
2597 /* Increment number of outer iterations */
2600 /* Update outer/inner flops */
2602 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_W3W3_F
,outeriter
*18 + inneriter
*360);