2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_single kernel generator.
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
47 #include "gromacs/simd/math_x86_avx_256_single.h"
48 #include "kernelutil_x86_avx_256_single.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_avx_256_single
52 * Electrostatics interaction: CubicSplineTable
53 * VdW interaction: CubicSplineTable
54 * Geometry: Water3-Particle
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_VF_avx_256_single
59 (t_nblist
* gmx_restrict nlist
,
60 rvec
* gmx_restrict xx
,
61 rvec
* gmx_restrict ff
,
62 t_forcerec
* gmx_restrict fr
,
63 t_mdatoms
* gmx_restrict mdatoms
,
64 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
65 t_nrnb
* gmx_restrict nrnb
)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
73 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
74 int jnrA
,jnrB
,jnrC
,jnrD
;
75 int jnrE
,jnrF
,jnrG
,jnrH
;
76 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
77 int jnrlistE
,jnrlistF
,jnrlistG
,jnrlistH
;
78 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
79 int j_coord_offsetE
,j_coord_offsetF
,j_coord_offsetG
,j_coord_offsetH
;
80 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
82 real
*shiftvec
,*fshift
,*x
,*f
;
83 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
,*fjptrE
,*fjptrF
,*fjptrG
,*fjptrH
;
85 __m256 tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
86 real
* vdwioffsetptr0
;
87 __m256 ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
88 real
* vdwioffsetptr1
;
89 __m256 ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
90 real
* vdwioffsetptr2
;
91 __m256 ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
92 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
,vdwjidx0E
,vdwjidx0F
,vdwjidx0G
,vdwjidx0H
;
93 __m256 jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
94 __m256 dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
95 __m256 dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
96 __m256 dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
97 __m256 velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
100 __m256 rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
103 __m256 one_sixth
= _mm256_set1_ps(1.0/6.0);
104 __m256 one_twelfth
= _mm256_set1_ps(1.0/12.0);
106 __m128i vfitab_lo
,vfitab_hi
;
107 __m128i ifour
= _mm_set1_epi32(4);
108 __m256 rt
,vfeps
,vftabscale
,Y
,F
,G
,H
,Heps
,Fp
,VV
,FF
;
110 __m256 dummy_mask
,cutoff_mask
;
111 __m256 signbit
= _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
112 __m256 one
= _mm256_set1_ps(1.0);
113 __m256 two
= _mm256_set1_ps(2.0);
119 jindex
= nlist
->jindex
;
121 shiftidx
= nlist
->shift
;
123 shiftvec
= fr
->shift_vec
[0];
124 fshift
= fr
->fshift
[0];
125 facel
= _mm256_set1_ps(fr
->epsfac
);
126 charge
= mdatoms
->chargeA
;
127 nvdwtype
= fr
->ntype
;
129 vdwtype
= mdatoms
->typeA
;
131 vftab
= kernel_data
->table_elec_vdw
->data
;
132 vftabscale
= _mm256_set1_ps(kernel_data
->table_elec_vdw
->scale
);
134 /* Setup water-specific parameters */
135 inr
= nlist
->iinr
[0];
136 iq0
= _mm256_mul_ps(facel
,_mm256_set1_ps(charge
[inr
+0]));
137 iq1
= _mm256_mul_ps(facel
,_mm256_set1_ps(charge
[inr
+1]));
138 iq2
= _mm256_mul_ps(facel
,_mm256_set1_ps(charge
[inr
+2]));
139 vdwioffsetptr0
= vdwparam
+2*nvdwtype
*vdwtype
[inr
+0];
141 /* Avoid stupid compiler warnings */
142 jnrA
= jnrB
= jnrC
= jnrD
= jnrE
= jnrF
= jnrG
= jnrH
= 0;
155 for(iidx
=0;iidx
<4*DIM
;iidx
++)
160 /* Start outer loop over neighborlists */
161 for(iidx
=0; iidx
<nri
; iidx
++)
163 /* Load shift vector for this list */
164 i_shift_offset
= DIM
*shiftidx
[iidx
];
166 /* Load limits for loop over neighbors */
167 j_index_start
= jindex
[iidx
];
168 j_index_end
= jindex
[iidx
+1];
170 /* Get outer coordinate index */
172 i_coord_offset
= DIM
*inr
;
174 /* Load i particle coords and add shift vector */
175 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
176 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
178 fix0
= _mm256_setzero_ps();
179 fiy0
= _mm256_setzero_ps();
180 fiz0
= _mm256_setzero_ps();
181 fix1
= _mm256_setzero_ps();
182 fiy1
= _mm256_setzero_ps();
183 fiz1
= _mm256_setzero_ps();
184 fix2
= _mm256_setzero_ps();
185 fiy2
= _mm256_setzero_ps();
186 fiz2
= _mm256_setzero_ps();
188 /* Reset potential sums */
189 velecsum
= _mm256_setzero_ps();
190 vvdwsum
= _mm256_setzero_ps();
192 /* Start inner kernel loop */
193 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+7]>=0; jidx
+=8)
196 /* Get j neighbor index, and coordinate index */
205 j_coord_offsetA
= DIM
*jnrA
;
206 j_coord_offsetB
= DIM
*jnrB
;
207 j_coord_offsetC
= DIM
*jnrC
;
208 j_coord_offsetD
= DIM
*jnrD
;
209 j_coord_offsetE
= DIM
*jnrE
;
210 j_coord_offsetF
= DIM
*jnrF
;
211 j_coord_offsetG
= DIM
*jnrG
;
212 j_coord_offsetH
= DIM
*jnrH
;
214 /* load j atom coordinates */
215 gmx_mm256_load_1rvec_8ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
216 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
217 x
+j_coord_offsetE
,x
+j_coord_offsetF
,
218 x
+j_coord_offsetG
,x
+j_coord_offsetH
,
221 /* Calculate displacement vector */
222 dx00
= _mm256_sub_ps(ix0
,jx0
);
223 dy00
= _mm256_sub_ps(iy0
,jy0
);
224 dz00
= _mm256_sub_ps(iz0
,jz0
);
225 dx10
= _mm256_sub_ps(ix1
,jx0
);
226 dy10
= _mm256_sub_ps(iy1
,jy0
);
227 dz10
= _mm256_sub_ps(iz1
,jz0
);
228 dx20
= _mm256_sub_ps(ix2
,jx0
);
229 dy20
= _mm256_sub_ps(iy2
,jy0
);
230 dz20
= _mm256_sub_ps(iz2
,jz0
);
232 /* Calculate squared distance and things based on it */
233 rsq00
= gmx_mm256_calc_rsq_ps(dx00
,dy00
,dz00
);
234 rsq10
= gmx_mm256_calc_rsq_ps(dx10
,dy10
,dz10
);
235 rsq20
= gmx_mm256_calc_rsq_ps(dx20
,dy20
,dz20
);
237 rinv00
= gmx_mm256_invsqrt_ps(rsq00
);
238 rinv10
= gmx_mm256_invsqrt_ps(rsq10
);
239 rinv20
= gmx_mm256_invsqrt_ps(rsq20
);
241 /* Load parameters for j particles */
242 jq0
= gmx_mm256_load_8real_swizzle_ps(charge
+jnrA
+0,charge
+jnrB
+0,
243 charge
+jnrC
+0,charge
+jnrD
+0,
244 charge
+jnrE
+0,charge
+jnrF
+0,
245 charge
+jnrG
+0,charge
+jnrH
+0);
246 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
247 vdwjidx0B
= 2*vdwtype
[jnrB
+0];
248 vdwjidx0C
= 2*vdwtype
[jnrC
+0];
249 vdwjidx0D
= 2*vdwtype
[jnrD
+0];
250 vdwjidx0E
= 2*vdwtype
[jnrE
+0];
251 vdwjidx0F
= 2*vdwtype
[jnrF
+0];
252 vdwjidx0G
= 2*vdwtype
[jnrG
+0];
253 vdwjidx0H
= 2*vdwtype
[jnrH
+0];
255 fjx0
= _mm256_setzero_ps();
256 fjy0
= _mm256_setzero_ps();
257 fjz0
= _mm256_setzero_ps();
259 /**************************
260 * CALCULATE INTERACTIONS *
261 **************************/
263 r00
= _mm256_mul_ps(rsq00
,rinv00
);
265 /* Compute parameters for interactions between i and j atoms */
266 qq00
= _mm256_mul_ps(iq0
,jq0
);
267 gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0
+vdwjidx0A
,
268 vdwioffsetptr0
+vdwjidx0B
,
269 vdwioffsetptr0
+vdwjidx0C
,
270 vdwioffsetptr0
+vdwjidx0D
,
271 vdwioffsetptr0
+vdwjidx0E
,
272 vdwioffsetptr0
+vdwjidx0F
,
273 vdwioffsetptr0
+vdwjidx0G
,
274 vdwioffsetptr0
+vdwjidx0H
,
277 /* Calculate table index by multiplying r with table scale and truncate to integer */
278 rt
= _mm256_mul_ps(r00
,vftabscale
);
279 vfitab
= _mm256_cvttps_epi32(rt
);
280 vfeps
= _mm256_sub_ps(rt
,_mm256_round_ps(rt
, _MM_FROUND_FLOOR
));
281 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
282 vfitab_lo
= _mm256_extractf128_si256(vfitab
,0x0);
283 vfitab_hi
= _mm256_extractf128_si256(vfitab
,0x1);
284 vfitab_lo
= _mm_slli_epi32(_mm_add_epi32(vfitab_lo
,_mm_slli_epi32(vfitab_lo
,1)),2);
285 vfitab_hi
= _mm_slli_epi32(_mm_add_epi32(vfitab_hi
,_mm_slli_epi32(vfitab_hi
,1)),2);
287 /* CUBIC SPLINE TABLE ELECTROSTATICS */
288 Y
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,0)),
289 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,0)));
290 F
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,1)),
291 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,1)));
292 G
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,2)),
293 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,2)));
294 H
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,3)),
295 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,3)));
296 GMX_MM256_HALFTRANSPOSE4_PS(Y
,F
,G
,H
);
297 Heps
= _mm256_mul_ps(vfeps
,H
);
298 Fp
= _mm256_add_ps(F
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,Heps
)));
299 VV
= _mm256_add_ps(Y
,_mm256_mul_ps(vfeps
,Fp
));
300 velec
= _mm256_mul_ps(qq00
,VV
);
301 FF
= _mm256_add_ps(Fp
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,_mm256_add_ps(Heps
,Heps
))));
302 felec
= _mm256_xor_ps(signbit
,_mm256_mul_ps(_mm256_mul_ps(qq00
,FF
),_mm256_mul_ps(vftabscale
,rinv00
)));
304 /* CUBIC SPLINE TABLE DISPERSION */
305 vfitab_lo
= _mm_add_epi32(vfitab_lo
,ifour
);
306 vfitab_hi
= _mm_add_epi32(vfitab_hi
,ifour
);
307 Y
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,0)),
308 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,0)));
309 F
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,1)),
310 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,1)));
311 G
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,2)),
312 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,2)));
313 H
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,3)),
314 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,3)));
315 GMX_MM256_HALFTRANSPOSE4_PS(Y
,F
,G
,H
);
316 Heps
= _mm256_mul_ps(vfeps
,H
);
317 Fp
= _mm256_add_ps(F
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,Heps
)));
318 VV
= _mm256_add_ps(Y
,_mm256_mul_ps(vfeps
,Fp
));
319 vvdw6
= _mm256_mul_ps(c6_00
,VV
);
320 FF
= _mm256_add_ps(Fp
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,_mm256_add_ps(Heps
,Heps
))));
321 fvdw6
= _mm256_mul_ps(c6_00
,FF
);
323 /* CUBIC SPLINE TABLE REPULSION */
324 vfitab_lo
= _mm_add_epi32(vfitab_lo
,ifour
);
325 vfitab_hi
= _mm_add_epi32(vfitab_hi
,ifour
);
326 Y
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,0)),
327 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,0)));
328 F
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,1)),
329 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,1)));
330 G
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,2)),
331 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,2)));
332 H
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,3)),
333 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,3)));
334 GMX_MM256_HALFTRANSPOSE4_PS(Y
,F
,G
,H
);
335 Heps
= _mm256_mul_ps(vfeps
,H
);
336 Fp
= _mm256_add_ps(F
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,Heps
)));
337 VV
= _mm256_add_ps(Y
,_mm256_mul_ps(vfeps
,Fp
));
338 vvdw12
= _mm256_mul_ps(c12_00
,VV
);
339 FF
= _mm256_add_ps(Fp
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,_mm256_add_ps(Heps
,Heps
))));
340 fvdw12
= _mm256_mul_ps(c12_00
,FF
);
341 vvdw
= _mm256_add_ps(vvdw12
,vvdw6
);
342 fvdw
= _mm256_xor_ps(signbit
,_mm256_mul_ps(_mm256_add_ps(fvdw6
,fvdw12
),_mm256_mul_ps(vftabscale
,rinv00
)));
344 /* Update potential sum for this i atom from the interaction with this j atom. */
345 velecsum
= _mm256_add_ps(velecsum
,velec
);
346 vvdwsum
= _mm256_add_ps(vvdwsum
,vvdw
);
348 fscal
= _mm256_add_ps(felec
,fvdw
);
350 /* Calculate temporary vectorial force */
351 tx
= _mm256_mul_ps(fscal
,dx00
);
352 ty
= _mm256_mul_ps(fscal
,dy00
);
353 tz
= _mm256_mul_ps(fscal
,dz00
);
355 /* Update vectorial force */
356 fix0
= _mm256_add_ps(fix0
,tx
);
357 fiy0
= _mm256_add_ps(fiy0
,ty
);
358 fiz0
= _mm256_add_ps(fiz0
,tz
);
360 fjx0
= _mm256_add_ps(fjx0
,tx
);
361 fjy0
= _mm256_add_ps(fjy0
,ty
);
362 fjz0
= _mm256_add_ps(fjz0
,tz
);
364 /**************************
365 * CALCULATE INTERACTIONS *
366 **************************/
368 r10
= _mm256_mul_ps(rsq10
,rinv10
);
370 /* Compute parameters for interactions between i and j atoms */
371 qq10
= _mm256_mul_ps(iq1
,jq0
);
373 /* Calculate table index by multiplying r with table scale and truncate to integer */
374 rt
= _mm256_mul_ps(r10
,vftabscale
);
375 vfitab
= _mm256_cvttps_epi32(rt
);
376 vfeps
= _mm256_sub_ps(rt
,_mm256_round_ps(rt
, _MM_FROUND_FLOOR
));
377 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
378 vfitab_lo
= _mm256_extractf128_si256(vfitab
,0x0);
379 vfitab_hi
= _mm256_extractf128_si256(vfitab
,0x1);
380 vfitab_lo
= _mm_slli_epi32(_mm_add_epi32(vfitab_lo
,_mm_slli_epi32(vfitab_lo
,1)),2);
381 vfitab_hi
= _mm_slli_epi32(_mm_add_epi32(vfitab_hi
,_mm_slli_epi32(vfitab_hi
,1)),2);
383 /* CUBIC SPLINE TABLE ELECTROSTATICS */
384 Y
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,0)),
385 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,0)));
386 F
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,1)),
387 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,1)));
388 G
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,2)),
389 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,2)));
390 H
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,3)),
391 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,3)));
392 GMX_MM256_HALFTRANSPOSE4_PS(Y
,F
,G
,H
);
393 Heps
= _mm256_mul_ps(vfeps
,H
);
394 Fp
= _mm256_add_ps(F
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,Heps
)));
395 VV
= _mm256_add_ps(Y
,_mm256_mul_ps(vfeps
,Fp
));
396 velec
= _mm256_mul_ps(qq10
,VV
);
397 FF
= _mm256_add_ps(Fp
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,_mm256_add_ps(Heps
,Heps
))));
398 felec
= _mm256_xor_ps(signbit
,_mm256_mul_ps(_mm256_mul_ps(qq10
,FF
),_mm256_mul_ps(vftabscale
,rinv10
)));
400 /* Update potential sum for this i atom from the interaction with this j atom. */
401 velecsum
= _mm256_add_ps(velecsum
,velec
);
405 /* Calculate temporary vectorial force */
406 tx
= _mm256_mul_ps(fscal
,dx10
);
407 ty
= _mm256_mul_ps(fscal
,dy10
);
408 tz
= _mm256_mul_ps(fscal
,dz10
);
410 /* Update vectorial force */
411 fix1
= _mm256_add_ps(fix1
,tx
);
412 fiy1
= _mm256_add_ps(fiy1
,ty
);
413 fiz1
= _mm256_add_ps(fiz1
,tz
);
415 fjx0
= _mm256_add_ps(fjx0
,tx
);
416 fjy0
= _mm256_add_ps(fjy0
,ty
);
417 fjz0
= _mm256_add_ps(fjz0
,tz
);
419 /**************************
420 * CALCULATE INTERACTIONS *
421 **************************/
423 r20
= _mm256_mul_ps(rsq20
,rinv20
);
425 /* Compute parameters for interactions between i and j atoms */
426 qq20
= _mm256_mul_ps(iq2
,jq0
);
428 /* Calculate table index by multiplying r with table scale and truncate to integer */
429 rt
= _mm256_mul_ps(r20
,vftabscale
);
430 vfitab
= _mm256_cvttps_epi32(rt
);
431 vfeps
= _mm256_sub_ps(rt
,_mm256_round_ps(rt
, _MM_FROUND_FLOOR
));
432 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
433 vfitab_lo
= _mm256_extractf128_si256(vfitab
,0x0);
434 vfitab_hi
= _mm256_extractf128_si256(vfitab
,0x1);
435 vfitab_lo
= _mm_slli_epi32(_mm_add_epi32(vfitab_lo
,_mm_slli_epi32(vfitab_lo
,1)),2);
436 vfitab_hi
= _mm_slli_epi32(_mm_add_epi32(vfitab_hi
,_mm_slli_epi32(vfitab_hi
,1)),2);
438 /* CUBIC SPLINE TABLE ELECTROSTATICS */
439 Y
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,0)),
440 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,0)));
441 F
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,1)),
442 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,1)));
443 G
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,2)),
444 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,2)));
445 H
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,3)),
446 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,3)));
447 GMX_MM256_HALFTRANSPOSE4_PS(Y
,F
,G
,H
);
448 Heps
= _mm256_mul_ps(vfeps
,H
);
449 Fp
= _mm256_add_ps(F
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,Heps
)));
450 VV
= _mm256_add_ps(Y
,_mm256_mul_ps(vfeps
,Fp
));
451 velec
= _mm256_mul_ps(qq20
,VV
);
452 FF
= _mm256_add_ps(Fp
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,_mm256_add_ps(Heps
,Heps
))));
453 felec
= _mm256_xor_ps(signbit
,_mm256_mul_ps(_mm256_mul_ps(qq20
,FF
),_mm256_mul_ps(vftabscale
,rinv20
)));
455 /* Update potential sum for this i atom from the interaction with this j atom. */
456 velecsum
= _mm256_add_ps(velecsum
,velec
);
460 /* Calculate temporary vectorial force */
461 tx
= _mm256_mul_ps(fscal
,dx20
);
462 ty
= _mm256_mul_ps(fscal
,dy20
);
463 tz
= _mm256_mul_ps(fscal
,dz20
);
465 /* Update vectorial force */
466 fix2
= _mm256_add_ps(fix2
,tx
);
467 fiy2
= _mm256_add_ps(fiy2
,ty
);
468 fiz2
= _mm256_add_ps(fiz2
,tz
);
470 fjx0
= _mm256_add_ps(fjx0
,tx
);
471 fjy0
= _mm256_add_ps(fjy0
,ty
);
472 fjz0
= _mm256_add_ps(fjz0
,tz
);
474 fjptrA
= f
+j_coord_offsetA
;
475 fjptrB
= f
+j_coord_offsetB
;
476 fjptrC
= f
+j_coord_offsetC
;
477 fjptrD
= f
+j_coord_offsetD
;
478 fjptrE
= f
+j_coord_offsetE
;
479 fjptrF
= f
+j_coord_offsetF
;
480 fjptrG
= f
+j_coord_offsetG
;
481 fjptrH
= f
+j_coord_offsetH
;
483 gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,fjptrE
,fjptrF
,fjptrG
,fjptrH
,fjx0
,fjy0
,fjz0
);
485 /* Inner loop uses 162 flops */
491 /* Get j neighbor index, and coordinate index */
492 jnrlistA
= jjnr
[jidx
];
493 jnrlistB
= jjnr
[jidx
+1];
494 jnrlistC
= jjnr
[jidx
+2];
495 jnrlistD
= jjnr
[jidx
+3];
496 jnrlistE
= jjnr
[jidx
+4];
497 jnrlistF
= jjnr
[jidx
+5];
498 jnrlistG
= jjnr
[jidx
+6];
499 jnrlistH
= jjnr
[jidx
+7];
500 /* Sign of each element will be negative for non-real atoms.
501 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
502 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
504 dummy_mask
= gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
+4)),_mm_setzero_si128())),
505 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128())));
507 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
508 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
509 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
510 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
511 jnrE
= (jnrlistE
>=0) ? jnrlistE
: 0;
512 jnrF
= (jnrlistF
>=0) ? jnrlistF
: 0;
513 jnrG
= (jnrlistG
>=0) ? jnrlistG
: 0;
514 jnrH
= (jnrlistH
>=0) ? jnrlistH
: 0;
515 j_coord_offsetA
= DIM
*jnrA
;
516 j_coord_offsetB
= DIM
*jnrB
;
517 j_coord_offsetC
= DIM
*jnrC
;
518 j_coord_offsetD
= DIM
*jnrD
;
519 j_coord_offsetE
= DIM
*jnrE
;
520 j_coord_offsetF
= DIM
*jnrF
;
521 j_coord_offsetG
= DIM
*jnrG
;
522 j_coord_offsetH
= DIM
*jnrH
;
524 /* load j atom coordinates */
525 gmx_mm256_load_1rvec_8ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
526 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
527 x
+j_coord_offsetE
,x
+j_coord_offsetF
,
528 x
+j_coord_offsetG
,x
+j_coord_offsetH
,
531 /* Calculate displacement vector */
532 dx00
= _mm256_sub_ps(ix0
,jx0
);
533 dy00
= _mm256_sub_ps(iy0
,jy0
);
534 dz00
= _mm256_sub_ps(iz0
,jz0
);
535 dx10
= _mm256_sub_ps(ix1
,jx0
);
536 dy10
= _mm256_sub_ps(iy1
,jy0
);
537 dz10
= _mm256_sub_ps(iz1
,jz0
);
538 dx20
= _mm256_sub_ps(ix2
,jx0
);
539 dy20
= _mm256_sub_ps(iy2
,jy0
);
540 dz20
= _mm256_sub_ps(iz2
,jz0
);
542 /* Calculate squared distance and things based on it */
543 rsq00
= gmx_mm256_calc_rsq_ps(dx00
,dy00
,dz00
);
544 rsq10
= gmx_mm256_calc_rsq_ps(dx10
,dy10
,dz10
);
545 rsq20
= gmx_mm256_calc_rsq_ps(dx20
,dy20
,dz20
);
547 rinv00
= gmx_mm256_invsqrt_ps(rsq00
);
548 rinv10
= gmx_mm256_invsqrt_ps(rsq10
);
549 rinv20
= gmx_mm256_invsqrt_ps(rsq20
);
551 /* Load parameters for j particles */
552 jq0
= gmx_mm256_load_8real_swizzle_ps(charge
+jnrA
+0,charge
+jnrB
+0,
553 charge
+jnrC
+0,charge
+jnrD
+0,
554 charge
+jnrE
+0,charge
+jnrF
+0,
555 charge
+jnrG
+0,charge
+jnrH
+0);
556 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
557 vdwjidx0B
= 2*vdwtype
[jnrB
+0];
558 vdwjidx0C
= 2*vdwtype
[jnrC
+0];
559 vdwjidx0D
= 2*vdwtype
[jnrD
+0];
560 vdwjidx0E
= 2*vdwtype
[jnrE
+0];
561 vdwjidx0F
= 2*vdwtype
[jnrF
+0];
562 vdwjidx0G
= 2*vdwtype
[jnrG
+0];
563 vdwjidx0H
= 2*vdwtype
[jnrH
+0];
565 fjx0
= _mm256_setzero_ps();
566 fjy0
= _mm256_setzero_ps();
567 fjz0
= _mm256_setzero_ps();
569 /**************************
570 * CALCULATE INTERACTIONS *
571 **************************/
573 r00
= _mm256_mul_ps(rsq00
,rinv00
);
574 r00
= _mm256_andnot_ps(dummy_mask
,r00
);
576 /* Compute parameters for interactions between i and j atoms */
577 qq00
= _mm256_mul_ps(iq0
,jq0
);
578 gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0
+vdwjidx0A
,
579 vdwioffsetptr0
+vdwjidx0B
,
580 vdwioffsetptr0
+vdwjidx0C
,
581 vdwioffsetptr0
+vdwjidx0D
,
582 vdwioffsetptr0
+vdwjidx0E
,
583 vdwioffsetptr0
+vdwjidx0F
,
584 vdwioffsetptr0
+vdwjidx0G
,
585 vdwioffsetptr0
+vdwjidx0H
,
588 /* Calculate table index by multiplying r with table scale and truncate to integer */
589 rt
= _mm256_mul_ps(r00
,vftabscale
);
590 vfitab
= _mm256_cvttps_epi32(rt
);
591 vfeps
= _mm256_sub_ps(rt
,_mm256_round_ps(rt
, _MM_FROUND_FLOOR
));
592 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
593 vfitab_lo
= _mm256_extractf128_si256(vfitab
,0x0);
594 vfitab_hi
= _mm256_extractf128_si256(vfitab
,0x1);
595 vfitab_lo
= _mm_slli_epi32(_mm_add_epi32(vfitab_lo
,_mm_slli_epi32(vfitab_lo
,1)),2);
596 vfitab_hi
= _mm_slli_epi32(_mm_add_epi32(vfitab_hi
,_mm_slli_epi32(vfitab_hi
,1)),2);
598 /* CUBIC SPLINE TABLE ELECTROSTATICS */
599 Y
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,0)),
600 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,0)));
601 F
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,1)),
602 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,1)));
603 G
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,2)),
604 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,2)));
605 H
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,3)),
606 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,3)));
607 GMX_MM256_HALFTRANSPOSE4_PS(Y
,F
,G
,H
);
608 Heps
= _mm256_mul_ps(vfeps
,H
);
609 Fp
= _mm256_add_ps(F
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,Heps
)));
610 VV
= _mm256_add_ps(Y
,_mm256_mul_ps(vfeps
,Fp
));
611 velec
= _mm256_mul_ps(qq00
,VV
);
612 FF
= _mm256_add_ps(Fp
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,_mm256_add_ps(Heps
,Heps
))));
613 felec
= _mm256_xor_ps(signbit
,_mm256_mul_ps(_mm256_mul_ps(qq00
,FF
),_mm256_mul_ps(vftabscale
,rinv00
)));
615 /* CUBIC SPLINE TABLE DISPERSION */
616 vfitab_lo
= _mm_add_epi32(vfitab_lo
,ifour
);
617 vfitab_hi
= _mm_add_epi32(vfitab_hi
,ifour
);
618 Y
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,0)),
619 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,0)));
620 F
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,1)),
621 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,1)));
622 G
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,2)),
623 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,2)));
624 H
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,3)),
625 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,3)));
626 GMX_MM256_HALFTRANSPOSE4_PS(Y
,F
,G
,H
);
627 Heps
= _mm256_mul_ps(vfeps
,H
);
628 Fp
= _mm256_add_ps(F
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,Heps
)));
629 VV
= _mm256_add_ps(Y
,_mm256_mul_ps(vfeps
,Fp
));
630 vvdw6
= _mm256_mul_ps(c6_00
,VV
);
631 FF
= _mm256_add_ps(Fp
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,_mm256_add_ps(Heps
,Heps
))));
632 fvdw6
= _mm256_mul_ps(c6_00
,FF
);
634 /* CUBIC SPLINE TABLE REPULSION */
635 vfitab_lo
= _mm_add_epi32(vfitab_lo
,ifour
);
636 vfitab_hi
= _mm_add_epi32(vfitab_hi
,ifour
);
637 Y
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,0)),
638 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,0)));
639 F
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,1)),
640 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,1)));
641 G
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,2)),
642 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,2)));
643 H
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,3)),
644 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,3)));
645 GMX_MM256_HALFTRANSPOSE4_PS(Y
,F
,G
,H
);
646 Heps
= _mm256_mul_ps(vfeps
,H
);
647 Fp
= _mm256_add_ps(F
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,Heps
)));
648 VV
= _mm256_add_ps(Y
,_mm256_mul_ps(vfeps
,Fp
));
649 vvdw12
= _mm256_mul_ps(c12_00
,VV
);
650 FF
= _mm256_add_ps(Fp
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,_mm256_add_ps(Heps
,Heps
))));
651 fvdw12
= _mm256_mul_ps(c12_00
,FF
);
652 vvdw
= _mm256_add_ps(vvdw12
,vvdw6
);
653 fvdw
= _mm256_xor_ps(signbit
,_mm256_mul_ps(_mm256_add_ps(fvdw6
,fvdw12
),_mm256_mul_ps(vftabscale
,rinv00
)));
655 /* Update potential sum for this i atom from the interaction with this j atom. */
656 velec
= _mm256_andnot_ps(dummy_mask
,velec
);
657 velecsum
= _mm256_add_ps(velecsum
,velec
);
658 vvdw
= _mm256_andnot_ps(dummy_mask
,vvdw
);
659 vvdwsum
= _mm256_add_ps(vvdwsum
,vvdw
);
661 fscal
= _mm256_add_ps(felec
,fvdw
);
663 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
665 /* Calculate temporary vectorial force */
666 tx
= _mm256_mul_ps(fscal
,dx00
);
667 ty
= _mm256_mul_ps(fscal
,dy00
);
668 tz
= _mm256_mul_ps(fscal
,dz00
);
670 /* Update vectorial force */
671 fix0
= _mm256_add_ps(fix0
,tx
);
672 fiy0
= _mm256_add_ps(fiy0
,ty
);
673 fiz0
= _mm256_add_ps(fiz0
,tz
);
675 fjx0
= _mm256_add_ps(fjx0
,tx
);
676 fjy0
= _mm256_add_ps(fjy0
,ty
);
677 fjz0
= _mm256_add_ps(fjz0
,tz
);
679 /**************************
680 * CALCULATE INTERACTIONS *
681 **************************/
683 r10
= _mm256_mul_ps(rsq10
,rinv10
);
684 r10
= _mm256_andnot_ps(dummy_mask
,r10
);
686 /* Compute parameters for interactions between i and j atoms */
687 qq10
= _mm256_mul_ps(iq1
,jq0
);
689 /* Calculate table index by multiplying r with table scale and truncate to integer */
690 rt
= _mm256_mul_ps(r10
,vftabscale
);
691 vfitab
= _mm256_cvttps_epi32(rt
);
692 vfeps
= _mm256_sub_ps(rt
,_mm256_round_ps(rt
, _MM_FROUND_FLOOR
));
693 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
694 vfitab_lo
= _mm256_extractf128_si256(vfitab
,0x0);
695 vfitab_hi
= _mm256_extractf128_si256(vfitab
,0x1);
696 vfitab_lo
= _mm_slli_epi32(_mm_add_epi32(vfitab_lo
,_mm_slli_epi32(vfitab_lo
,1)),2);
697 vfitab_hi
= _mm_slli_epi32(_mm_add_epi32(vfitab_hi
,_mm_slli_epi32(vfitab_hi
,1)),2);
699 /* CUBIC SPLINE TABLE ELECTROSTATICS */
700 Y
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,0)),
701 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,0)));
702 F
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,1)),
703 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,1)));
704 G
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,2)),
705 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,2)));
706 H
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,3)),
707 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,3)));
708 GMX_MM256_HALFTRANSPOSE4_PS(Y
,F
,G
,H
);
709 Heps
= _mm256_mul_ps(vfeps
,H
);
710 Fp
= _mm256_add_ps(F
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,Heps
)));
711 VV
= _mm256_add_ps(Y
,_mm256_mul_ps(vfeps
,Fp
));
712 velec
= _mm256_mul_ps(qq10
,VV
);
713 FF
= _mm256_add_ps(Fp
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,_mm256_add_ps(Heps
,Heps
))));
714 felec
= _mm256_xor_ps(signbit
,_mm256_mul_ps(_mm256_mul_ps(qq10
,FF
),_mm256_mul_ps(vftabscale
,rinv10
)));
716 /* Update potential sum for this i atom from the interaction with this j atom. */
717 velec
= _mm256_andnot_ps(dummy_mask
,velec
);
718 velecsum
= _mm256_add_ps(velecsum
,velec
);
722 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
724 /* Calculate temporary vectorial force */
725 tx
= _mm256_mul_ps(fscal
,dx10
);
726 ty
= _mm256_mul_ps(fscal
,dy10
);
727 tz
= _mm256_mul_ps(fscal
,dz10
);
729 /* Update vectorial force */
730 fix1
= _mm256_add_ps(fix1
,tx
);
731 fiy1
= _mm256_add_ps(fiy1
,ty
);
732 fiz1
= _mm256_add_ps(fiz1
,tz
);
734 fjx0
= _mm256_add_ps(fjx0
,tx
);
735 fjy0
= _mm256_add_ps(fjy0
,ty
);
736 fjz0
= _mm256_add_ps(fjz0
,tz
);
738 /**************************
739 * CALCULATE INTERACTIONS *
740 **************************/
742 r20
= _mm256_mul_ps(rsq20
,rinv20
);
743 r20
= _mm256_andnot_ps(dummy_mask
,r20
);
745 /* Compute parameters for interactions between i and j atoms */
746 qq20
= _mm256_mul_ps(iq2
,jq0
);
748 /* Calculate table index by multiplying r with table scale and truncate to integer */
749 rt
= _mm256_mul_ps(r20
,vftabscale
);
750 vfitab
= _mm256_cvttps_epi32(rt
);
751 vfeps
= _mm256_sub_ps(rt
,_mm256_round_ps(rt
, _MM_FROUND_FLOOR
));
752 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
753 vfitab_lo
= _mm256_extractf128_si256(vfitab
,0x0);
754 vfitab_hi
= _mm256_extractf128_si256(vfitab
,0x1);
755 vfitab_lo
= _mm_slli_epi32(_mm_add_epi32(vfitab_lo
,_mm_slli_epi32(vfitab_lo
,1)),2);
756 vfitab_hi
= _mm_slli_epi32(_mm_add_epi32(vfitab_hi
,_mm_slli_epi32(vfitab_hi
,1)),2);
758 /* CUBIC SPLINE TABLE ELECTROSTATICS */
759 Y
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,0)),
760 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,0)));
761 F
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,1)),
762 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,1)));
763 G
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,2)),
764 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,2)));
765 H
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,3)),
766 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,3)));
767 GMX_MM256_HALFTRANSPOSE4_PS(Y
,F
,G
,H
);
768 Heps
= _mm256_mul_ps(vfeps
,H
);
769 Fp
= _mm256_add_ps(F
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,Heps
)));
770 VV
= _mm256_add_ps(Y
,_mm256_mul_ps(vfeps
,Fp
));
771 velec
= _mm256_mul_ps(qq20
,VV
);
772 FF
= _mm256_add_ps(Fp
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,_mm256_add_ps(Heps
,Heps
))));
773 felec
= _mm256_xor_ps(signbit
,_mm256_mul_ps(_mm256_mul_ps(qq20
,FF
),_mm256_mul_ps(vftabscale
,rinv20
)));
775 /* Update potential sum for this i atom from the interaction with this j atom. */
776 velec
= _mm256_andnot_ps(dummy_mask
,velec
);
777 velecsum
= _mm256_add_ps(velecsum
,velec
);
781 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
783 /* Calculate temporary vectorial force */
784 tx
= _mm256_mul_ps(fscal
,dx20
);
785 ty
= _mm256_mul_ps(fscal
,dy20
);
786 tz
= _mm256_mul_ps(fscal
,dz20
);
788 /* Update vectorial force */
789 fix2
= _mm256_add_ps(fix2
,tx
);
790 fiy2
= _mm256_add_ps(fiy2
,ty
);
791 fiz2
= _mm256_add_ps(fiz2
,tz
);
793 fjx0
= _mm256_add_ps(fjx0
,tx
);
794 fjy0
= _mm256_add_ps(fjy0
,ty
);
795 fjz0
= _mm256_add_ps(fjz0
,tz
);
797 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
798 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
799 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
800 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
801 fjptrE
= (jnrlistE
>=0) ? f
+j_coord_offsetE
: scratch
;
802 fjptrF
= (jnrlistF
>=0) ? f
+j_coord_offsetF
: scratch
;
803 fjptrG
= (jnrlistG
>=0) ? f
+j_coord_offsetG
: scratch
;
804 fjptrH
= (jnrlistH
>=0) ? f
+j_coord_offsetH
: scratch
;
806 gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,fjptrE
,fjptrF
,fjptrG
,fjptrH
,fjx0
,fjy0
,fjz0
);
808 /* Inner loop uses 165 flops */
811 /* End of innermost loop */
813 gmx_mm256_update_iforce_3atom_swizzle_ps(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
814 f
+i_coord_offset
,fshift
+i_shift_offset
);
817 /* Update potential energies */
818 gmx_mm256_update_1pot_ps(velecsum
,kernel_data
->energygrp_elec
+ggid
);
819 gmx_mm256_update_1pot_ps(vvdwsum
,kernel_data
->energygrp_vdw
+ggid
);
821 /* Increment number of inner iterations */
822 inneriter
+= j_index_end
- j_index_start
;
824 /* Outer loop uses 20 flops */
827 /* Increment number of outer iterations */
830 /* Update outer/inner flops */
832 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W3_VF
,outeriter
*20 + inneriter
*165);
835 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_avx_256_single
836 * Electrostatics interaction: CubicSplineTable
837 * VdW interaction: CubicSplineTable
838 * Geometry: Water3-Particle
839 * Calculate force/pot: Force
842 nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_F_avx_256_single
843 (t_nblist
* gmx_restrict nlist
,
844 rvec
* gmx_restrict xx
,
845 rvec
* gmx_restrict ff
,
846 t_forcerec
* gmx_restrict fr
,
847 t_mdatoms
* gmx_restrict mdatoms
,
848 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
849 t_nrnb
* gmx_restrict nrnb
)
851 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
852 * just 0 for non-waters.
853 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
854 * jnr indices corresponding to data put in the four positions in the SIMD register.
856 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
857 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
858 int jnrA
,jnrB
,jnrC
,jnrD
;
859 int jnrE
,jnrF
,jnrG
,jnrH
;
860 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
861 int jnrlistE
,jnrlistF
,jnrlistG
,jnrlistH
;
862 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
863 int j_coord_offsetE
,j_coord_offsetF
,j_coord_offsetG
,j_coord_offsetH
;
864 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
866 real
*shiftvec
,*fshift
,*x
,*f
;
867 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
,*fjptrE
,*fjptrF
,*fjptrG
,*fjptrH
;
869 __m256 tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
870 real
* vdwioffsetptr0
;
871 __m256 ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
872 real
* vdwioffsetptr1
;
873 __m256 ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
874 real
* vdwioffsetptr2
;
875 __m256 ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
876 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
,vdwjidx0E
,vdwjidx0F
,vdwjidx0G
,vdwjidx0H
;
877 __m256 jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
878 __m256 dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
879 __m256 dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
880 __m256 dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
881 __m256 velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
884 __m256 rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
887 __m256 one_sixth
= _mm256_set1_ps(1.0/6.0);
888 __m256 one_twelfth
= _mm256_set1_ps(1.0/12.0);
890 __m128i vfitab_lo
,vfitab_hi
;
891 __m128i ifour
= _mm_set1_epi32(4);
892 __m256 rt
,vfeps
,vftabscale
,Y
,F
,G
,H
,Heps
,Fp
,VV
,FF
;
894 __m256 dummy_mask
,cutoff_mask
;
895 __m256 signbit
= _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
896 __m256 one
= _mm256_set1_ps(1.0);
897 __m256 two
= _mm256_set1_ps(2.0);
903 jindex
= nlist
->jindex
;
905 shiftidx
= nlist
->shift
;
907 shiftvec
= fr
->shift_vec
[0];
908 fshift
= fr
->fshift
[0];
909 facel
= _mm256_set1_ps(fr
->epsfac
);
910 charge
= mdatoms
->chargeA
;
911 nvdwtype
= fr
->ntype
;
913 vdwtype
= mdatoms
->typeA
;
915 vftab
= kernel_data
->table_elec_vdw
->data
;
916 vftabscale
= _mm256_set1_ps(kernel_data
->table_elec_vdw
->scale
);
918 /* Setup water-specific parameters */
919 inr
= nlist
->iinr
[0];
920 iq0
= _mm256_mul_ps(facel
,_mm256_set1_ps(charge
[inr
+0]));
921 iq1
= _mm256_mul_ps(facel
,_mm256_set1_ps(charge
[inr
+1]));
922 iq2
= _mm256_mul_ps(facel
,_mm256_set1_ps(charge
[inr
+2]));
923 vdwioffsetptr0
= vdwparam
+2*nvdwtype
*vdwtype
[inr
+0];
925 /* Avoid stupid compiler warnings */
926 jnrA
= jnrB
= jnrC
= jnrD
= jnrE
= jnrF
= jnrG
= jnrH
= 0;
939 for(iidx
=0;iidx
<4*DIM
;iidx
++)
944 /* Start outer loop over neighborlists */
945 for(iidx
=0; iidx
<nri
; iidx
++)
947 /* Load shift vector for this list */
948 i_shift_offset
= DIM
*shiftidx
[iidx
];
950 /* Load limits for loop over neighbors */
951 j_index_start
= jindex
[iidx
];
952 j_index_end
= jindex
[iidx
+1];
954 /* Get outer coordinate index */
956 i_coord_offset
= DIM
*inr
;
958 /* Load i particle coords and add shift vector */
959 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
960 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
962 fix0
= _mm256_setzero_ps();
963 fiy0
= _mm256_setzero_ps();
964 fiz0
= _mm256_setzero_ps();
965 fix1
= _mm256_setzero_ps();
966 fiy1
= _mm256_setzero_ps();
967 fiz1
= _mm256_setzero_ps();
968 fix2
= _mm256_setzero_ps();
969 fiy2
= _mm256_setzero_ps();
970 fiz2
= _mm256_setzero_ps();
972 /* Start inner kernel loop */
973 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+7]>=0; jidx
+=8)
976 /* Get j neighbor index, and coordinate index */
985 j_coord_offsetA
= DIM
*jnrA
;
986 j_coord_offsetB
= DIM
*jnrB
;
987 j_coord_offsetC
= DIM
*jnrC
;
988 j_coord_offsetD
= DIM
*jnrD
;
989 j_coord_offsetE
= DIM
*jnrE
;
990 j_coord_offsetF
= DIM
*jnrF
;
991 j_coord_offsetG
= DIM
*jnrG
;
992 j_coord_offsetH
= DIM
*jnrH
;
994 /* load j atom coordinates */
995 gmx_mm256_load_1rvec_8ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
996 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
997 x
+j_coord_offsetE
,x
+j_coord_offsetF
,
998 x
+j_coord_offsetG
,x
+j_coord_offsetH
,
1001 /* Calculate displacement vector */
1002 dx00
= _mm256_sub_ps(ix0
,jx0
);
1003 dy00
= _mm256_sub_ps(iy0
,jy0
);
1004 dz00
= _mm256_sub_ps(iz0
,jz0
);
1005 dx10
= _mm256_sub_ps(ix1
,jx0
);
1006 dy10
= _mm256_sub_ps(iy1
,jy0
);
1007 dz10
= _mm256_sub_ps(iz1
,jz0
);
1008 dx20
= _mm256_sub_ps(ix2
,jx0
);
1009 dy20
= _mm256_sub_ps(iy2
,jy0
);
1010 dz20
= _mm256_sub_ps(iz2
,jz0
);
1012 /* Calculate squared distance and things based on it */
1013 rsq00
= gmx_mm256_calc_rsq_ps(dx00
,dy00
,dz00
);
1014 rsq10
= gmx_mm256_calc_rsq_ps(dx10
,dy10
,dz10
);
1015 rsq20
= gmx_mm256_calc_rsq_ps(dx20
,dy20
,dz20
);
1017 rinv00
= gmx_mm256_invsqrt_ps(rsq00
);
1018 rinv10
= gmx_mm256_invsqrt_ps(rsq10
);
1019 rinv20
= gmx_mm256_invsqrt_ps(rsq20
);
1021 /* Load parameters for j particles */
1022 jq0
= gmx_mm256_load_8real_swizzle_ps(charge
+jnrA
+0,charge
+jnrB
+0,
1023 charge
+jnrC
+0,charge
+jnrD
+0,
1024 charge
+jnrE
+0,charge
+jnrF
+0,
1025 charge
+jnrG
+0,charge
+jnrH
+0);
1026 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
1027 vdwjidx0B
= 2*vdwtype
[jnrB
+0];
1028 vdwjidx0C
= 2*vdwtype
[jnrC
+0];
1029 vdwjidx0D
= 2*vdwtype
[jnrD
+0];
1030 vdwjidx0E
= 2*vdwtype
[jnrE
+0];
1031 vdwjidx0F
= 2*vdwtype
[jnrF
+0];
1032 vdwjidx0G
= 2*vdwtype
[jnrG
+0];
1033 vdwjidx0H
= 2*vdwtype
[jnrH
+0];
1035 fjx0
= _mm256_setzero_ps();
1036 fjy0
= _mm256_setzero_ps();
1037 fjz0
= _mm256_setzero_ps();
1039 /**************************
1040 * CALCULATE INTERACTIONS *
1041 **************************/
1043 r00
= _mm256_mul_ps(rsq00
,rinv00
);
1045 /* Compute parameters for interactions between i and j atoms */
1046 qq00
= _mm256_mul_ps(iq0
,jq0
);
1047 gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0
+vdwjidx0A
,
1048 vdwioffsetptr0
+vdwjidx0B
,
1049 vdwioffsetptr0
+vdwjidx0C
,
1050 vdwioffsetptr0
+vdwjidx0D
,
1051 vdwioffsetptr0
+vdwjidx0E
,
1052 vdwioffsetptr0
+vdwjidx0F
,
1053 vdwioffsetptr0
+vdwjidx0G
,
1054 vdwioffsetptr0
+vdwjidx0H
,
1057 /* Calculate table index by multiplying r with table scale and truncate to integer */
1058 rt
= _mm256_mul_ps(r00
,vftabscale
);
1059 vfitab
= _mm256_cvttps_epi32(rt
);
1060 vfeps
= _mm256_sub_ps(rt
,_mm256_round_ps(rt
, _MM_FROUND_FLOOR
));
1061 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1062 vfitab_lo
= _mm256_extractf128_si256(vfitab
,0x0);
1063 vfitab_hi
= _mm256_extractf128_si256(vfitab
,0x1);
1064 vfitab_lo
= _mm_slli_epi32(_mm_add_epi32(vfitab_lo
,_mm_slli_epi32(vfitab_lo
,1)),2);
1065 vfitab_hi
= _mm_slli_epi32(_mm_add_epi32(vfitab_hi
,_mm_slli_epi32(vfitab_hi
,1)),2);
1067 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1068 Y
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,0)),
1069 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,0)));
1070 F
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,1)),
1071 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,1)));
1072 G
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,2)),
1073 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,2)));
1074 H
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,3)),
1075 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,3)));
1076 GMX_MM256_HALFTRANSPOSE4_PS(Y
,F
,G
,H
);
1077 Heps
= _mm256_mul_ps(vfeps
,H
);
1078 Fp
= _mm256_add_ps(F
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,Heps
)));
1079 FF
= _mm256_add_ps(Fp
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,_mm256_add_ps(Heps
,Heps
))));
1080 felec
= _mm256_xor_ps(signbit
,_mm256_mul_ps(_mm256_mul_ps(qq00
,FF
),_mm256_mul_ps(vftabscale
,rinv00
)));
1082 /* CUBIC SPLINE TABLE DISPERSION */
1083 vfitab_lo
= _mm_add_epi32(vfitab_lo
,ifour
);
1084 vfitab_hi
= _mm_add_epi32(vfitab_hi
,ifour
);
1085 Y
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,0)),
1086 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,0)));
1087 F
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,1)),
1088 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,1)));
1089 G
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,2)),
1090 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,2)));
1091 H
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,3)),
1092 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,3)));
1093 GMX_MM256_HALFTRANSPOSE4_PS(Y
,F
,G
,H
);
1094 Heps
= _mm256_mul_ps(vfeps
,H
);
1095 Fp
= _mm256_add_ps(F
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,Heps
)));
1096 FF
= _mm256_add_ps(Fp
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,_mm256_add_ps(Heps
,Heps
))));
1097 fvdw6
= _mm256_mul_ps(c6_00
,FF
);
1099 /* CUBIC SPLINE TABLE REPULSION */
1100 vfitab_lo
= _mm_add_epi32(vfitab_lo
,ifour
);
1101 vfitab_hi
= _mm_add_epi32(vfitab_hi
,ifour
);
1102 Y
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,0)),
1103 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,0)));
1104 F
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,1)),
1105 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,1)));
1106 G
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,2)),
1107 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,2)));
1108 H
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,3)),
1109 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,3)));
1110 GMX_MM256_HALFTRANSPOSE4_PS(Y
,F
,G
,H
);
1111 Heps
= _mm256_mul_ps(vfeps
,H
);
1112 Fp
= _mm256_add_ps(F
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,Heps
)));
1113 FF
= _mm256_add_ps(Fp
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,_mm256_add_ps(Heps
,Heps
))));
1114 fvdw12
= _mm256_mul_ps(c12_00
,FF
);
1115 fvdw
= _mm256_xor_ps(signbit
,_mm256_mul_ps(_mm256_add_ps(fvdw6
,fvdw12
),_mm256_mul_ps(vftabscale
,rinv00
)));
1117 fscal
= _mm256_add_ps(felec
,fvdw
);
1119 /* Calculate temporary vectorial force */
1120 tx
= _mm256_mul_ps(fscal
,dx00
);
1121 ty
= _mm256_mul_ps(fscal
,dy00
);
1122 tz
= _mm256_mul_ps(fscal
,dz00
);
1124 /* Update vectorial force */
1125 fix0
= _mm256_add_ps(fix0
,tx
);
1126 fiy0
= _mm256_add_ps(fiy0
,ty
);
1127 fiz0
= _mm256_add_ps(fiz0
,tz
);
1129 fjx0
= _mm256_add_ps(fjx0
,tx
);
1130 fjy0
= _mm256_add_ps(fjy0
,ty
);
1131 fjz0
= _mm256_add_ps(fjz0
,tz
);
1133 /**************************
1134 * CALCULATE INTERACTIONS *
1135 **************************/
1137 r10
= _mm256_mul_ps(rsq10
,rinv10
);
1139 /* Compute parameters for interactions between i and j atoms */
1140 qq10
= _mm256_mul_ps(iq1
,jq0
);
1142 /* Calculate table index by multiplying r with table scale and truncate to integer */
1143 rt
= _mm256_mul_ps(r10
,vftabscale
);
1144 vfitab
= _mm256_cvttps_epi32(rt
);
1145 vfeps
= _mm256_sub_ps(rt
,_mm256_round_ps(rt
, _MM_FROUND_FLOOR
));
1146 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1147 vfitab_lo
= _mm256_extractf128_si256(vfitab
,0x0);
1148 vfitab_hi
= _mm256_extractf128_si256(vfitab
,0x1);
1149 vfitab_lo
= _mm_slli_epi32(_mm_add_epi32(vfitab_lo
,_mm_slli_epi32(vfitab_lo
,1)),2);
1150 vfitab_hi
= _mm_slli_epi32(_mm_add_epi32(vfitab_hi
,_mm_slli_epi32(vfitab_hi
,1)),2);
1152 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1153 Y
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,0)),
1154 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,0)));
1155 F
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,1)),
1156 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,1)));
1157 G
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,2)),
1158 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,2)));
1159 H
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,3)),
1160 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,3)));
1161 GMX_MM256_HALFTRANSPOSE4_PS(Y
,F
,G
,H
);
1162 Heps
= _mm256_mul_ps(vfeps
,H
);
1163 Fp
= _mm256_add_ps(F
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,Heps
)));
1164 FF
= _mm256_add_ps(Fp
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,_mm256_add_ps(Heps
,Heps
))));
1165 felec
= _mm256_xor_ps(signbit
,_mm256_mul_ps(_mm256_mul_ps(qq10
,FF
),_mm256_mul_ps(vftabscale
,rinv10
)));
1169 /* Calculate temporary vectorial force */
1170 tx
= _mm256_mul_ps(fscal
,dx10
);
1171 ty
= _mm256_mul_ps(fscal
,dy10
);
1172 tz
= _mm256_mul_ps(fscal
,dz10
);
1174 /* Update vectorial force */
1175 fix1
= _mm256_add_ps(fix1
,tx
);
1176 fiy1
= _mm256_add_ps(fiy1
,ty
);
1177 fiz1
= _mm256_add_ps(fiz1
,tz
);
1179 fjx0
= _mm256_add_ps(fjx0
,tx
);
1180 fjy0
= _mm256_add_ps(fjy0
,ty
);
1181 fjz0
= _mm256_add_ps(fjz0
,tz
);
1183 /**************************
1184 * CALCULATE INTERACTIONS *
1185 **************************/
1187 r20
= _mm256_mul_ps(rsq20
,rinv20
);
1189 /* Compute parameters for interactions between i and j atoms */
1190 qq20
= _mm256_mul_ps(iq2
,jq0
);
1192 /* Calculate table index by multiplying r with table scale and truncate to integer */
1193 rt
= _mm256_mul_ps(r20
,vftabscale
);
1194 vfitab
= _mm256_cvttps_epi32(rt
);
1195 vfeps
= _mm256_sub_ps(rt
,_mm256_round_ps(rt
, _MM_FROUND_FLOOR
));
1196 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1197 vfitab_lo
= _mm256_extractf128_si256(vfitab
,0x0);
1198 vfitab_hi
= _mm256_extractf128_si256(vfitab
,0x1);
1199 vfitab_lo
= _mm_slli_epi32(_mm_add_epi32(vfitab_lo
,_mm_slli_epi32(vfitab_lo
,1)),2);
1200 vfitab_hi
= _mm_slli_epi32(_mm_add_epi32(vfitab_hi
,_mm_slli_epi32(vfitab_hi
,1)),2);
1202 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1203 Y
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,0)),
1204 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,0)));
1205 F
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,1)),
1206 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,1)));
1207 G
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,2)),
1208 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,2)));
1209 H
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,3)),
1210 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,3)));
1211 GMX_MM256_HALFTRANSPOSE4_PS(Y
,F
,G
,H
);
1212 Heps
= _mm256_mul_ps(vfeps
,H
);
1213 Fp
= _mm256_add_ps(F
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,Heps
)));
1214 FF
= _mm256_add_ps(Fp
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,_mm256_add_ps(Heps
,Heps
))));
1215 felec
= _mm256_xor_ps(signbit
,_mm256_mul_ps(_mm256_mul_ps(qq20
,FF
),_mm256_mul_ps(vftabscale
,rinv20
)));
1219 /* Calculate temporary vectorial force */
1220 tx
= _mm256_mul_ps(fscal
,dx20
);
1221 ty
= _mm256_mul_ps(fscal
,dy20
);
1222 tz
= _mm256_mul_ps(fscal
,dz20
);
1224 /* Update vectorial force */
1225 fix2
= _mm256_add_ps(fix2
,tx
);
1226 fiy2
= _mm256_add_ps(fiy2
,ty
);
1227 fiz2
= _mm256_add_ps(fiz2
,tz
);
1229 fjx0
= _mm256_add_ps(fjx0
,tx
);
1230 fjy0
= _mm256_add_ps(fjy0
,ty
);
1231 fjz0
= _mm256_add_ps(fjz0
,tz
);
1233 fjptrA
= f
+j_coord_offsetA
;
1234 fjptrB
= f
+j_coord_offsetB
;
1235 fjptrC
= f
+j_coord_offsetC
;
1236 fjptrD
= f
+j_coord_offsetD
;
1237 fjptrE
= f
+j_coord_offsetE
;
1238 fjptrF
= f
+j_coord_offsetF
;
1239 fjptrG
= f
+j_coord_offsetG
;
1240 fjptrH
= f
+j_coord_offsetH
;
1242 gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,fjptrE
,fjptrF
,fjptrG
,fjptrH
,fjx0
,fjy0
,fjz0
);
1244 /* Inner loop uses 142 flops */
1247 if(jidx
<j_index_end
)
1250 /* Get j neighbor index, and coordinate index */
1251 jnrlistA
= jjnr
[jidx
];
1252 jnrlistB
= jjnr
[jidx
+1];
1253 jnrlistC
= jjnr
[jidx
+2];
1254 jnrlistD
= jjnr
[jidx
+3];
1255 jnrlistE
= jjnr
[jidx
+4];
1256 jnrlistF
= jjnr
[jidx
+5];
1257 jnrlistG
= jjnr
[jidx
+6];
1258 jnrlistH
= jjnr
[jidx
+7];
1259 /* Sign of each element will be negative for non-real atoms.
1260 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1261 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1263 dummy_mask
= gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
+4)),_mm_setzero_si128())),
1264 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128())));
1266 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
1267 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
1268 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
1269 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
1270 jnrE
= (jnrlistE
>=0) ? jnrlistE
: 0;
1271 jnrF
= (jnrlistF
>=0) ? jnrlistF
: 0;
1272 jnrG
= (jnrlistG
>=0) ? jnrlistG
: 0;
1273 jnrH
= (jnrlistH
>=0) ? jnrlistH
: 0;
1274 j_coord_offsetA
= DIM
*jnrA
;
1275 j_coord_offsetB
= DIM
*jnrB
;
1276 j_coord_offsetC
= DIM
*jnrC
;
1277 j_coord_offsetD
= DIM
*jnrD
;
1278 j_coord_offsetE
= DIM
*jnrE
;
1279 j_coord_offsetF
= DIM
*jnrF
;
1280 j_coord_offsetG
= DIM
*jnrG
;
1281 j_coord_offsetH
= DIM
*jnrH
;
1283 /* load j atom coordinates */
1284 gmx_mm256_load_1rvec_8ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1285 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
1286 x
+j_coord_offsetE
,x
+j_coord_offsetF
,
1287 x
+j_coord_offsetG
,x
+j_coord_offsetH
,
1290 /* Calculate displacement vector */
1291 dx00
= _mm256_sub_ps(ix0
,jx0
);
1292 dy00
= _mm256_sub_ps(iy0
,jy0
);
1293 dz00
= _mm256_sub_ps(iz0
,jz0
);
1294 dx10
= _mm256_sub_ps(ix1
,jx0
);
1295 dy10
= _mm256_sub_ps(iy1
,jy0
);
1296 dz10
= _mm256_sub_ps(iz1
,jz0
);
1297 dx20
= _mm256_sub_ps(ix2
,jx0
);
1298 dy20
= _mm256_sub_ps(iy2
,jy0
);
1299 dz20
= _mm256_sub_ps(iz2
,jz0
);
1301 /* Calculate squared distance and things based on it */
1302 rsq00
= gmx_mm256_calc_rsq_ps(dx00
,dy00
,dz00
);
1303 rsq10
= gmx_mm256_calc_rsq_ps(dx10
,dy10
,dz10
);
1304 rsq20
= gmx_mm256_calc_rsq_ps(dx20
,dy20
,dz20
);
1306 rinv00
= gmx_mm256_invsqrt_ps(rsq00
);
1307 rinv10
= gmx_mm256_invsqrt_ps(rsq10
);
1308 rinv20
= gmx_mm256_invsqrt_ps(rsq20
);
1310 /* Load parameters for j particles */
1311 jq0
= gmx_mm256_load_8real_swizzle_ps(charge
+jnrA
+0,charge
+jnrB
+0,
1312 charge
+jnrC
+0,charge
+jnrD
+0,
1313 charge
+jnrE
+0,charge
+jnrF
+0,
1314 charge
+jnrG
+0,charge
+jnrH
+0);
1315 vdwjidx0A
= 2*vdwtype
[jnrA
+0];
1316 vdwjidx0B
= 2*vdwtype
[jnrB
+0];
1317 vdwjidx0C
= 2*vdwtype
[jnrC
+0];
1318 vdwjidx0D
= 2*vdwtype
[jnrD
+0];
1319 vdwjidx0E
= 2*vdwtype
[jnrE
+0];
1320 vdwjidx0F
= 2*vdwtype
[jnrF
+0];
1321 vdwjidx0G
= 2*vdwtype
[jnrG
+0];
1322 vdwjidx0H
= 2*vdwtype
[jnrH
+0];
1324 fjx0
= _mm256_setzero_ps();
1325 fjy0
= _mm256_setzero_ps();
1326 fjz0
= _mm256_setzero_ps();
1328 /**************************
1329 * CALCULATE INTERACTIONS *
1330 **************************/
1332 r00
= _mm256_mul_ps(rsq00
,rinv00
);
1333 r00
= _mm256_andnot_ps(dummy_mask
,r00
);
1335 /* Compute parameters for interactions between i and j atoms */
1336 qq00
= _mm256_mul_ps(iq0
,jq0
);
1337 gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0
+vdwjidx0A
,
1338 vdwioffsetptr0
+vdwjidx0B
,
1339 vdwioffsetptr0
+vdwjidx0C
,
1340 vdwioffsetptr0
+vdwjidx0D
,
1341 vdwioffsetptr0
+vdwjidx0E
,
1342 vdwioffsetptr0
+vdwjidx0F
,
1343 vdwioffsetptr0
+vdwjidx0G
,
1344 vdwioffsetptr0
+vdwjidx0H
,
1347 /* Calculate table index by multiplying r with table scale and truncate to integer */
1348 rt
= _mm256_mul_ps(r00
,vftabscale
);
1349 vfitab
= _mm256_cvttps_epi32(rt
);
1350 vfeps
= _mm256_sub_ps(rt
,_mm256_round_ps(rt
, _MM_FROUND_FLOOR
));
1351 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1352 vfitab_lo
= _mm256_extractf128_si256(vfitab
,0x0);
1353 vfitab_hi
= _mm256_extractf128_si256(vfitab
,0x1);
1354 vfitab_lo
= _mm_slli_epi32(_mm_add_epi32(vfitab_lo
,_mm_slli_epi32(vfitab_lo
,1)),2);
1355 vfitab_hi
= _mm_slli_epi32(_mm_add_epi32(vfitab_hi
,_mm_slli_epi32(vfitab_hi
,1)),2);
1357 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1358 Y
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,0)),
1359 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,0)));
1360 F
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,1)),
1361 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,1)));
1362 G
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,2)),
1363 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,2)));
1364 H
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,3)),
1365 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,3)));
1366 GMX_MM256_HALFTRANSPOSE4_PS(Y
,F
,G
,H
);
1367 Heps
= _mm256_mul_ps(vfeps
,H
);
1368 Fp
= _mm256_add_ps(F
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,Heps
)));
1369 FF
= _mm256_add_ps(Fp
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,_mm256_add_ps(Heps
,Heps
))));
1370 felec
= _mm256_xor_ps(signbit
,_mm256_mul_ps(_mm256_mul_ps(qq00
,FF
),_mm256_mul_ps(vftabscale
,rinv00
)));
1372 /* CUBIC SPLINE TABLE DISPERSION */
1373 vfitab_lo
= _mm_add_epi32(vfitab_lo
,ifour
);
1374 vfitab_hi
= _mm_add_epi32(vfitab_hi
,ifour
);
1375 Y
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,0)),
1376 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,0)));
1377 F
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,1)),
1378 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,1)));
1379 G
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,2)),
1380 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,2)));
1381 H
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,3)),
1382 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,3)));
1383 GMX_MM256_HALFTRANSPOSE4_PS(Y
,F
,G
,H
);
1384 Heps
= _mm256_mul_ps(vfeps
,H
);
1385 Fp
= _mm256_add_ps(F
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,Heps
)));
1386 FF
= _mm256_add_ps(Fp
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,_mm256_add_ps(Heps
,Heps
))));
1387 fvdw6
= _mm256_mul_ps(c6_00
,FF
);
1389 /* CUBIC SPLINE TABLE REPULSION */
1390 vfitab_lo
= _mm_add_epi32(vfitab_lo
,ifour
);
1391 vfitab_hi
= _mm_add_epi32(vfitab_hi
,ifour
);
1392 Y
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,0)),
1393 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,0)));
1394 F
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,1)),
1395 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,1)));
1396 G
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,2)),
1397 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,2)));
1398 H
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,3)),
1399 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,3)));
1400 GMX_MM256_HALFTRANSPOSE4_PS(Y
,F
,G
,H
);
1401 Heps
= _mm256_mul_ps(vfeps
,H
);
1402 Fp
= _mm256_add_ps(F
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,Heps
)));
1403 FF
= _mm256_add_ps(Fp
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,_mm256_add_ps(Heps
,Heps
))));
1404 fvdw12
= _mm256_mul_ps(c12_00
,FF
);
1405 fvdw
= _mm256_xor_ps(signbit
,_mm256_mul_ps(_mm256_add_ps(fvdw6
,fvdw12
),_mm256_mul_ps(vftabscale
,rinv00
)));
1407 fscal
= _mm256_add_ps(felec
,fvdw
);
1409 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
1411 /* Calculate temporary vectorial force */
1412 tx
= _mm256_mul_ps(fscal
,dx00
);
1413 ty
= _mm256_mul_ps(fscal
,dy00
);
1414 tz
= _mm256_mul_ps(fscal
,dz00
);
1416 /* Update vectorial force */
1417 fix0
= _mm256_add_ps(fix0
,tx
);
1418 fiy0
= _mm256_add_ps(fiy0
,ty
);
1419 fiz0
= _mm256_add_ps(fiz0
,tz
);
1421 fjx0
= _mm256_add_ps(fjx0
,tx
);
1422 fjy0
= _mm256_add_ps(fjy0
,ty
);
1423 fjz0
= _mm256_add_ps(fjz0
,tz
);
1425 /**************************
1426 * CALCULATE INTERACTIONS *
1427 **************************/
1429 r10
= _mm256_mul_ps(rsq10
,rinv10
);
1430 r10
= _mm256_andnot_ps(dummy_mask
,r10
);
1432 /* Compute parameters for interactions between i and j atoms */
1433 qq10
= _mm256_mul_ps(iq1
,jq0
);
1435 /* Calculate table index by multiplying r with table scale and truncate to integer */
1436 rt
= _mm256_mul_ps(r10
,vftabscale
);
1437 vfitab
= _mm256_cvttps_epi32(rt
);
1438 vfeps
= _mm256_sub_ps(rt
,_mm256_round_ps(rt
, _MM_FROUND_FLOOR
));
1439 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1440 vfitab_lo
= _mm256_extractf128_si256(vfitab
,0x0);
1441 vfitab_hi
= _mm256_extractf128_si256(vfitab
,0x1);
1442 vfitab_lo
= _mm_slli_epi32(_mm_add_epi32(vfitab_lo
,_mm_slli_epi32(vfitab_lo
,1)),2);
1443 vfitab_hi
= _mm_slli_epi32(_mm_add_epi32(vfitab_hi
,_mm_slli_epi32(vfitab_hi
,1)),2);
1445 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1446 Y
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,0)),
1447 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,0)));
1448 F
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,1)),
1449 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,1)));
1450 G
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,2)),
1451 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,2)));
1452 H
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,3)),
1453 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,3)));
1454 GMX_MM256_HALFTRANSPOSE4_PS(Y
,F
,G
,H
);
1455 Heps
= _mm256_mul_ps(vfeps
,H
);
1456 Fp
= _mm256_add_ps(F
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,Heps
)));
1457 FF
= _mm256_add_ps(Fp
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,_mm256_add_ps(Heps
,Heps
))));
1458 felec
= _mm256_xor_ps(signbit
,_mm256_mul_ps(_mm256_mul_ps(qq10
,FF
),_mm256_mul_ps(vftabscale
,rinv10
)));
1462 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
1464 /* Calculate temporary vectorial force */
1465 tx
= _mm256_mul_ps(fscal
,dx10
);
1466 ty
= _mm256_mul_ps(fscal
,dy10
);
1467 tz
= _mm256_mul_ps(fscal
,dz10
);
1469 /* Update vectorial force */
1470 fix1
= _mm256_add_ps(fix1
,tx
);
1471 fiy1
= _mm256_add_ps(fiy1
,ty
);
1472 fiz1
= _mm256_add_ps(fiz1
,tz
);
1474 fjx0
= _mm256_add_ps(fjx0
,tx
);
1475 fjy0
= _mm256_add_ps(fjy0
,ty
);
1476 fjz0
= _mm256_add_ps(fjz0
,tz
);
1478 /**************************
1479 * CALCULATE INTERACTIONS *
1480 **************************/
1482 r20
= _mm256_mul_ps(rsq20
,rinv20
);
1483 r20
= _mm256_andnot_ps(dummy_mask
,r20
);
1485 /* Compute parameters for interactions between i and j atoms */
1486 qq20
= _mm256_mul_ps(iq2
,jq0
);
1488 /* Calculate table index by multiplying r with table scale and truncate to integer */
1489 rt
= _mm256_mul_ps(r20
,vftabscale
);
1490 vfitab
= _mm256_cvttps_epi32(rt
);
1491 vfeps
= _mm256_sub_ps(rt
,_mm256_round_ps(rt
, _MM_FROUND_FLOOR
));
1492 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1493 vfitab_lo
= _mm256_extractf128_si256(vfitab
,0x0);
1494 vfitab_hi
= _mm256_extractf128_si256(vfitab
,0x1);
1495 vfitab_lo
= _mm_slli_epi32(_mm_add_epi32(vfitab_lo
,_mm_slli_epi32(vfitab_lo
,1)),2);
1496 vfitab_hi
= _mm_slli_epi32(_mm_add_epi32(vfitab_hi
,_mm_slli_epi32(vfitab_hi
,1)),2);
1498 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1499 Y
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,0)),
1500 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,0)));
1501 F
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,1)),
1502 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,1)));
1503 G
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,2)),
1504 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,2)));
1505 H
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,3)),
1506 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,3)));
1507 GMX_MM256_HALFTRANSPOSE4_PS(Y
,F
,G
,H
);
1508 Heps
= _mm256_mul_ps(vfeps
,H
);
1509 Fp
= _mm256_add_ps(F
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,Heps
)));
1510 FF
= _mm256_add_ps(Fp
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,_mm256_add_ps(Heps
,Heps
))));
1511 felec
= _mm256_xor_ps(signbit
,_mm256_mul_ps(_mm256_mul_ps(qq20
,FF
),_mm256_mul_ps(vftabscale
,rinv20
)));
1515 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
1517 /* Calculate temporary vectorial force */
1518 tx
= _mm256_mul_ps(fscal
,dx20
);
1519 ty
= _mm256_mul_ps(fscal
,dy20
);
1520 tz
= _mm256_mul_ps(fscal
,dz20
);
1522 /* Update vectorial force */
1523 fix2
= _mm256_add_ps(fix2
,tx
);
1524 fiy2
= _mm256_add_ps(fiy2
,ty
);
1525 fiz2
= _mm256_add_ps(fiz2
,tz
);
1527 fjx0
= _mm256_add_ps(fjx0
,tx
);
1528 fjy0
= _mm256_add_ps(fjy0
,ty
);
1529 fjz0
= _mm256_add_ps(fjz0
,tz
);
1531 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
1532 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
1533 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
1534 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
1535 fjptrE
= (jnrlistE
>=0) ? f
+j_coord_offsetE
: scratch
;
1536 fjptrF
= (jnrlistF
>=0) ? f
+j_coord_offsetF
: scratch
;
1537 fjptrG
= (jnrlistG
>=0) ? f
+j_coord_offsetG
: scratch
;
1538 fjptrH
= (jnrlistH
>=0) ? f
+j_coord_offsetH
: scratch
;
1540 gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,fjptrE
,fjptrF
,fjptrG
,fjptrH
,fjx0
,fjy0
,fjz0
);
1542 /* Inner loop uses 145 flops */
1545 /* End of innermost loop */
1547 gmx_mm256_update_iforce_3atom_swizzle_ps(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
1548 f
+i_coord_offset
,fshift
+i_shift_offset
);
1550 /* Increment number of inner iterations */
1551 inneriter
+= j_index_end
- j_index_start
;
1553 /* Outer loop uses 18 flops */
1556 /* Increment number of outer iterations */
1559 /* Update outer/inner flops */
1561 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W3_F
,outeriter
*18 + inneriter
*145);