2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_single kernel generator.
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
47 #include "gromacs/simd/math_x86_avx_256_single.h"
48 #include "kernelutil_x86_avx_256_single.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_avx_256_single
52 * Electrostatics interaction: CubicSplineTable
53 * VdW interaction: CubicSplineTable
54 * Geometry: Water4-Water4
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_avx_256_single
59 (t_nblist
* gmx_restrict nlist
,
60 rvec
* gmx_restrict xx
,
61 rvec
* gmx_restrict ff
,
62 t_forcerec
* gmx_restrict fr
,
63 t_mdatoms
* gmx_restrict mdatoms
,
64 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
65 t_nrnb
* gmx_restrict nrnb
)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
73 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
74 int jnrA
,jnrB
,jnrC
,jnrD
;
75 int jnrE
,jnrF
,jnrG
,jnrH
;
76 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
77 int jnrlistE
,jnrlistF
,jnrlistG
,jnrlistH
;
78 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
79 int j_coord_offsetE
,j_coord_offsetF
,j_coord_offsetG
,j_coord_offsetH
;
80 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
82 real
*shiftvec
,*fshift
,*x
,*f
;
83 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
,*fjptrE
,*fjptrF
,*fjptrG
,*fjptrH
;
85 __m256 tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
86 real
* vdwioffsetptr0
;
87 __m256 ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
88 real
* vdwioffsetptr1
;
89 __m256 ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
90 real
* vdwioffsetptr2
;
91 __m256 ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
92 real
* vdwioffsetptr3
;
93 __m256 ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
94 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
,vdwjidx0E
,vdwjidx0F
,vdwjidx0G
,vdwjidx0H
;
95 __m256 jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
96 int vdwjidx1A
,vdwjidx1B
,vdwjidx1C
,vdwjidx1D
,vdwjidx1E
,vdwjidx1F
,vdwjidx1G
,vdwjidx1H
;
97 __m256 jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
98 int vdwjidx2A
,vdwjidx2B
,vdwjidx2C
,vdwjidx2D
,vdwjidx2E
,vdwjidx2F
,vdwjidx2G
,vdwjidx2H
;
99 __m256 jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
100 int vdwjidx3A
,vdwjidx3B
,vdwjidx3C
,vdwjidx3D
,vdwjidx3E
,vdwjidx3F
,vdwjidx3G
,vdwjidx3H
;
101 __m256 jx3
,jy3
,jz3
,fjx3
,fjy3
,fjz3
,jq3
,isaj3
;
102 __m256 dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
103 __m256 dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
104 __m256 dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
105 __m256 dx13
,dy13
,dz13
,rsq13
,rinv13
,rinvsq13
,r13
,qq13
,c6_13
,c12_13
;
106 __m256 dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
107 __m256 dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
108 __m256 dx23
,dy23
,dz23
,rsq23
,rinv23
,rinvsq23
,r23
,qq23
,c6_23
,c12_23
;
109 __m256 dx31
,dy31
,dz31
,rsq31
,rinv31
,rinvsq31
,r31
,qq31
,c6_31
,c12_31
;
110 __m256 dx32
,dy32
,dz32
,rsq32
,rinv32
,rinvsq32
,r32
,qq32
,c6_32
,c12_32
;
111 __m256 dx33
,dy33
,dz33
,rsq33
,rinv33
,rinvsq33
,r33
,qq33
,c6_33
,c12_33
;
112 __m256 velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
115 __m256 rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
118 __m256 one_sixth
= _mm256_set1_ps(1.0/6.0);
119 __m256 one_twelfth
= _mm256_set1_ps(1.0/12.0);
121 __m128i vfitab_lo
,vfitab_hi
;
122 __m128i ifour
= _mm_set1_epi32(4);
123 __m256 rt
,vfeps
,vftabscale
,Y
,F
,G
,H
,Heps
,Fp
,VV
,FF
;
125 __m256 dummy_mask
,cutoff_mask
;
126 __m256 signbit
= _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
127 __m256 one
= _mm256_set1_ps(1.0);
128 __m256 two
= _mm256_set1_ps(2.0);
134 jindex
= nlist
->jindex
;
136 shiftidx
= nlist
->shift
;
138 shiftvec
= fr
->shift_vec
[0];
139 fshift
= fr
->fshift
[0];
140 facel
= _mm256_set1_ps(fr
->epsfac
);
141 charge
= mdatoms
->chargeA
;
142 nvdwtype
= fr
->ntype
;
144 vdwtype
= mdatoms
->typeA
;
146 vftab
= kernel_data
->table_elec_vdw
->data
;
147 vftabscale
= _mm256_set1_ps(kernel_data
->table_elec_vdw
->scale
);
149 /* Setup water-specific parameters */
150 inr
= nlist
->iinr
[0];
151 iq1
= _mm256_mul_ps(facel
,_mm256_set1_ps(charge
[inr
+1]));
152 iq2
= _mm256_mul_ps(facel
,_mm256_set1_ps(charge
[inr
+2]));
153 iq3
= _mm256_mul_ps(facel
,_mm256_set1_ps(charge
[inr
+3]));
154 vdwioffsetptr0
= vdwparam
+2*nvdwtype
*vdwtype
[inr
+0];
156 jq1
= _mm256_set1_ps(charge
[inr
+1]);
157 jq2
= _mm256_set1_ps(charge
[inr
+2]);
158 jq3
= _mm256_set1_ps(charge
[inr
+3]);
159 vdwjidx0A
= 2*vdwtype
[inr
+0];
160 c6_00
= _mm256_set1_ps(vdwioffsetptr0
[vdwjidx0A
]);
161 c12_00
= _mm256_set1_ps(vdwioffsetptr0
[vdwjidx0A
+1]);
162 qq11
= _mm256_mul_ps(iq1
,jq1
);
163 qq12
= _mm256_mul_ps(iq1
,jq2
);
164 qq13
= _mm256_mul_ps(iq1
,jq3
);
165 qq21
= _mm256_mul_ps(iq2
,jq1
);
166 qq22
= _mm256_mul_ps(iq2
,jq2
);
167 qq23
= _mm256_mul_ps(iq2
,jq3
);
168 qq31
= _mm256_mul_ps(iq3
,jq1
);
169 qq32
= _mm256_mul_ps(iq3
,jq2
);
170 qq33
= _mm256_mul_ps(iq3
,jq3
);
172 /* Avoid stupid compiler warnings */
173 jnrA
= jnrB
= jnrC
= jnrD
= jnrE
= jnrF
= jnrG
= jnrH
= 0;
186 for(iidx
=0;iidx
<4*DIM
;iidx
++)
191 /* Start outer loop over neighborlists */
192 for(iidx
=0; iidx
<nri
; iidx
++)
194 /* Load shift vector for this list */
195 i_shift_offset
= DIM
*shiftidx
[iidx
];
197 /* Load limits for loop over neighbors */
198 j_index_start
= jindex
[iidx
];
199 j_index_end
= jindex
[iidx
+1];
201 /* Get outer coordinate index */
203 i_coord_offset
= DIM
*inr
;
205 /* Load i particle coords and add shift vector */
206 gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
207 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
209 fix0
= _mm256_setzero_ps();
210 fiy0
= _mm256_setzero_ps();
211 fiz0
= _mm256_setzero_ps();
212 fix1
= _mm256_setzero_ps();
213 fiy1
= _mm256_setzero_ps();
214 fiz1
= _mm256_setzero_ps();
215 fix2
= _mm256_setzero_ps();
216 fiy2
= _mm256_setzero_ps();
217 fiz2
= _mm256_setzero_ps();
218 fix3
= _mm256_setzero_ps();
219 fiy3
= _mm256_setzero_ps();
220 fiz3
= _mm256_setzero_ps();
222 /* Reset potential sums */
223 velecsum
= _mm256_setzero_ps();
224 vvdwsum
= _mm256_setzero_ps();
226 /* Start inner kernel loop */
227 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+7]>=0; jidx
+=8)
230 /* Get j neighbor index, and coordinate index */
239 j_coord_offsetA
= DIM
*jnrA
;
240 j_coord_offsetB
= DIM
*jnrB
;
241 j_coord_offsetC
= DIM
*jnrC
;
242 j_coord_offsetD
= DIM
*jnrD
;
243 j_coord_offsetE
= DIM
*jnrE
;
244 j_coord_offsetF
= DIM
*jnrF
;
245 j_coord_offsetG
= DIM
*jnrG
;
246 j_coord_offsetH
= DIM
*jnrH
;
248 /* load j atom coordinates */
249 gmx_mm256_load_4rvec_8ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
250 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
251 x
+j_coord_offsetE
,x
+j_coord_offsetF
,
252 x
+j_coord_offsetG
,x
+j_coord_offsetH
,
253 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
254 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
256 /* Calculate displacement vector */
257 dx00
= _mm256_sub_ps(ix0
,jx0
);
258 dy00
= _mm256_sub_ps(iy0
,jy0
);
259 dz00
= _mm256_sub_ps(iz0
,jz0
);
260 dx11
= _mm256_sub_ps(ix1
,jx1
);
261 dy11
= _mm256_sub_ps(iy1
,jy1
);
262 dz11
= _mm256_sub_ps(iz1
,jz1
);
263 dx12
= _mm256_sub_ps(ix1
,jx2
);
264 dy12
= _mm256_sub_ps(iy1
,jy2
);
265 dz12
= _mm256_sub_ps(iz1
,jz2
);
266 dx13
= _mm256_sub_ps(ix1
,jx3
);
267 dy13
= _mm256_sub_ps(iy1
,jy3
);
268 dz13
= _mm256_sub_ps(iz1
,jz3
);
269 dx21
= _mm256_sub_ps(ix2
,jx1
);
270 dy21
= _mm256_sub_ps(iy2
,jy1
);
271 dz21
= _mm256_sub_ps(iz2
,jz1
);
272 dx22
= _mm256_sub_ps(ix2
,jx2
);
273 dy22
= _mm256_sub_ps(iy2
,jy2
);
274 dz22
= _mm256_sub_ps(iz2
,jz2
);
275 dx23
= _mm256_sub_ps(ix2
,jx3
);
276 dy23
= _mm256_sub_ps(iy2
,jy3
);
277 dz23
= _mm256_sub_ps(iz2
,jz3
);
278 dx31
= _mm256_sub_ps(ix3
,jx1
);
279 dy31
= _mm256_sub_ps(iy3
,jy1
);
280 dz31
= _mm256_sub_ps(iz3
,jz1
);
281 dx32
= _mm256_sub_ps(ix3
,jx2
);
282 dy32
= _mm256_sub_ps(iy3
,jy2
);
283 dz32
= _mm256_sub_ps(iz3
,jz2
);
284 dx33
= _mm256_sub_ps(ix3
,jx3
);
285 dy33
= _mm256_sub_ps(iy3
,jy3
);
286 dz33
= _mm256_sub_ps(iz3
,jz3
);
288 /* Calculate squared distance and things based on it */
289 rsq00
= gmx_mm256_calc_rsq_ps(dx00
,dy00
,dz00
);
290 rsq11
= gmx_mm256_calc_rsq_ps(dx11
,dy11
,dz11
);
291 rsq12
= gmx_mm256_calc_rsq_ps(dx12
,dy12
,dz12
);
292 rsq13
= gmx_mm256_calc_rsq_ps(dx13
,dy13
,dz13
);
293 rsq21
= gmx_mm256_calc_rsq_ps(dx21
,dy21
,dz21
);
294 rsq22
= gmx_mm256_calc_rsq_ps(dx22
,dy22
,dz22
);
295 rsq23
= gmx_mm256_calc_rsq_ps(dx23
,dy23
,dz23
);
296 rsq31
= gmx_mm256_calc_rsq_ps(dx31
,dy31
,dz31
);
297 rsq32
= gmx_mm256_calc_rsq_ps(dx32
,dy32
,dz32
);
298 rsq33
= gmx_mm256_calc_rsq_ps(dx33
,dy33
,dz33
);
300 rinv00
= gmx_mm256_invsqrt_ps(rsq00
);
301 rinv11
= gmx_mm256_invsqrt_ps(rsq11
);
302 rinv12
= gmx_mm256_invsqrt_ps(rsq12
);
303 rinv13
= gmx_mm256_invsqrt_ps(rsq13
);
304 rinv21
= gmx_mm256_invsqrt_ps(rsq21
);
305 rinv22
= gmx_mm256_invsqrt_ps(rsq22
);
306 rinv23
= gmx_mm256_invsqrt_ps(rsq23
);
307 rinv31
= gmx_mm256_invsqrt_ps(rsq31
);
308 rinv32
= gmx_mm256_invsqrt_ps(rsq32
);
309 rinv33
= gmx_mm256_invsqrt_ps(rsq33
);
311 fjx0
= _mm256_setzero_ps();
312 fjy0
= _mm256_setzero_ps();
313 fjz0
= _mm256_setzero_ps();
314 fjx1
= _mm256_setzero_ps();
315 fjy1
= _mm256_setzero_ps();
316 fjz1
= _mm256_setzero_ps();
317 fjx2
= _mm256_setzero_ps();
318 fjy2
= _mm256_setzero_ps();
319 fjz2
= _mm256_setzero_ps();
320 fjx3
= _mm256_setzero_ps();
321 fjy3
= _mm256_setzero_ps();
322 fjz3
= _mm256_setzero_ps();
324 /**************************
325 * CALCULATE INTERACTIONS *
326 **************************/
328 r00
= _mm256_mul_ps(rsq00
,rinv00
);
330 /* Calculate table index by multiplying r with table scale and truncate to integer */
331 rt
= _mm256_mul_ps(r00
,vftabscale
);
332 vfitab
= _mm256_cvttps_epi32(rt
);
333 vfeps
= _mm256_sub_ps(rt
,_mm256_round_ps(rt
, _MM_FROUND_FLOOR
));
334 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
335 vfitab_lo
= _mm256_extractf128_si256(vfitab
,0x0);
336 vfitab_hi
= _mm256_extractf128_si256(vfitab
,0x1);
337 vfitab_lo
= _mm_slli_epi32(_mm_add_epi32(vfitab_lo
,_mm_slli_epi32(vfitab_lo
,1)),2);
338 vfitab_hi
= _mm_slli_epi32(_mm_add_epi32(vfitab_hi
,_mm_slli_epi32(vfitab_hi
,1)),2);
340 /* CUBIC SPLINE TABLE DISPERSION */
341 vfitab_lo
= _mm_add_epi32(vfitab_lo
,ifour
);
342 vfitab_hi
= _mm_add_epi32(vfitab_hi
,ifour
);
343 Y
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,0)),
344 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,0)));
345 F
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,1)),
346 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,1)));
347 G
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,2)),
348 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,2)));
349 H
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,3)),
350 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,3)));
351 GMX_MM256_HALFTRANSPOSE4_PS(Y
,F
,G
,H
);
352 Heps
= _mm256_mul_ps(vfeps
,H
);
353 Fp
= _mm256_add_ps(F
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,Heps
)));
354 VV
= _mm256_add_ps(Y
,_mm256_mul_ps(vfeps
,Fp
));
355 vvdw6
= _mm256_mul_ps(c6_00
,VV
);
356 FF
= _mm256_add_ps(Fp
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,_mm256_add_ps(Heps
,Heps
))));
357 fvdw6
= _mm256_mul_ps(c6_00
,FF
);
359 /* CUBIC SPLINE TABLE REPULSION */
360 vfitab_lo
= _mm_add_epi32(vfitab_lo
,ifour
);
361 vfitab_hi
= _mm_add_epi32(vfitab_hi
,ifour
);
362 Y
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,0)),
363 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,0)));
364 F
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,1)),
365 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,1)));
366 G
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,2)),
367 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,2)));
368 H
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,3)),
369 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,3)));
370 GMX_MM256_HALFTRANSPOSE4_PS(Y
,F
,G
,H
);
371 Heps
= _mm256_mul_ps(vfeps
,H
);
372 Fp
= _mm256_add_ps(F
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,Heps
)));
373 VV
= _mm256_add_ps(Y
,_mm256_mul_ps(vfeps
,Fp
));
374 vvdw12
= _mm256_mul_ps(c12_00
,VV
);
375 FF
= _mm256_add_ps(Fp
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,_mm256_add_ps(Heps
,Heps
))));
376 fvdw12
= _mm256_mul_ps(c12_00
,FF
);
377 vvdw
= _mm256_add_ps(vvdw12
,vvdw6
);
378 fvdw
= _mm256_xor_ps(signbit
,_mm256_mul_ps(_mm256_add_ps(fvdw6
,fvdw12
),_mm256_mul_ps(vftabscale
,rinv00
)));
380 /* Update potential sum for this i atom from the interaction with this j atom. */
381 vvdwsum
= _mm256_add_ps(vvdwsum
,vvdw
);
385 /* Calculate temporary vectorial force */
386 tx
= _mm256_mul_ps(fscal
,dx00
);
387 ty
= _mm256_mul_ps(fscal
,dy00
);
388 tz
= _mm256_mul_ps(fscal
,dz00
);
390 /* Update vectorial force */
391 fix0
= _mm256_add_ps(fix0
,tx
);
392 fiy0
= _mm256_add_ps(fiy0
,ty
);
393 fiz0
= _mm256_add_ps(fiz0
,tz
);
395 fjx0
= _mm256_add_ps(fjx0
,tx
);
396 fjy0
= _mm256_add_ps(fjy0
,ty
);
397 fjz0
= _mm256_add_ps(fjz0
,tz
);
399 /**************************
400 * CALCULATE INTERACTIONS *
401 **************************/
403 r11
= _mm256_mul_ps(rsq11
,rinv11
);
405 /* Calculate table index by multiplying r with table scale and truncate to integer */
406 rt
= _mm256_mul_ps(r11
,vftabscale
);
407 vfitab
= _mm256_cvttps_epi32(rt
);
408 vfeps
= _mm256_sub_ps(rt
,_mm256_round_ps(rt
, _MM_FROUND_FLOOR
));
409 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
410 vfitab_lo
= _mm256_extractf128_si256(vfitab
,0x0);
411 vfitab_hi
= _mm256_extractf128_si256(vfitab
,0x1);
412 vfitab_lo
= _mm_slli_epi32(_mm_add_epi32(vfitab_lo
,_mm_slli_epi32(vfitab_lo
,1)),2);
413 vfitab_hi
= _mm_slli_epi32(_mm_add_epi32(vfitab_hi
,_mm_slli_epi32(vfitab_hi
,1)),2);
415 /* CUBIC SPLINE TABLE ELECTROSTATICS */
416 Y
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,0)),
417 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,0)));
418 F
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,1)),
419 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,1)));
420 G
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,2)),
421 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,2)));
422 H
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,3)),
423 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,3)));
424 GMX_MM256_HALFTRANSPOSE4_PS(Y
,F
,G
,H
);
425 Heps
= _mm256_mul_ps(vfeps
,H
);
426 Fp
= _mm256_add_ps(F
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,Heps
)));
427 VV
= _mm256_add_ps(Y
,_mm256_mul_ps(vfeps
,Fp
));
428 velec
= _mm256_mul_ps(qq11
,VV
);
429 FF
= _mm256_add_ps(Fp
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,_mm256_add_ps(Heps
,Heps
))));
430 felec
= _mm256_xor_ps(signbit
,_mm256_mul_ps(_mm256_mul_ps(qq11
,FF
),_mm256_mul_ps(vftabscale
,rinv11
)));
432 /* Update potential sum for this i atom from the interaction with this j atom. */
433 velecsum
= _mm256_add_ps(velecsum
,velec
);
437 /* Calculate temporary vectorial force */
438 tx
= _mm256_mul_ps(fscal
,dx11
);
439 ty
= _mm256_mul_ps(fscal
,dy11
);
440 tz
= _mm256_mul_ps(fscal
,dz11
);
442 /* Update vectorial force */
443 fix1
= _mm256_add_ps(fix1
,tx
);
444 fiy1
= _mm256_add_ps(fiy1
,ty
);
445 fiz1
= _mm256_add_ps(fiz1
,tz
);
447 fjx1
= _mm256_add_ps(fjx1
,tx
);
448 fjy1
= _mm256_add_ps(fjy1
,ty
);
449 fjz1
= _mm256_add_ps(fjz1
,tz
);
451 /**************************
452 * CALCULATE INTERACTIONS *
453 **************************/
455 r12
= _mm256_mul_ps(rsq12
,rinv12
);
457 /* Calculate table index by multiplying r with table scale and truncate to integer */
458 rt
= _mm256_mul_ps(r12
,vftabscale
);
459 vfitab
= _mm256_cvttps_epi32(rt
);
460 vfeps
= _mm256_sub_ps(rt
,_mm256_round_ps(rt
, _MM_FROUND_FLOOR
));
461 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
462 vfitab_lo
= _mm256_extractf128_si256(vfitab
,0x0);
463 vfitab_hi
= _mm256_extractf128_si256(vfitab
,0x1);
464 vfitab_lo
= _mm_slli_epi32(_mm_add_epi32(vfitab_lo
,_mm_slli_epi32(vfitab_lo
,1)),2);
465 vfitab_hi
= _mm_slli_epi32(_mm_add_epi32(vfitab_hi
,_mm_slli_epi32(vfitab_hi
,1)),2);
467 /* CUBIC SPLINE TABLE ELECTROSTATICS */
468 Y
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,0)),
469 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,0)));
470 F
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,1)),
471 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,1)));
472 G
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,2)),
473 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,2)));
474 H
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,3)),
475 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,3)));
476 GMX_MM256_HALFTRANSPOSE4_PS(Y
,F
,G
,H
);
477 Heps
= _mm256_mul_ps(vfeps
,H
);
478 Fp
= _mm256_add_ps(F
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,Heps
)));
479 VV
= _mm256_add_ps(Y
,_mm256_mul_ps(vfeps
,Fp
));
480 velec
= _mm256_mul_ps(qq12
,VV
);
481 FF
= _mm256_add_ps(Fp
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,_mm256_add_ps(Heps
,Heps
))));
482 felec
= _mm256_xor_ps(signbit
,_mm256_mul_ps(_mm256_mul_ps(qq12
,FF
),_mm256_mul_ps(vftabscale
,rinv12
)));
484 /* Update potential sum for this i atom from the interaction with this j atom. */
485 velecsum
= _mm256_add_ps(velecsum
,velec
);
489 /* Calculate temporary vectorial force */
490 tx
= _mm256_mul_ps(fscal
,dx12
);
491 ty
= _mm256_mul_ps(fscal
,dy12
);
492 tz
= _mm256_mul_ps(fscal
,dz12
);
494 /* Update vectorial force */
495 fix1
= _mm256_add_ps(fix1
,tx
);
496 fiy1
= _mm256_add_ps(fiy1
,ty
);
497 fiz1
= _mm256_add_ps(fiz1
,tz
);
499 fjx2
= _mm256_add_ps(fjx2
,tx
);
500 fjy2
= _mm256_add_ps(fjy2
,ty
);
501 fjz2
= _mm256_add_ps(fjz2
,tz
);
503 /**************************
504 * CALCULATE INTERACTIONS *
505 **************************/
507 r13
= _mm256_mul_ps(rsq13
,rinv13
);
509 /* Calculate table index by multiplying r with table scale and truncate to integer */
510 rt
= _mm256_mul_ps(r13
,vftabscale
);
511 vfitab
= _mm256_cvttps_epi32(rt
);
512 vfeps
= _mm256_sub_ps(rt
,_mm256_round_ps(rt
, _MM_FROUND_FLOOR
));
513 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
514 vfitab_lo
= _mm256_extractf128_si256(vfitab
,0x0);
515 vfitab_hi
= _mm256_extractf128_si256(vfitab
,0x1);
516 vfitab_lo
= _mm_slli_epi32(_mm_add_epi32(vfitab_lo
,_mm_slli_epi32(vfitab_lo
,1)),2);
517 vfitab_hi
= _mm_slli_epi32(_mm_add_epi32(vfitab_hi
,_mm_slli_epi32(vfitab_hi
,1)),2);
519 /* CUBIC SPLINE TABLE ELECTROSTATICS */
520 Y
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,0)),
521 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,0)));
522 F
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,1)),
523 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,1)));
524 G
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,2)),
525 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,2)));
526 H
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,3)),
527 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,3)));
528 GMX_MM256_HALFTRANSPOSE4_PS(Y
,F
,G
,H
);
529 Heps
= _mm256_mul_ps(vfeps
,H
);
530 Fp
= _mm256_add_ps(F
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,Heps
)));
531 VV
= _mm256_add_ps(Y
,_mm256_mul_ps(vfeps
,Fp
));
532 velec
= _mm256_mul_ps(qq13
,VV
);
533 FF
= _mm256_add_ps(Fp
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,_mm256_add_ps(Heps
,Heps
))));
534 felec
= _mm256_xor_ps(signbit
,_mm256_mul_ps(_mm256_mul_ps(qq13
,FF
),_mm256_mul_ps(vftabscale
,rinv13
)));
536 /* Update potential sum for this i atom from the interaction with this j atom. */
537 velecsum
= _mm256_add_ps(velecsum
,velec
);
541 /* Calculate temporary vectorial force */
542 tx
= _mm256_mul_ps(fscal
,dx13
);
543 ty
= _mm256_mul_ps(fscal
,dy13
);
544 tz
= _mm256_mul_ps(fscal
,dz13
);
546 /* Update vectorial force */
547 fix1
= _mm256_add_ps(fix1
,tx
);
548 fiy1
= _mm256_add_ps(fiy1
,ty
);
549 fiz1
= _mm256_add_ps(fiz1
,tz
);
551 fjx3
= _mm256_add_ps(fjx3
,tx
);
552 fjy3
= _mm256_add_ps(fjy3
,ty
);
553 fjz3
= _mm256_add_ps(fjz3
,tz
);
555 /**************************
556 * CALCULATE INTERACTIONS *
557 **************************/
559 r21
= _mm256_mul_ps(rsq21
,rinv21
);
561 /* Calculate table index by multiplying r with table scale and truncate to integer */
562 rt
= _mm256_mul_ps(r21
,vftabscale
);
563 vfitab
= _mm256_cvttps_epi32(rt
);
564 vfeps
= _mm256_sub_ps(rt
,_mm256_round_ps(rt
, _MM_FROUND_FLOOR
));
565 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
566 vfitab_lo
= _mm256_extractf128_si256(vfitab
,0x0);
567 vfitab_hi
= _mm256_extractf128_si256(vfitab
,0x1);
568 vfitab_lo
= _mm_slli_epi32(_mm_add_epi32(vfitab_lo
,_mm_slli_epi32(vfitab_lo
,1)),2);
569 vfitab_hi
= _mm_slli_epi32(_mm_add_epi32(vfitab_hi
,_mm_slli_epi32(vfitab_hi
,1)),2);
571 /* CUBIC SPLINE TABLE ELECTROSTATICS */
572 Y
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,0)),
573 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,0)));
574 F
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,1)),
575 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,1)));
576 G
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,2)),
577 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,2)));
578 H
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,3)),
579 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,3)));
580 GMX_MM256_HALFTRANSPOSE4_PS(Y
,F
,G
,H
);
581 Heps
= _mm256_mul_ps(vfeps
,H
);
582 Fp
= _mm256_add_ps(F
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,Heps
)));
583 VV
= _mm256_add_ps(Y
,_mm256_mul_ps(vfeps
,Fp
));
584 velec
= _mm256_mul_ps(qq21
,VV
);
585 FF
= _mm256_add_ps(Fp
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,_mm256_add_ps(Heps
,Heps
))));
586 felec
= _mm256_xor_ps(signbit
,_mm256_mul_ps(_mm256_mul_ps(qq21
,FF
),_mm256_mul_ps(vftabscale
,rinv21
)));
588 /* Update potential sum for this i atom from the interaction with this j atom. */
589 velecsum
= _mm256_add_ps(velecsum
,velec
);
593 /* Calculate temporary vectorial force */
594 tx
= _mm256_mul_ps(fscal
,dx21
);
595 ty
= _mm256_mul_ps(fscal
,dy21
);
596 tz
= _mm256_mul_ps(fscal
,dz21
);
598 /* Update vectorial force */
599 fix2
= _mm256_add_ps(fix2
,tx
);
600 fiy2
= _mm256_add_ps(fiy2
,ty
);
601 fiz2
= _mm256_add_ps(fiz2
,tz
);
603 fjx1
= _mm256_add_ps(fjx1
,tx
);
604 fjy1
= _mm256_add_ps(fjy1
,ty
);
605 fjz1
= _mm256_add_ps(fjz1
,tz
);
607 /**************************
608 * CALCULATE INTERACTIONS *
609 **************************/
611 r22
= _mm256_mul_ps(rsq22
,rinv22
);
613 /* Calculate table index by multiplying r with table scale and truncate to integer */
614 rt
= _mm256_mul_ps(r22
,vftabscale
);
615 vfitab
= _mm256_cvttps_epi32(rt
);
616 vfeps
= _mm256_sub_ps(rt
,_mm256_round_ps(rt
, _MM_FROUND_FLOOR
));
617 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
618 vfitab_lo
= _mm256_extractf128_si256(vfitab
,0x0);
619 vfitab_hi
= _mm256_extractf128_si256(vfitab
,0x1);
620 vfitab_lo
= _mm_slli_epi32(_mm_add_epi32(vfitab_lo
,_mm_slli_epi32(vfitab_lo
,1)),2);
621 vfitab_hi
= _mm_slli_epi32(_mm_add_epi32(vfitab_hi
,_mm_slli_epi32(vfitab_hi
,1)),2);
623 /* CUBIC SPLINE TABLE ELECTROSTATICS */
624 Y
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,0)),
625 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,0)));
626 F
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,1)),
627 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,1)));
628 G
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,2)),
629 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,2)));
630 H
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,3)),
631 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,3)));
632 GMX_MM256_HALFTRANSPOSE4_PS(Y
,F
,G
,H
);
633 Heps
= _mm256_mul_ps(vfeps
,H
);
634 Fp
= _mm256_add_ps(F
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,Heps
)));
635 VV
= _mm256_add_ps(Y
,_mm256_mul_ps(vfeps
,Fp
));
636 velec
= _mm256_mul_ps(qq22
,VV
);
637 FF
= _mm256_add_ps(Fp
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,_mm256_add_ps(Heps
,Heps
))));
638 felec
= _mm256_xor_ps(signbit
,_mm256_mul_ps(_mm256_mul_ps(qq22
,FF
),_mm256_mul_ps(vftabscale
,rinv22
)));
640 /* Update potential sum for this i atom from the interaction with this j atom. */
641 velecsum
= _mm256_add_ps(velecsum
,velec
);
645 /* Calculate temporary vectorial force */
646 tx
= _mm256_mul_ps(fscal
,dx22
);
647 ty
= _mm256_mul_ps(fscal
,dy22
);
648 tz
= _mm256_mul_ps(fscal
,dz22
);
650 /* Update vectorial force */
651 fix2
= _mm256_add_ps(fix2
,tx
);
652 fiy2
= _mm256_add_ps(fiy2
,ty
);
653 fiz2
= _mm256_add_ps(fiz2
,tz
);
655 fjx2
= _mm256_add_ps(fjx2
,tx
);
656 fjy2
= _mm256_add_ps(fjy2
,ty
);
657 fjz2
= _mm256_add_ps(fjz2
,tz
);
659 /**************************
660 * CALCULATE INTERACTIONS *
661 **************************/
663 r23
= _mm256_mul_ps(rsq23
,rinv23
);
665 /* Calculate table index by multiplying r with table scale and truncate to integer */
666 rt
= _mm256_mul_ps(r23
,vftabscale
);
667 vfitab
= _mm256_cvttps_epi32(rt
);
668 vfeps
= _mm256_sub_ps(rt
,_mm256_round_ps(rt
, _MM_FROUND_FLOOR
));
669 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
670 vfitab_lo
= _mm256_extractf128_si256(vfitab
,0x0);
671 vfitab_hi
= _mm256_extractf128_si256(vfitab
,0x1);
672 vfitab_lo
= _mm_slli_epi32(_mm_add_epi32(vfitab_lo
,_mm_slli_epi32(vfitab_lo
,1)),2);
673 vfitab_hi
= _mm_slli_epi32(_mm_add_epi32(vfitab_hi
,_mm_slli_epi32(vfitab_hi
,1)),2);
675 /* CUBIC SPLINE TABLE ELECTROSTATICS */
676 Y
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,0)),
677 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,0)));
678 F
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,1)),
679 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,1)));
680 G
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,2)),
681 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,2)));
682 H
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,3)),
683 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,3)));
684 GMX_MM256_HALFTRANSPOSE4_PS(Y
,F
,G
,H
);
685 Heps
= _mm256_mul_ps(vfeps
,H
);
686 Fp
= _mm256_add_ps(F
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,Heps
)));
687 VV
= _mm256_add_ps(Y
,_mm256_mul_ps(vfeps
,Fp
));
688 velec
= _mm256_mul_ps(qq23
,VV
);
689 FF
= _mm256_add_ps(Fp
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,_mm256_add_ps(Heps
,Heps
))));
690 felec
= _mm256_xor_ps(signbit
,_mm256_mul_ps(_mm256_mul_ps(qq23
,FF
),_mm256_mul_ps(vftabscale
,rinv23
)));
692 /* Update potential sum for this i atom from the interaction with this j atom. */
693 velecsum
= _mm256_add_ps(velecsum
,velec
);
697 /* Calculate temporary vectorial force */
698 tx
= _mm256_mul_ps(fscal
,dx23
);
699 ty
= _mm256_mul_ps(fscal
,dy23
);
700 tz
= _mm256_mul_ps(fscal
,dz23
);
702 /* Update vectorial force */
703 fix2
= _mm256_add_ps(fix2
,tx
);
704 fiy2
= _mm256_add_ps(fiy2
,ty
);
705 fiz2
= _mm256_add_ps(fiz2
,tz
);
707 fjx3
= _mm256_add_ps(fjx3
,tx
);
708 fjy3
= _mm256_add_ps(fjy3
,ty
);
709 fjz3
= _mm256_add_ps(fjz3
,tz
);
711 /**************************
712 * CALCULATE INTERACTIONS *
713 **************************/
715 r31
= _mm256_mul_ps(rsq31
,rinv31
);
717 /* Calculate table index by multiplying r with table scale and truncate to integer */
718 rt
= _mm256_mul_ps(r31
,vftabscale
);
719 vfitab
= _mm256_cvttps_epi32(rt
);
720 vfeps
= _mm256_sub_ps(rt
,_mm256_round_ps(rt
, _MM_FROUND_FLOOR
));
721 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
722 vfitab_lo
= _mm256_extractf128_si256(vfitab
,0x0);
723 vfitab_hi
= _mm256_extractf128_si256(vfitab
,0x1);
724 vfitab_lo
= _mm_slli_epi32(_mm_add_epi32(vfitab_lo
,_mm_slli_epi32(vfitab_lo
,1)),2);
725 vfitab_hi
= _mm_slli_epi32(_mm_add_epi32(vfitab_hi
,_mm_slli_epi32(vfitab_hi
,1)),2);
727 /* CUBIC SPLINE TABLE ELECTROSTATICS */
728 Y
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,0)),
729 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,0)));
730 F
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,1)),
731 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,1)));
732 G
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,2)),
733 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,2)));
734 H
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,3)),
735 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,3)));
736 GMX_MM256_HALFTRANSPOSE4_PS(Y
,F
,G
,H
);
737 Heps
= _mm256_mul_ps(vfeps
,H
);
738 Fp
= _mm256_add_ps(F
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,Heps
)));
739 VV
= _mm256_add_ps(Y
,_mm256_mul_ps(vfeps
,Fp
));
740 velec
= _mm256_mul_ps(qq31
,VV
);
741 FF
= _mm256_add_ps(Fp
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,_mm256_add_ps(Heps
,Heps
))));
742 felec
= _mm256_xor_ps(signbit
,_mm256_mul_ps(_mm256_mul_ps(qq31
,FF
),_mm256_mul_ps(vftabscale
,rinv31
)));
744 /* Update potential sum for this i atom from the interaction with this j atom. */
745 velecsum
= _mm256_add_ps(velecsum
,velec
);
749 /* Calculate temporary vectorial force */
750 tx
= _mm256_mul_ps(fscal
,dx31
);
751 ty
= _mm256_mul_ps(fscal
,dy31
);
752 tz
= _mm256_mul_ps(fscal
,dz31
);
754 /* Update vectorial force */
755 fix3
= _mm256_add_ps(fix3
,tx
);
756 fiy3
= _mm256_add_ps(fiy3
,ty
);
757 fiz3
= _mm256_add_ps(fiz3
,tz
);
759 fjx1
= _mm256_add_ps(fjx1
,tx
);
760 fjy1
= _mm256_add_ps(fjy1
,ty
);
761 fjz1
= _mm256_add_ps(fjz1
,tz
);
763 /**************************
764 * CALCULATE INTERACTIONS *
765 **************************/
767 r32
= _mm256_mul_ps(rsq32
,rinv32
);
769 /* Calculate table index by multiplying r with table scale and truncate to integer */
770 rt
= _mm256_mul_ps(r32
,vftabscale
);
771 vfitab
= _mm256_cvttps_epi32(rt
);
772 vfeps
= _mm256_sub_ps(rt
,_mm256_round_ps(rt
, _MM_FROUND_FLOOR
));
773 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
774 vfitab_lo
= _mm256_extractf128_si256(vfitab
,0x0);
775 vfitab_hi
= _mm256_extractf128_si256(vfitab
,0x1);
776 vfitab_lo
= _mm_slli_epi32(_mm_add_epi32(vfitab_lo
,_mm_slli_epi32(vfitab_lo
,1)),2);
777 vfitab_hi
= _mm_slli_epi32(_mm_add_epi32(vfitab_hi
,_mm_slli_epi32(vfitab_hi
,1)),2);
779 /* CUBIC SPLINE TABLE ELECTROSTATICS */
780 Y
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,0)),
781 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,0)));
782 F
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,1)),
783 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,1)));
784 G
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,2)),
785 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,2)));
786 H
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,3)),
787 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,3)));
788 GMX_MM256_HALFTRANSPOSE4_PS(Y
,F
,G
,H
);
789 Heps
= _mm256_mul_ps(vfeps
,H
);
790 Fp
= _mm256_add_ps(F
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,Heps
)));
791 VV
= _mm256_add_ps(Y
,_mm256_mul_ps(vfeps
,Fp
));
792 velec
= _mm256_mul_ps(qq32
,VV
);
793 FF
= _mm256_add_ps(Fp
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,_mm256_add_ps(Heps
,Heps
))));
794 felec
= _mm256_xor_ps(signbit
,_mm256_mul_ps(_mm256_mul_ps(qq32
,FF
),_mm256_mul_ps(vftabscale
,rinv32
)));
796 /* Update potential sum for this i atom from the interaction with this j atom. */
797 velecsum
= _mm256_add_ps(velecsum
,velec
);
801 /* Calculate temporary vectorial force */
802 tx
= _mm256_mul_ps(fscal
,dx32
);
803 ty
= _mm256_mul_ps(fscal
,dy32
);
804 tz
= _mm256_mul_ps(fscal
,dz32
);
806 /* Update vectorial force */
807 fix3
= _mm256_add_ps(fix3
,tx
);
808 fiy3
= _mm256_add_ps(fiy3
,ty
);
809 fiz3
= _mm256_add_ps(fiz3
,tz
);
811 fjx2
= _mm256_add_ps(fjx2
,tx
);
812 fjy2
= _mm256_add_ps(fjy2
,ty
);
813 fjz2
= _mm256_add_ps(fjz2
,tz
);
815 /**************************
816 * CALCULATE INTERACTIONS *
817 **************************/
819 r33
= _mm256_mul_ps(rsq33
,rinv33
);
821 /* Calculate table index by multiplying r with table scale and truncate to integer */
822 rt
= _mm256_mul_ps(r33
,vftabscale
);
823 vfitab
= _mm256_cvttps_epi32(rt
);
824 vfeps
= _mm256_sub_ps(rt
,_mm256_round_ps(rt
, _MM_FROUND_FLOOR
));
825 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
826 vfitab_lo
= _mm256_extractf128_si256(vfitab
,0x0);
827 vfitab_hi
= _mm256_extractf128_si256(vfitab
,0x1);
828 vfitab_lo
= _mm_slli_epi32(_mm_add_epi32(vfitab_lo
,_mm_slli_epi32(vfitab_lo
,1)),2);
829 vfitab_hi
= _mm_slli_epi32(_mm_add_epi32(vfitab_hi
,_mm_slli_epi32(vfitab_hi
,1)),2);
831 /* CUBIC SPLINE TABLE ELECTROSTATICS */
832 Y
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,0)),
833 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,0)));
834 F
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,1)),
835 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,1)));
836 G
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,2)),
837 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,2)));
838 H
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,3)),
839 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,3)));
840 GMX_MM256_HALFTRANSPOSE4_PS(Y
,F
,G
,H
);
841 Heps
= _mm256_mul_ps(vfeps
,H
);
842 Fp
= _mm256_add_ps(F
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,Heps
)));
843 VV
= _mm256_add_ps(Y
,_mm256_mul_ps(vfeps
,Fp
));
844 velec
= _mm256_mul_ps(qq33
,VV
);
845 FF
= _mm256_add_ps(Fp
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,_mm256_add_ps(Heps
,Heps
))));
846 felec
= _mm256_xor_ps(signbit
,_mm256_mul_ps(_mm256_mul_ps(qq33
,FF
),_mm256_mul_ps(vftabscale
,rinv33
)));
848 /* Update potential sum for this i atom from the interaction with this j atom. */
849 velecsum
= _mm256_add_ps(velecsum
,velec
);
853 /* Calculate temporary vectorial force */
854 tx
= _mm256_mul_ps(fscal
,dx33
);
855 ty
= _mm256_mul_ps(fscal
,dy33
);
856 tz
= _mm256_mul_ps(fscal
,dz33
);
858 /* Update vectorial force */
859 fix3
= _mm256_add_ps(fix3
,tx
);
860 fiy3
= _mm256_add_ps(fiy3
,ty
);
861 fiz3
= _mm256_add_ps(fiz3
,tz
);
863 fjx3
= _mm256_add_ps(fjx3
,tx
);
864 fjy3
= _mm256_add_ps(fjy3
,ty
);
865 fjz3
= _mm256_add_ps(fjz3
,tz
);
867 fjptrA
= f
+j_coord_offsetA
;
868 fjptrB
= f
+j_coord_offsetB
;
869 fjptrC
= f
+j_coord_offsetC
;
870 fjptrD
= f
+j_coord_offsetD
;
871 fjptrE
= f
+j_coord_offsetE
;
872 fjptrF
= f
+j_coord_offsetF
;
873 fjptrG
= f
+j_coord_offsetG
;
874 fjptrH
= f
+j_coord_offsetH
;
876 gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,fjptrE
,fjptrF
,fjptrG
,fjptrH
,
877 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,
878 fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
880 /* Inner loop uses 446 flops */
886 /* Get j neighbor index, and coordinate index */
887 jnrlistA
= jjnr
[jidx
];
888 jnrlistB
= jjnr
[jidx
+1];
889 jnrlistC
= jjnr
[jidx
+2];
890 jnrlistD
= jjnr
[jidx
+3];
891 jnrlistE
= jjnr
[jidx
+4];
892 jnrlistF
= jjnr
[jidx
+5];
893 jnrlistG
= jjnr
[jidx
+6];
894 jnrlistH
= jjnr
[jidx
+7];
895 /* Sign of each element will be negative for non-real atoms.
896 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
897 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
899 dummy_mask
= gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
+4)),_mm_setzero_si128())),
900 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128())));
902 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
903 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
904 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
905 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
906 jnrE
= (jnrlistE
>=0) ? jnrlistE
: 0;
907 jnrF
= (jnrlistF
>=0) ? jnrlistF
: 0;
908 jnrG
= (jnrlistG
>=0) ? jnrlistG
: 0;
909 jnrH
= (jnrlistH
>=0) ? jnrlistH
: 0;
910 j_coord_offsetA
= DIM
*jnrA
;
911 j_coord_offsetB
= DIM
*jnrB
;
912 j_coord_offsetC
= DIM
*jnrC
;
913 j_coord_offsetD
= DIM
*jnrD
;
914 j_coord_offsetE
= DIM
*jnrE
;
915 j_coord_offsetF
= DIM
*jnrF
;
916 j_coord_offsetG
= DIM
*jnrG
;
917 j_coord_offsetH
= DIM
*jnrH
;
919 /* load j atom coordinates */
920 gmx_mm256_load_4rvec_8ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
921 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
922 x
+j_coord_offsetE
,x
+j_coord_offsetF
,
923 x
+j_coord_offsetG
,x
+j_coord_offsetH
,
924 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
925 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
927 /* Calculate displacement vector */
928 dx00
= _mm256_sub_ps(ix0
,jx0
);
929 dy00
= _mm256_sub_ps(iy0
,jy0
);
930 dz00
= _mm256_sub_ps(iz0
,jz0
);
931 dx11
= _mm256_sub_ps(ix1
,jx1
);
932 dy11
= _mm256_sub_ps(iy1
,jy1
);
933 dz11
= _mm256_sub_ps(iz1
,jz1
);
934 dx12
= _mm256_sub_ps(ix1
,jx2
);
935 dy12
= _mm256_sub_ps(iy1
,jy2
);
936 dz12
= _mm256_sub_ps(iz1
,jz2
);
937 dx13
= _mm256_sub_ps(ix1
,jx3
);
938 dy13
= _mm256_sub_ps(iy1
,jy3
);
939 dz13
= _mm256_sub_ps(iz1
,jz3
);
940 dx21
= _mm256_sub_ps(ix2
,jx1
);
941 dy21
= _mm256_sub_ps(iy2
,jy1
);
942 dz21
= _mm256_sub_ps(iz2
,jz1
);
943 dx22
= _mm256_sub_ps(ix2
,jx2
);
944 dy22
= _mm256_sub_ps(iy2
,jy2
);
945 dz22
= _mm256_sub_ps(iz2
,jz2
);
946 dx23
= _mm256_sub_ps(ix2
,jx3
);
947 dy23
= _mm256_sub_ps(iy2
,jy3
);
948 dz23
= _mm256_sub_ps(iz2
,jz3
);
949 dx31
= _mm256_sub_ps(ix3
,jx1
);
950 dy31
= _mm256_sub_ps(iy3
,jy1
);
951 dz31
= _mm256_sub_ps(iz3
,jz1
);
952 dx32
= _mm256_sub_ps(ix3
,jx2
);
953 dy32
= _mm256_sub_ps(iy3
,jy2
);
954 dz32
= _mm256_sub_ps(iz3
,jz2
);
955 dx33
= _mm256_sub_ps(ix3
,jx3
);
956 dy33
= _mm256_sub_ps(iy3
,jy3
);
957 dz33
= _mm256_sub_ps(iz3
,jz3
);
959 /* Calculate squared distance and things based on it */
960 rsq00
= gmx_mm256_calc_rsq_ps(dx00
,dy00
,dz00
);
961 rsq11
= gmx_mm256_calc_rsq_ps(dx11
,dy11
,dz11
);
962 rsq12
= gmx_mm256_calc_rsq_ps(dx12
,dy12
,dz12
);
963 rsq13
= gmx_mm256_calc_rsq_ps(dx13
,dy13
,dz13
);
964 rsq21
= gmx_mm256_calc_rsq_ps(dx21
,dy21
,dz21
);
965 rsq22
= gmx_mm256_calc_rsq_ps(dx22
,dy22
,dz22
);
966 rsq23
= gmx_mm256_calc_rsq_ps(dx23
,dy23
,dz23
);
967 rsq31
= gmx_mm256_calc_rsq_ps(dx31
,dy31
,dz31
);
968 rsq32
= gmx_mm256_calc_rsq_ps(dx32
,dy32
,dz32
);
969 rsq33
= gmx_mm256_calc_rsq_ps(dx33
,dy33
,dz33
);
971 rinv00
= gmx_mm256_invsqrt_ps(rsq00
);
972 rinv11
= gmx_mm256_invsqrt_ps(rsq11
);
973 rinv12
= gmx_mm256_invsqrt_ps(rsq12
);
974 rinv13
= gmx_mm256_invsqrt_ps(rsq13
);
975 rinv21
= gmx_mm256_invsqrt_ps(rsq21
);
976 rinv22
= gmx_mm256_invsqrt_ps(rsq22
);
977 rinv23
= gmx_mm256_invsqrt_ps(rsq23
);
978 rinv31
= gmx_mm256_invsqrt_ps(rsq31
);
979 rinv32
= gmx_mm256_invsqrt_ps(rsq32
);
980 rinv33
= gmx_mm256_invsqrt_ps(rsq33
);
982 fjx0
= _mm256_setzero_ps();
983 fjy0
= _mm256_setzero_ps();
984 fjz0
= _mm256_setzero_ps();
985 fjx1
= _mm256_setzero_ps();
986 fjy1
= _mm256_setzero_ps();
987 fjz1
= _mm256_setzero_ps();
988 fjx2
= _mm256_setzero_ps();
989 fjy2
= _mm256_setzero_ps();
990 fjz2
= _mm256_setzero_ps();
991 fjx3
= _mm256_setzero_ps();
992 fjy3
= _mm256_setzero_ps();
993 fjz3
= _mm256_setzero_ps();
995 /**************************
996 * CALCULATE INTERACTIONS *
997 **************************/
999 r00
= _mm256_mul_ps(rsq00
,rinv00
);
1000 r00
= _mm256_andnot_ps(dummy_mask
,r00
);
1002 /* Calculate table index by multiplying r with table scale and truncate to integer */
1003 rt
= _mm256_mul_ps(r00
,vftabscale
);
1004 vfitab
= _mm256_cvttps_epi32(rt
);
1005 vfeps
= _mm256_sub_ps(rt
,_mm256_round_ps(rt
, _MM_FROUND_FLOOR
));
1006 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1007 vfitab_lo
= _mm256_extractf128_si256(vfitab
,0x0);
1008 vfitab_hi
= _mm256_extractf128_si256(vfitab
,0x1);
1009 vfitab_lo
= _mm_slli_epi32(_mm_add_epi32(vfitab_lo
,_mm_slli_epi32(vfitab_lo
,1)),2);
1010 vfitab_hi
= _mm_slli_epi32(_mm_add_epi32(vfitab_hi
,_mm_slli_epi32(vfitab_hi
,1)),2);
1012 /* CUBIC SPLINE TABLE DISPERSION */
1013 vfitab_lo
= _mm_add_epi32(vfitab_lo
,ifour
);
1014 vfitab_hi
= _mm_add_epi32(vfitab_hi
,ifour
);
1015 Y
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,0)),
1016 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,0)));
1017 F
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,1)),
1018 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,1)));
1019 G
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,2)),
1020 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,2)));
1021 H
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,3)),
1022 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,3)));
1023 GMX_MM256_HALFTRANSPOSE4_PS(Y
,F
,G
,H
);
1024 Heps
= _mm256_mul_ps(vfeps
,H
);
1025 Fp
= _mm256_add_ps(F
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,Heps
)));
1026 VV
= _mm256_add_ps(Y
,_mm256_mul_ps(vfeps
,Fp
));
1027 vvdw6
= _mm256_mul_ps(c6_00
,VV
);
1028 FF
= _mm256_add_ps(Fp
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,_mm256_add_ps(Heps
,Heps
))));
1029 fvdw6
= _mm256_mul_ps(c6_00
,FF
);
1031 /* CUBIC SPLINE TABLE REPULSION */
1032 vfitab_lo
= _mm_add_epi32(vfitab_lo
,ifour
);
1033 vfitab_hi
= _mm_add_epi32(vfitab_hi
,ifour
);
1034 Y
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,0)),
1035 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,0)));
1036 F
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,1)),
1037 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,1)));
1038 G
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,2)),
1039 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,2)));
1040 H
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,3)),
1041 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,3)));
1042 GMX_MM256_HALFTRANSPOSE4_PS(Y
,F
,G
,H
);
1043 Heps
= _mm256_mul_ps(vfeps
,H
);
1044 Fp
= _mm256_add_ps(F
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,Heps
)));
1045 VV
= _mm256_add_ps(Y
,_mm256_mul_ps(vfeps
,Fp
));
1046 vvdw12
= _mm256_mul_ps(c12_00
,VV
);
1047 FF
= _mm256_add_ps(Fp
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,_mm256_add_ps(Heps
,Heps
))));
1048 fvdw12
= _mm256_mul_ps(c12_00
,FF
);
1049 vvdw
= _mm256_add_ps(vvdw12
,vvdw6
);
1050 fvdw
= _mm256_xor_ps(signbit
,_mm256_mul_ps(_mm256_add_ps(fvdw6
,fvdw12
),_mm256_mul_ps(vftabscale
,rinv00
)));
1052 /* Update potential sum for this i atom from the interaction with this j atom. */
1053 vvdw
= _mm256_andnot_ps(dummy_mask
,vvdw
);
1054 vvdwsum
= _mm256_add_ps(vvdwsum
,vvdw
);
1058 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
1060 /* Calculate temporary vectorial force */
1061 tx
= _mm256_mul_ps(fscal
,dx00
);
1062 ty
= _mm256_mul_ps(fscal
,dy00
);
1063 tz
= _mm256_mul_ps(fscal
,dz00
);
1065 /* Update vectorial force */
1066 fix0
= _mm256_add_ps(fix0
,tx
);
1067 fiy0
= _mm256_add_ps(fiy0
,ty
);
1068 fiz0
= _mm256_add_ps(fiz0
,tz
);
1070 fjx0
= _mm256_add_ps(fjx0
,tx
);
1071 fjy0
= _mm256_add_ps(fjy0
,ty
);
1072 fjz0
= _mm256_add_ps(fjz0
,tz
);
1074 /**************************
1075 * CALCULATE INTERACTIONS *
1076 **************************/
1078 r11
= _mm256_mul_ps(rsq11
,rinv11
);
1079 r11
= _mm256_andnot_ps(dummy_mask
,r11
);
1081 /* Calculate table index by multiplying r with table scale and truncate to integer */
1082 rt
= _mm256_mul_ps(r11
,vftabscale
);
1083 vfitab
= _mm256_cvttps_epi32(rt
);
1084 vfeps
= _mm256_sub_ps(rt
,_mm256_round_ps(rt
, _MM_FROUND_FLOOR
));
1085 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1086 vfitab_lo
= _mm256_extractf128_si256(vfitab
,0x0);
1087 vfitab_hi
= _mm256_extractf128_si256(vfitab
,0x1);
1088 vfitab_lo
= _mm_slli_epi32(_mm_add_epi32(vfitab_lo
,_mm_slli_epi32(vfitab_lo
,1)),2);
1089 vfitab_hi
= _mm_slli_epi32(_mm_add_epi32(vfitab_hi
,_mm_slli_epi32(vfitab_hi
,1)),2);
1091 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1092 Y
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,0)),
1093 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,0)));
1094 F
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,1)),
1095 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,1)));
1096 G
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,2)),
1097 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,2)));
1098 H
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,3)),
1099 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,3)));
1100 GMX_MM256_HALFTRANSPOSE4_PS(Y
,F
,G
,H
);
1101 Heps
= _mm256_mul_ps(vfeps
,H
);
1102 Fp
= _mm256_add_ps(F
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,Heps
)));
1103 VV
= _mm256_add_ps(Y
,_mm256_mul_ps(vfeps
,Fp
));
1104 velec
= _mm256_mul_ps(qq11
,VV
);
1105 FF
= _mm256_add_ps(Fp
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,_mm256_add_ps(Heps
,Heps
))));
1106 felec
= _mm256_xor_ps(signbit
,_mm256_mul_ps(_mm256_mul_ps(qq11
,FF
),_mm256_mul_ps(vftabscale
,rinv11
)));
1108 /* Update potential sum for this i atom from the interaction with this j atom. */
1109 velec
= _mm256_andnot_ps(dummy_mask
,velec
);
1110 velecsum
= _mm256_add_ps(velecsum
,velec
);
1114 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
1116 /* Calculate temporary vectorial force */
1117 tx
= _mm256_mul_ps(fscal
,dx11
);
1118 ty
= _mm256_mul_ps(fscal
,dy11
);
1119 tz
= _mm256_mul_ps(fscal
,dz11
);
1121 /* Update vectorial force */
1122 fix1
= _mm256_add_ps(fix1
,tx
);
1123 fiy1
= _mm256_add_ps(fiy1
,ty
);
1124 fiz1
= _mm256_add_ps(fiz1
,tz
);
1126 fjx1
= _mm256_add_ps(fjx1
,tx
);
1127 fjy1
= _mm256_add_ps(fjy1
,ty
);
1128 fjz1
= _mm256_add_ps(fjz1
,tz
);
1130 /**************************
1131 * CALCULATE INTERACTIONS *
1132 **************************/
1134 r12
= _mm256_mul_ps(rsq12
,rinv12
);
1135 r12
= _mm256_andnot_ps(dummy_mask
,r12
);
1137 /* Calculate table index by multiplying r with table scale and truncate to integer */
1138 rt
= _mm256_mul_ps(r12
,vftabscale
);
1139 vfitab
= _mm256_cvttps_epi32(rt
);
1140 vfeps
= _mm256_sub_ps(rt
,_mm256_round_ps(rt
, _MM_FROUND_FLOOR
));
1141 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1142 vfitab_lo
= _mm256_extractf128_si256(vfitab
,0x0);
1143 vfitab_hi
= _mm256_extractf128_si256(vfitab
,0x1);
1144 vfitab_lo
= _mm_slli_epi32(_mm_add_epi32(vfitab_lo
,_mm_slli_epi32(vfitab_lo
,1)),2);
1145 vfitab_hi
= _mm_slli_epi32(_mm_add_epi32(vfitab_hi
,_mm_slli_epi32(vfitab_hi
,1)),2);
1147 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1148 Y
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,0)),
1149 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,0)));
1150 F
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,1)),
1151 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,1)));
1152 G
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,2)),
1153 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,2)));
1154 H
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,3)),
1155 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,3)));
1156 GMX_MM256_HALFTRANSPOSE4_PS(Y
,F
,G
,H
);
1157 Heps
= _mm256_mul_ps(vfeps
,H
);
1158 Fp
= _mm256_add_ps(F
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,Heps
)));
1159 VV
= _mm256_add_ps(Y
,_mm256_mul_ps(vfeps
,Fp
));
1160 velec
= _mm256_mul_ps(qq12
,VV
);
1161 FF
= _mm256_add_ps(Fp
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,_mm256_add_ps(Heps
,Heps
))));
1162 felec
= _mm256_xor_ps(signbit
,_mm256_mul_ps(_mm256_mul_ps(qq12
,FF
),_mm256_mul_ps(vftabscale
,rinv12
)));
1164 /* Update potential sum for this i atom from the interaction with this j atom. */
1165 velec
= _mm256_andnot_ps(dummy_mask
,velec
);
1166 velecsum
= _mm256_add_ps(velecsum
,velec
);
1170 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
1172 /* Calculate temporary vectorial force */
1173 tx
= _mm256_mul_ps(fscal
,dx12
);
1174 ty
= _mm256_mul_ps(fscal
,dy12
);
1175 tz
= _mm256_mul_ps(fscal
,dz12
);
1177 /* Update vectorial force */
1178 fix1
= _mm256_add_ps(fix1
,tx
);
1179 fiy1
= _mm256_add_ps(fiy1
,ty
);
1180 fiz1
= _mm256_add_ps(fiz1
,tz
);
1182 fjx2
= _mm256_add_ps(fjx2
,tx
);
1183 fjy2
= _mm256_add_ps(fjy2
,ty
);
1184 fjz2
= _mm256_add_ps(fjz2
,tz
);
1186 /**************************
1187 * CALCULATE INTERACTIONS *
1188 **************************/
1190 r13
= _mm256_mul_ps(rsq13
,rinv13
);
1191 r13
= _mm256_andnot_ps(dummy_mask
,r13
);
1193 /* Calculate table index by multiplying r with table scale and truncate to integer */
1194 rt
= _mm256_mul_ps(r13
,vftabscale
);
1195 vfitab
= _mm256_cvttps_epi32(rt
);
1196 vfeps
= _mm256_sub_ps(rt
,_mm256_round_ps(rt
, _MM_FROUND_FLOOR
));
1197 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1198 vfitab_lo
= _mm256_extractf128_si256(vfitab
,0x0);
1199 vfitab_hi
= _mm256_extractf128_si256(vfitab
,0x1);
1200 vfitab_lo
= _mm_slli_epi32(_mm_add_epi32(vfitab_lo
,_mm_slli_epi32(vfitab_lo
,1)),2);
1201 vfitab_hi
= _mm_slli_epi32(_mm_add_epi32(vfitab_hi
,_mm_slli_epi32(vfitab_hi
,1)),2);
1203 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1204 Y
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,0)),
1205 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,0)));
1206 F
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,1)),
1207 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,1)));
1208 G
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,2)),
1209 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,2)));
1210 H
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,3)),
1211 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,3)));
1212 GMX_MM256_HALFTRANSPOSE4_PS(Y
,F
,G
,H
);
1213 Heps
= _mm256_mul_ps(vfeps
,H
);
1214 Fp
= _mm256_add_ps(F
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,Heps
)));
1215 VV
= _mm256_add_ps(Y
,_mm256_mul_ps(vfeps
,Fp
));
1216 velec
= _mm256_mul_ps(qq13
,VV
);
1217 FF
= _mm256_add_ps(Fp
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,_mm256_add_ps(Heps
,Heps
))));
1218 felec
= _mm256_xor_ps(signbit
,_mm256_mul_ps(_mm256_mul_ps(qq13
,FF
),_mm256_mul_ps(vftabscale
,rinv13
)));
1220 /* Update potential sum for this i atom from the interaction with this j atom. */
1221 velec
= _mm256_andnot_ps(dummy_mask
,velec
);
1222 velecsum
= _mm256_add_ps(velecsum
,velec
);
1226 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
1228 /* Calculate temporary vectorial force */
1229 tx
= _mm256_mul_ps(fscal
,dx13
);
1230 ty
= _mm256_mul_ps(fscal
,dy13
);
1231 tz
= _mm256_mul_ps(fscal
,dz13
);
1233 /* Update vectorial force */
1234 fix1
= _mm256_add_ps(fix1
,tx
);
1235 fiy1
= _mm256_add_ps(fiy1
,ty
);
1236 fiz1
= _mm256_add_ps(fiz1
,tz
);
1238 fjx3
= _mm256_add_ps(fjx3
,tx
);
1239 fjy3
= _mm256_add_ps(fjy3
,ty
);
1240 fjz3
= _mm256_add_ps(fjz3
,tz
);
1242 /**************************
1243 * CALCULATE INTERACTIONS *
1244 **************************/
1246 r21
= _mm256_mul_ps(rsq21
,rinv21
);
1247 r21
= _mm256_andnot_ps(dummy_mask
,r21
);
1249 /* Calculate table index by multiplying r with table scale and truncate to integer */
1250 rt
= _mm256_mul_ps(r21
,vftabscale
);
1251 vfitab
= _mm256_cvttps_epi32(rt
);
1252 vfeps
= _mm256_sub_ps(rt
,_mm256_round_ps(rt
, _MM_FROUND_FLOOR
));
1253 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1254 vfitab_lo
= _mm256_extractf128_si256(vfitab
,0x0);
1255 vfitab_hi
= _mm256_extractf128_si256(vfitab
,0x1);
1256 vfitab_lo
= _mm_slli_epi32(_mm_add_epi32(vfitab_lo
,_mm_slli_epi32(vfitab_lo
,1)),2);
1257 vfitab_hi
= _mm_slli_epi32(_mm_add_epi32(vfitab_hi
,_mm_slli_epi32(vfitab_hi
,1)),2);
1259 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1260 Y
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,0)),
1261 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,0)));
1262 F
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,1)),
1263 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,1)));
1264 G
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,2)),
1265 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,2)));
1266 H
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,3)),
1267 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,3)));
1268 GMX_MM256_HALFTRANSPOSE4_PS(Y
,F
,G
,H
);
1269 Heps
= _mm256_mul_ps(vfeps
,H
);
1270 Fp
= _mm256_add_ps(F
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,Heps
)));
1271 VV
= _mm256_add_ps(Y
,_mm256_mul_ps(vfeps
,Fp
));
1272 velec
= _mm256_mul_ps(qq21
,VV
);
1273 FF
= _mm256_add_ps(Fp
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,_mm256_add_ps(Heps
,Heps
))));
1274 felec
= _mm256_xor_ps(signbit
,_mm256_mul_ps(_mm256_mul_ps(qq21
,FF
),_mm256_mul_ps(vftabscale
,rinv21
)));
1276 /* Update potential sum for this i atom from the interaction with this j atom. */
1277 velec
= _mm256_andnot_ps(dummy_mask
,velec
);
1278 velecsum
= _mm256_add_ps(velecsum
,velec
);
1282 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
1284 /* Calculate temporary vectorial force */
1285 tx
= _mm256_mul_ps(fscal
,dx21
);
1286 ty
= _mm256_mul_ps(fscal
,dy21
);
1287 tz
= _mm256_mul_ps(fscal
,dz21
);
1289 /* Update vectorial force */
1290 fix2
= _mm256_add_ps(fix2
,tx
);
1291 fiy2
= _mm256_add_ps(fiy2
,ty
);
1292 fiz2
= _mm256_add_ps(fiz2
,tz
);
1294 fjx1
= _mm256_add_ps(fjx1
,tx
);
1295 fjy1
= _mm256_add_ps(fjy1
,ty
);
1296 fjz1
= _mm256_add_ps(fjz1
,tz
);
1298 /**************************
1299 * CALCULATE INTERACTIONS *
1300 **************************/
1302 r22
= _mm256_mul_ps(rsq22
,rinv22
);
1303 r22
= _mm256_andnot_ps(dummy_mask
,r22
);
1305 /* Calculate table index by multiplying r with table scale and truncate to integer */
1306 rt
= _mm256_mul_ps(r22
,vftabscale
);
1307 vfitab
= _mm256_cvttps_epi32(rt
);
1308 vfeps
= _mm256_sub_ps(rt
,_mm256_round_ps(rt
, _MM_FROUND_FLOOR
));
1309 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1310 vfitab_lo
= _mm256_extractf128_si256(vfitab
,0x0);
1311 vfitab_hi
= _mm256_extractf128_si256(vfitab
,0x1);
1312 vfitab_lo
= _mm_slli_epi32(_mm_add_epi32(vfitab_lo
,_mm_slli_epi32(vfitab_lo
,1)),2);
1313 vfitab_hi
= _mm_slli_epi32(_mm_add_epi32(vfitab_hi
,_mm_slli_epi32(vfitab_hi
,1)),2);
1315 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1316 Y
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,0)),
1317 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,0)));
1318 F
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,1)),
1319 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,1)));
1320 G
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,2)),
1321 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,2)));
1322 H
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,3)),
1323 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,3)));
1324 GMX_MM256_HALFTRANSPOSE4_PS(Y
,F
,G
,H
);
1325 Heps
= _mm256_mul_ps(vfeps
,H
);
1326 Fp
= _mm256_add_ps(F
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,Heps
)));
1327 VV
= _mm256_add_ps(Y
,_mm256_mul_ps(vfeps
,Fp
));
1328 velec
= _mm256_mul_ps(qq22
,VV
);
1329 FF
= _mm256_add_ps(Fp
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,_mm256_add_ps(Heps
,Heps
))));
1330 felec
= _mm256_xor_ps(signbit
,_mm256_mul_ps(_mm256_mul_ps(qq22
,FF
),_mm256_mul_ps(vftabscale
,rinv22
)));
1332 /* Update potential sum for this i atom from the interaction with this j atom. */
1333 velec
= _mm256_andnot_ps(dummy_mask
,velec
);
1334 velecsum
= _mm256_add_ps(velecsum
,velec
);
1338 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
1340 /* Calculate temporary vectorial force */
1341 tx
= _mm256_mul_ps(fscal
,dx22
);
1342 ty
= _mm256_mul_ps(fscal
,dy22
);
1343 tz
= _mm256_mul_ps(fscal
,dz22
);
1345 /* Update vectorial force */
1346 fix2
= _mm256_add_ps(fix2
,tx
);
1347 fiy2
= _mm256_add_ps(fiy2
,ty
);
1348 fiz2
= _mm256_add_ps(fiz2
,tz
);
1350 fjx2
= _mm256_add_ps(fjx2
,tx
);
1351 fjy2
= _mm256_add_ps(fjy2
,ty
);
1352 fjz2
= _mm256_add_ps(fjz2
,tz
);
1354 /**************************
1355 * CALCULATE INTERACTIONS *
1356 **************************/
1358 r23
= _mm256_mul_ps(rsq23
,rinv23
);
1359 r23
= _mm256_andnot_ps(dummy_mask
,r23
);
1361 /* Calculate table index by multiplying r with table scale and truncate to integer */
1362 rt
= _mm256_mul_ps(r23
,vftabscale
);
1363 vfitab
= _mm256_cvttps_epi32(rt
);
1364 vfeps
= _mm256_sub_ps(rt
,_mm256_round_ps(rt
, _MM_FROUND_FLOOR
));
1365 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1366 vfitab_lo
= _mm256_extractf128_si256(vfitab
,0x0);
1367 vfitab_hi
= _mm256_extractf128_si256(vfitab
,0x1);
1368 vfitab_lo
= _mm_slli_epi32(_mm_add_epi32(vfitab_lo
,_mm_slli_epi32(vfitab_lo
,1)),2);
1369 vfitab_hi
= _mm_slli_epi32(_mm_add_epi32(vfitab_hi
,_mm_slli_epi32(vfitab_hi
,1)),2);
1371 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1372 Y
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,0)),
1373 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,0)));
1374 F
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,1)),
1375 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,1)));
1376 G
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,2)),
1377 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,2)));
1378 H
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,3)),
1379 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,3)));
1380 GMX_MM256_HALFTRANSPOSE4_PS(Y
,F
,G
,H
);
1381 Heps
= _mm256_mul_ps(vfeps
,H
);
1382 Fp
= _mm256_add_ps(F
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,Heps
)));
1383 VV
= _mm256_add_ps(Y
,_mm256_mul_ps(vfeps
,Fp
));
1384 velec
= _mm256_mul_ps(qq23
,VV
);
1385 FF
= _mm256_add_ps(Fp
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,_mm256_add_ps(Heps
,Heps
))));
1386 felec
= _mm256_xor_ps(signbit
,_mm256_mul_ps(_mm256_mul_ps(qq23
,FF
),_mm256_mul_ps(vftabscale
,rinv23
)));
1388 /* Update potential sum for this i atom from the interaction with this j atom. */
1389 velec
= _mm256_andnot_ps(dummy_mask
,velec
);
1390 velecsum
= _mm256_add_ps(velecsum
,velec
);
1394 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
1396 /* Calculate temporary vectorial force */
1397 tx
= _mm256_mul_ps(fscal
,dx23
);
1398 ty
= _mm256_mul_ps(fscal
,dy23
);
1399 tz
= _mm256_mul_ps(fscal
,dz23
);
1401 /* Update vectorial force */
1402 fix2
= _mm256_add_ps(fix2
,tx
);
1403 fiy2
= _mm256_add_ps(fiy2
,ty
);
1404 fiz2
= _mm256_add_ps(fiz2
,tz
);
1406 fjx3
= _mm256_add_ps(fjx3
,tx
);
1407 fjy3
= _mm256_add_ps(fjy3
,ty
);
1408 fjz3
= _mm256_add_ps(fjz3
,tz
);
1410 /**************************
1411 * CALCULATE INTERACTIONS *
1412 **************************/
1414 r31
= _mm256_mul_ps(rsq31
,rinv31
);
1415 r31
= _mm256_andnot_ps(dummy_mask
,r31
);
1417 /* Calculate table index by multiplying r with table scale and truncate to integer */
1418 rt
= _mm256_mul_ps(r31
,vftabscale
);
1419 vfitab
= _mm256_cvttps_epi32(rt
);
1420 vfeps
= _mm256_sub_ps(rt
,_mm256_round_ps(rt
, _MM_FROUND_FLOOR
));
1421 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1422 vfitab_lo
= _mm256_extractf128_si256(vfitab
,0x0);
1423 vfitab_hi
= _mm256_extractf128_si256(vfitab
,0x1);
1424 vfitab_lo
= _mm_slli_epi32(_mm_add_epi32(vfitab_lo
,_mm_slli_epi32(vfitab_lo
,1)),2);
1425 vfitab_hi
= _mm_slli_epi32(_mm_add_epi32(vfitab_hi
,_mm_slli_epi32(vfitab_hi
,1)),2);
1427 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1428 Y
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,0)),
1429 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,0)));
1430 F
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,1)),
1431 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,1)));
1432 G
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,2)),
1433 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,2)));
1434 H
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,3)),
1435 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,3)));
1436 GMX_MM256_HALFTRANSPOSE4_PS(Y
,F
,G
,H
);
1437 Heps
= _mm256_mul_ps(vfeps
,H
);
1438 Fp
= _mm256_add_ps(F
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,Heps
)));
1439 VV
= _mm256_add_ps(Y
,_mm256_mul_ps(vfeps
,Fp
));
1440 velec
= _mm256_mul_ps(qq31
,VV
);
1441 FF
= _mm256_add_ps(Fp
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,_mm256_add_ps(Heps
,Heps
))));
1442 felec
= _mm256_xor_ps(signbit
,_mm256_mul_ps(_mm256_mul_ps(qq31
,FF
),_mm256_mul_ps(vftabscale
,rinv31
)));
1444 /* Update potential sum for this i atom from the interaction with this j atom. */
1445 velec
= _mm256_andnot_ps(dummy_mask
,velec
);
1446 velecsum
= _mm256_add_ps(velecsum
,velec
);
1450 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
1452 /* Calculate temporary vectorial force */
1453 tx
= _mm256_mul_ps(fscal
,dx31
);
1454 ty
= _mm256_mul_ps(fscal
,dy31
);
1455 tz
= _mm256_mul_ps(fscal
,dz31
);
1457 /* Update vectorial force */
1458 fix3
= _mm256_add_ps(fix3
,tx
);
1459 fiy3
= _mm256_add_ps(fiy3
,ty
);
1460 fiz3
= _mm256_add_ps(fiz3
,tz
);
1462 fjx1
= _mm256_add_ps(fjx1
,tx
);
1463 fjy1
= _mm256_add_ps(fjy1
,ty
);
1464 fjz1
= _mm256_add_ps(fjz1
,tz
);
1466 /**************************
1467 * CALCULATE INTERACTIONS *
1468 **************************/
1470 r32
= _mm256_mul_ps(rsq32
,rinv32
);
1471 r32
= _mm256_andnot_ps(dummy_mask
,r32
);
1473 /* Calculate table index by multiplying r with table scale and truncate to integer */
1474 rt
= _mm256_mul_ps(r32
,vftabscale
);
1475 vfitab
= _mm256_cvttps_epi32(rt
);
1476 vfeps
= _mm256_sub_ps(rt
,_mm256_round_ps(rt
, _MM_FROUND_FLOOR
));
1477 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1478 vfitab_lo
= _mm256_extractf128_si256(vfitab
,0x0);
1479 vfitab_hi
= _mm256_extractf128_si256(vfitab
,0x1);
1480 vfitab_lo
= _mm_slli_epi32(_mm_add_epi32(vfitab_lo
,_mm_slli_epi32(vfitab_lo
,1)),2);
1481 vfitab_hi
= _mm_slli_epi32(_mm_add_epi32(vfitab_hi
,_mm_slli_epi32(vfitab_hi
,1)),2);
1483 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1484 Y
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,0)),
1485 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,0)));
1486 F
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,1)),
1487 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,1)));
1488 G
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,2)),
1489 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,2)));
1490 H
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,3)),
1491 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,3)));
1492 GMX_MM256_HALFTRANSPOSE4_PS(Y
,F
,G
,H
);
1493 Heps
= _mm256_mul_ps(vfeps
,H
);
1494 Fp
= _mm256_add_ps(F
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,Heps
)));
1495 VV
= _mm256_add_ps(Y
,_mm256_mul_ps(vfeps
,Fp
));
1496 velec
= _mm256_mul_ps(qq32
,VV
);
1497 FF
= _mm256_add_ps(Fp
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,_mm256_add_ps(Heps
,Heps
))));
1498 felec
= _mm256_xor_ps(signbit
,_mm256_mul_ps(_mm256_mul_ps(qq32
,FF
),_mm256_mul_ps(vftabscale
,rinv32
)));
1500 /* Update potential sum for this i atom from the interaction with this j atom. */
1501 velec
= _mm256_andnot_ps(dummy_mask
,velec
);
1502 velecsum
= _mm256_add_ps(velecsum
,velec
);
1506 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
1508 /* Calculate temporary vectorial force */
1509 tx
= _mm256_mul_ps(fscal
,dx32
);
1510 ty
= _mm256_mul_ps(fscal
,dy32
);
1511 tz
= _mm256_mul_ps(fscal
,dz32
);
1513 /* Update vectorial force */
1514 fix3
= _mm256_add_ps(fix3
,tx
);
1515 fiy3
= _mm256_add_ps(fiy3
,ty
);
1516 fiz3
= _mm256_add_ps(fiz3
,tz
);
1518 fjx2
= _mm256_add_ps(fjx2
,tx
);
1519 fjy2
= _mm256_add_ps(fjy2
,ty
);
1520 fjz2
= _mm256_add_ps(fjz2
,tz
);
1522 /**************************
1523 * CALCULATE INTERACTIONS *
1524 **************************/
1526 r33
= _mm256_mul_ps(rsq33
,rinv33
);
1527 r33
= _mm256_andnot_ps(dummy_mask
,r33
);
1529 /* Calculate table index by multiplying r with table scale and truncate to integer */
1530 rt
= _mm256_mul_ps(r33
,vftabscale
);
1531 vfitab
= _mm256_cvttps_epi32(rt
);
1532 vfeps
= _mm256_sub_ps(rt
,_mm256_round_ps(rt
, _MM_FROUND_FLOOR
));
1533 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1534 vfitab_lo
= _mm256_extractf128_si256(vfitab
,0x0);
1535 vfitab_hi
= _mm256_extractf128_si256(vfitab
,0x1);
1536 vfitab_lo
= _mm_slli_epi32(_mm_add_epi32(vfitab_lo
,_mm_slli_epi32(vfitab_lo
,1)),2);
1537 vfitab_hi
= _mm_slli_epi32(_mm_add_epi32(vfitab_hi
,_mm_slli_epi32(vfitab_hi
,1)),2);
1539 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1540 Y
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,0)),
1541 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,0)));
1542 F
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,1)),
1543 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,1)));
1544 G
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,2)),
1545 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,2)));
1546 H
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,3)),
1547 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,3)));
1548 GMX_MM256_HALFTRANSPOSE4_PS(Y
,F
,G
,H
);
1549 Heps
= _mm256_mul_ps(vfeps
,H
);
1550 Fp
= _mm256_add_ps(F
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,Heps
)));
1551 VV
= _mm256_add_ps(Y
,_mm256_mul_ps(vfeps
,Fp
));
1552 velec
= _mm256_mul_ps(qq33
,VV
);
1553 FF
= _mm256_add_ps(Fp
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,_mm256_add_ps(Heps
,Heps
))));
1554 felec
= _mm256_xor_ps(signbit
,_mm256_mul_ps(_mm256_mul_ps(qq33
,FF
),_mm256_mul_ps(vftabscale
,rinv33
)));
1556 /* Update potential sum for this i atom from the interaction with this j atom. */
1557 velec
= _mm256_andnot_ps(dummy_mask
,velec
);
1558 velecsum
= _mm256_add_ps(velecsum
,velec
);
1562 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
1564 /* Calculate temporary vectorial force */
1565 tx
= _mm256_mul_ps(fscal
,dx33
);
1566 ty
= _mm256_mul_ps(fscal
,dy33
);
1567 tz
= _mm256_mul_ps(fscal
,dz33
);
1569 /* Update vectorial force */
1570 fix3
= _mm256_add_ps(fix3
,tx
);
1571 fiy3
= _mm256_add_ps(fiy3
,ty
);
1572 fiz3
= _mm256_add_ps(fiz3
,tz
);
1574 fjx3
= _mm256_add_ps(fjx3
,tx
);
1575 fjy3
= _mm256_add_ps(fjy3
,ty
);
1576 fjz3
= _mm256_add_ps(fjz3
,tz
);
1578 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
1579 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
1580 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
1581 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
1582 fjptrE
= (jnrlistE
>=0) ? f
+j_coord_offsetE
: scratch
;
1583 fjptrF
= (jnrlistF
>=0) ? f
+j_coord_offsetF
: scratch
;
1584 fjptrG
= (jnrlistG
>=0) ? f
+j_coord_offsetG
: scratch
;
1585 fjptrH
= (jnrlistH
>=0) ? f
+j_coord_offsetH
: scratch
;
1587 gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,fjptrE
,fjptrF
,fjptrG
,fjptrH
,
1588 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,
1589 fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
1591 /* Inner loop uses 456 flops */
1594 /* End of innermost loop */
1596 gmx_mm256_update_iforce_4atom_swizzle_ps(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
1597 f
+i_coord_offset
,fshift
+i_shift_offset
);
1600 /* Update potential energies */
1601 gmx_mm256_update_1pot_ps(velecsum
,kernel_data
->energygrp_elec
+ggid
);
1602 gmx_mm256_update_1pot_ps(vvdwsum
,kernel_data
->energygrp_vdw
+ggid
);
1604 /* Increment number of inner iterations */
1605 inneriter
+= j_index_end
- j_index_start
;
1607 /* Outer loop uses 26 flops */
1610 /* Increment number of outer iterations */
1613 /* Update outer/inner flops */
1615 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W4W4_VF
,outeriter
*26 + inneriter
*456);
1618 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_avx_256_single
1619 * Electrostatics interaction: CubicSplineTable
1620 * VdW interaction: CubicSplineTable
1621 * Geometry: Water4-Water4
1622 * Calculate force/pot: Force
1625 nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_avx_256_single
1626 (t_nblist
* gmx_restrict nlist
,
1627 rvec
* gmx_restrict xx
,
1628 rvec
* gmx_restrict ff
,
1629 t_forcerec
* gmx_restrict fr
,
1630 t_mdatoms
* gmx_restrict mdatoms
,
1631 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
1632 t_nrnb
* gmx_restrict nrnb
)
1634 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1635 * just 0 for non-waters.
1636 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1637 * jnr indices corresponding to data put in the four positions in the SIMD register.
1639 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
1640 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
1641 int jnrA
,jnrB
,jnrC
,jnrD
;
1642 int jnrE
,jnrF
,jnrG
,jnrH
;
1643 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
1644 int jnrlistE
,jnrlistF
,jnrlistG
,jnrlistH
;
1645 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
1646 int j_coord_offsetE
,j_coord_offsetF
,j_coord_offsetG
,j_coord_offsetH
;
1647 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
1648 real rcutoff_scalar
;
1649 real
*shiftvec
,*fshift
,*x
,*f
;
1650 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
,*fjptrE
,*fjptrF
,*fjptrG
,*fjptrH
;
1651 real scratch
[4*DIM
];
1652 __m256 tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
1653 real
* vdwioffsetptr0
;
1654 __m256 ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
1655 real
* vdwioffsetptr1
;
1656 __m256 ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
1657 real
* vdwioffsetptr2
;
1658 __m256 ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
1659 real
* vdwioffsetptr3
;
1660 __m256 ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
1661 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
,vdwjidx0E
,vdwjidx0F
,vdwjidx0G
,vdwjidx0H
;
1662 __m256 jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
1663 int vdwjidx1A
,vdwjidx1B
,vdwjidx1C
,vdwjidx1D
,vdwjidx1E
,vdwjidx1F
,vdwjidx1G
,vdwjidx1H
;
1664 __m256 jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
1665 int vdwjidx2A
,vdwjidx2B
,vdwjidx2C
,vdwjidx2D
,vdwjidx2E
,vdwjidx2F
,vdwjidx2G
,vdwjidx2H
;
1666 __m256 jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
1667 int vdwjidx3A
,vdwjidx3B
,vdwjidx3C
,vdwjidx3D
,vdwjidx3E
,vdwjidx3F
,vdwjidx3G
,vdwjidx3H
;
1668 __m256 jx3
,jy3
,jz3
,fjx3
,fjy3
,fjz3
,jq3
,isaj3
;
1669 __m256 dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
1670 __m256 dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
1671 __m256 dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
1672 __m256 dx13
,dy13
,dz13
,rsq13
,rinv13
,rinvsq13
,r13
,qq13
,c6_13
,c12_13
;
1673 __m256 dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
1674 __m256 dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
1675 __m256 dx23
,dy23
,dz23
,rsq23
,rinv23
,rinvsq23
,r23
,qq23
,c6_23
,c12_23
;
1676 __m256 dx31
,dy31
,dz31
,rsq31
,rinv31
,rinvsq31
,r31
,qq31
,c6_31
,c12_31
;
1677 __m256 dx32
,dy32
,dz32
,rsq32
,rinv32
,rinvsq32
,r32
,qq32
,c6_32
,c12_32
;
1678 __m256 dx33
,dy33
,dz33
,rsq33
,rinv33
,rinvsq33
,r33
,qq33
,c6_33
,c12_33
;
1679 __m256 velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
1682 __m256 rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
1685 __m256 one_sixth
= _mm256_set1_ps(1.0/6.0);
1686 __m256 one_twelfth
= _mm256_set1_ps(1.0/12.0);
1688 __m128i vfitab_lo
,vfitab_hi
;
1689 __m128i ifour
= _mm_set1_epi32(4);
1690 __m256 rt
,vfeps
,vftabscale
,Y
,F
,G
,H
,Heps
,Fp
,VV
,FF
;
1692 __m256 dummy_mask
,cutoff_mask
;
1693 __m256 signbit
= _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1694 __m256 one
= _mm256_set1_ps(1.0);
1695 __m256 two
= _mm256_set1_ps(2.0);
1701 jindex
= nlist
->jindex
;
1703 shiftidx
= nlist
->shift
;
1705 shiftvec
= fr
->shift_vec
[0];
1706 fshift
= fr
->fshift
[0];
1707 facel
= _mm256_set1_ps(fr
->epsfac
);
1708 charge
= mdatoms
->chargeA
;
1709 nvdwtype
= fr
->ntype
;
1710 vdwparam
= fr
->nbfp
;
1711 vdwtype
= mdatoms
->typeA
;
1713 vftab
= kernel_data
->table_elec_vdw
->data
;
1714 vftabscale
= _mm256_set1_ps(kernel_data
->table_elec_vdw
->scale
);
1716 /* Setup water-specific parameters */
1717 inr
= nlist
->iinr
[0];
1718 iq1
= _mm256_mul_ps(facel
,_mm256_set1_ps(charge
[inr
+1]));
1719 iq2
= _mm256_mul_ps(facel
,_mm256_set1_ps(charge
[inr
+2]));
1720 iq3
= _mm256_mul_ps(facel
,_mm256_set1_ps(charge
[inr
+3]));
1721 vdwioffsetptr0
= vdwparam
+2*nvdwtype
*vdwtype
[inr
+0];
1723 jq1
= _mm256_set1_ps(charge
[inr
+1]);
1724 jq2
= _mm256_set1_ps(charge
[inr
+2]);
1725 jq3
= _mm256_set1_ps(charge
[inr
+3]);
1726 vdwjidx0A
= 2*vdwtype
[inr
+0];
1727 c6_00
= _mm256_set1_ps(vdwioffsetptr0
[vdwjidx0A
]);
1728 c12_00
= _mm256_set1_ps(vdwioffsetptr0
[vdwjidx0A
+1]);
1729 qq11
= _mm256_mul_ps(iq1
,jq1
);
1730 qq12
= _mm256_mul_ps(iq1
,jq2
);
1731 qq13
= _mm256_mul_ps(iq1
,jq3
);
1732 qq21
= _mm256_mul_ps(iq2
,jq1
);
1733 qq22
= _mm256_mul_ps(iq2
,jq2
);
1734 qq23
= _mm256_mul_ps(iq2
,jq3
);
1735 qq31
= _mm256_mul_ps(iq3
,jq1
);
1736 qq32
= _mm256_mul_ps(iq3
,jq2
);
1737 qq33
= _mm256_mul_ps(iq3
,jq3
);
1739 /* Avoid stupid compiler warnings */
1740 jnrA
= jnrB
= jnrC
= jnrD
= jnrE
= jnrF
= jnrG
= jnrH
= 0;
1741 j_coord_offsetA
= 0;
1742 j_coord_offsetB
= 0;
1743 j_coord_offsetC
= 0;
1744 j_coord_offsetD
= 0;
1745 j_coord_offsetE
= 0;
1746 j_coord_offsetF
= 0;
1747 j_coord_offsetG
= 0;
1748 j_coord_offsetH
= 0;
1753 for(iidx
=0;iidx
<4*DIM
;iidx
++)
1755 scratch
[iidx
] = 0.0;
1758 /* Start outer loop over neighborlists */
1759 for(iidx
=0; iidx
<nri
; iidx
++)
1761 /* Load shift vector for this list */
1762 i_shift_offset
= DIM
*shiftidx
[iidx
];
1764 /* Load limits for loop over neighbors */
1765 j_index_start
= jindex
[iidx
];
1766 j_index_end
= jindex
[iidx
+1];
1768 /* Get outer coordinate index */
1770 i_coord_offset
= DIM
*inr
;
1772 /* Load i particle coords and add shift vector */
1773 gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
1774 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
1776 fix0
= _mm256_setzero_ps();
1777 fiy0
= _mm256_setzero_ps();
1778 fiz0
= _mm256_setzero_ps();
1779 fix1
= _mm256_setzero_ps();
1780 fiy1
= _mm256_setzero_ps();
1781 fiz1
= _mm256_setzero_ps();
1782 fix2
= _mm256_setzero_ps();
1783 fiy2
= _mm256_setzero_ps();
1784 fiz2
= _mm256_setzero_ps();
1785 fix3
= _mm256_setzero_ps();
1786 fiy3
= _mm256_setzero_ps();
1787 fiz3
= _mm256_setzero_ps();
1789 /* Start inner kernel loop */
1790 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+7]>=0; jidx
+=8)
1793 /* Get j neighbor index, and coordinate index */
1795 jnrB
= jjnr
[jidx
+1];
1796 jnrC
= jjnr
[jidx
+2];
1797 jnrD
= jjnr
[jidx
+3];
1798 jnrE
= jjnr
[jidx
+4];
1799 jnrF
= jjnr
[jidx
+5];
1800 jnrG
= jjnr
[jidx
+6];
1801 jnrH
= jjnr
[jidx
+7];
1802 j_coord_offsetA
= DIM
*jnrA
;
1803 j_coord_offsetB
= DIM
*jnrB
;
1804 j_coord_offsetC
= DIM
*jnrC
;
1805 j_coord_offsetD
= DIM
*jnrD
;
1806 j_coord_offsetE
= DIM
*jnrE
;
1807 j_coord_offsetF
= DIM
*jnrF
;
1808 j_coord_offsetG
= DIM
*jnrG
;
1809 j_coord_offsetH
= DIM
*jnrH
;
1811 /* load j atom coordinates */
1812 gmx_mm256_load_4rvec_8ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1813 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
1814 x
+j_coord_offsetE
,x
+j_coord_offsetF
,
1815 x
+j_coord_offsetG
,x
+j_coord_offsetH
,
1816 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
1817 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
1819 /* Calculate displacement vector */
1820 dx00
= _mm256_sub_ps(ix0
,jx0
);
1821 dy00
= _mm256_sub_ps(iy0
,jy0
);
1822 dz00
= _mm256_sub_ps(iz0
,jz0
);
1823 dx11
= _mm256_sub_ps(ix1
,jx1
);
1824 dy11
= _mm256_sub_ps(iy1
,jy1
);
1825 dz11
= _mm256_sub_ps(iz1
,jz1
);
1826 dx12
= _mm256_sub_ps(ix1
,jx2
);
1827 dy12
= _mm256_sub_ps(iy1
,jy2
);
1828 dz12
= _mm256_sub_ps(iz1
,jz2
);
1829 dx13
= _mm256_sub_ps(ix1
,jx3
);
1830 dy13
= _mm256_sub_ps(iy1
,jy3
);
1831 dz13
= _mm256_sub_ps(iz1
,jz3
);
1832 dx21
= _mm256_sub_ps(ix2
,jx1
);
1833 dy21
= _mm256_sub_ps(iy2
,jy1
);
1834 dz21
= _mm256_sub_ps(iz2
,jz1
);
1835 dx22
= _mm256_sub_ps(ix2
,jx2
);
1836 dy22
= _mm256_sub_ps(iy2
,jy2
);
1837 dz22
= _mm256_sub_ps(iz2
,jz2
);
1838 dx23
= _mm256_sub_ps(ix2
,jx3
);
1839 dy23
= _mm256_sub_ps(iy2
,jy3
);
1840 dz23
= _mm256_sub_ps(iz2
,jz3
);
1841 dx31
= _mm256_sub_ps(ix3
,jx1
);
1842 dy31
= _mm256_sub_ps(iy3
,jy1
);
1843 dz31
= _mm256_sub_ps(iz3
,jz1
);
1844 dx32
= _mm256_sub_ps(ix3
,jx2
);
1845 dy32
= _mm256_sub_ps(iy3
,jy2
);
1846 dz32
= _mm256_sub_ps(iz3
,jz2
);
1847 dx33
= _mm256_sub_ps(ix3
,jx3
);
1848 dy33
= _mm256_sub_ps(iy3
,jy3
);
1849 dz33
= _mm256_sub_ps(iz3
,jz3
);
1851 /* Calculate squared distance and things based on it */
1852 rsq00
= gmx_mm256_calc_rsq_ps(dx00
,dy00
,dz00
);
1853 rsq11
= gmx_mm256_calc_rsq_ps(dx11
,dy11
,dz11
);
1854 rsq12
= gmx_mm256_calc_rsq_ps(dx12
,dy12
,dz12
);
1855 rsq13
= gmx_mm256_calc_rsq_ps(dx13
,dy13
,dz13
);
1856 rsq21
= gmx_mm256_calc_rsq_ps(dx21
,dy21
,dz21
);
1857 rsq22
= gmx_mm256_calc_rsq_ps(dx22
,dy22
,dz22
);
1858 rsq23
= gmx_mm256_calc_rsq_ps(dx23
,dy23
,dz23
);
1859 rsq31
= gmx_mm256_calc_rsq_ps(dx31
,dy31
,dz31
);
1860 rsq32
= gmx_mm256_calc_rsq_ps(dx32
,dy32
,dz32
);
1861 rsq33
= gmx_mm256_calc_rsq_ps(dx33
,dy33
,dz33
);
1863 rinv00
= gmx_mm256_invsqrt_ps(rsq00
);
1864 rinv11
= gmx_mm256_invsqrt_ps(rsq11
);
1865 rinv12
= gmx_mm256_invsqrt_ps(rsq12
);
1866 rinv13
= gmx_mm256_invsqrt_ps(rsq13
);
1867 rinv21
= gmx_mm256_invsqrt_ps(rsq21
);
1868 rinv22
= gmx_mm256_invsqrt_ps(rsq22
);
1869 rinv23
= gmx_mm256_invsqrt_ps(rsq23
);
1870 rinv31
= gmx_mm256_invsqrt_ps(rsq31
);
1871 rinv32
= gmx_mm256_invsqrt_ps(rsq32
);
1872 rinv33
= gmx_mm256_invsqrt_ps(rsq33
);
1874 fjx0
= _mm256_setzero_ps();
1875 fjy0
= _mm256_setzero_ps();
1876 fjz0
= _mm256_setzero_ps();
1877 fjx1
= _mm256_setzero_ps();
1878 fjy1
= _mm256_setzero_ps();
1879 fjz1
= _mm256_setzero_ps();
1880 fjx2
= _mm256_setzero_ps();
1881 fjy2
= _mm256_setzero_ps();
1882 fjz2
= _mm256_setzero_ps();
1883 fjx3
= _mm256_setzero_ps();
1884 fjy3
= _mm256_setzero_ps();
1885 fjz3
= _mm256_setzero_ps();
1887 /**************************
1888 * CALCULATE INTERACTIONS *
1889 **************************/
1891 r00
= _mm256_mul_ps(rsq00
,rinv00
);
1893 /* Calculate table index by multiplying r with table scale and truncate to integer */
1894 rt
= _mm256_mul_ps(r00
,vftabscale
);
1895 vfitab
= _mm256_cvttps_epi32(rt
);
1896 vfeps
= _mm256_sub_ps(rt
,_mm256_round_ps(rt
, _MM_FROUND_FLOOR
));
1897 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1898 vfitab_lo
= _mm256_extractf128_si256(vfitab
,0x0);
1899 vfitab_hi
= _mm256_extractf128_si256(vfitab
,0x1);
1900 vfitab_lo
= _mm_slli_epi32(_mm_add_epi32(vfitab_lo
,_mm_slli_epi32(vfitab_lo
,1)),2);
1901 vfitab_hi
= _mm_slli_epi32(_mm_add_epi32(vfitab_hi
,_mm_slli_epi32(vfitab_hi
,1)),2);
1903 /* CUBIC SPLINE TABLE DISPERSION */
1904 vfitab_lo
= _mm_add_epi32(vfitab_lo
,ifour
);
1905 vfitab_hi
= _mm_add_epi32(vfitab_hi
,ifour
);
1906 Y
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,0)),
1907 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,0)));
1908 F
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,1)),
1909 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,1)));
1910 G
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,2)),
1911 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,2)));
1912 H
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,3)),
1913 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,3)));
1914 GMX_MM256_HALFTRANSPOSE4_PS(Y
,F
,G
,H
);
1915 Heps
= _mm256_mul_ps(vfeps
,H
);
1916 Fp
= _mm256_add_ps(F
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,Heps
)));
1917 FF
= _mm256_add_ps(Fp
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,_mm256_add_ps(Heps
,Heps
))));
1918 fvdw6
= _mm256_mul_ps(c6_00
,FF
);
1920 /* CUBIC SPLINE TABLE REPULSION */
1921 vfitab_lo
= _mm_add_epi32(vfitab_lo
,ifour
);
1922 vfitab_hi
= _mm_add_epi32(vfitab_hi
,ifour
);
1923 Y
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,0)),
1924 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,0)));
1925 F
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,1)),
1926 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,1)));
1927 G
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,2)),
1928 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,2)));
1929 H
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,3)),
1930 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,3)));
1931 GMX_MM256_HALFTRANSPOSE4_PS(Y
,F
,G
,H
);
1932 Heps
= _mm256_mul_ps(vfeps
,H
);
1933 Fp
= _mm256_add_ps(F
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,Heps
)));
1934 FF
= _mm256_add_ps(Fp
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,_mm256_add_ps(Heps
,Heps
))));
1935 fvdw12
= _mm256_mul_ps(c12_00
,FF
);
1936 fvdw
= _mm256_xor_ps(signbit
,_mm256_mul_ps(_mm256_add_ps(fvdw6
,fvdw12
),_mm256_mul_ps(vftabscale
,rinv00
)));
1940 /* Calculate temporary vectorial force */
1941 tx
= _mm256_mul_ps(fscal
,dx00
);
1942 ty
= _mm256_mul_ps(fscal
,dy00
);
1943 tz
= _mm256_mul_ps(fscal
,dz00
);
1945 /* Update vectorial force */
1946 fix0
= _mm256_add_ps(fix0
,tx
);
1947 fiy0
= _mm256_add_ps(fiy0
,ty
);
1948 fiz0
= _mm256_add_ps(fiz0
,tz
);
1950 fjx0
= _mm256_add_ps(fjx0
,tx
);
1951 fjy0
= _mm256_add_ps(fjy0
,ty
);
1952 fjz0
= _mm256_add_ps(fjz0
,tz
);
1954 /**************************
1955 * CALCULATE INTERACTIONS *
1956 **************************/
1958 r11
= _mm256_mul_ps(rsq11
,rinv11
);
1960 /* Calculate table index by multiplying r with table scale and truncate to integer */
1961 rt
= _mm256_mul_ps(r11
,vftabscale
);
1962 vfitab
= _mm256_cvttps_epi32(rt
);
1963 vfeps
= _mm256_sub_ps(rt
,_mm256_round_ps(rt
, _MM_FROUND_FLOOR
));
1964 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1965 vfitab_lo
= _mm256_extractf128_si256(vfitab
,0x0);
1966 vfitab_hi
= _mm256_extractf128_si256(vfitab
,0x1);
1967 vfitab_lo
= _mm_slli_epi32(_mm_add_epi32(vfitab_lo
,_mm_slli_epi32(vfitab_lo
,1)),2);
1968 vfitab_hi
= _mm_slli_epi32(_mm_add_epi32(vfitab_hi
,_mm_slli_epi32(vfitab_hi
,1)),2);
1970 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1971 Y
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,0)),
1972 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,0)));
1973 F
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,1)),
1974 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,1)));
1975 G
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,2)),
1976 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,2)));
1977 H
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,3)),
1978 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,3)));
1979 GMX_MM256_HALFTRANSPOSE4_PS(Y
,F
,G
,H
);
1980 Heps
= _mm256_mul_ps(vfeps
,H
);
1981 Fp
= _mm256_add_ps(F
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,Heps
)));
1982 FF
= _mm256_add_ps(Fp
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,_mm256_add_ps(Heps
,Heps
))));
1983 felec
= _mm256_xor_ps(signbit
,_mm256_mul_ps(_mm256_mul_ps(qq11
,FF
),_mm256_mul_ps(vftabscale
,rinv11
)));
1987 /* Calculate temporary vectorial force */
1988 tx
= _mm256_mul_ps(fscal
,dx11
);
1989 ty
= _mm256_mul_ps(fscal
,dy11
);
1990 tz
= _mm256_mul_ps(fscal
,dz11
);
1992 /* Update vectorial force */
1993 fix1
= _mm256_add_ps(fix1
,tx
);
1994 fiy1
= _mm256_add_ps(fiy1
,ty
);
1995 fiz1
= _mm256_add_ps(fiz1
,tz
);
1997 fjx1
= _mm256_add_ps(fjx1
,tx
);
1998 fjy1
= _mm256_add_ps(fjy1
,ty
);
1999 fjz1
= _mm256_add_ps(fjz1
,tz
);
2001 /**************************
2002 * CALCULATE INTERACTIONS *
2003 **************************/
2005 r12
= _mm256_mul_ps(rsq12
,rinv12
);
2007 /* Calculate table index by multiplying r with table scale and truncate to integer */
2008 rt
= _mm256_mul_ps(r12
,vftabscale
);
2009 vfitab
= _mm256_cvttps_epi32(rt
);
2010 vfeps
= _mm256_sub_ps(rt
,_mm256_round_ps(rt
, _MM_FROUND_FLOOR
));
2011 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2012 vfitab_lo
= _mm256_extractf128_si256(vfitab
,0x0);
2013 vfitab_hi
= _mm256_extractf128_si256(vfitab
,0x1);
2014 vfitab_lo
= _mm_slli_epi32(_mm_add_epi32(vfitab_lo
,_mm_slli_epi32(vfitab_lo
,1)),2);
2015 vfitab_hi
= _mm_slli_epi32(_mm_add_epi32(vfitab_hi
,_mm_slli_epi32(vfitab_hi
,1)),2);
2017 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2018 Y
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,0)),
2019 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,0)));
2020 F
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,1)),
2021 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,1)));
2022 G
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,2)),
2023 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,2)));
2024 H
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,3)),
2025 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,3)));
2026 GMX_MM256_HALFTRANSPOSE4_PS(Y
,F
,G
,H
);
2027 Heps
= _mm256_mul_ps(vfeps
,H
);
2028 Fp
= _mm256_add_ps(F
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,Heps
)));
2029 FF
= _mm256_add_ps(Fp
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,_mm256_add_ps(Heps
,Heps
))));
2030 felec
= _mm256_xor_ps(signbit
,_mm256_mul_ps(_mm256_mul_ps(qq12
,FF
),_mm256_mul_ps(vftabscale
,rinv12
)));
2034 /* Calculate temporary vectorial force */
2035 tx
= _mm256_mul_ps(fscal
,dx12
);
2036 ty
= _mm256_mul_ps(fscal
,dy12
);
2037 tz
= _mm256_mul_ps(fscal
,dz12
);
2039 /* Update vectorial force */
2040 fix1
= _mm256_add_ps(fix1
,tx
);
2041 fiy1
= _mm256_add_ps(fiy1
,ty
);
2042 fiz1
= _mm256_add_ps(fiz1
,tz
);
2044 fjx2
= _mm256_add_ps(fjx2
,tx
);
2045 fjy2
= _mm256_add_ps(fjy2
,ty
);
2046 fjz2
= _mm256_add_ps(fjz2
,tz
);
2048 /**************************
2049 * CALCULATE INTERACTIONS *
2050 **************************/
2052 r13
= _mm256_mul_ps(rsq13
,rinv13
);
2054 /* Calculate table index by multiplying r with table scale and truncate to integer */
2055 rt
= _mm256_mul_ps(r13
,vftabscale
);
2056 vfitab
= _mm256_cvttps_epi32(rt
);
2057 vfeps
= _mm256_sub_ps(rt
,_mm256_round_ps(rt
, _MM_FROUND_FLOOR
));
2058 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2059 vfitab_lo
= _mm256_extractf128_si256(vfitab
,0x0);
2060 vfitab_hi
= _mm256_extractf128_si256(vfitab
,0x1);
2061 vfitab_lo
= _mm_slli_epi32(_mm_add_epi32(vfitab_lo
,_mm_slli_epi32(vfitab_lo
,1)),2);
2062 vfitab_hi
= _mm_slli_epi32(_mm_add_epi32(vfitab_hi
,_mm_slli_epi32(vfitab_hi
,1)),2);
2064 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2065 Y
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,0)),
2066 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,0)));
2067 F
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,1)),
2068 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,1)));
2069 G
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,2)),
2070 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,2)));
2071 H
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,3)),
2072 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,3)));
2073 GMX_MM256_HALFTRANSPOSE4_PS(Y
,F
,G
,H
);
2074 Heps
= _mm256_mul_ps(vfeps
,H
);
2075 Fp
= _mm256_add_ps(F
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,Heps
)));
2076 FF
= _mm256_add_ps(Fp
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,_mm256_add_ps(Heps
,Heps
))));
2077 felec
= _mm256_xor_ps(signbit
,_mm256_mul_ps(_mm256_mul_ps(qq13
,FF
),_mm256_mul_ps(vftabscale
,rinv13
)));
2081 /* Calculate temporary vectorial force */
2082 tx
= _mm256_mul_ps(fscal
,dx13
);
2083 ty
= _mm256_mul_ps(fscal
,dy13
);
2084 tz
= _mm256_mul_ps(fscal
,dz13
);
2086 /* Update vectorial force */
2087 fix1
= _mm256_add_ps(fix1
,tx
);
2088 fiy1
= _mm256_add_ps(fiy1
,ty
);
2089 fiz1
= _mm256_add_ps(fiz1
,tz
);
2091 fjx3
= _mm256_add_ps(fjx3
,tx
);
2092 fjy3
= _mm256_add_ps(fjy3
,ty
);
2093 fjz3
= _mm256_add_ps(fjz3
,tz
);
2095 /**************************
2096 * CALCULATE INTERACTIONS *
2097 **************************/
2099 r21
= _mm256_mul_ps(rsq21
,rinv21
);
2101 /* Calculate table index by multiplying r with table scale and truncate to integer */
2102 rt
= _mm256_mul_ps(r21
,vftabscale
);
2103 vfitab
= _mm256_cvttps_epi32(rt
);
2104 vfeps
= _mm256_sub_ps(rt
,_mm256_round_ps(rt
, _MM_FROUND_FLOOR
));
2105 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2106 vfitab_lo
= _mm256_extractf128_si256(vfitab
,0x0);
2107 vfitab_hi
= _mm256_extractf128_si256(vfitab
,0x1);
2108 vfitab_lo
= _mm_slli_epi32(_mm_add_epi32(vfitab_lo
,_mm_slli_epi32(vfitab_lo
,1)),2);
2109 vfitab_hi
= _mm_slli_epi32(_mm_add_epi32(vfitab_hi
,_mm_slli_epi32(vfitab_hi
,1)),2);
2111 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2112 Y
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,0)),
2113 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,0)));
2114 F
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,1)),
2115 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,1)));
2116 G
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,2)),
2117 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,2)));
2118 H
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,3)),
2119 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,3)));
2120 GMX_MM256_HALFTRANSPOSE4_PS(Y
,F
,G
,H
);
2121 Heps
= _mm256_mul_ps(vfeps
,H
);
2122 Fp
= _mm256_add_ps(F
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,Heps
)));
2123 FF
= _mm256_add_ps(Fp
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,_mm256_add_ps(Heps
,Heps
))));
2124 felec
= _mm256_xor_ps(signbit
,_mm256_mul_ps(_mm256_mul_ps(qq21
,FF
),_mm256_mul_ps(vftabscale
,rinv21
)));
2128 /* Calculate temporary vectorial force */
2129 tx
= _mm256_mul_ps(fscal
,dx21
);
2130 ty
= _mm256_mul_ps(fscal
,dy21
);
2131 tz
= _mm256_mul_ps(fscal
,dz21
);
2133 /* Update vectorial force */
2134 fix2
= _mm256_add_ps(fix2
,tx
);
2135 fiy2
= _mm256_add_ps(fiy2
,ty
);
2136 fiz2
= _mm256_add_ps(fiz2
,tz
);
2138 fjx1
= _mm256_add_ps(fjx1
,tx
);
2139 fjy1
= _mm256_add_ps(fjy1
,ty
);
2140 fjz1
= _mm256_add_ps(fjz1
,tz
);
2142 /**************************
2143 * CALCULATE INTERACTIONS *
2144 **************************/
2146 r22
= _mm256_mul_ps(rsq22
,rinv22
);
2148 /* Calculate table index by multiplying r with table scale and truncate to integer */
2149 rt
= _mm256_mul_ps(r22
,vftabscale
);
2150 vfitab
= _mm256_cvttps_epi32(rt
);
2151 vfeps
= _mm256_sub_ps(rt
,_mm256_round_ps(rt
, _MM_FROUND_FLOOR
));
2152 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2153 vfitab_lo
= _mm256_extractf128_si256(vfitab
,0x0);
2154 vfitab_hi
= _mm256_extractf128_si256(vfitab
,0x1);
2155 vfitab_lo
= _mm_slli_epi32(_mm_add_epi32(vfitab_lo
,_mm_slli_epi32(vfitab_lo
,1)),2);
2156 vfitab_hi
= _mm_slli_epi32(_mm_add_epi32(vfitab_hi
,_mm_slli_epi32(vfitab_hi
,1)),2);
2158 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2159 Y
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,0)),
2160 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,0)));
2161 F
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,1)),
2162 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,1)));
2163 G
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,2)),
2164 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,2)));
2165 H
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,3)),
2166 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,3)));
2167 GMX_MM256_HALFTRANSPOSE4_PS(Y
,F
,G
,H
);
2168 Heps
= _mm256_mul_ps(vfeps
,H
);
2169 Fp
= _mm256_add_ps(F
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,Heps
)));
2170 FF
= _mm256_add_ps(Fp
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,_mm256_add_ps(Heps
,Heps
))));
2171 felec
= _mm256_xor_ps(signbit
,_mm256_mul_ps(_mm256_mul_ps(qq22
,FF
),_mm256_mul_ps(vftabscale
,rinv22
)));
2175 /* Calculate temporary vectorial force */
2176 tx
= _mm256_mul_ps(fscal
,dx22
);
2177 ty
= _mm256_mul_ps(fscal
,dy22
);
2178 tz
= _mm256_mul_ps(fscal
,dz22
);
2180 /* Update vectorial force */
2181 fix2
= _mm256_add_ps(fix2
,tx
);
2182 fiy2
= _mm256_add_ps(fiy2
,ty
);
2183 fiz2
= _mm256_add_ps(fiz2
,tz
);
2185 fjx2
= _mm256_add_ps(fjx2
,tx
);
2186 fjy2
= _mm256_add_ps(fjy2
,ty
);
2187 fjz2
= _mm256_add_ps(fjz2
,tz
);
2189 /**************************
2190 * CALCULATE INTERACTIONS *
2191 **************************/
2193 r23
= _mm256_mul_ps(rsq23
,rinv23
);
2195 /* Calculate table index by multiplying r with table scale and truncate to integer */
2196 rt
= _mm256_mul_ps(r23
,vftabscale
);
2197 vfitab
= _mm256_cvttps_epi32(rt
);
2198 vfeps
= _mm256_sub_ps(rt
,_mm256_round_ps(rt
, _MM_FROUND_FLOOR
));
2199 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2200 vfitab_lo
= _mm256_extractf128_si256(vfitab
,0x0);
2201 vfitab_hi
= _mm256_extractf128_si256(vfitab
,0x1);
2202 vfitab_lo
= _mm_slli_epi32(_mm_add_epi32(vfitab_lo
,_mm_slli_epi32(vfitab_lo
,1)),2);
2203 vfitab_hi
= _mm_slli_epi32(_mm_add_epi32(vfitab_hi
,_mm_slli_epi32(vfitab_hi
,1)),2);
2205 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2206 Y
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,0)),
2207 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,0)));
2208 F
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,1)),
2209 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,1)));
2210 G
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,2)),
2211 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,2)));
2212 H
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,3)),
2213 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,3)));
2214 GMX_MM256_HALFTRANSPOSE4_PS(Y
,F
,G
,H
);
2215 Heps
= _mm256_mul_ps(vfeps
,H
);
2216 Fp
= _mm256_add_ps(F
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,Heps
)));
2217 FF
= _mm256_add_ps(Fp
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,_mm256_add_ps(Heps
,Heps
))));
2218 felec
= _mm256_xor_ps(signbit
,_mm256_mul_ps(_mm256_mul_ps(qq23
,FF
),_mm256_mul_ps(vftabscale
,rinv23
)));
2222 /* Calculate temporary vectorial force */
2223 tx
= _mm256_mul_ps(fscal
,dx23
);
2224 ty
= _mm256_mul_ps(fscal
,dy23
);
2225 tz
= _mm256_mul_ps(fscal
,dz23
);
2227 /* Update vectorial force */
2228 fix2
= _mm256_add_ps(fix2
,tx
);
2229 fiy2
= _mm256_add_ps(fiy2
,ty
);
2230 fiz2
= _mm256_add_ps(fiz2
,tz
);
2232 fjx3
= _mm256_add_ps(fjx3
,tx
);
2233 fjy3
= _mm256_add_ps(fjy3
,ty
);
2234 fjz3
= _mm256_add_ps(fjz3
,tz
);
2236 /**************************
2237 * CALCULATE INTERACTIONS *
2238 **************************/
2240 r31
= _mm256_mul_ps(rsq31
,rinv31
);
2242 /* Calculate table index by multiplying r with table scale and truncate to integer */
2243 rt
= _mm256_mul_ps(r31
,vftabscale
);
2244 vfitab
= _mm256_cvttps_epi32(rt
);
2245 vfeps
= _mm256_sub_ps(rt
,_mm256_round_ps(rt
, _MM_FROUND_FLOOR
));
2246 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2247 vfitab_lo
= _mm256_extractf128_si256(vfitab
,0x0);
2248 vfitab_hi
= _mm256_extractf128_si256(vfitab
,0x1);
2249 vfitab_lo
= _mm_slli_epi32(_mm_add_epi32(vfitab_lo
,_mm_slli_epi32(vfitab_lo
,1)),2);
2250 vfitab_hi
= _mm_slli_epi32(_mm_add_epi32(vfitab_hi
,_mm_slli_epi32(vfitab_hi
,1)),2);
2252 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2253 Y
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,0)),
2254 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,0)));
2255 F
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,1)),
2256 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,1)));
2257 G
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,2)),
2258 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,2)));
2259 H
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,3)),
2260 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,3)));
2261 GMX_MM256_HALFTRANSPOSE4_PS(Y
,F
,G
,H
);
2262 Heps
= _mm256_mul_ps(vfeps
,H
);
2263 Fp
= _mm256_add_ps(F
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,Heps
)));
2264 FF
= _mm256_add_ps(Fp
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,_mm256_add_ps(Heps
,Heps
))));
2265 felec
= _mm256_xor_ps(signbit
,_mm256_mul_ps(_mm256_mul_ps(qq31
,FF
),_mm256_mul_ps(vftabscale
,rinv31
)));
2269 /* Calculate temporary vectorial force */
2270 tx
= _mm256_mul_ps(fscal
,dx31
);
2271 ty
= _mm256_mul_ps(fscal
,dy31
);
2272 tz
= _mm256_mul_ps(fscal
,dz31
);
2274 /* Update vectorial force */
2275 fix3
= _mm256_add_ps(fix3
,tx
);
2276 fiy3
= _mm256_add_ps(fiy3
,ty
);
2277 fiz3
= _mm256_add_ps(fiz3
,tz
);
2279 fjx1
= _mm256_add_ps(fjx1
,tx
);
2280 fjy1
= _mm256_add_ps(fjy1
,ty
);
2281 fjz1
= _mm256_add_ps(fjz1
,tz
);
2283 /**************************
2284 * CALCULATE INTERACTIONS *
2285 **************************/
2287 r32
= _mm256_mul_ps(rsq32
,rinv32
);
2289 /* Calculate table index by multiplying r with table scale and truncate to integer */
2290 rt
= _mm256_mul_ps(r32
,vftabscale
);
2291 vfitab
= _mm256_cvttps_epi32(rt
);
2292 vfeps
= _mm256_sub_ps(rt
,_mm256_round_ps(rt
, _MM_FROUND_FLOOR
));
2293 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2294 vfitab_lo
= _mm256_extractf128_si256(vfitab
,0x0);
2295 vfitab_hi
= _mm256_extractf128_si256(vfitab
,0x1);
2296 vfitab_lo
= _mm_slli_epi32(_mm_add_epi32(vfitab_lo
,_mm_slli_epi32(vfitab_lo
,1)),2);
2297 vfitab_hi
= _mm_slli_epi32(_mm_add_epi32(vfitab_hi
,_mm_slli_epi32(vfitab_hi
,1)),2);
2299 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2300 Y
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,0)),
2301 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,0)));
2302 F
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,1)),
2303 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,1)));
2304 G
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,2)),
2305 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,2)));
2306 H
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,3)),
2307 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,3)));
2308 GMX_MM256_HALFTRANSPOSE4_PS(Y
,F
,G
,H
);
2309 Heps
= _mm256_mul_ps(vfeps
,H
);
2310 Fp
= _mm256_add_ps(F
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,Heps
)));
2311 FF
= _mm256_add_ps(Fp
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,_mm256_add_ps(Heps
,Heps
))));
2312 felec
= _mm256_xor_ps(signbit
,_mm256_mul_ps(_mm256_mul_ps(qq32
,FF
),_mm256_mul_ps(vftabscale
,rinv32
)));
2316 /* Calculate temporary vectorial force */
2317 tx
= _mm256_mul_ps(fscal
,dx32
);
2318 ty
= _mm256_mul_ps(fscal
,dy32
);
2319 tz
= _mm256_mul_ps(fscal
,dz32
);
2321 /* Update vectorial force */
2322 fix3
= _mm256_add_ps(fix3
,tx
);
2323 fiy3
= _mm256_add_ps(fiy3
,ty
);
2324 fiz3
= _mm256_add_ps(fiz3
,tz
);
2326 fjx2
= _mm256_add_ps(fjx2
,tx
);
2327 fjy2
= _mm256_add_ps(fjy2
,ty
);
2328 fjz2
= _mm256_add_ps(fjz2
,tz
);
2330 /**************************
2331 * CALCULATE INTERACTIONS *
2332 **************************/
2334 r33
= _mm256_mul_ps(rsq33
,rinv33
);
2336 /* Calculate table index by multiplying r with table scale and truncate to integer */
2337 rt
= _mm256_mul_ps(r33
,vftabscale
);
2338 vfitab
= _mm256_cvttps_epi32(rt
);
2339 vfeps
= _mm256_sub_ps(rt
,_mm256_round_ps(rt
, _MM_FROUND_FLOOR
));
2340 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2341 vfitab_lo
= _mm256_extractf128_si256(vfitab
,0x0);
2342 vfitab_hi
= _mm256_extractf128_si256(vfitab
,0x1);
2343 vfitab_lo
= _mm_slli_epi32(_mm_add_epi32(vfitab_lo
,_mm_slli_epi32(vfitab_lo
,1)),2);
2344 vfitab_hi
= _mm_slli_epi32(_mm_add_epi32(vfitab_hi
,_mm_slli_epi32(vfitab_hi
,1)),2);
2346 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2347 Y
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,0)),
2348 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,0)));
2349 F
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,1)),
2350 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,1)));
2351 G
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,2)),
2352 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,2)));
2353 H
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,3)),
2354 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,3)));
2355 GMX_MM256_HALFTRANSPOSE4_PS(Y
,F
,G
,H
);
2356 Heps
= _mm256_mul_ps(vfeps
,H
);
2357 Fp
= _mm256_add_ps(F
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,Heps
)));
2358 FF
= _mm256_add_ps(Fp
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,_mm256_add_ps(Heps
,Heps
))));
2359 felec
= _mm256_xor_ps(signbit
,_mm256_mul_ps(_mm256_mul_ps(qq33
,FF
),_mm256_mul_ps(vftabscale
,rinv33
)));
2363 /* Calculate temporary vectorial force */
2364 tx
= _mm256_mul_ps(fscal
,dx33
);
2365 ty
= _mm256_mul_ps(fscal
,dy33
);
2366 tz
= _mm256_mul_ps(fscal
,dz33
);
2368 /* Update vectorial force */
2369 fix3
= _mm256_add_ps(fix3
,tx
);
2370 fiy3
= _mm256_add_ps(fiy3
,ty
);
2371 fiz3
= _mm256_add_ps(fiz3
,tz
);
2373 fjx3
= _mm256_add_ps(fjx3
,tx
);
2374 fjy3
= _mm256_add_ps(fjy3
,ty
);
2375 fjz3
= _mm256_add_ps(fjz3
,tz
);
2377 fjptrA
= f
+j_coord_offsetA
;
2378 fjptrB
= f
+j_coord_offsetB
;
2379 fjptrC
= f
+j_coord_offsetC
;
2380 fjptrD
= f
+j_coord_offsetD
;
2381 fjptrE
= f
+j_coord_offsetE
;
2382 fjptrF
= f
+j_coord_offsetF
;
2383 fjptrG
= f
+j_coord_offsetG
;
2384 fjptrH
= f
+j_coord_offsetH
;
2386 gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,fjptrE
,fjptrF
,fjptrG
,fjptrH
,
2387 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,
2388 fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
2390 /* Inner loop uses 402 flops */
2393 if(jidx
<j_index_end
)
2396 /* Get j neighbor index, and coordinate index */
2397 jnrlistA
= jjnr
[jidx
];
2398 jnrlistB
= jjnr
[jidx
+1];
2399 jnrlistC
= jjnr
[jidx
+2];
2400 jnrlistD
= jjnr
[jidx
+3];
2401 jnrlistE
= jjnr
[jidx
+4];
2402 jnrlistF
= jjnr
[jidx
+5];
2403 jnrlistG
= jjnr
[jidx
+6];
2404 jnrlistH
= jjnr
[jidx
+7];
2405 /* Sign of each element will be negative for non-real atoms.
2406 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2407 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2409 dummy_mask
= gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
+4)),_mm_setzero_si128())),
2410 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128())));
2412 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
2413 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
2414 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
2415 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
2416 jnrE
= (jnrlistE
>=0) ? jnrlistE
: 0;
2417 jnrF
= (jnrlistF
>=0) ? jnrlistF
: 0;
2418 jnrG
= (jnrlistG
>=0) ? jnrlistG
: 0;
2419 jnrH
= (jnrlistH
>=0) ? jnrlistH
: 0;
2420 j_coord_offsetA
= DIM
*jnrA
;
2421 j_coord_offsetB
= DIM
*jnrB
;
2422 j_coord_offsetC
= DIM
*jnrC
;
2423 j_coord_offsetD
= DIM
*jnrD
;
2424 j_coord_offsetE
= DIM
*jnrE
;
2425 j_coord_offsetF
= DIM
*jnrF
;
2426 j_coord_offsetG
= DIM
*jnrG
;
2427 j_coord_offsetH
= DIM
*jnrH
;
2429 /* load j atom coordinates */
2430 gmx_mm256_load_4rvec_8ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
2431 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
2432 x
+j_coord_offsetE
,x
+j_coord_offsetF
,
2433 x
+j_coord_offsetG
,x
+j_coord_offsetH
,
2434 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
2435 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
2437 /* Calculate displacement vector */
2438 dx00
= _mm256_sub_ps(ix0
,jx0
);
2439 dy00
= _mm256_sub_ps(iy0
,jy0
);
2440 dz00
= _mm256_sub_ps(iz0
,jz0
);
2441 dx11
= _mm256_sub_ps(ix1
,jx1
);
2442 dy11
= _mm256_sub_ps(iy1
,jy1
);
2443 dz11
= _mm256_sub_ps(iz1
,jz1
);
2444 dx12
= _mm256_sub_ps(ix1
,jx2
);
2445 dy12
= _mm256_sub_ps(iy1
,jy2
);
2446 dz12
= _mm256_sub_ps(iz1
,jz2
);
2447 dx13
= _mm256_sub_ps(ix1
,jx3
);
2448 dy13
= _mm256_sub_ps(iy1
,jy3
);
2449 dz13
= _mm256_sub_ps(iz1
,jz3
);
2450 dx21
= _mm256_sub_ps(ix2
,jx1
);
2451 dy21
= _mm256_sub_ps(iy2
,jy1
);
2452 dz21
= _mm256_sub_ps(iz2
,jz1
);
2453 dx22
= _mm256_sub_ps(ix2
,jx2
);
2454 dy22
= _mm256_sub_ps(iy2
,jy2
);
2455 dz22
= _mm256_sub_ps(iz2
,jz2
);
2456 dx23
= _mm256_sub_ps(ix2
,jx3
);
2457 dy23
= _mm256_sub_ps(iy2
,jy3
);
2458 dz23
= _mm256_sub_ps(iz2
,jz3
);
2459 dx31
= _mm256_sub_ps(ix3
,jx1
);
2460 dy31
= _mm256_sub_ps(iy3
,jy1
);
2461 dz31
= _mm256_sub_ps(iz3
,jz1
);
2462 dx32
= _mm256_sub_ps(ix3
,jx2
);
2463 dy32
= _mm256_sub_ps(iy3
,jy2
);
2464 dz32
= _mm256_sub_ps(iz3
,jz2
);
2465 dx33
= _mm256_sub_ps(ix3
,jx3
);
2466 dy33
= _mm256_sub_ps(iy3
,jy3
);
2467 dz33
= _mm256_sub_ps(iz3
,jz3
);
2469 /* Calculate squared distance and things based on it */
2470 rsq00
= gmx_mm256_calc_rsq_ps(dx00
,dy00
,dz00
);
2471 rsq11
= gmx_mm256_calc_rsq_ps(dx11
,dy11
,dz11
);
2472 rsq12
= gmx_mm256_calc_rsq_ps(dx12
,dy12
,dz12
);
2473 rsq13
= gmx_mm256_calc_rsq_ps(dx13
,dy13
,dz13
);
2474 rsq21
= gmx_mm256_calc_rsq_ps(dx21
,dy21
,dz21
);
2475 rsq22
= gmx_mm256_calc_rsq_ps(dx22
,dy22
,dz22
);
2476 rsq23
= gmx_mm256_calc_rsq_ps(dx23
,dy23
,dz23
);
2477 rsq31
= gmx_mm256_calc_rsq_ps(dx31
,dy31
,dz31
);
2478 rsq32
= gmx_mm256_calc_rsq_ps(dx32
,dy32
,dz32
);
2479 rsq33
= gmx_mm256_calc_rsq_ps(dx33
,dy33
,dz33
);
2481 rinv00
= gmx_mm256_invsqrt_ps(rsq00
);
2482 rinv11
= gmx_mm256_invsqrt_ps(rsq11
);
2483 rinv12
= gmx_mm256_invsqrt_ps(rsq12
);
2484 rinv13
= gmx_mm256_invsqrt_ps(rsq13
);
2485 rinv21
= gmx_mm256_invsqrt_ps(rsq21
);
2486 rinv22
= gmx_mm256_invsqrt_ps(rsq22
);
2487 rinv23
= gmx_mm256_invsqrt_ps(rsq23
);
2488 rinv31
= gmx_mm256_invsqrt_ps(rsq31
);
2489 rinv32
= gmx_mm256_invsqrt_ps(rsq32
);
2490 rinv33
= gmx_mm256_invsqrt_ps(rsq33
);
2492 fjx0
= _mm256_setzero_ps();
2493 fjy0
= _mm256_setzero_ps();
2494 fjz0
= _mm256_setzero_ps();
2495 fjx1
= _mm256_setzero_ps();
2496 fjy1
= _mm256_setzero_ps();
2497 fjz1
= _mm256_setzero_ps();
2498 fjx2
= _mm256_setzero_ps();
2499 fjy2
= _mm256_setzero_ps();
2500 fjz2
= _mm256_setzero_ps();
2501 fjx3
= _mm256_setzero_ps();
2502 fjy3
= _mm256_setzero_ps();
2503 fjz3
= _mm256_setzero_ps();
2505 /**************************
2506 * CALCULATE INTERACTIONS *
2507 **************************/
2509 r00
= _mm256_mul_ps(rsq00
,rinv00
);
2510 r00
= _mm256_andnot_ps(dummy_mask
,r00
);
2512 /* Calculate table index by multiplying r with table scale and truncate to integer */
2513 rt
= _mm256_mul_ps(r00
,vftabscale
);
2514 vfitab
= _mm256_cvttps_epi32(rt
);
2515 vfeps
= _mm256_sub_ps(rt
,_mm256_round_ps(rt
, _MM_FROUND_FLOOR
));
2516 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2517 vfitab_lo
= _mm256_extractf128_si256(vfitab
,0x0);
2518 vfitab_hi
= _mm256_extractf128_si256(vfitab
,0x1);
2519 vfitab_lo
= _mm_slli_epi32(_mm_add_epi32(vfitab_lo
,_mm_slli_epi32(vfitab_lo
,1)),2);
2520 vfitab_hi
= _mm_slli_epi32(_mm_add_epi32(vfitab_hi
,_mm_slli_epi32(vfitab_hi
,1)),2);
2522 /* CUBIC SPLINE TABLE DISPERSION */
2523 vfitab_lo
= _mm_add_epi32(vfitab_lo
,ifour
);
2524 vfitab_hi
= _mm_add_epi32(vfitab_hi
,ifour
);
2525 Y
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,0)),
2526 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,0)));
2527 F
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,1)),
2528 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,1)));
2529 G
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,2)),
2530 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,2)));
2531 H
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,3)),
2532 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,3)));
2533 GMX_MM256_HALFTRANSPOSE4_PS(Y
,F
,G
,H
);
2534 Heps
= _mm256_mul_ps(vfeps
,H
);
2535 Fp
= _mm256_add_ps(F
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,Heps
)));
2536 FF
= _mm256_add_ps(Fp
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,_mm256_add_ps(Heps
,Heps
))));
2537 fvdw6
= _mm256_mul_ps(c6_00
,FF
);
2539 /* CUBIC SPLINE TABLE REPULSION */
2540 vfitab_lo
= _mm_add_epi32(vfitab_lo
,ifour
);
2541 vfitab_hi
= _mm_add_epi32(vfitab_hi
,ifour
);
2542 Y
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,0)),
2543 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,0)));
2544 F
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,1)),
2545 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,1)));
2546 G
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,2)),
2547 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,2)));
2548 H
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,3)),
2549 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,3)));
2550 GMX_MM256_HALFTRANSPOSE4_PS(Y
,F
,G
,H
);
2551 Heps
= _mm256_mul_ps(vfeps
,H
);
2552 Fp
= _mm256_add_ps(F
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,Heps
)));
2553 FF
= _mm256_add_ps(Fp
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,_mm256_add_ps(Heps
,Heps
))));
2554 fvdw12
= _mm256_mul_ps(c12_00
,FF
);
2555 fvdw
= _mm256_xor_ps(signbit
,_mm256_mul_ps(_mm256_add_ps(fvdw6
,fvdw12
),_mm256_mul_ps(vftabscale
,rinv00
)));
2559 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
2561 /* Calculate temporary vectorial force */
2562 tx
= _mm256_mul_ps(fscal
,dx00
);
2563 ty
= _mm256_mul_ps(fscal
,dy00
);
2564 tz
= _mm256_mul_ps(fscal
,dz00
);
2566 /* Update vectorial force */
2567 fix0
= _mm256_add_ps(fix0
,tx
);
2568 fiy0
= _mm256_add_ps(fiy0
,ty
);
2569 fiz0
= _mm256_add_ps(fiz0
,tz
);
2571 fjx0
= _mm256_add_ps(fjx0
,tx
);
2572 fjy0
= _mm256_add_ps(fjy0
,ty
);
2573 fjz0
= _mm256_add_ps(fjz0
,tz
);
2575 /**************************
2576 * CALCULATE INTERACTIONS *
2577 **************************/
2579 r11
= _mm256_mul_ps(rsq11
,rinv11
);
2580 r11
= _mm256_andnot_ps(dummy_mask
,r11
);
2582 /* Calculate table index by multiplying r with table scale and truncate to integer */
2583 rt
= _mm256_mul_ps(r11
,vftabscale
);
2584 vfitab
= _mm256_cvttps_epi32(rt
);
2585 vfeps
= _mm256_sub_ps(rt
,_mm256_round_ps(rt
, _MM_FROUND_FLOOR
));
2586 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2587 vfitab_lo
= _mm256_extractf128_si256(vfitab
,0x0);
2588 vfitab_hi
= _mm256_extractf128_si256(vfitab
,0x1);
2589 vfitab_lo
= _mm_slli_epi32(_mm_add_epi32(vfitab_lo
,_mm_slli_epi32(vfitab_lo
,1)),2);
2590 vfitab_hi
= _mm_slli_epi32(_mm_add_epi32(vfitab_hi
,_mm_slli_epi32(vfitab_hi
,1)),2);
2592 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2593 Y
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,0)),
2594 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,0)));
2595 F
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,1)),
2596 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,1)));
2597 G
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,2)),
2598 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,2)));
2599 H
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,3)),
2600 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,3)));
2601 GMX_MM256_HALFTRANSPOSE4_PS(Y
,F
,G
,H
);
2602 Heps
= _mm256_mul_ps(vfeps
,H
);
2603 Fp
= _mm256_add_ps(F
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,Heps
)));
2604 FF
= _mm256_add_ps(Fp
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,_mm256_add_ps(Heps
,Heps
))));
2605 felec
= _mm256_xor_ps(signbit
,_mm256_mul_ps(_mm256_mul_ps(qq11
,FF
),_mm256_mul_ps(vftabscale
,rinv11
)));
2609 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
2611 /* Calculate temporary vectorial force */
2612 tx
= _mm256_mul_ps(fscal
,dx11
);
2613 ty
= _mm256_mul_ps(fscal
,dy11
);
2614 tz
= _mm256_mul_ps(fscal
,dz11
);
2616 /* Update vectorial force */
2617 fix1
= _mm256_add_ps(fix1
,tx
);
2618 fiy1
= _mm256_add_ps(fiy1
,ty
);
2619 fiz1
= _mm256_add_ps(fiz1
,tz
);
2621 fjx1
= _mm256_add_ps(fjx1
,tx
);
2622 fjy1
= _mm256_add_ps(fjy1
,ty
);
2623 fjz1
= _mm256_add_ps(fjz1
,tz
);
2625 /**************************
2626 * CALCULATE INTERACTIONS *
2627 **************************/
2629 r12
= _mm256_mul_ps(rsq12
,rinv12
);
2630 r12
= _mm256_andnot_ps(dummy_mask
,r12
);
2632 /* Calculate table index by multiplying r with table scale and truncate to integer */
2633 rt
= _mm256_mul_ps(r12
,vftabscale
);
2634 vfitab
= _mm256_cvttps_epi32(rt
);
2635 vfeps
= _mm256_sub_ps(rt
,_mm256_round_ps(rt
, _MM_FROUND_FLOOR
));
2636 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2637 vfitab_lo
= _mm256_extractf128_si256(vfitab
,0x0);
2638 vfitab_hi
= _mm256_extractf128_si256(vfitab
,0x1);
2639 vfitab_lo
= _mm_slli_epi32(_mm_add_epi32(vfitab_lo
,_mm_slli_epi32(vfitab_lo
,1)),2);
2640 vfitab_hi
= _mm_slli_epi32(_mm_add_epi32(vfitab_hi
,_mm_slli_epi32(vfitab_hi
,1)),2);
2642 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2643 Y
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,0)),
2644 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,0)));
2645 F
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,1)),
2646 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,1)));
2647 G
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,2)),
2648 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,2)));
2649 H
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,3)),
2650 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,3)));
2651 GMX_MM256_HALFTRANSPOSE4_PS(Y
,F
,G
,H
);
2652 Heps
= _mm256_mul_ps(vfeps
,H
);
2653 Fp
= _mm256_add_ps(F
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,Heps
)));
2654 FF
= _mm256_add_ps(Fp
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,_mm256_add_ps(Heps
,Heps
))));
2655 felec
= _mm256_xor_ps(signbit
,_mm256_mul_ps(_mm256_mul_ps(qq12
,FF
),_mm256_mul_ps(vftabscale
,rinv12
)));
2659 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
2661 /* Calculate temporary vectorial force */
2662 tx
= _mm256_mul_ps(fscal
,dx12
);
2663 ty
= _mm256_mul_ps(fscal
,dy12
);
2664 tz
= _mm256_mul_ps(fscal
,dz12
);
2666 /* Update vectorial force */
2667 fix1
= _mm256_add_ps(fix1
,tx
);
2668 fiy1
= _mm256_add_ps(fiy1
,ty
);
2669 fiz1
= _mm256_add_ps(fiz1
,tz
);
2671 fjx2
= _mm256_add_ps(fjx2
,tx
);
2672 fjy2
= _mm256_add_ps(fjy2
,ty
);
2673 fjz2
= _mm256_add_ps(fjz2
,tz
);
2675 /**************************
2676 * CALCULATE INTERACTIONS *
2677 **************************/
2679 r13
= _mm256_mul_ps(rsq13
,rinv13
);
2680 r13
= _mm256_andnot_ps(dummy_mask
,r13
);
2682 /* Calculate table index by multiplying r with table scale and truncate to integer */
2683 rt
= _mm256_mul_ps(r13
,vftabscale
);
2684 vfitab
= _mm256_cvttps_epi32(rt
);
2685 vfeps
= _mm256_sub_ps(rt
,_mm256_round_ps(rt
, _MM_FROUND_FLOOR
));
2686 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2687 vfitab_lo
= _mm256_extractf128_si256(vfitab
,0x0);
2688 vfitab_hi
= _mm256_extractf128_si256(vfitab
,0x1);
2689 vfitab_lo
= _mm_slli_epi32(_mm_add_epi32(vfitab_lo
,_mm_slli_epi32(vfitab_lo
,1)),2);
2690 vfitab_hi
= _mm_slli_epi32(_mm_add_epi32(vfitab_hi
,_mm_slli_epi32(vfitab_hi
,1)),2);
2692 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2693 Y
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,0)),
2694 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,0)));
2695 F
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,1)),
2696 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,1)));
2697 G
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,2)),
2698 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,2)));
2699 H
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,3)),
2700 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,3)));
2701 GMX_MM256_HALFTRANSPOSE4_PS(Y
,F
,G
,H
);
2702 Heps
= _mm256_mul_ps(vfeps
,H
);
2703 Fp
= _mm256_add_ps(F
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,Heps
)));
2704 FF
= _mm256_add_ps(Fp
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,_mm256_add_ps(Heps
,Heps
))));
2705 felec
= _mm256_xor_ps(signbit
,_mm256_mul_ps(_mm256_mul_ps(qq13
,FF
),_mm256_mul_ps(vftabscale
,rinv13
)));
2709 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
2711 /* Calculate temporary vectorial force */
2712 tx
= _mm256_mul_ps(fscal
,dx13
);
2713 ty
= _mm256_mul_ps(fscal
,dy13
);
2714 tz
= _mm256_mul_ps(fscal
,dz13
);
2716 /* Update vectorial force */
2717 fix1
= _mm256_add_ps(fix1
,tx
);
2718 fiy1
= _mm256_add_ps(fiy1
,ty
);
2719 fiz1
= _mm256_add_ps(fiz1
,tz
);
2721 fjx3
= _mm256_add_ps(fjx3
,tx
);
2722 fjy3
= _mm256_add_ps(fjy3
,ty
);
2723 fjz3
= _mm256_add_ps(fjz3
,tz
);
2725 /**************************
2726 * CALCULATE INTERACTIONS *
2727 **************************/
2729 r21
= _mm256_mul_ps(rsq21
,rinv21
);
2730 r21
= _mm256_andnot_ps(dummy_mask
,r21
);
2732 /* Calculate table index by multiplying r with table scale and truncate to integer */
2733 rt
= _mm256_mul_ps(r21
,vftabscale
);
2734 vfitab
= _mm256_cvttps_epi32(rt
);
2735 vfeps
= _mm256_sub_ps(rt
,_mm256_round_ps(rt
, _MM_FROUND_FLOOR
));
2736 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2737 vfitab_lo
= _mm256_extractf128_si256(vfitab
,0x0);
2738 vfitab_hi
= _mm256_extractf128_si256(vfitab
,0x1);
2739 vfitab_lo
= _mm_slli_epi32(_mm_add_epi32(vfitab_lo
,_mm_slli_epi32(vfitab_lo
,1)),2);
2740 vfitab_hi
= _mm_slli_epi32(_mm_add_epi32(vfitab_hi
,_mm_slli_epi32(vfitab_hi
,1)),2);
2742 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2743 Y
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,0)),
2744 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,0)));
2745 F
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,1)),
2746 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,1)));
2747 G
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,2)),
2748 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,2)));
2749 H
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,3)),
2750 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,3)));
2751 GMX_MM256_HALFTRANSPOSE4_PS(Y
,F
,G
,H
);
2752 Heps
= _mm256_mul_ps(vfeps
,H
);
2753 Fp
= _mm256_add_ps(F
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,Heps
)));
2754 FF
= _mm256_add_ps(Fp
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,_mm256_add_ps(Heps
,Heps
))));
2755 felec
= _mm256_xor_ps(signbit
,_mm256_mul_ps(_mm256_mul_ps(qq21
,FF
),_mm256_mul_ps(vftabscale
,rinv21
)));
2759 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
2761 /* Calculate temporary vectorial force */
2762 tx
= _mm256_mul_ps(fscal
,dx21
);
2763 ty
= _mm256_mul_ps(fscal
,dy21
);
2764 tz
= _mm256_mul_ps(fscal
,dz21
);
2766 /* Update vectorial force */
2767 fix2
= _mm256_add_ps(fix2
,tx
);
2768 fiy2
= _mm256_add_ps(fiy2
,ty
);
2769 fiz2
= _mm256_add_ps(fiz2
,tz
);
2771 fjx1
= _mm256_add_ps(fjx1
,tx
);
2772 fjy1
= _mm256_add_ps(fjy1
,ty
);
2773 fjz1
= _mm256_add_ps(fjz1
,tz
);
2775 /**************************
2776 * CALCULATE INTERACTIONS *
2777 **************************/
2779 r22
= _mm256_mul_ps(rsq22
,rinv22
);
2780 r22
= _mm256_andnot_ps(dummy_mask
,r22
);
2782 /* Calculate table index by multiplying r with table scale and truncate to integer */
2783 rt
= _mm256_mul_ps(r22
,vftabscale
);
2784 vfitab
= _mm256_cvttps_epi32(rt
);
2785 vfeps
= _mm256_sub_ps(rt
,_mm256_round_ps(rt
, _MM_FROUND_FLOOR
));
2786 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2787 vfitab_lo
= _mm256_extractf128_si256(vfitab
,0x0);
2788 vfitab_hi
= _mm256_extractf128_si256(vfitab
,0x1);
2789 vfitab_lo
= _mm_slli_epi32(_mm_add_epi32(vfitab_lo
,_mm_slli_epi32(vfitab_lo
,1)),2);
2790 vfitab_hi
= _mm_slli_epi32(_mm_add_epi32(vfitab_hi
,_mm_slli_epi32(vfitab_hi
,1)),2);
2792 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2793 Y
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,0)),
2794 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,0)));
2795 F
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,1)),
2796 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,1)));
2797 G
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,2)),
2798 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,2)));
2799 H
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,3)),
2800 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,3)));
2801 GMX_MM256_HALFTRANSPOSE4_PS(Y
,F
,G
,H
);
2802 Heps
= _mm256_mul_ps(vfeps
,H
);
2803 Fp
= _mm256_add_ps(F
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,Heps
)));
2804 FF
= _mm256_add_ps(Fp
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,_mm256_add_ps(Heps
,Heps
))));
2805 felec
= _mm256_xor_ps(signbit
,_mm256_mul_ps(_mm256_mul_ps(qq22
,FF
),_mm256_mul_ps(vftabscale
,rinv22
)));
2809 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
2811 /* Calculate temporary vectorial force */
2812 tx
= _mm256_mul_ps(fscal
,dx22
);
2813 ty
= _mm256_mul_ps(fscal
,dy22
);
2814 tz
= _mm256_mul_ps(fscal
,dz22
);
2816 /* Update vectorial force */
2817 fix2
= _mm256_add_ps(fix2
,tx
);
2818 fiy2
= _mm256_add_ps(fiy2
,ty
);
2819 fiz2
= _mm256_add_ps(fiz2
,tz
);
2821 fjx2
= _mm256_add_ps(fjx2
,tx
);
2822 fjy2
= _mm256_add_ps(fjy2
,ty
);
2823 fjz2
= _mm256_add_ps(fjz2
,tz
);
2825 /**************************
2826 * CALCULATE INTERACTIONS *
2827 **************************/
2829 r23
= _mm256_mul_ps(rsq23
,rinv23
);
2830 r23
= _mm256_andnot_ps(dummy_mask
,r23
);
2832 /* Calculate table index by multiplying r with table scale and truncate to integer */
2833 rt
= _mm256_mul_ps(r23
,vftabscale
);
2834 vfitab
= _mm256_cvttps_epi32(rt
);
2835 vfeps
= _mm256_sub_ps(rt
,_mm256_round_ps(rt
, _MM_FROUND_FLOOR
));
2836 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2837 vfitab_lo
= _mm256_extractf128_si256(vfitab
,0x0);
2838 vfitab_hi
= _mm256_extractf128_si256(vfitab
,0x1);
2839 vfitab_lo
= _mm_slli_epi32(_mm_add_epi32(vfitab_lo
,_mm_slli_epi32(vfitab_lo
,1)),2);
2840 vfitab_hi
= _mm_slli_epi32(_mm_add_epi32(vfitab_hi
,_mm_slli_epi32(vfitab_hi
,1)),2);
2842 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2843 Y
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,0)),
2844 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,0)));
2845 F
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,1)),
2846 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,1)));
2847 G
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,2)),
2848 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,2)));
2849 H
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,3)),
2850 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,3)));
2851 GMX_MM256_HALFTRANSPOSE4_PS(Y
,F
,G
,H
);
2852 Heps
= _mm256_mul_ps(vfeps
,H
);
2853 Fp
= _mm256_add_ps(F
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,Heps
)));
2854 FF
= _mm256_add_ps(Fp
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,_mm256_add_ps(Heps
,Heps
))));
2855 felec
= _mm256_xor_ps(signbit
,_mm256_mul_ps(_mm256_mul_ps(qq23
,FF
),_mm256_mul_ps(vftabscale
,rinv23
)));
2859 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
2861 /* Calculate temporary vectorial force */
2862 tx
= _mm256_mul_ps(fscal
,dx23
);
2863 ty
= _mm256_mul_ps(fscal
,dy23
);
2864 tz
= _mm256_mul_ps(fscal
,dz23
);
2866 /* Update vectorial force */
2867 fix2
= _mm256_add_ps(fix2
,tx
);
2868 fiy2
= _mm256_add_ps(fiy2
,ty
);
2869 fiz2
= _mm256_add_ps(fiz2
,tz
);
2871 fjx3
= _mm256_add_ps(fjx3
,tx
);
2872 fjy3
= _mm256_add_ps(fjy3
,ty
);
2873 fjz3
= _mm256_add_ps(fjz3
,tz
);
2875 /**************************
2876 * CALCULATE INTERACTIONS *
2877 **************************/
2879 r31
= _mm256_mul_ps(rsq31
,rinv31
);
2880 r31
= _mm256_andnot_ps(dummy_mask
,r31
);
2882 /* Calculate table index by multiplying r with table scale and truncate to integer */
2883 rt
= _mm256_mul_ps(r31
,vftabscale
);
2884 vfitab
= _mm256_cvttps_epi32(rt
);
2885 vfeps
= _mm256_sub_ps(rt
,_mm256_round_ps(rt
, _MM_FROUND_FLOOR
));
2886 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2887 vfitab_lo
= _mm256_extractf128_si256(vfitab
,0x0);
2888 vfitab_hi
= _mm256_extractf128_si256(vfitab
,0x1);
2889 vfitab_lo
= _mm_slli_epi32(_mm_add_epi32(vfitab_lo
,_mm_slli_epi32(vfitab_lo
,1)),2);
2890 vfitab_hi
= _mm_slli_epi32(_mm_add_epi32(vfitab_hi
,_mm_slli_epi32(vfitab_hi
,1)),2);
2892 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2893 Y
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,0)),
2894 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,0)));
2895 F
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,1)),
2896 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,1)));
2897 G
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,2)),
2898 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,2)));
2899 H
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,3)),
2900 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,3)));
2901 GMX_MM256_HALFTRANSPOSE4_PS(Y
,F
,G
,H
);
2902 Heps
= _mm256_mul_ps(vfeps
,H
);
2903 Fp
= _mm256_add_ps(F
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,Heps
)));
2904 FF
= _mm256_add_ps(Fp
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,_mm256_add_ps(Heps
,Heps
))));
2905 felec
= _mm256_xor_ps(signbit
,_mm256_mul_ps(_mm256_mul_ps(qq31
,FF
),_mm256_mul_ps(vftabscale
,rinv31
)));
2909 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
2911 /* Calculate temporary vectorial force */
2912 tx
= _mm256_mul_ps(fscal
,dx31
);
2913 ty
= _mm256_mul_ps(fscal
,dy31
);
2914 tz
= _mm256_mul_ps(fscal
,dz31
);
2916 /* Update vectorial force */
2917 fix3
= _mm256_add_ps(fix3
,tx
);
2918 fiy3
= _mm256_add_ps(fiy3
,ty
);
2919 fiz3
= _mm256_add_ps(fiz3
,tz
);
2921 fjx1
= _mm256_add_ps(fjx1
,tx
);
2922 fjy1
= _mm256_add_ps(fjy1
,ty
);
2923 fjz1
= _mm256_add_ps(fjz1
,tz
);
2925 /**************************
2926 * CALCULATE INTERACTIONS *
2927 **************************/
2929 r32
= _mm256_mul_ps(rsq32
,rinv32
);
2930 r32
= _mm256_andnot_ps(dummy_mask
,r32
);
2932 /* Calculate table index by multiplying r with table scale and truncate to integer */
2933 rt
= _mm256_mul_ps(r32
,vftabscale
);
2934 vfitab
= _mm256_cvttps_epi32(rt
);
2935 vfeps
= _mm256_sub_ps(rt
,_mm256_round_ps(rt
, _MM_FROUND_FLOOR
));
2936 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2937 vfitab_lo
= _mm256_extractf128_si256(vfitab
,0x0);
2938 vfitab_hi
= _mm256_extractf128_si256(vfitab
,0x1);
2939 vfitab_lo
= _mm_slli_epi32(_mm_add_epi32(vfitab_lo
,_mm_slli_epi32(vfitab_lo
,1)),2);
2940 vfitab_hi
= _mm_slli_epi32(_mm_add_epi32(vfitab_hi
,_mm_slli_epi32(vfitab_hi
,1)),2);
2942 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2943 Y
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,0)),
2944 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,0)));
2945 F
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,1)),
2946 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,1)));
2947 G
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,2)),
2948 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,2)));
2949 H
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,3)),
2950 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,3)));
2951 GMX_MM256_HALFTRANSPOSE4_PS(Y
,F
,G
,H
);
2952 Heps
= _mm256_mul_ps(vfeps
,H
);
2953 Fp
= _mm256_add_ps(F
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,Heps
)));
2954 FF
= _mm256_add_ps(Fp
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,_mm256_add_ps(Heps
,Heps
))));
2955 felec
= _mm256_xor_ps(signbit
,_mm256_mul_ps(_mm256_mul_ps(qq32
,FF
),_mm256_mul_ps(vftabscale
,rinv32
)));
2959 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
2961 /* Calculate temporary vectorial force */
2962 tx
= _mm256_mul_ps(fscal
,dx32
);
2963 ty
= _mm256_mul_ps(fscal
,dy32
);
2964 tz
= _mm256_mul_ps(fscal
,dz32
);
2966 /* Update vectorial force */
2967 fix3
= _mm256_add_ps(fix3
,tx
);
2968 fiy3
= _mm256_add_ps(fiy3
,ty
);
2969 fiz3
= _mm256_add_ps(fiz3
,tz
);
2971 fjx2
= _mm256_add_ps(fjx2
,tx
);
2972 fjy2
= _mm256_add_ps(fjy2
,ty
);
2973 fjz2
= _mm256_add_ps(fjz2
,tz
);
2975 /**************************
2976 * CALCULATE INTERACTIONS *
2977 **************************/
2979 r33
= _mm256_mul_ps(rsq33
,rinv33
);
2980 r33
= _mm256_andnot_ps(dummy_mask
,r33
);
2982 /* Calculate table index by multiplying r with table scale and truncate to integer */
2983 rt
= _mm256_mul_ps(r33
,vftabscale
);
2984 vfitab
= _mm256_cvttps_epi32(rt
);
2985 vfeps
= _mm256_sub_ps(rt
,_mm256_round_ps(rt
, _MM_FROUND_FLOOR
));
2986 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2987 vfitab_lo
= _mm256_extractf128_si256(vfitab
,0x0);
2988 vfitab_hi
= _mm256_extractf128_si256(vfitab
,0x1);
2989 vfitab_lo
= _mm_slli_epi32(_mm_add_epi32(vfitab_lo
,_mm_slli_epi32(vfitab_lo
,1)),2);
2990 vfitab_hi
= _mm_slli_epi32(_mm_add_epi32(vfitab_hi
,_mm_slli_epi32(vfitab_hi
,1)),2);
2992 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2993 Y
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,0)),
2994 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,0)));
2995 F
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,1)),
2996 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,1)));
2997 G
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,2)),
2998 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,2)));
2999 H
= gmx_mm256_set_m128(_mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_hi
,3)),
3000 _mm_load_ps(vftab
+ _mm_extract_epi32(vfitab_lo
,3)));
3001 GMX_MM256_HALFTRANSPOSE4_PS(Y
,F
,G
,H
);
3002 Heps
= _mm256_mul_ps(vfeps
,H
);
3003 Fp
= _mm256_add_ps(F
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,Heps
)));
3004 FF
= _mm256_add_ps(Fp
,_mm256_mul_ps(vfeps
,_mm256_add_ps(G
,_mm256_add_ps(Heps
,Heps
))));
3005 felec
= _mm256_xor_ps(signbit
,_mm256_mul_ps(_mm256_mul_ps(qq33
,FF
),_mm256_mul_ps(vftabscale
,rinv33
)));
3009 fscal
= _mm256_andnot_ps(dummy_mask
,fscal
);
3011 /* Calculate temporary vectorial force */
3012 tx
= _mm256_mul_ps(fscal
,dx33
);
3013 ty
= _mm256_mul_ps(fscal
,dy33
);
3014 tz
= _mm256_mul_ps(fscal
,dz33
);
3016 /* Update vectorial force */
3017 fix3
= _mm256_add_ps(fix3
,tx
);
3018 fiy3
= _mm256_add_ps(fiy3
,ty
);
3019 fiz3
= _mm256_add_ps(fiz3
,tz
);
3021 fjx3
= _mm256_add_ps(fjx3
,tx
);
3022 fjy3
= _mm256_add_ps(fjy3
,ty
);
3023 fjz3
= _mm256_add_ps(fjz3
,tz
);
3025 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
3026 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
3027 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
3028 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
3029 fjptrE
= (jnrlistE
>=0) ? f
+j_coord_offsetE
: scratch
;
3030 fjptrF
= (jnrlistF
>=0) ? f
+j_coord_offsetF
: scratch
;
3031 fjptrG
= (jnrlistG
>=0) ? f
+j_coord_offsetG
: scratch
;
3032 fjptrH
= (jnrlistH
>=0) ? f
+j_coord_offsetH
: scratch
;
3034 gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,fjptrE
,fjptrF
,fjptrG
,fjptrH
,
3035 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,
3036 fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
3038 /* Inner loop uses 412 flops */
3041 /* End of innermost loop */
3043 gmx_mm256_update_iforce_4atom_swizzle_ps(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
3044 f
+i_coord_offset
,fshift
+i_shift_offset
);
3046 /* Increment number of inner iterations */
3047 inneriter
+= j_index_end
- j_index_start
;
3049 /* Outer loop uses 24 flops */
3052 /* Increment number of outer iterations */
3055 /* Update outer/inner flops */
3057 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W4W4_F
,outeriter
*24 + inneriter
*412);