2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_avx_128_fma_single.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_avx_128_fma_single
51 * Electrostatics interaction: CubicSplineTable
52 * VdW interaction: CubicSplineTable
53 * Geometry: Water4-Water4
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_avx_128_fma_single
58 (t_nblist
* gmx_restrict nlist
,
59 rvec
* gmx_restrict xx
,
60 rvec
* gmx_restrict ff
,
61 struct t_forcerec
* gmx_restrict fr
,
62 t_mdatoms
* gmx_restrict mdatoms
,
63 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
64 t_nrnb
* gmx_restrict nrnb
)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
72 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
73 int jnrA
,jnrB
,jnrC
,jnrD
;
74 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
75 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
76 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
78 real
*shiftvec
,*fshift
,*x
,*f
;
79 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
;
81 __m128 fscal
,rcutoff
,rcutoff2
,jidxall
;
83 __m128 ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
85 __m128 ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
87 __m128 ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
89 __m128 ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
90 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
;
91 __m128 jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
92 int vdwjidx1A
,vdwjidx1B
,vdwjidx1C
,vdwjidx1D
;
93 __m128 jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
94 int vdwjidx2A
,vdwjidx2B
,vdwjidx2C
,vdwjidx2D
;
95 __m128 jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
96 int vdwjidx3A
,vdwjidx3B
,vdwjidx3C
,vdwjidx3D
;
97 __m128 jx3
,jy3
,jz3
,fjx3
,fjy3
,fjz3
,jq3
,isaj3
;
98 __m128 dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
99 __m128 dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
100 __m128 dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
101 __m128 dx13
,dy13
,dz13
,rsq13
,rinv13
,rinvsq13
,r13
,qq13
,c6_13
,c12_13
;
102 __m128 dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
103 __m128 dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
104 __m128 dx23
,dy23
,dz23
,rsq23
,rinv23
,rinvsq23
,r23
,qq23
,c6_23
,c12_23
;
105 __m128 dx31
,dy31
,dz31
,rsq31
,rinv31
,rinvsq31
,r31
,qq31
,c6_31
,c12_31
;
106 __m128 dx32
,dy32
,dz32
,rsq32
,rinv32
,rinvsq32
,r32
,qq32
,c6_32
,c12_32
;
107 __m128 dx33
,dy33
,dz33
,rsq33
,rinv33
,rinvsq33
,r33
,qq33
,c6_33
,c12_33
;
108 __m128 velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
111 __m128 rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
114 __m128 one_sixth
= _mm_set1_ps(1.0/6.0);
115 __m128 one_twelfth
= _mm_set1_ps(1.0/12.0);
117 __m128i ifour
= _mm_set1_epi32(4);
118 __m128 rt
,vfeps
,twovfeps
,vftabscale
,Y
,F
,G
,H
,Fp
,VV
,FF
;
120 __m128 dummy_mask
,cutoff_mask
;
121 __m128 signbit
= _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
122 __m128 one
= _mm_set1_ps(1.0);
123 __m128 two
= _mm_set1_ps(2.0);
129 jindex
= nlist
->jindex
;
131 shiftidx
= nlist
->shift
;
133 shiftvec
= fr
->shift_vec
[0];
134 fshift
= fr
->fshift
[0];
135 facel
= _mm_set1_ps(fr
->ic
->epsfac
);
136 charge
= mdatoms
->chargeA
;
137 nvdwtype
= fr
->ntype
;
139 vdwtype
= mdatoms
->typeA
;
141 vftab
= kernel_data
->table_elec_vdw
->data
;
142 vftabscale
= _mm_set1_ps(kernel_data
->table_elec_vdw
->scale
);
144 /* Setup water-specific parameters */
145 inr
= nlist
->iinr
[0];
146 iq1
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+1]));
147 iq2
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+2]));
148 iq3
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+3]));
149 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
151 jq1
= _mm_set1_ps(charge
[inr
+1]);
152 jq2
= _mm_set1_ps(charge
[inr
+2]);
153 jq3
= _mm_set1_ps(charge
[inr
+3]);
154 vdwjidx0A
= 2*vdwtype
[inr
+0];
155 c6_00
= _mm_set1_ps(vdwparam
[vdwioffset0
+vdwjidx0A
]);
156 c12_00
= _mm_set1_ps(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
157 qq11
= _mm_mul_ps(iq1
,jq1
);
158 qq12
= _mm_mul_ps(iq1
,jq2
);
159 qq13
= _mm_mul_ps(iq1
,jq3
);
160 qq21
= _mm_mul_ps(iq2
,jq1
);
161 qq22
= _mm_mul_ps(iq2
,jq2
);
162 qq23
= _mm_mul_ps(iq2
,jq3
);
163 qq31
= _mm_mul_ps(iq3
,jq1
);
164 qq32
= _mm_mul_ps(iq3
,jq2
);
165 qq33
= _mm_mul_ps(iq3
,jq3
);
167 /* Avoid stupid compiler warnings */
168 jnrA
= jnrB
= jnrC
= jnrD
= 0;
177 for(iidx
=0;iidx
<4*DIM
;iidx
++)
182 /* Start outer loop over neighborlists */
183 for(iidx
=0; iidx
<nri
; iidx
++)
185 /* Load shift vector for this list */
186 i_shift_offset
= DIM
*shiftidx
[iidx
];
188 /* Load limits for loop over neighbors */
189 j_index_start
= jindex
[iidx
];
190 j_index_end
= jindex
[iidx
+1];
192 /* Get outer coordinate index */
194 i_coord_offset
= DIM
*inr
;
196 /* Load i particle coords and add shift vector */
197 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
198 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
200 fix0
= _mm_setzero_ps();
201 fiy0
= _mm_setzero_ps();
202 fiz0
= _mm_setzero_ps();
203 fix1
= _mm_setzero_ps();
204 fiy1
= _mm_setzero_ps();
205 fiz1
= _mm_setzero_ps();
206 fix2
= _mm_setzero_ps();
207 fiy2
= _mm_setzero_ps();
208 fiz2
= _mm_setzero_ps();
209 fix3
= _mm_setzero_ps();
210 fiy3
= _mm_setzero_ps();
211 fiz3
= _mm_setzero_ps();
213 /* Reset potential sums */
214 velecsum
= _mm_setzero_ps();
215 vvdwsum
= _mm_setzero_ps();
217 /* Start inner kernel loop */
218 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+3]>=0; jidx
+=4)
221 /* Get j neighbor index, and coordinate index */
226 j_coord_offsetA
= DIM
*jnrA
;
227 j_coord_offsetB
= DIM
*jnrB
;
228 j_coord_offsetC
= DIM
*jnrC
;
229 j_coord_offsetD
= DIM
*jnrD
;
231 /* load j atom coordinates */
232 gmx_mm_load_4rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
233 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
234 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
235 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
237 /* Calculate displacement vector */
238 dx00
= _mm_sub_ps(ix0
,jx0
);
239 dy00
= _mm_sub_ps(iy0
,jy0
);
240 dz00
= _mm_sub_ps(iz0
,jz0
);
241 dx11
= _mm_sub_ps(ix1
,jx1
);
242 dy11
= _mm_sub_ps(iy1
,jy1
);
243 dz11
= _mm_sub_ps(iz1
,jz1
);
244 dx12
= _mm_sub_ps(ix1
,jx2
);
245 dy12
= _mm_sub_ps(iy1
,jy2
);
246 dz12
= _mm_sub_ps(iz1
,jz2
);
247 dx13
= _mm_sub_ps(ix1
,jx3
);
248 dy13
= _mm_sub_ps(iy1
,jy3
);
249 dz13
= _mm_sub_ps(iz1
,jz3
);
250 dx21
= _mm_sub_ps(ix2
,jx1
);
251 dy21
= _mm_sub_ps(iy2
,jy1
);
252 dz21
= _mm_sub_ps(iz2
,jz1
);
253 dx22
= _mm_sub_ps(ix2
,jx2
);
254 dy22
= _mm_sub_ps(iy2
,jy2
);
255 dz22
= _mm_sub_ps(iz2
,jz2
);
256 dx23
= _mm_sub_ps(ix2
,jx3
);
257 dy23
= _mm_sub_ps(iy2
,jy3
);
258 dz23
= _mm_sub_ps(iz2
,jz3
);
259 dx31
= _mm_sub_ps(ix3
,jx1
);
260 dy31
= _mm_sub_ps(iy3
,jy1
);
261 dz31
= _mm_sub_ps(iz3
,jz1
);
262 dx32
= _mm_sub_ps(ix3
,jx2
);
263 dy32
= _mm_sub_ps(iy3
,jy2
);
264 dz32
= _mm_sub_ps(iz3
,jz2
);
265 dx33
= _mm_sub_ps(ix3
,jx3
);
266 dy33
= _mm_sub_ps(iy3
,jy3
);
267 dz33
= _mm_sub_ps(iz3
,jz3
);
269 /* Calculate squared distance and things based on it */
270 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
271 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
272 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
273 rsq13
= gmx_mm_calc_rsq_ps(dx13
,dy13
,dz13
);
274 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
275 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
276 rsq23
= gmx_mm_calc_rsq_ps(dx23
,dy23
,dz23
);
277 rsq31
= gmx_mm_calc_rsq_ps(dx31
,dy31
,dz31
);
278 rsq32
= gmx_mm_calc_rsq_ps(dx32
,dy32
,dz32
);
279 rsq33
= gmx_mm_calc_rsq_ps(dx33
,dy33
,dz33
);
281 rinv00
= avx128fma_invsqrt_f(rsq00
);
282 rinv11
= avx128fma_invsqrt_f(rsq11
);
283 rinv12
= avx128fma_invsqrt_f(rsq12
);
284 rinv13
= avx128fma_invsqrt_f(rsq13
);
285 rinv21
= avx128fma_invsqrt_f(rsq21
);
286 rinv22
= avx128fma_invsqrt_f(rsq22
);
287 rinv23
= avx128fma_invsqrt_f(rsq23
);
288 rinv31
= avx128fma_invsqrt_f(rsq31
);
289 rinv32
= avx128fma_invsqrt_f(rsq32
);
290 rinv33
= avx128fma_invsqrt_f(rsq33
);
292 fjx0
= _mm_setzero_ps();
293 fjy0
= _mm_setzero_ps();
294 fjz0
= _mm_setzero_ps();
295 fjx1
= _mm_setzero_ps();
296 fjy1
= _mm_setzero_ps();
297 fjz1
= _mm_setzero_ps();
298 fjx2
= _mm_setzero_ps();
299 fjy2
= _mm_setzero_ps();
300 fjz2
= _mm_setzero_ps();
301 fjx3
= _mm_setzero_ps();
302 fjy3
= _mm_setzero_ps();
303 fjz3
= _mm_setzero_ps();
305 /**************************
306 * CALCULATE INTERACTIONS *
307 **************************/
309 r00
= _mm_mul_ps(rsq00
,rinv00
);
311 /* Calculate table index by multiplying r with table scale and truncate to integer */
312 rt
= _mm_mul_ps(r00
,vftabscale
);
313 vfitab
= _mm_cvttps_epi32(rt
);
315 vfeps
= _mm_frcz_ps(rt
);
317 vfeps
= _mm_sub_ps(rt
,_mm_round_ps(rt
, _MM_FROUND_FLOOR
));
319 twovfeps
= _mm_add_ps(vfeps
,vfeps
);
320 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
322 /* CUBIC SPLINE TABLE DISPERSION */
323 vfitab
= _mm_add_epi32(vfitab
,ifour
);
324 Y
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,0) );
325 F
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,1) );
326 G
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,2) );
327 H
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,3) );
328 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
329 Fp
= _mm_macc_ps(vfeps
,_mm_macc_ps(H
,vfeps
,G
),F
);
330 VV
= _mm_macc_ps(vfeps
,Fp
,Y
);
331 vvdw6
= _mm_mul_ps(c6_00
,VV
);
332 FF
= _mm_macc_ps(vfeps
,_mm_macc_ps(twovfeps
,H
,G
),Fp
);
333 fvdw6
= _mm_mul_ps(c6_00
,FF
);
335 /* CUBIC SPLINE TABLE REPULSION */
336 vfitab
= _mm_add_epi32(vfitab
,ifour
);
337 Y
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,0) );
338 F
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,1) );
339 G
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,2) );
340 H
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,3) );
341 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
342 Fp
= _mm_macc_ps(vfeps
,_mm_macc_ps(H
,vfeps
,G
),F
);
343 VV
= _mm_macc_ps(vfeps
,Fp
,Y
);
344 vvdw12
= _mm_mul_ps(c12_00
,VV
);
345 FF
= _mm_macc_ps(vfeps
,_mm_macc_ps(twovfeps
,H
,G
),Fp
);
346 fvdw12
= _mm_mul_ps(c12_00
,FF
);
347 vvdw
= _mm_add_ps(vvdw12
,vvdw6
);
348 fvdw
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_add_ps(fvdw6
,fvdw12
),_mm_mul_ps(vftabscale
,rinv00
)));
350 /* Update potential sum for this i atom from the interaction with this j atom. */
351 vvdwsum
= _mm_add_ps(vvdwsum
,vvdw
);
355 /* Update vectorial force */
356 fix0
= _mm_macc_ps(dx00
,fscal
,fix0
);
357 fiy0
= _mm_macc_ps(dy00
,fscal
,fiy0
);
358 fiz0
= _mm_macc_ps(dz00
,fscal
,fiz0
);
360 fjx0
= _mm_macc_ps(dx00
,fscal
,fjx0
);
361 fjy0
= _mm_macc_ps(dy00
,fscal
,fjy0
);
362 fjz0
= _mm_macc_ps(dz00
,fscal
,fjz0
);
364 /**************************
365 * CALCULATE INTERACTIONS *
366 **************************/
368 r11
= _mm_mul_ps(rsq11
,rinv11
);
370 /* Calculate table index by multiplying r with table scale and truncate to integer */
371 rt
= _mm_mul_ps(r11
,vftabscale
);
372 vfitab
= _mm_cvttps_epi32(rt
);
374 vfeps
= _mm_frcz_ps(rt
);
376 vfeps
= _mm_sub_ps(rt
,_mm_round_ps(rt
, _MM_FROUND_FLOOR
));
378 twovfeps
= _mm_add_ps(vfeps
,vfeps
);
379 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
381 /* CUBIC SPLINE TABLE ELECTROSTATICS */
382 Y
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,0) );
383 F
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,1) );
384 G
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,2) );
385 H
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,3) );
386 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
387 Fp
= _mm_macc_ps(vfeps
,_mm_macc_ps(H
,vfeps
,G
),F
);
388 VV
= _mm_macc_ps(vfeps
,Fp
,Y
);
389 velec
= _mm_mul_ps(qq11
,VV
);
390 FF
= _mm_macc_ps(vfeps
,_mm_macc_ps(twovfeps
,H
,G
),Fp
);
391 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq11
,FF
),_mm_mul_ps(vftabscale
,rinv11
)));
393 /* Update potential sum for this i atom from the interaction with this j atom. */
394 velecsum
= _mm_add_ps(velecsum
,velec
);
398 /* Update vectorial force */
399 fix1
= _mm_macc_ps(dx11
,fscal
,fix1
);
400 fiy1
= _mm_macc_ps(dy11
,fscal
,fiy1
);
401 fiz1
= _mm_macc_ps(dz11
,fscal
,fiz1
);
403 fjx1
= _mm_macc_ps(dx11
,fscal
,fjx1
);
404 fjy1
= _mm_macc_ps(dy11
,fscal
,fjy1
);
405 fjz1
= _mm_macc_ps(dz11
,fscal
,fjz1
);
407 /**************************
408 * CALCULATE INTERACTIONS *
409 **************************/
411 r12
= _mm_mul_ps(rsq12
,rinv12
);
413 /* Calculate table index by multiplying r with table scale and truncate to integer */
414 rt
= _mm_mul_ps(r12
,vftabscale
);
415 vfitab
= _mm_cvttps_epi32(rt
);
417 vfeps
= _mm_frcz_ps(rt
);
419 vfeps
= _mm_sub_ps(rt
,_mm_round_ps(rt
, _MM_FROUND_FLOOR
));
421 twovfeps
= _mm_add_ps(vfeps
,vfeps
);
422 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
424 /* CUBIC SPLINE TABLE ELECTROSTATICS */
425 Y
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,0) );
426 F
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,1) );
427 G
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,2) );
428 H
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,3) );
429 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
430 Fp
= _mm_macc_ps(vfeps
,_mm_macc_ps(H
,vfeps
,G
),F
);
431 VV
= _mm_macc_ps(vfeps
,Fp
,Y
);
432 velec
= _mm_mul_ps(qq12
,VV
);
433 FF
= _mm_macc_ps(vfeps
,_mm_macc_ps(twovfeps
,H
,G
),Fp
);
434 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq12
,FF
),_mm_mul_ps(vftabscale
,rinv12
)));
436 /* Update potential sum for this i atom from the interaction with this j atom. */
437 velecsum
= _mm_add_ps(velecsum
,velec
);
441 /* Update vectorial force */
442 fix1
= _mm_macc_ps(dx12
,fscal
,fix1
);
443 fiy1
= _mm_macc_ps(dy12
,fscal
,fiy1
);
444 fiz1
= _mm_macc_ps(dz12
,fscal
,fiz1
);
446 fjx2
= _mm_macc_ps(dx12
,fscal
,fjx2
);
447 fjy2
= _mm_macc_ps(dy12
,fscal
,fjy2
);
448 fjz2
= _mm_macc_ps(dz12
,fscal
,fjz2
);
450 /**************************
451 * CALCULATE INTERACTIONS *
452 **************************/
454 r13
= _mm_mul_ps(rsq13
,rinv13
);
456 /* Calculate table index by multiplying r with table scale and truncate to integer */
457 rt
= _mm_mul_ps(r13
,vftabscale
);
458 vfitab
= _mm_cvttps_epi32(rt
);
460 vfeps
= _mm_frcz_ps(rt
);
462 vfeps
= _mm_sub_ps(rt
,_mm_round_ps(rt
, _MM_FROUND_FLOOR
));
464 twovfeps
= _mm_add_ps(vfeps
,vfeps
);
465 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
467 /* CUBIC SPLINE TABLE ELECTROSTATICS */
468 Y
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,0) );
469 F
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,1) );
470 G
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,2) );
471 H
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,3) );
472 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
473 Fp
= _mm_macc_ps(vfeps
,_mm_macc_ps(H
,vfeps
,G
),F
);
474 VV
= _mm_macc_ps(vfeps
,Fp
,Y
);
475 velec
= _mm_mul_ps(qq13
,VV
);
476 FF
= _mm_macc_ps(vfeps
,_mm_macc_ps(twovfeps
,H
,G
),Fp
);
477 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq13
,FF
),_mm_mul_ps(vftabscale
,rinv13
)));
479 /* Update potential sum for this i atom from the interaction with this j atom. */
480 velecsum
= _mm_add_ps(velecsum
,velec
);
484 /* Update vectorial force */
485 fix1
= _mm_macc_ps(dx13
,fscal
,fix1
);
486 fiy1
= _mm_macc_ps(dy13
,fscal
,fiy1
);
487 fiz1
= _mm_macc_ps(dz13
,fscal
,fiz1
);
489 fjx3
= _mm_macc_ps(dx13
,fscal
,fjx3
);
490 fjy3
= _mm_macc_ps(dy13
,fscal
,fjy3
);
491 fjz3
= _mm_macc_ps(dz13
,fscal
,fjz3
);
493 /**************************
494 * CALCULATE INTERACTIONS *
495 **************************/
497 r21
= _mm_mul_ps(rsq21
,rinv21
);
499 /* Calculate table index by multiplying r with table scale and truncate to integer */
500 rt
= _mm_mul_ps(r21
,vftabscale
);
501 vfitab
= _mm_cvttps_epi32(rt
);
503 vfeps
= _mm_frcz_ps(rt
);
505 vfeps
= _mm_sub_ps(rt
,_mm_round_ps(rt
, _MM_FROUND_FLOOR
));
507 twovfeps
= _mm_add_ps(vfeps
,vfeps
);
508 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
510 /* CUBIC SPLINE TABLE ELECTROSTATICS */
511 Y
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,0) );
512 F
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,1) );
513 G
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,2) );
514 H
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,3) );
515 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
516 Fp
= _mm_macc_ps(vfeps
,_mm_macc_ps(H
,vfeps
,G
),F
);
517 VV
= _mm_macc_ps(vfeps
,Fp
,Y
);
518 velec
= _mm_mul_ps(qq21
,VV
);
519 FF
= _mm_macc_ps(vfeps
,_mm_macc_ps(twovfeps
,H
,G
),Fp
);
520 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq21
,FF
),_mm_mul_ps(vftabscale
,rinv21
)));
522 /* Update potential sum for this i atom from the interaction with this j atom. */
523 velecsum
= _mm_add_ps(velecsum
,velec
);
527 /* Update vectorial force */
528 fix2
= _mm_macc_ps(dx21
,fscal
,fix2
);
529 fiy2
= _mm_macc_ps(dy21
,fscal
,fiy2
);
530 fiz2
= _mm_macc_ps(dz21
,fscal
,fiz2
);
532 fjx1
= _mm_macc_ps(dx21
,fscal
,fjx1
);
533 fjy1
= _mm_macc_ps(dy21
,fscal
,fjy1
);
534 fjz1
= _mm_macc_ps(dz21
,fscal
,fjz1
);
536 /**************************
537 * CALCULATE INTERACTIONS *
538 **************************/
540 r22
= _mm_mul_ps(rsq22
,rinv22
);
542 /* Calculate table index by multiplying r with table scale and truncate to integer */
543 rt
= _mm_mul_ps(r22
,vftabscale
);
544 vfitab
= _mm_cvttps_epi32(rt
);
546 vfeps
= _mm_frcz_ps(rt
);
548 vfeps
= _mm_sub_ps(rt
,_mm_round_ps(rt
, _MM_FROUND_FLOOR
));
550 twovfeps
= _mm_add_ps(vfeps
,vfeps
);
551 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
553 /* CUBIC SPLINE TABLE ELECTROSTATICS */
554 Y
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,0) );
555 F
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,1) );
556 G
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,2) );
557 H
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,3) );
558 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
559 Fp
= _mm_macc_ps(vfeps
,_mm_macc_ps(H
,vfeps
,G
),F
);
560 VV
= _mm_macc_ps(vfeps
,Fp
,Y
);
561 velec
= _mm_mul_ps(qq22
,VV
);
562 FF
= _mm_macc_ps(vfeps
,_mm_macc_ps(twovfeps
,H
,G
),Fp
);
563 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq22
,FF
),_mm_mul_ps(vftabscale
,rinv22
)));
565 /* Update potential sum for this i atom from the interaction with this j atom. */
566 velecsum
= _mm_add_ps(velecsum
,velec
);
570 /* Update vectorial force */
571 fix2
= _mm_macc_ps(dx22
,fscal
,fix2
);
572 fiy2
= _mm_macc_ps(dy22
,fscal
,fiy2
);
573 fiz2
= _mm_macc_ps(dz22
,fscal
,fiz2
);
575 fjx2
= _mm_macc_ps(dx22
,fscal
,fjx2
);
576 fjy2
= _mm_macc_ps(dy22
,fscal
,fjy2
);
577 fjz2
= _mm_macc_ps(dz22
,fscal
,fjz2
);
579 /**************************
580 * CALCULATE INTERACTIONS *
581 **************************/
583 r23
= _mm_mul_ps(rsq23
,rinv23
);
585 /* Calculate table index by multiplying r with table scale and truncate to integer */
586 rt
= _mm_mul_ps(r23
,vftabscale
);
587 vfitab
= _mm_cvttps_epi32(rt
);
589 vfeps
= _mm_frcz_ps(rt
);
591 vfeps
= _mm_sub_ps(rt
,_mm_round_ps(rt
, _MM_FROUND_FLOOR
));
593 twovfeps
= _mm_add_ps(vfeps
,vfeps
);
594 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
596 /* CUBIC SPLINE TABLE ELECTROSTATICS */
597 Y
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,0) );
598 F
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,1) );
599 G
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,2) );
600 H
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,3) );
601 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
602 Fp
= _mm_macc_ps(vfeps
,_mm_macc_ps(H
,vfeps
,G
),F
);
603 VV
= _mm_macc_ps(vfeps
,Fp
,Y
);
604 velec
= _mm_mul_ps(qq23
,VV
);
605 FF
= _mm_macc_ps(vfeps
,_mm_macc_ps(twovfeps
,H
,G
),Fp
);
606 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq23
,FF
),_mm_mul_ps(vftabscale
,rinv23
)));
608 /* Update potential sum for this i atom from the interaction with this j atom. */
609 velecsum
= _mm_add_ps(velecsum
,velec
);
613 /* Update vectorial force */
614 fix2
= _mm_macc_ps(dx23
,fscal
,fix2
);
615 fiy2
= _mm_macc_ps(dy23
,fscal
,fiy2
);
616 fiz2
= _mm_macc_ps(dz23
,fscal
,fiz2
);
618 fjx3
= _mm_macc_ps(dx23
,fscal
,fjx3
);
619 fjy3
= _mm_macc_ps(dy23
,fscal
,fjy3
);
620 fjz3
= _mm_macc_ps(dz23
,fscal
,fjz3
);
622 /**************************
623 * CALCULATE INTERACTIONS *
624 **************************/
626 r31
= _mm_mul_ps(rsq31
,rinv31
);
628 /* Calculate table index by multiplying r with table scale and truncate to integer */
629 rt
= _mm_mul_ps(r31
,vftabscale
);
630 vfitab
= _mm_cvttps_epi32(rt
);
632 vfeps
= _mm_frcz_ps(rt
);
634 vfeps
= _mm_sub_ps(rt
,_mm_round_ps(rt
, _MM_FROUND_FLOOR
));
636 twovfeps
= _mm_add_ps(vfeps
,vfeps
);
637 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
639 /* CUBIC SPLINE TABLE ELECTROSTATICS */
640 Y
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,0) );
641 F
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,1) );
642 G
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,2) );
643 H
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,3) );
644 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
645 Fp
= _mm_macc_ps(vfeps
,_mm_macc_ps(H
,vfeps
,G
),F
);
646 VV
= _mm_macc_ps(vfeps
,Fp
,Y
);
647 velec
= _mm_mul_ps(qq31
,VV
);
648 FF
= _mm_macc_ps(vfeps
,_mm_macc_ps(twovfeps
,H
,G
),Fp
);
649 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq31
,FF
),_mm_mul_ps(vftabscale
,rinv31
)));
651 /* Update potential sum for this i atom from the interaction with this j atom. */
652 velecsum
= _mm_add_ps(velecsum
,velec
);
656 /* Update vectorial force */
657 fix3
= _mm_macc_ps(dx31
,fscal
,fix3
);
658 fiy3
= _mm_macc_ps(dy31
,fscal
,fiy3
);
659 fiz3
= _mm_macc_ps(dz31
,fscal
,fiz3
);
661 fjx1
= _mm_macc_ps(dx31
,fscal
,fjx1
);
662 fjy1
= _mm_macc_ps(dy31
,fscal
,fjy1
);
663 fjz1
= _mm_macc_ps(dz31
,fscal
,fjz1
);
665 /**************************
666 * CALCULATE INTERACTIONS *
667 **************************/
669 r32
= _mm_mul_ps(rsq32
,rinv32
);
671 /* Calculate table index by multiplying r with table scale and truncate to integer */
672 rt
= _mm_mul_ps(r32
,vftabscale
);
673 vfitab
= _mm_cvttps_epi32(rt
);
675 vfeps
= _mm_frcz_ps(rt
);
677 vfeps
= _mm_sub_ps(rt
,_mm_round_ps(rt
, _MM_FROUND_FLOOR
));
679 twovfeps
= _mm_add_ps(vfeps
,vfeps
);
680 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
682 /* CUBIC SPLINE TABLE ELECTROSTATICS */
683 Y
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,0) );
684 F
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,1) );
685 G
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,2) );
686 H
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,3) );
687 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
688 Fp
= _mm_macc_ps(vfeps
,_mm_macc_ps(H
,vfeps
,G
),F
);
689 VV
= _mm_macc_ps(vfeps
,Fp
,Y
);
690 velec
= _mm_mul_ps(qq32
,VV
);
691 FF
= _mm_macc_ps(vfeps
,_mm_macc_ps(twovfeps
,H
,G
),Fp
);
692 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq32
,FF
),_mm_mul_ps(vftabscale
,rinv32
)));
694 /* Update potential sum for this i atom from the interaction with this j atom. */
695 velecsum
= _mm_add_ps(velecsum
,velec
);
699 /* Update vectorial force */
700 fix3
= _mm_macc_ps(dx32
,fscal
,fix3
);
701 fiy3
= _mm_macc_ps(dy32
,fscal
,fiy3
);
702 fiz3
= _mm_macc_ps(dz32
,fscal
,fiz3
);
704 fjx2
= _mm_macc_ps(dx32
,fscal
,fjx2
);
705 fjy2
= _mm_macc_ps(dy32
,fscal
,fjy2
);
706 fjz2
= _mm_macc_ps(dz32
,fscal
,fjz2
);
708 /**************************
709 * CALCULATE INTERACTIONS *
710 **************************/
712 r33
= _mm_mul_ps(rsq33
,rinv33
);
714 /* Calculate table index by multiplying r with table scale and truncate to integer */
715 rt
= _mm_mul_ps(r33
,vftabscale
);
716 vfitab
= _mm_cvttps_epi32(rt
);
718 vfeps
= _mm_frcz_ps(rt
);
720 vfeps
= _mm_sub_ps(rt
,_mm_round_ps(rt
, _MM_FROUND_FLOOR
));
722 twovfeps
= _mm_add_ps(vfeps
,vfeps
);
723 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
725 /* CUBIC SPLINE TABLE ELECTROSTATICS */
726 Y
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,0) );
727 F
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,1) );
728 G
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,2) );
729 H
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,3) );
730 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
731 Fp
= _mm_macc_ps(vfeps
,_mm_macc_ps(H
,vfeps
,G
),F
);
732 VV
= _mm_macc_ps(vfeps
,Fp
,Y
);
733 velec
= _mm_mul_ps(qq33
,VV
);
734 FF
= _mm_macc_ps(vfeps
,_mm_macc_ps(twovfeps
,H
,G
),Fp
);
735 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq33
,FF
),_mm_mul_ps(vftabscale
,rinv33
)));
737 /* Update potential sum for this i atom from the interaction with this j atom. */
738 velecsum
= _mm_add_ps(velecsum
,velec
);
742 /* Update vectorial force */
743 fix3
= _mm_macc_ps(dx33
,fscal
,fix3
);
744 fiy3
= _mm_macc_ps(dy33
,fscal
,fiy3
);
745 fiz3
= _mm_macc_ps(dz33
,fscal
,fiz3
);
747 fjx3
= _mm_macc_ps(dx33
,fscal
,fjx3
);
748 fjy3
= _mm_macc_ps(dy33
,fscal
,fjy3
);
749 fjz3
= _mm_macc_ps(dz33
,fscal
,fjz3
);
751 fjptrA
= f
+j_coord_offsetA
;
752 fjptrB
= f
+j_coord_offsetB
;
753 fjptrC
= f
+j_coord_offsetC
;
754 fjptrD
= f
+j_coord_offsetD
;
756 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
757 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,
758 fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
760 /* Inner loop uses 476 flops */
766 /* Get j neighbor index, and coordinate index */
767 jnrlistA
= jjnr
[jidx
];
768 jnrlistB
= jjnr
[jidx
+1];
769 jnrlistC
= jjnr
[jidx
+2];
770 jnrlistD
= jjnr
[jidx
+3];
771 /* Sign of each element will be negative for non-real atoms.
772 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
773 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
775 dummy_mask
= gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128()));
776 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
777 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
778 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
779 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
780 j_coord_offsetA
= DIM
*jnrA
;
781 j_coord_offsetB
= DIM
*jnrB
;
782 j_coord_offsetC
= DIM
*jnrC
;
783 j_coord_offsetD
= DIM
*jnrD
;
785 /* load j atom coordinates */
786 gmx_mm_load_4rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
787 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
788 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
789 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
791 /* Calculate displacement vector */
792 dx00
= _mm_sub_ps(ix0
,jx0
);
793 dy00
= _mm_sub_ps(iy0
,jy0
);
794 dz00
= _mm_sub_ps(iz0
,jz0
);
795 dx11
= _mm_sub_ps(ix1
,jx1
);
796 dy11
= _mm_sub_ps(iy1
,jy1
);
797 dz11
= _mm_sub_ps(iz1
,jz1
);
798 dx12
= _mm_sub_ps(ix1
,jx2
);
799 dy12
= _mm_sub_ps(iy1
,jy2
);
800 dz12
= _mm_sub_ps(iz1
,jz2
);
801 dx13
= _mm_sub_ps(ix1
,jx3
);
802 dy13
= _mm_sub_ps(iy1
,jy3
);
803 dz13
= _mm_sub_ps(iz1
,jz3
);
804 dx21
= _mm_sub_ps(ix2
,jx1
);
805 dy21
= _mm_sub_ps(iy2
,jy1
);
806 dz21
= _mm_sub_ps(iz2
,jz1
);
807 dx22
= _mm_sub_ps(ix2
,jx2
);
808 dy22
= _mm_sub_ps(iy2
,jy2
);
809 dz22
= _mm_sub_ps(iz2
,jz2
);
810 dx23
= _mm_sub_ps(ix2
,jx3
);
811 dy23
= _mm_sub_ps(iy2
,jy3
);
812 dz23
= _mm_sub_ps(iz2
,jz3
);
813 dx31
= _mm_sub_ps(ix3
,jx1
);
814 dy31
= _mm_sub_ps(iy3
,jy1
);
815 dz31
= _mm_sub_ps(iz3
,jz1
);
816 dx32
= _mm_sub_ps(ix3
,jx2
);
817 dy32
= _mm_sub_ps(iy3
,jy2
);
818 dz32
= _mm_sub_ps(iz3
,jz2
);
819 dx33
= _mm_sub_ps(ix3
,jx3
);
820 dy33
= _mm_sub_ps(iy3
,jy3
);
821 dz33
= _mm_sub_ps(iz3
,jz3
);
823 /* Calculate squared distance and things based on it */
824 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
825 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
826 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
827 rsq13
= gmx_mm_calc_rsq_ps(dx13
,dy13
,dz13
);
828 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
829 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
830 rsq23
= gmx_mm_calc_rsq_ps(dx23
,dy23
,dz23
);
831 rsq31
= gmx_mm_calc_rsq_ps(dx31
,dy31
,dz31
);
832 rsq32
= gmx_mm_calc_rsq_ps(dx32
,dy32
,dz32
);
833 rsq33
= gmx_mm_calc_rsq_ps(dx33
,dy33
,dz33
);
835 rinv00
= avx128fma_invsqrt_f(rsq00
);
836 rinv11
= avx128fma_invsqrt_f(rsq11
);
837 rinv12
= avx128fma_invsqrt_f(rsq12
);
838 rinv13
= avx128fma_invsqrt_f(rsq13
);
839 rinv21
= avx128fma_invsqrt_f(rsq21
);
840 rinv22
= avx128fma_invsqrt_f(rsq22
);
841 rinv23
= avx128fma_invsqrt_f(rsq23
);
842 rinv31
= avx128fma_invsqrt_f(rsq31
);
843 rinv32
= avx128fma_invsqrt_f(rsq32
);
844 rinv33
= avx128fma_invsqrt_f(rsq33
);
846 fjx0
= _mm_setzero_ps();
847 fjy0
= _mm_setzero_ps();
848 fjz0
= _mm_setzero_ps();
849 fjx1
= _mm_setzero_ps();
850 fjy1
= _mm_setzero_ps();
851 fjz1
= _mm_setzero_ps();
852 fjx2
= _mm_setzero_ps();
853 fjy2
= _mm_setzero_ps();
854 fjz2
= _mm_setzero_ps();
855 fjx3
= _mm_setzero_ps();
856 fjy3
= _mm_setzero_ps();
857 fjz3
= _mm_setzero_ps();
859 /**************************
860 * CALCULATE INTERACTIONS *
861 **************************/
863 r00
= _mm_mul_ps(rsq00
,rinv00
);
864 r00
= _mm_andnot_ps(dummy_mask
,r00
);
866 /* Calculate table index by multiplying r with table scale and truncate to integer */
867 rt
= _mm_mul_ps(r00
,vftabscale
);
868 vfitab
= _mm_cvttps_epi32(rt
);
870 vfeps
= _mm_frcz_ps(rt
);
872 vfeps
= _mm_sub_ps(rt
,_mm_round_ps(rt
, _MM_FROUND_FLOOR
));
874 twovfeps
= _mm_add_ps(vfeps
,vfeps
);
875 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
877 /* CUBIC SPLINE TABLE DISPERSION */
878 vfitab
= _mm_add_epi32(vfitab
,ifour
);
879 Y
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,0) );
880 F
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,1) );
881 G
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,2) );
882 H
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,3) );
883 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
884 Fp
= _mm_macc_ps(vfeps
,_mm_macc_ps(H
,vfeps
,G
),F
);
885 VV
= _mm_macc_ps(vfeps
,Fp
,Y
);
886 vvdw6
= _mm_mul_ps(c6_00
,VV
);
887 FF
= _mm_macc_ps(vfeps
,_mm_macc_ps(twovfeps
,H
,G
),Fp
);
888 fvdw6
= _mm_mul_ps(c6_00
,FF
);
890 /* CUBIC SPLINE TABLE REPULSION */
891 vfitab
= _mm_add_epi32(vfitab
,ifour
);
892 Y
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,0) );
893 F
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,1) );
894 G
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,2) );
895 H
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,3) );
896 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
897 Fp
= _mm_macc_ps(vfeps
,_mm_macc_ps(H
,vfeps
,G
),F
);
898 VV
= _mm_macc_ps(vfeps
,Fp
,Y
);
899 vvdw12
= _mm_mul_ps(c12_00
,VV
);
900 FF
= _mm_macc_ps(vfeps
,_mm_macc_ps(twovfeps
,H
,G
),Fp
);
901 fvdw12
= _mm_mul_ps(c12_00
,FF
);
902 vvdw
= _mm_add_ps(vvdw12
,vvdw6
);
903 fvdw
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_add_ps(fvdw6
,fvdw12
),_mm_mul_ps(vftabscale
,rinv00
)));
905 /* Update potential sum for this i atom from the interaction with this j atom. */
906 vvdw
= _mm_andnot_ps(dummy_mask
,vvdw
);
907 vvdwsum
= _mm_add_ps(vvdwsum
,vvdw
);
911 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
913 /* Update vectorial force */
914 fix0
= _mm_macc_ps(dx00
,fscal
,fix0
);
915 fiy0
= _mm_macc_ps(dy00
,fscal
,fiy0
);
916 fiz0
= _mm_macc_ps(dz00
,fscal
,fiz0
);
918 fjx0
= _mm_macc_ps(dx00
,fscal
,fjx0
);
919 fjy0
= _mm_macc_ps(dy00
,fscal
,fjy0
);
920 fjz0
= _mm_macc_ps(dz00
,fscal
,fjz0
);
922 /**************************
923 * CALCULATE INTERACTIONS *
924 **************************/
926 r11
= _mm_mul_ps(rsq11
,rinv11
);
927 r11
= _mm_andnot_ps(dummy_mask
,r11
);
929 /* Calculate table index by multiplying r with table scale and truncate to integer */
930 rt
= _mm_mul_ps(r11
,vftabscale
);
931 vfitab
= _mm_cvttps_epi32(rt
);
933 vfeps
= _mm_frcz_ps(rt
);
935 vfeps
= _mm_sub_ps(rt
,_mm_round_ps(rt
, _MM_FROUND_FLOOR
));
937 twovfeps
= _mm_add_ps(vfeps
,vfeps
);
938 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
940 /* CUBIC SPLINE TABLE ELECTROSTATICS */
941 Y
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,0) );
942 F
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,1) );
943 G
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,2) );
944 H
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,3) );
945 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
946 Fp
= _mm_macc_ps(vfeps
,_mm_macc_ps(H
,vfeps
,G
),F
);
947 VV
= _mm_macc_ps(vfeps
,Fp
,Y
);
948 velec
= _mm_mul_ps(qq11
,VV
);
949 FF
= _mm_macc_ps(vfeps
,_mm_macc_ps(twovfeps
,H
,G
),Fp
);
950 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq11
,FF
),_mm_mul_ps(vftabscale
,rinv11
)));
952 /* Update potential sum for this i atom from the interaction with this j atom. */
953 velec
= _mm_andnot_ps(dummy_mask
,velec
);
954 velecsum
= _mm_add_ps(velecsum
,velec
);
958 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
960 /* Update vectorial force */
961 fix1
= _mm_macc_ps(dx11
,fscal
,fix1
);
962 fiy1
= _mm_macc_ps(dy11
,fscal
,fiy1
);
963 fiz1
= _mm_macc_ps(dz11
,fscal
,fiz1
);
965 fjx1
= _mm_macc_ps(dx11
,fscal
,fjx1
);
966 fjy1
= _mm_macc_ps(dy11
,fscal
,fjy1
);
967 fjz1
= _mm_macc_ps(dz11
,fscal
,fjz1
);
969 /**************************
970 * CALCULATE INTERACTIONS *
971 **************************/
973 r12
= _mm_mul_ps(rsq12
,rinv12
);
974 r12
= _mm_andnot_ps(dummy_mask
,r12
);
976 /* Calculate table index by multiplying r with table scale and truncate to integer */
977 rt
= _mm_mul_ps(r12
,vftabscale
);
978 vfitab
= _mm_cvttps_epi32(rt
);
980 vfeps
= _mm_frcz_ps(rt
);
982 vfeps
= _mm_sub_ps(rt
,_mm_round_ps(rt
, _MM_FROUND_FLOOR
));
984 twovfeps
= _mm_add_ps(vfeps
,vfeps
);
985 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
987 /* CUBIC SPLINE TABLE ELECTROSTATICS */
988 Y
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,0) );
989 F
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,1) );
990 G
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,2) );
991 H
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,3) );
992 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
993 Fp
= _mm_macc_ps(vfeps
,_mm_macc_ps(H
,vfeps
,G
),F
);
994 VV
= _mm_macc_ps(vfeps
,Fp
,Y
);
995 velec
= _mm_mul_ps(qq12
,VV
);
996 FF
= _mm_macc_ps(vfeps
,_mm_macc_ps(twovfeps
,H
,G
),Fp
);
997 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq12
,FF
),_mm_mul_ps(vftabscale
,rinv12
)));
999 /* Update potential sum for this i atom from the interaction with this j atom. */
1000 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1001 velecsum
= _mm_add_ps(velecsum
,velec
);
1005 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1007 /* Update vectorial force */
1008 fix1
= _mm_macc_ps(dx12
,fscal
,fix1
);
1009 fiy1
= _mm_macc_ps(dy12
,fscal
,fiy1
);
1010 fiz1
= _mm_macc_ps(dz12
,fscal
,fiz1
);
1012 fjx2
= _mm_macc_ps(dx12
,fscal
,fjx2
);
1013 fjy2
= _mm_macc_ps(dy12
,fscal
,fjy2
);
1014 fjz2
= _mm_macc_ps(dz12
,fscal
,fjz2
);
1016 /**************************
1017 * CALCULATE INTERACTIONS *
1018 **************************/
1020 r13
= _mm_mul_ps(rsq13
,rinv13
);
1021 r13
= _mm_andnot_ps(dummy_mask
,r13
);
1023 /* Calculate table index by multiplying r with table scale and truncate to integer */
1024 rt
= _mm_mul_ps(r13
,vftabscale
);
1025 vfitab
= _mm_cvttps_epi32(rt
);
1027 vfeps
= _mm_frcz_ps(rt
);
1029 vfeps
= _mm_sub_ps(rt
,_mm_round_ps(rt
, _MM_FROUND_FLOOR
));
1031 twovfeps
= _mm_add_ps(vfeps
,vfeps
);
1032 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1034 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1035 Y
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,0) );
1036 F
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,1) );
1037 G
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,2) );
1038 H
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,3) );
1039 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1040 Fp
= _mm_macc_ps(vfeps
,_mm_macc_ps(H
,vfeps
,G
),F
);
1041 VV
= _mm_macc_ps(vfeps
,Fp
,Y
);
1042 velec
= _mm_mul_ps(qq13
,VV
);
1043 FF
= _mm_macc_ps(vfeps
,_mm_macc_ps(twovfeps
,H
,G
),Fp
);
1044 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq13
,FF
),_mm_mul_ps(vftabscale
,rinv13
)));
1046 /* Update potential sum for this i atom from the interaction with this j atom. */
1047 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1048 velecsum
= _mm_add_ps(velecsum
,velec
);
1052 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1054 /* Update vectorial force */
1055 fix1
= _mm_macc_ps(dx13
,fscal
,fix1
);
1056 fiy1
= _mm_macc_ps(dy13
,fscal
,fiy1
);
1057 fiz1
= _mm_macc_ps(dz13
,fscal
,fiz1
);
1059 fjx3
= _mm_macc_ps(dx13
,fscal
,fjx3
);
1060 fjy3
= _mm_macc_ps(dy13
,fscal
,fjy3
);
1061 fjz3
= _mm_macc_ps(dz13
,fscal
,fjz3
);
1063 /**************************
1064 * CALCULATE INTERACTIONS *
1065 **************************/
1067 r21
= _mm_mul_ps(rsq21
,rinv21
);
1068 r21
= _mm_andnot_ps(dummy_mask
,r21
);
1070 /* Calculate table index by multiplying r with table scale and truncate to integer */
1071 rt
= _mm_mul_ps(r21
,vftabscale
);
1072 vfitab
= _mm_cvttps_epi32(rt
);
1074 vfeps
= _mm_frcz_ps(rt
);
1076 vfeps
= _mm_sub_ps(rt
,_mm_round_ps(rt
, _MM_FROUND_FLOOR
));
1078 twovfeps
= _mm_add_ps(vfeps
,vfeps
);
1079 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1081 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1082 Y
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,0) );
1083 F
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,1) );
1084 G
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,2) );
1085 H
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,3) );
1086 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1087 Fp
= _mm_macc_ps(vfeps
,_mm_macc_ps(H
,vfeps
,G
),F
);
1088 VV
= _mm_macc_ps(vfeps
,Fp
,Y
);
1089 velec
= _mm_mul_ps(qq21
,VV
);
1090 FF
= _mm_macc_ps(vfeps
,_mm_macc_ps(twovfeps
,H
,G
),Fp
);
1091 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq21
,FF
),_mm_mul_ps(vftabscale
,rinv21
)));
1093 /* Update potential sum for this i atom from the interaction with this j atom. */
1094 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1095 velecsum
= _mm_add_ps(velecsum
,velec
);
1099 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1101 /* Update vectorial force */
1102 fix2
= _mm_macc_ps(dx21
,fscal
,fix2
);
1103 fiy2
= _mm_macc_ps(dy21
,fscal
,fiy2
);
1104 fiz2
= _mm_macc_ps(dz21
,fscal
,fiz2
);
1106 fjx1
= _mm_macc_ps(dx21
,fscal
,fjx1
);
1107 fjy1
= _mm_macc_ps(dy21
,fscal
,fjy1
);
1108 fjz1
= _mm_macc_ps(dz21
,fscal
,fjz1
);
1110 /**************************
1111 * CALCULATE INTERACTIONS *
1112 **************************/
1114 r22
= _mm_mul_ps(rsq22
,rinv22
);
1115 r22
= _mm_andnot_ps(dummy_mask
,r22
);
1117 /* Calculate table index by multiplying r with table scale and truncate to integer */
1118 rt
= _mm_mul_ps(r22
,vftabscale
);
1119 vfitab
= _mm_cvttps_epi32(rt
);
1121 vfeps
= _mm_frcz_ps(rt
);
1123 vfeps
= _mm_sub_ps(rt
,_mm_round_ps(rt
, _MM_FROUND_FLOOR
));
1125 twovfeps
= _mm_add_ps(vfeps
,vfeps
);
1126 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1128 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1129 Y
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,0) );
1130 F
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,1) );
1131 G
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,2) );
1132 H
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,3) );
1133 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1134 Fp
= _mm_macc_ps(vfeps
,_mm_macc_ps(H
,vfeps
,G
),F
);
1135 VV
= _mm_macc_ps(vfeps
,Fp
,Y
);
1136 velec
= _mm_mul_ps(qq22
,VV
);
1137 FF
= _mm_macc_ps(vfeps
,_mm_macc_ps(twovfeps
,H
,G
),Fp
);
1138 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq22
,FF
),_mm_mul_ps(vftabscale
,rinv22
)));
1140 /* Update potential sum for this i atom from the interaction with this j atom. */
1141 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1142 velecsum
= _mm_add_ps(velecsum
,velec
);
1146 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1148 /* Update vectorial force */
1149 fix2
= _mm_macc_ps(dx22
,fscal
,fix2
);
1150 fiy2
= _mm_macc_ps(dy22
,fscal
,fiy2
);
1151 fiz2
= _mm_macc_ps(dz22
,fscal
,fiz2
);
1153 fjx2
= _mm_macc_ps(dx22
,fscal
,fjx2
);
1154 fjy2
= _mm_macc_ps(dy22
,fscal
,fjy2
);
1155 fjz2
= _mm_macc_ps(dz22
,fscal
,fjz2
);
1157 /**************************
1158 * CALCULATE INTERACTIONS *
1159 **************************/
1161 r23
= _mm_mul_ps(rsq23
,rinv23
);
1162 r23
= _mm_andnot_ps(dummy_mask
,r23
);
1164 /* Calculate table index by multiplying r with table scale and truncate to integer */
1165 rt
= _mm_mul_ps(r23
,vftabscale
);
1166 vfitab
= _mm_cvttps_epi32(rt
);
1168 vfeps
= _mm_frcz_ps(rt
);
1170 vfeps
= _mm_sub_ps(rt
,_mm_round_ps(rt
, _MM_FROUND_FLOOR
));
1172 twovfeps
= _mm_add_ps(vfeps
,vfeps
);
1173 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1175 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1176 Y
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,0) );
1177 F
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,1) );
1178 G
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,2) );
1179 H
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,3) );
1180 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1181 Fp
= _mm_macc_ps(vfeps
,_mm_macc_ps(H
,vfeps
,G
),F
);
1182 VV
= _mm_macc_ps(vfeps
,Fp
,Y
);
1183 velec
= _mm_mul_ps(qq23
,VV
);
1184 FF
= _mm_macc_ps(vfeps
,_mm_macc_ps(twovfeps
,H
,G
),Fp
);
1185 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq23
,FF
),_mm_mul_ps(vftabscale
,rinv23
)));
1187 /* Update potential sum for this i atom from the interaction with this j atom. */
1188 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1189 velecsum
= _mm_add_ps(velecsum
,velec
);
1193 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1195 /* Update vectorial force */
1196 fix2
= _mm_macc_ps(dx23
,fscal
,fix2
);
1197 fiy2
= _mm_macc_ps(dy23
,fscal
,fiy2
);
1198 fiz2
= _mm_macc_ps(dz23
,fscal
,fiz2
);
1200 fjx3
= _mm_macc_ps(dx23
,fscal
,fjx3
);
1201 fjy3
= _mm_macc_ps(dy23
,fscal
,fjy3
);
1202 fjz3
= _mm_macc_ps(dz23
,fscal
,fjz3
);
1204 /**************************
1205 * CALCULATE INTERACTIONS *
1206 **************************/
1208 r31
= _mm_mul_ps(rsq31
,rinv31
);
1209 r31
= _mm_andnot_ps(dummy_mask
,r31
);
1211 /* Calculate table index by multiplying r with table scale and truncate to integer */
1212 rt
= _mm_mul_ps(r31
,vftabscale
);
1213 vfitab
= _mm_cvttps_epi32(rt
);
1215 vfeps
= _mm_frcz_ps(rt
);
1217 vfeps
= _mm_sub_ps(rt
,_mm_round_ps(rt
, _MM_FROUND_FLOOR
));
1219 twovfeps
= _mm_add_ps(vfeps
,vfeps
);
1220 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1222 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1223 Y
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,0) );
1224 F
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,1) );
1225 G
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,2) );
1226 H
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,3) );
1227 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1228 Fp
= _mm_macc_ps(vfeps
,_mm_macc_ps(H
,vfeps
,G
),F
);
1229 VV
= _mm_macc_ps(vfeps
,Fp
,Y
);
1230 velec
= _mm_mul_ps(qq31
,VV
);
1231 FF
= _mm_macc_ps(vfeps
,_mm_macc_ps(twovfeps
,H
,G
),Fp
);
1232 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq31
,FF
),_mm_mul_ps(vftabscale
,rinv31
)));
1234 /* Update potential sum for this i atom from the interaction with this j atom. */
1235 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1236 velecsum
= _mm_add_ps(velecsum
,velec
);
1240 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1242 /* Update vectorial force */
1243 fix3
= _mm_macc_ps(dx31
,fscal
,fix3
);
1244 fiy3
= _mm_macc_ps(dy31
,fscal
,fiy3
);
1245 fiz3
= _mm_macc_ps(dz31
,fscal
,fiz3
);
1247 fjx1
= _mm_macc_ps(dx31
,fscal
,fjx1
);
1248 fjy1
= _mm_macc_ps(dy31
,fscal
,fjy1
);
1249 fjz1
= _mm_macc_ps(dz31
,fscal
,fjz1
);
1251 /**************************
1252 * CALCULATE INTERACTIONS *
1253 **************************/
1255 r32
= _mm_mul_ps(rsq32
,rinv32
);
1256 r32
= _mm_andnot_ps(dummy_mask
,r32
);
1258 /* Calculate table index by multiplying r with table scale and truncate to integer */
1259 rt
= _mm_mul_ps(r32
,vftabscale
);
1260 vfitab
= _mm_cvttps_epi32(rt
);
1262 vfeps
= _mm_frcz_ps(rt
);
1264 vfeps
= _mm_sub_ps(rt
,_mm_round_ps(rt
, _MM_FROUND_FLOOR
));
1266 twovfeps
= _mm_add_ps(vfeps
,vfeps
);
1267 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1269 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1270 Y
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,0) );
1271 F
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,1) );
1272 G
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,2) );
1273 H
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,3) );
1274 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1275 Fp
= _mm_macc_ps(vfeps
,_mm_macc_ps(H
,vfeps
,G
),F
);
1276 VV
= _mm_macc_ps(vfeps
,Fp
,Y
);
1277 velec
= _mm_mul_ps(qq32
,VV
);
1278 FF
= _mm_macc_ps(vfeps
,_mm_macc_ps(twovfeps
,H
,G
),Fp
);
1279 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq32
,FF
),_mm_mul_ps(vftabscale
,rinv32
)));
1281 /* Update potential sum for this i atom from the interaction with this j atom. */
1282 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1283 velecsum
= _mm_add_ps(velecsum
,velec
);
1287 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1289 /* Update vectorial force */
1290 fix3
= _mm_macc_ps(dx32
,fscal
,fix3
);
1291 fiy3
= _mm_macc_ps(dy32
,fscal
,fiy3
);
1292 fiz3
= _mm_macc_ps(dz32
,fscal
,fiz3
);
1294 fjx2
= _mm_macc_ps(dx32
,fscal
,fjx2
);
1295 fjy2
= _mm_macc_ps(dy32
,fscal
,fjy2
);
1296 fjz2
= _mm_macc_ps(dz32
,fscal
,fjz2
);
1298 /**************************
1299 * CALCULATE INTERACTIONS *
1300 **************************/
1302 r33
= _mm_mul_ps(rsq33
,rinv33
);
1303 r33
= _mm_andnot_ps(dummy_mask
,r33
);
1305 /* Calculate table index by multiplying r with table scale and truncate to integer */
1306 rt
= _mm_mul_ps(r33
,vftabscale
);
1307 vfitab
= _mm_cvttps_epi32(rt
);
1309 vfeps
= _mm_frcz_ps(rt
);
1311 vfeps
= _mm_sub_ps(rt
,_mm_round_ps(rt
, _MM_FROUND_FLOOR
));
1313 twovfeps
= _mm_add_ps(vfeps
,vfeps
);
1314 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1316 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1317 Y
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,0) );
1318 F
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,1) );
1319 G
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,2) );
1320 H
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,3) );
1321 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1322 Fp
= _mm_macc_ps(vfeps
,_mm_macc_ps(H
,vfeps
,G
),F
);
1323 VV
= _mm_macc_ps(vfeps
,Fp
,Y
);
1324 velec
= _mm_mul_ps(qq33
,VV
);
1325 FF
= _mm_macc_ps(vfeps
,_mm_macc_ps(twovfeps
,H
,G
),Fp
);
1326 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq33
,FF
),_mm_mul_ps(vftabscale
,rinv33
)));
1328 /* Update potential sum for this i atom from the interaction with this j atom. */
1329 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1330 velecsum
= _mm_add_ps(velecsum
,velec
);
1334 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1336 /* Update vectorial force */
1337 fix3
= _mm_macc_ps(dx33
,fscal
,fix3
);
1338 fiy3
= _mm_macc_ps(dy33
,fscal
,fiy3
);
1339 fiz3
= _mm_macc_ps(dz33
,fscal
,fiz3
);
1341 fjx3
= _mm_macc_ps(dx33
,fscal
,fjx3
);
1342 fjy3
= _mm_macc_ps(dy33
,fscal
,fjy3
);
1343 fjz3
= _mm_macc_ps(dz33
,fscal
,fjz3
);
1345 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
1346 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
1347 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
1348 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
1350 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
1351 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,
1352 fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
1354 /* Inner loop uses 486 flops */
1357 /* End of innermost loop */
1359 gmx_mm_update_iforce_4atom_swizzle_ps(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
1360 f
+i_coord_offset
,fshift
+i_shift_offset
);
1363 /* Update potential energies */
1364 gmx_mm_update_1pot_ps(velecsum
,kernel_data
->energygrp_elec
+ggid
);
1365 gmx_mm_update_1pot_ps(vvdwsum
,kernel_data
->energygrp_vdw
+ggid
);
1367 /* Increment number of inner iterations */
1368 inneriter
+= j_index_end
- j_index_start
;
1370 /* Outer loop uses 26 flops */
1373 /* Increment number of outer iterations */
1376 /* Update outer/inner flops */
1378 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W4W4_VF
,outeriter
*26 + inneriter
*486);
1381 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_avx_128_fma_single
1382 * Electrostatics interaction: CubicSplineTable
1383 * VdW interaction: CubicSplineTable
1384 * Geometry: Water4-Water4
1385 * Calculate force/pot: Force
1388 nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_avx_128_fma_single
1389 (t_nblist
* gmx_restrict nlist
,
1390 rvec
* gmx_restrict xx
,
1391 rvec
* gmx_restrict ff
,
1392 struct t_forcerec
* gmx_restrict fr
,
1393 t_mdatoms
* gmx_restrict mdatoms
,
1394 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
1395 t_nrnb
* gmx_restrict nrnb
)
1397 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1398 * just 0 for non-waters.
1399 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
1400 * jnr indices corresponding to data put in the four positions in the SIMD register.
1402 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
1403 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
1404 int jnrA
,jnrB
,jnrC
,jnrD
;
1405 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
1406 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
1407 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
1408 real rcutoff_scalar
;
1409 real
*shiftvec
,*fshift
,*x
,*f
;
1410 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
;
1411 real scratch
[4*DIM
];
1412 __m128 fscal
,rcutoff
,rcutoff2
,jidxall
;
1414 __m128 ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
1416 __m128 ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
1418 __m128 ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
1420 __m128 ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
1421 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
;
1422 __m128 jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
1423 int vdwjidx1A
,vdwjidx1B
,vdwjidx1C
,vdwjidx1D
;
1424 __m128 jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
1425 int vdwjidx2A
,vdwjidx2B
,vdwjidx2C
,vdwjidx2D
;
1426 __m128 jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
1427 int vdwjidx3A
,vdwjidx3B
,vdwjidx3C
,vdwjidx3D
;
1428 __m128 jx3
,jy3
,jz3
,fjx3
,fjy3
,fjz3
,jq3
,isaj3
;
1429 __m128 dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
1430 __m128 dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
1431 __m128 dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
1432 __m128 dx13
,dy13
,dz13
,rsq13
,rinv13
,rinvsq13
,r13
,qq13
,c6_13
,c12_13
;
1433 __m128 dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
1434 __m128 dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
1435 __m128 dx23
,dy23
,dz23
,rsq23
,rinv23
,rinvsq23
,r23
,qq23
,c6_23
,c12_23
;
1436 __m128 dx31
,dy31
,dz31
,rsq31
,rinv31
,rinvsq31
,r31
,qq31
,c6_31
,c12_31
;
1437 __m128 dx32
,dy32
,dz32
,rsq32
,rinv32
,rinvsq32
,r32
,qq32
,c6_32
,c12_32
;
1438 __m128 dx33
,dy33
,dz33
,rsq33
,rinv33
,rinvsq33
,r33
,qq33
,c6_33
,c12_33
;
1439 __m128 velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
1442 __m128 rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
1445 __m128 one_sixth
= _mm_set1_ps(1.0/6.0);
1446 __m128 one_twelfth
= _mm_set1_ps(1.0/12.0);
1448 __m128i ifour
= _mm_set1_epi32(4);
1449 __m128 rt
,vfeps
,twovfeps
,vftabscale
,Y
,F
,G
,H
,Fp
,VV
,FF
;
1451 __m128 dummy_mask
,cutoff_mask
;
1452 __m128 signbit
= _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1453 __m128 one
= _mm_set1_ps(1.0);
1454 __m128 two
= _mm_set1_ps(2.0);
1460 jindex
= nlist
->jindex
;
1462 shiftidx
= nlist
->shift
;
1464 shiftvec
= fr
->shift_vec
[0];
1465 fshift
= fr
->fshift
[0];
1466 facel
= _mm_set1_ps(fr
->ic
->epsfac
);
1467 charge
= mdatoms
->chargeA
;
1468 nvdwtype
= fr
->ntype
;
1469 vdwparam
= fr
->nbfp
;
1470 vdwtype
= mdatoms
->typeA
;
1472 vftab
= kernel_data
->table_elec_vdw
->data
;
1473 vftabscale
= _mm_set1_ps(kernel_data
->table_elec_vdw
->scale
);
1475 /* Setup water-specific parameters */
1476 inr
= nlist
->iinr
[0];
1477 iq1
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+1]));
1478 iq2
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+2]));
1479 iq3
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+3]));
1480 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
1482 jq1
= _mm_set1_ps(charge
[inr
+1]);
1483 jq2
= _mm_set1_ps(charge
[inr
+2]);
1484 jq3
= _mm_set1_ps(charge
[inr
+3]);
1485 vdwjidx0A
= 2*vdwtype
[inr
+0];
1486 c6_00
= _mm_set1_ps(vdwparam
[vdwioffset0
+vdwjidx0A
]);
1487 c12_00
= _mm_set1_ps(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
1488 qq11
= _mm_mul_ps(iq1
,jq1
);
1489 qq12
= _mm_mul_ps(iq1
,jq2
);
1490 qq13
= _mm_mul_ps(iq1
,jq3
);
1491 qq21
= _mm_mul_ps(iq2
,jq1
);
1492 qq22
= _mm_mul_ps(iq2
,jq2
);
1493 qq23
= _mm_mul_ps(iq2
,jq3
);
1494 qq31
= _mm_mul_ps(iq3
,jq1
);
1495 qq32
= _mm_mul_ps(iq3
,jq2
);
1496 qq33
= _mm_mul_ps(iq3
,jq3
);
1498 /* Avoid stupid compiler warnings */
1499 jnrA
= jnrB
= jnrC
= jnrD
= 0;
1500 j_coord_offsetA
= 0;
1501 j_coord_offsetB
= 0;
1502 j_coord_offsetC
= 0;
1503 j_coord_offsetD
= 0;
1508 for(iidx
=0;iidx
<4*DIM
;iidx
++)
1510 scratch
[iidx
] = 0.0;
1513 /* Start outer loop over neighborlists */
1514 for(iidx
=0; iidx
<nri
; iidx
++)
1516 /* Load shift vector for this list */
1517 i_shift_offset
= DIM
*shiftidx
[iidx
];
1519 /* Load limits for loop over neighbors */
1520 j_index_start
= jindex
[iidx
];
1521 j_index_end
= jindex
[iidx
+1];
1523 /* Get outer coordinate index */
1525 i_coord_offset
= DIM
*inr
;
1527 /* Load i particle coords and add shift vector */
1528 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
1529 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
1531 fix0
= _mm_setzero_ps();
1532 fiy0
= _mm_setzero_ps();
1533 fiz0
= _mm_setzero_ps();
1534 fix1
= _mm_setzero_ps();
1535 fiy1
= _mm_setzero_ps();
1536 fiz1
= _mm_setzero_ps();
1537 fix2
= _mm_setzero_ps();
1538 fiy2
= _mm_setzero_ps();
1539 fiz2
= _mm_setzero_ps();
1540 fix3
= _mm_setzero_ps();
1541 fiy3
= _mm_setzero_ps();
1542 fiz3
= _mm_setzero_ps();
1544 /* Start inner kernel loop */
1545 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+3]>=0; jidx
+=4)
1548 /* Get j neighbor index, and coordinate index */
1550 jnrB
= jjnr
[jidx
+1];
1551 jnrC
= jjnr
[jidx
+2];
1552 jnrD
= jjnr
[jidx
+3];
1553 j_coord_offsetA
= DIM
*jnrA
;
1554 j_coord_offsetB
= DIM
*jnrB
;
1555 j_coord_offsetC
= DIM
*jnrC
;
1556 j_coord_offsetD
= DIM
*jnrD
;
1558 /* load j atom coordinates */
1559 gmx_mm_load_4rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1560 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
1561 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
1562 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
1564 /* Calculate displacement vector */
1565 dx00
= _mm_sub_ps(ix0
,jx0
);
1566 dy00
= _mm_sub_ps(iy0
,jy0
);
1567 dz00
= _mm_sub_ps(iz0
,jz0
);
1568 dx11
= _mm_sub_ps(ix1
,jx1
);
1569 dy11
= _mm_sub_ps(iy1
,jy1
);
1570 dz11
= _mm_sub_ps(iz1
,jz1
);
1571 dx12
= _mm_sub_ps(ix1
,jx2
);
1572 dy12
= _mm_sub_ps(iy1
,jy2
);
1573 dz12
= _mm_sub_ps(iz1
,jz2
);
1574 dx13
= _mm_sub_ps(ix1
,jx3
);
1575 dy13
= _mm_sub_ps(iy1
,jy3
);
1576 dz13
= _mm_sub_ps(iz1
,jz3
);
1577 dx21
= _mm_sub_ps(ix2
,jx1
);
1578 dy21
= _mm_sub_ps(iy2
,jy1
);
1579 dz21
= _mm_sub_ps(iz2
,jz1
);
1580 dx22
= _mm_sub_ps(ix2
,jx2
);
1581 dy22
= _mm_sub_ps(iy2
,jy2
);
1582 dz22
= _mm_sub_ps(iz2
,jz2
);
1583 dx23
= _mm_sub_ps(ix2
,jx3
);
1584 dy23
= _mm_sub_ps(iy2
,jy3
);
1585 dz23
= _mm_sub_ps(iz2
,jz3
);
1586 dx31
= _mm_sub_ps(ix3
,jx1
);
1587 dy31
= _mm_sub_ps(iy3
,jy1
);
1588 dz31
= _mm_sub_ps(iz3
,jz1
);
1589 dx32
= _mm_sub_ps(ix3
,jx2
);
1590 dy32
= _mm_sub_ps(iy3
,jy2
);
1591 dz32
= _mm_sub_ps(iz3
,jz2
);
1592 dx33
= _mm_sub_ps(ix3
,jx3
);
1593 dy33
= _mm_sub_ps(iy3
,jy3
);
1594 dz33
= _mm_sub_ps(iz3
,jz3
);
1596 /* Calculate squared distance and things based on it */
1597 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
1598 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
1599 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
1600 rsq13
= gmx_mm_calc_rsq_ps(dx13
,dy13
,dz13
);
1601 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
1602 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
1603 rsq23
= gmx_mm_calc_rsq_ps(dx23
,dy23
,dz23
);
1604 rsq31
= gmx_mm_calc_rsq_ps(dx31
,dy31
,dz31
);
1605 rsq32
= gmx_mm_calc_rsq_ps(dx32
,dy32
,dz32
);
1606 rsq33
= gmx_mm_calc_rsq_ps(dx33
,dy33
,dz33
);
1608 rinv00
= avx128fma_invsqrt_f(rsq00
);
1609 rinv11
= avx128fma_invsqrt_f(rsq11
);
1610 rinv12
= avx128fma_invsqrt_f(rsq12
);
1611 rinv13
= avx128fma_invsqrt_f(rsq13
);
1612 rinv21
= avx128fma_invsqrt_f(rsq21
);
1613 rinv22
= avx128fma_invsqrt_f(rsq22
);
1614 rinv23
= avx128fma_invsqrt_f(rsq23
);
1615 rinv31
= avx128fma_invsqrt_f(rsq31
);
1616 rinv32
= avx128fma_invsqrt_f(rsq32
);
1617 rinv33
= avx128fma_invsqrt_f(rsq33
);
1619 fjx0
= _mm_setzero_ps();
1620 fjy0
= _mm_setzero_ps();
1621 fjz0
= _mm_setzero_ps();
1622 fjx1
= _mm_setzero_ps();
1623 fjy1
= _mm_setzero_ps();
1624 fjz1
= _mm_setzero_ps();
1625 fjx2
= _mm_setzero_ps();
1626 fjy2
= _mm_setzero_ps();
1627 fjz2
= _mm_setzero_ps();
1628 fjx3
= _mm_setzero_ps();
1629 fjy3
= _mm_setzero_ps();
1630 fjz3
= _mm_setzero_ps();
1632 /**************************
1633 * CALCULATE INTERACTIONS *
1634 **************************/
1636 r00
= _mm_mul_ps(rsq00
,rinv00
);
1638 /* Calculate table index by multiplying r with table scale and truncate to integer */
1639 rt
= _mm_mul_ps(r00
,vftabscale
);
1640 vfitab
= _mm_cvttps_epi32(rt
);
1642 vfeps
= _mm_frcz_ps(rt
);
1644 vfeps
= _mm_sub_ps(rt
,_mm_round_ps(rt
, _MM_FROUND_FLOOR
));
1646 twovfeps
= _mm_add_ps(vfeps
,vfeps
);
1647 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1649 /* CUBIC SPLINE TABLE DISPERSION */
1650 vfitab
= _mm_add_epi32(vfitab
,ifour
);
1651 Y
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,0) );
1652 F
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,1) );
1653 G
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,2) );
1654 H
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,3) );
1655 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1656 Fp
= _mm_macc_ps(vfeps
,_mm_macc_ps(H
,vfeps
,G
),F
);
1657 FF
= _mm_macc_ps(vfeps
,_mm_macc_ps(twovfeps
,H
,G
),Fp
);
1658 fvdw6
= _mm_mul_ps(c6_00
,FF
);
1660 /* CUBIC SPLINE TABLE REPULSION */
1661 vfitab
= _mm_add_epi32(vfitab
,ifour
);
1662 Y
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,0) );
1663 F
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,1) );
1664 G
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,2) );
1665 H
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,3) );
1666 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1667 Fp
= _mm_macc_ps(vfeps
,_mm_macc_ps(H
,vfeps
,G
),F
);
1668 FF
= _mm_macc_ps(vfeps
,_mm_macc_ps(twovfeps
,H
,G
),Fp
);
1669 fvdw12
= _mm_mul_ps(c12_00
,FF
);
1670 fvdw
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_add_ps(fvdw6
,fvdw12
),_mm_mul_ps(vftabscale
,rinv00
)));
1674 /* Update vectorial force */
1675 fix0
= _mm_macc_ps(dx00
,fscal
,fix0
);
1676 fiy0
= _mm_macc_ps(dy00
,fscal
,fiy0
);
1677 fiz0
= _mm_macc_ps(dz00
,fscal
,fiz0
);
1679 fjx0
= _mm_macc_ps(dx00
,fscal
,fjx0
);
1680 fjy0
= _mm_macc_ps(dy00
,fscal
,fjy0
);
1681 fjz0
= _mm_macc_ps(dz00
,fscal
,fjz0
);
1683 /**************************
1684 * CALCULATE INTERACTIONS *
1685 **************************/
1687 r11
= _mm_mul_ps(rsq11
,rinv11
);
1689 /* Calculate table index by multiplying r with table scale and truncate to integer */
1690 rt
= _mm_mul_ps(r11
,vftabscale
);
1691 vfitab
= _mm_cvttps_epi32(rt
);
1693 vfeps
= _mm_frcz_ps(rt
);
1695 vfeps
= _mm_sub_ps(rt
,_mm_round_ps(rt
, _MM_FROUND_FLOOR
));
1697 twovfeps
= _mm_add_ps(vfeps
,vfeps
);
1698 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1700 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1701 Y
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,0) );
1702 F
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,1) );
1703 G
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,2) );
1704 H
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,3) );
1705 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1706 Fp
= _mm_macc_ps(vfeps
,_mm_macc_ps(H
,vfeps
,G
),F
);
1707 FF
= _mm_macc_ps(vfeps
,_mm_macc_ps(twovfeps
,H
,G
),Fp
);
1708 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq11
,FF
),_mm_mul_ps(vftabscale
,rinv11
)));
1712 /* Update vectorial force */
1713 fix1
= _mm_macc_ps(dx11
,fscal
,fix1
);
1714 fiy1
= _mm_macc_ps(dy11
,fscal
,fiy1
);
1715 fiz1
= _mm_macc_ps(dz11
,fscal
,fiz1
);
1717 fjx1
= _mm_macc_ps(dx11
,fscal
,fjx1
);
1718 fjy1
= _mm_macc_ps(dy11
,fscal
,fjy1
);
1719 fjz1
= _mm_macc_ps(dz11
,fscal
,fjz1
);
1721 /**************************
1722 * CALCULATE INTERACTIONS *
1723 **************************/
1725 r12
= _mm_mul_ps(rsq12
,rinv12
);
1727 /* Calculate table index by multiplying r with table scale and truncate to integer */
1728 rt
= _mm_mul_ps(r12
,vftabscale
);
1729 vfitab
= _mm_cvttps_epi32(rt
);
1731 vfeps
= _mm_frcz_ps(rt
);
1733 vfeps
= _mm_sub_ps(rt
,_mm_round_ps(rt
, _MM_FROUND_FLOOR
));
1735 twovfeps
= _mm_add_ps(vfeps
,vfeps
);
1736 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1738 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1739 Y
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,0) );
1740 F
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,1) );
1741 G
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,2) );
1742 H
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,3) );
1743 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1744 Fp
= _mm_macc_ps(vfeps
,_mm_macc_ps(H
,vfeps
,G
),F
);
1745 FF
= _mm_macc_ps(vfeps
,_mm_macc_ps(twovfeps
,H
,G
),Fp
);
1746 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq12
,FF
),_mm_mul_ps(vftabscale
,rinv12
)));
1750 /* Update vectorial force */
1751 fix1
= _mm_macc_ps(dx12
,fscal
,fix1
);
1752 fiy1
= _mm_macc_ps(dy12
,fscal
,fiy1
);
1753 fiz1
= _mm_macc_ps(dz12
,fscal
,fiz1
);
1755 fjx2
= _mm_macc_ps(dx12
,fscal
,fjx2
);
1756 fjy2
= _mm_macc_ps(dy12
,fscal
,fjy2
);
1757 fjz2
= _mm_macc_ps(dz12
,fscal
,fjz2
);
1759 /**************************
1760 * CALCULATE INTERACTIONS *
1761 **************************/
1763 r13
= _mm_mul_ps(rsq13
,rinv13
);
1765 /* Calculate table index by multiplying r with table scale and truncate to integer */
1766 rt
= _mm_mul_ps(r13
,vftabscale
);
1767 vfitab
= _mm_cvttps_epi32(rt
);
1769 vfeps
= _mm_frcz_ps(rt
);
1771 vfeps
= _mm_sub_ps(rt
,_mm_round_ps(rt
, _MM_FROUND_FLOOR
));
1773 twovfeps
= _mm_add_ps(vfeps
,vfeps
);
1774 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1776 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1777 Y
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,0) );
1778 F
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,1) );
1779 G
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,2) );
1780 H
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,3) );
1781 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1782 Fp
= _mm_macc_ps(vfeps
,_mm_macc_ps(H
,vfeps
,G
),F
);
1783 FF
= _mm_macc_ps(vfeps
,_mm_macc_ps(twovfeps
,H
,G
),Fp
);
1784 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq13
,FF
),_mm_mul_ps(vftabscale
,rinv13
)));
1788 /* Update vectorial force */
1789 fix1
= _mm_macc_ps(dx13
,fscal
,fix1
);
1790 fiy1
= _mm_macc_ps(dy13
,fscal
,fiy1
);
1791 fiz1
= _mm_macc_ps(dz13
,fscal
,fiz1
);
1793 fjx3
= _mm_macc_ps(dx13
,fscal
,fjx3
);
1794 fjy3
= _mm_macc_ps(dy13
,fscal
,fjy3
);
1795 fjz3
= _mm_macc_ps(dz13
,fscal
,fjz3
);
1797 /**************************
1798 * CALCULATE INTERACTIONS *
1799 **************************/
1801 r21
= _mm_mul_ps(rsq21
,rinv21
);
1803 /* Calculate table index by multiplying r with table scale and truncate to integer */
1804 rt
= _mm_mul_ps(r21
,vftabscale
);
1805 vfitab
= _mm_cvttps_epi32(rt
);
1807 vfeps
= _mm_frcz_ps(rt
);
1809 vfeps
= _mm_sub_ps(rt
,_mm_round_ps(rt
, _MM_FROUND_FLOOR
));
1811 twovfeps
= _mm_add_ps(vfeps
,vfeps
);
1812 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1814 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1815 Y
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,0) );
1816 F
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,1) );
1817 G
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,2) );
1818 H
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,3) );
1819 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1820 Fp
= _mm_macc_ps(vfeps
,_mm_macc_ps(H
,vfeps
,G
),F
);
1821 FF
= _mm_macc_ps(vfeps
,_mm_macc_ps(twovfeps
,H
,G
),Fp
);
1822 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq21
,FF
),_mm_mul_ps(vftabscale
,rinv21
)));
1826 /* Update vectorial force */
1827 fix2
= _mm_macc_ps(dx21
,fscal
,fix2
);
1828 fiy2
= _mm_macc_ps(dy21
,fscal
,fiy2
);
1829 fiz2
= _mm_macc_ps(dz21
,fscal
,fiz2
);
1831 fjx1
= _mm_macc_ps(dx21
,fscal
,fjx1
);
1832 fjy1
= _mm_macc_ps(dy21
,fscal
,fjy1
);
1833 fjz1
= _mm_macc_ps(dz21
,fscal
,fjz1
);
1835 /**************************
1836 * CALCULATE INTERACTIONS *
1837 **************************/
1839 r22
= _mm_mul_ps(rsq22
,rinv22
);
1841 /* Calculate table index by multiplying r with table scale and truncate to integer */
1842 rt
= _mm_mul_ps(r22
,vftabscale
);
1843 vfitab
= _mm_cvttps_epi32(rt
);
1845 vfeps
= _mm_frcz_ps(rt
);
1847 vfeps
= _mm_sub_ps(rt
,_mm_round_ps(rt
, _MM_FROUND_FLOOR
));
1849 twovfeps
= _mm_add_ps(vfeps
,vfeps
);
1850 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1852 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1853 Y
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,0) );
1854 F
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,1) );
1855 G
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,2) );
1856 H
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,3) );
1857 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1858 Fp
= _mm_macc_ps(vfeps
,_mm_macc_ps(H
,vfeps
,G
),F
);
1859 FF
= _mm_macc_ps(vfeps
,_mm_macc_ps(twovfeps
,H
,G
),Fp
);
1860 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq22
,FF
),_mm_mul_ps(vftabscale
,rinv22
)));
1864 /* Update vectorial force */
1865 fix2
= _mm_macc_ps(dx22
,fscal
,fix2
);
1866 fiy2
= _mm_macc_ps(dy22
,fscal
,fiy2
);
1867 fiz2
= _mm_macc_ps(dz22
,fscal
,fiz2
);
1869 fjx2
= _mm_macc_ps(dx22
,fscal
,fjx2
);
1870 fjy2
= _mm_macc_ps(dy22
,fscal
,fjy2
);
1871 fjz2
= _mm_macc_ps(dz22
,fscal
,fjz2
);
1873 /**************************
1874 * CALCULATE INTERACTIONS *
1875 **************************/
1877 r23
= _mm_mul_ps(rsq23
,rinv23
);
1879 /* Calculate table index by multiplying r with table scale and truncate to integer */
1880 rt
= _mm_mul_ps(r23
,vftabscale
);
1881 vfitab
= _mm_cvttps_epi32(rt
);
1883 vfeps
= _mm_frcz_ps(rt
);
1885 vfeps
= _mm_sub_ps(rt
,_mm_round_ps(rt
, _MM_FROUND_FLOOR
));
1887 twovfeps
= _mm_add_ps(vfeps
,vfeps
);
1888 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1890 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1891 Y
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,0) );
1892 F
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,1) );
1893 G
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,2) );
1894 H
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,3) );
1895 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1896 Fp
= _mm_macc_ps(vfeps
,_mm_macc_ps(H
,vfeps
,G
),F
);
1897 FF
= _mm_macc_ps(vfeps
,_mm_macc_ps(twovfeps
,H
,G
),Fp
);
1898 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq23
,FF
),_mm_mul_ps(vftabscale
,rinv23
)));
1902 /* Update vectorial force */
1903 fix2
= _mm_macc_ps(dx23
,fscal
,fix2
);
1904 fiy2
= _mm_macc_ps(dy23
,fscal
,fiy2
);
1905 fiz2
= _mm_macc_ps(dz23
,fscal
,fiz2
);
1907 fjx3
= _mm_macc_ps(dx23
,fscal
,fjx3
);
1908 fjy3
= _mm_macc_ps(dy23
,fscal
,fjy3
);
1909 fjz3
= _mm_macc_ps(dz23
,fscal
,fjz3
);
1911 /**************************
1912 * CALCULATE INTERACTIONS *
1913 **************************/
1915 r31
= _mm_mul_ps(rsq31
,rinv31
);
1917 /* Calculate table index by multiplying r with table scale and truncate to integer */
1918 rt
= _mm_mul_ps(r31
,vftabscale
);
1919 vfitab
= _mm_cvttps_epi32(rt
);
1921 vfeps
= _mm_frcz_ps(rt
);
1923 vfeps
= _mm_sub_ps(rt
,_mm_round_ps(rt
, _MM_FROUND_FLOOR
));
1925 twovfeps
= _mm_add_ps(vfeps
,vfeps
);
1926 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1928 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1929 Y
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,0) );
1930 F
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,1) );
1931 G
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,2) );
1932 H
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,3) );
1933 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1934 Fp
= _mm_macc_ps(vfeps
,_mm_macc_ps(H
,vfeps
,G
),F
);
1935 FF
= _mm_macc_ps(vfeps
,_mm_macc_ps(twovfeps
,H
,G
),Fp
);
1936 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq31
,FF
),_mm_mul_ps(vftabscale
,rinv31
)));
1940 /* Update vectorial force */
1941 fix3
= _mm_macc_ps(dx31
,fscal
,fix3
);
1942 fiy3
= _mm_macc_ps(dy31
,fscal
,fiy3
);
1943 fiz3
= _mm_macc_ps(dz31
,fscal
,fiz3
);
1945 fjx1
= _mm_macc_ps(dx31
,fscal
,fjx1
);
1946 fjy1
= _mm_macc_ps(dy31
,fscal
,fjy1
);
1947 fjz1
= _mm_macc_ps(dz31
,fscal
,fjz1
);
1949 /**************************
1950 * CALCULATE INTERACTIONS *
1951 **************************/
1953 r32
= _mm_mul_ps(rsq32
,rinv32
);
1955 /* Calculate table index by multiplying r with table scale and truncate to integer */
1956 rt
= _mm_mul_ps(r32
,vftabscale
);
1957 vfitab
= _mm_cvttps_epi32(rt
);
1959 vfeps
= _mm_frcz_ps(rt
);
1961 vfeps
= _mm_sub_ps(rt
,_mm_round_ps(rt
, _MM_FROUND_FLOOR
));
1963 twovfeps
= _mm_add_ps(vfeps
,vfeps
);
1964 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1966 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1967 Y
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,0) );
1968 F
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,1) );
1969 G
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,2) );
1970 H
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,3) );
1971 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1972 Fp
= _mm_macc_ps(vfeps
,_mm_macc_ps(H
,vfeps
,G
),F
);
1973 FF
= _mm_macc_ps(vfeps
,_mm_macc_ps(twovfeps
,H
,G
),Fp
);
1974 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq32
,FF
),_mm_mul_ps(vftabscale
,rinv32
)));
1978 /* Update vectorial force */
1979 fix3
= _mm_macc_ps(dx32
,fscal
,fix3
);
1980 fiy3
= _mm_macc_ps(dy32
,fscal
,fiy3
);
1981 fiz3
= _mm_macc_ps(dz32
,fscal
,fiz3
);
1983 fjx2
= _mm_macc_ps(dx32
,fscal
,fjx2
);
1984 fjy2
= _mm_macc_ps(dy32
,fscal
,fjy2
);
1985 fjz2
= _mm_macc_ps(dz32
,fscal
,fjz2
);
1987 /**************************
1988 * CALCULATE INTERACTIONS *
1989 **************************/
1991 r33
= _mm_mul_ps(rsq33
,rinv33
);
1993 /* Calculate table index by multiplying r with table scale and truncate to integer */
1994 rt
= _mm_mul_ps(r33
,vftabscale
);
1995 vfitab
= _mm_cvttps_epi32(rt
);
1997 vfeps
= _mm_frcz_ps(rt
);
1999 vfeps
= _mm_sub_ps(rt
,_mm_round_ps(rt
, _MM_FROUND_FLOOR
));
2001 twovfeps
= _mm_add_ps(vfeps
,vfeps
);
2002 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
2004 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2005 Y
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,0) );
2006 F
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,1) );
2007 G
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,2) );
2008 H
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,3) );
2009 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2010 Fp
= _mm_macc_ps(vfeps
,_mm_macc_ps(H
,vfeps
,G
),F
);
2011 FF
= _mm_macc_ps(vfeps
,_mm_macc_ps(twovfeps
,H
,G
),Fp
);
2012 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq33
,FF
),_mm_mul_ps(vftabscale
,rinv33
)));
2016 /* Update vectorial force */
2017 fix3
= _mm_macc_ps(dx33
,fscal
,fix3
);
2018 fiy3
= _mm_macc_ps(dy33
,fscal
,fiy3
);
2019 fiz3
= _mm_macc_ps(dz33
,fscal
,fiz3
);
2021 fjx3
= _mm_macc_ps(dx33
,fscal
,fjx3
);
2022 fjy3
= _mm_macc_ps(dy33
,fscal
,fjy3
);
2023 fjz3
= _mm_macc_ps(dz33
,fscal
,fjz3
);
2025 fjptrA
= f
+j_coord_offsetA
;
2026 fjptrB
= f
+j_coord_offsetB
;
2027 fjptrC
= f
+j_coord_offsetC
;
2028 fjptrD
= f
+j_coord_offsetD
;
2030 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
2031 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,
2032 fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
2034 /* Inner loop uses 432 flops */
2037 if(jidx
<j_index_end
)
2040 /* Get j neighbor index, and coordinate index */
2041 jnrlistA
= jjnr
[jidx
];
2042 jnrlistB
= jjnr
[jidx
+1];
2043 jnrlistC
= jjnr
[jidx
+2];
2044 jnrlistD
= jjnr
[jidx
+3];
2045 /* Sign of each element will be negative for non-real atoms.
2046 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2047 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2049 dummy_mask
= gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128()));
2050 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
2051 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
2052 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
2053 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
2054 j_coord_offsetA
= DIM
*jnrA
;
2055 j_coord_offsetB
= DIM
*jnrB
;
2056 j_coord_offsetC
= DIM
*jnrC
;
2057 j_coord_offsetD
= DIM
*jnrD
;
2059 /* load j atom coordinates */
2060 gmx_mm_load_4rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
2061 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
2062 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
2063 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
2065 /* Calculate displacement vector */
2066 dx00
= _mm_sub_ps(ix0
,jx0
);
2067 dy00
= _mm_sub_ps(iy0
,jy0
);
2068 dz00
= _mm_sub_ps(iz0
,jz0
);
2069 dx11
= _mm_sub_ps(ix1
,jx1
);
2070 dy11
= _mm_sub_ps(iy1
,jy1
);
2071 dz11
= _mm_sub_ps(iz1
,jz1
);
2072 dx12
= _mm_sub_ps(ix1
,jx2
);
2073 dy12
= _mm_sub_ps(iy1
,jy2
);
2074 dz12
= _mm_sub_ps(iz1
,jz2
);
2075 dx13
= _mm_sub_ps(ix1
,jx3
);
2076 dy13
= _mm_sub_ps(iy1
,jy3
);
2077 dz13
= _mm_sub_ps(iz1
,jz3
);
2078 dx21
= _mm_sub_ps(ix2
,jx1
);
2079 dy21
= _mm_sub_ps(iy2
,jy1
);
2080 dz21
= _mm_sub_ps(iz2
,jz1
);
2081 dx22
= _mm_sub_ps(ix2
,jx2
);
2082 dy22
= _mm_sub_ps(iy2
,jy2
);
2083 dz22
= _mm_sub_ps(iz2
,jz2
);
2084 dx23
= _mm_sub_ps(ix2
,jx3
);
2085 dy23
= _mm_sub_ps(iy2
,jy3
);
2086 dz23
= _mm_sub_ps(iz2
,jz3
);
2087 dx31
= _mm_sub_ps(ix3
,jx1
);
2088 dy31
= _mm_sub_ps(iy3
,jy1
);
2089 dz31
= _mm_sub_ps(iz3
,jz1
);
2090 dx32
= _mm_sub_ps(ix3
,jx2
);
2091 dy32
= _mm_sub_ps(iy3
,jy2
);
2092 dz32
= _mm_sub_ps(iz3
,jz2
);
2093 dx33
= _mm_sub_ps(ix3
,jx3
);
2094 dy33
= _mm_sub_ps(iy3
,jy3
);
2095 dz33
= _mm_sub_ps(iz3
,jz3
);
2097 /* Calculate squared distance and things based on it */
2098 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
2099 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
2100 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
2101 rsq13
= gmx_mm_calc_rsq_ps(dx13
,dy13
,dz13
);
2102 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
2103 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
2104 rsq23
= gmx_mm_calc_rsq_ps(dx23
,dy23
,dz23
);
2105 rsq31
= gmx_mm_calc_rsq_ps(dx31
,dy31
,dz31
);
2106 rsq32
= gmx_mm_calc_rsq_ps(dx32
,dy32
,dz32
);
2107 rsq33
= gmx_mm_calc_rsq_ps(dx33
,dy33
,dz33
);
2109 rinv00
= avx128fma_invsqrt_f(rsq00
);
2110 rinv11
= avx128fma_invsqrt_f(rsq11
);
2111 rinv12
= avx128fma_invsqrt_f(rsq12
);
2112 rinv13
= avx128fma_invsqrt_f(rsq13
);
2113 rinv21
= avx128fma_invsqrt_f(rsq21
);
2114 rinv22
= avx128fma_invsqrt_f(rsq22
);
2115 rinv23
= avx128fma_invsqrt_f(rsq23
);
2116 rinv31
= avx128fma_invsqrt_f(rsq31
);
2117 rinv32
= avx128fma_invsqrt_f(rsq32
);
2118 rinv33
= avx128fma_invsqrt_f(rsq33
);
2120 fjx0
= _mm_setzero_ps();
2121 fjy0
= _mm_setzero_ps();
2122 fjz0
= _mm_setzero_ps();
2123 fjx1
= _mm_setzero_ps();
2124 fjy1
= _mm_setzero_ps();
2125 fjz1
= _mm_setzero_ps();
2126 fjx2
= _mm_setzero_ps();
2127 fjy2
= _mm_setzero_ps();
2128 fjz2
= _mm_setzero_ps();
2129 fjx3
= _mm_setzero_ps();
2130 fjy3
= _mm_setzero_ps();
2131 fjz3
= _mm_setzero_ps();
2133 /**************************
2134 * CALCULATE INTERACTIONS *
2135 **************************/
2137 r00
= _mm_mul_ps(rsq00
,rinv00
);
2138 r00
= _mm_andnot_ps(dummy_mask
,r00
);
2140 /* Calculate table index by multiplying r with table scale and truncate to integer */
2141 rt
= _mm_mul_ps(r00
,vftabscale
);
2142 vfitab
= _mm_cvttps_epi32(rt
);
2144 vfeps
= _mm_frcz_ps(rt
);
2146 vfeps
= _mm_sub_ps(rt
,_mm_round_ps(rt
, _MM_FROUND_FLOOR
));
2148 twovfeps
= _mm_add_ps(vfeps
,vfeps
);
2149 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
2151 /* CUBIC SPLINE TABLE DISPERSION */
2152 vfitab
= _mm_add_epi32(vfitab
,ifour
);
2153 Y
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,0) );
2154 F
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,1) );
2155 G
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,2) );
2156 H
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,3) );
2157 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2158 Fp
= _mm_macc_ps(vfeps
,_mm_macc_ps(H
,vfeps
,G
),F
);
2159 FF
= _mm_macc_ps(vfeps
,_mm_macc_ps(twovfeps
,H
,G
),Fp
);
2160 fvdw6
= _mm_mul_ps(c6_00
,FF
);
2162 /* CUBIC SPLINE TABLE REPULSION */
2163 vfitab
= _mm_add_epi32(vfitab
,ifour
);
2164 Y
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,0) );
2165 F
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,1) );
2166 G
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,2) );
2167 H
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,3) );
2168 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2169 Fp
= _mm_macc_ps(vfeps
,_mm_macc_ps(H
,vfeps
,G
),F
);
2170 FF
= _mm_macc_ps(vfeps
,_mm_macc_ps(twovfeps
,H
,G
),Fp
);
2171 fvdw12
= _mm_mul_ps(c12_00
,FF
);
2172 fvdw
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_add_ps(fvdw6
,fvdw12
),_mm_mul_ps(vftabscale
,rinv00
)));
2176 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2178 /* Update vectorial force */
2179 fix0
= _mm_macc_ps(dx00
,fscal
,fix0
);
2180 fiy0
= _mm_macc_ps(dy00
,fscal
,fiy0
);
2181 fiz0
= _mm_macc_ps(dz00
,fscal
,fiz0
);
2183 fjx0
= _mm_macc_ps(dx00
,fscal
,fjx0
);
2184 fjy0
= _mm_macc_ps(dy00
,fscal
,fjy0
);
2185 fjz0
= _mm_macc_ps(dz00
,fscal
,fjz0
);
2187 /**************************
2188 * CALCULATE INTERACTIONS *
2189 **************************/
2191 r11
= _mm_mul_ps(rsq11
,rinv11
);
2192 r11
= _mm_andnot_ps(dummy_mask
,r11
);
2194 /* Calculate table index by multiplying r with table scale and truncate to integer */
2195 rt
= _mm_mul_ps(r11
,vftabscale
);
2196 vfitab
= _mm_cvttps_epi32(rt
);
2198 vfeps
= _mm_frcz_ps(rt
);
2200 vfeps
= _mm_sub_ps(rt
,_mm_round_ps(rt
, _MM_FROUND_FLOOR
));
2202 twovfeps
= _mm_add_ps(vfeps
,vfeps
);
2203 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
2205 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2206 Y
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,0) );
2207 F
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,1) );
2208 G
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,2) );
2209 H
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,3) );
2210 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2211 Fp
= _mm_macc_ps(vfeps
,_mm_macc_ps(H
,vfeps
,G
),F
);
2212 FF
= _mm_macc_ps(vfeps
,_mm_macc_ps(twovfeps
,H
,G
),Fp
);
2213 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq11
,FF
),_mm_mul_ps(vftabscale
,rinv11
)));
2217 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2219 /* Update vectorial force */
2220 fix1
= _mm_macc_ps(dx11
,fscal
,fix1
);
2221 fiy1
= _mm_macc_ps(dy11
,fscal
,fiy1
);
2222 fiz1
= _mm_macc_ps(dz11
,fscal
,fiz1
);
2224 fjx1
= _mm_macc_ps(dx11
,fscal
,fjx1
);
2225 fjy1
= _mm_macc_ps(dy11
,fscal
,fjy1
);
2226 fjz1
= _mm_macc_ps(dz11
,fscal
,fjz1
);
2228 /**************************
2229 * CALCULATE INTERACTIONS *
2230 **************************/
2232 r12
= _mm_mul_ps(rsq12
,rinv12
);
2233 r12
= _mm_andnot_ps(dummy_mask
,r12
);
2235 /* Calculate table index by multiplying r with table scale and truncate to integer */
2236 rt
= _mm_mul_ps(r12
,vftabscale
);
2237 vfitab
= _mm_cvttps_epi32(rt
);
2239 vfeps
= _mm_frcz_ps(rt
);
2241 vfeps
= _mm_sub_ps(rt
,_mm_round_ps(rt
, _MM_FROUND_FLOOR
));
2243 twovfeps
= _mm_add_ps(vfeps
,vfeps
);
2244 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
2246 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2247 Y
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,0) );
2248 F
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,1) );
2249 G
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,2) );
2250 H
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,3) );
2251 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2252 Fp
= _mm_macc_ps(vfeps
,_mm_macc_ps(H
,vfeps
,G
),F
);
2253 FF
= _mm_macc_ps(vfeps
,_mm_macc_ps(twovfeps
,H
,G
),Fp
);
2254 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq12
,FF
),_mm_mul_ps(vftabscale
,rinv12
)));
2258 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2260 /* Update vectorial force */
2261 fix1
= _mm_macc_ps(dx12
,fscal
,fix1
);
2262 fiy1
= _mm_macc_ps(dy12
,fscal
,fiy1
);
2263 fiz1
= _mm_macc_ps(dz12
,fscal
,fiz1
);
2265 fjx2
= _mm_macc_ps(dx12
,fscal
,fjx2
);
2266 fjy2
= _mm_macc_ps(dy12
,fscal
,fjy2
);
2267 fjz2
= _mm_macc_ps(dz12
,fscal
,fjz2
);
2269 /**************************
2270 * CALCULATE INTERACTIONS *
2271 **************************/
2273 r13
= _mm_mul_ps(rsq13
,rinv13
);
2274 r13
= _mm_andnot_ps(dummy_mask
,r13
);
2276 /* Calculate table index by multiplying r with table scale and truncate to integer */
2277 rt
= _mm_mul_ps(r13
,vftabscale
);
2278 vfitab
= _mm_cvttps_epi32(rt
);
2280 vfeps
= _mm_frcz_ps(rt
);
2282 vfeps
= _mm_sub_ps(rt
,_mm_round_ps(rt
, _MM_FROUND_FLOOR
));
2284 twovfeps
= _mm_add_ps(vfeps
,vfeps
);
2285 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
2287 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2288 Y
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,0) );
2289 F
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,1) );
2290 G
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,2) );
2291 H
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,3) );
2292 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2293 Fp
= _mm_macc_ps(vfeps
,_mm_macc_ps(H
,vfeps
,G
),F
);
2294 FF
= _mm_macc_ps(vfeps
,_mm_macc_ps(twovfeps
,H
,G
),Fp
);
2295 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq13
,FF
),_mm_mul_ps(vftabscale
,rinv13
)));
2299 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2301 /* Update vectorial force */
2302 fix1
= _mm_macc_ps(dx13
,fscal
,fix1
);
2303 fiy1
= _mm_macc_ps(dy13
,fscal
,fiy1
);
2304 fiz1
= _mm_macc_ps(dz13
,fscal
,fiz1
);
2306 fjx3
= _mm_macc_ps(dx13
,fscal
,fjx3
);
2307 fjy3
= _mm_macc_ps(dy13
,fscal
,fjy3
);
2308 fjz3
= _mm_macc_ps(dz13
,fscal
,fjz3
);
2310 /**************************
2311 * CALCULATE INTERACTIONS *
2312 **************************/
2314 r21
= _mm_mul_ps(rsq21
,rinv21
);
2315 r21
= _mm_andnot_ps(dummy_mask
,r21
);
2317 /* Calculate table index by multiplying r with table scale and truncate to integer */
2318 rt
= _mm_mul_ps(r21
,vftabscale
);
2319 vfitab
= _mm_cvttps_epi32(rt
);
2321 vfeps
= _mm_frcz_ps(rt
);
2323 vfeps
= _mm_sub_ps(rt
,_mm_round_ps(rt
, _MM_FROUND_FLOOR
));
2325 twovfeps
= _mm_add_ps(vfeps
,vfeps
);
2326 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
2328 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2329 Y
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,0) );
2330 F
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,1) );
2331 G
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,2) );
2332 H
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,3) );
2333 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2334 Fp
= _mm_macc_ps(vfeps
,_mm_macc_ps(H
,vfeps
,G
),F
);
2335 FF
= _mm_macc_ps(vfeps
,_mm_macc_ps(twovfeps
,H
,G
),Fp
);
2336 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq21
,FF
),_mm_mul_ps(vftabscale
,rinv21
)));
2340 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2342 /* Update vectorial force */
2343 fix2
= _mm_macc_ps(dx21
,fscal
,fix2
);
2344 fiy2
= _mm_macc_ps(dy21
,fscal
,fiy2
);
2345 fiz2
= _mm_macc_ps(dz21
,fscal
,fiz2
);
2347 fjx1
= _mm_macc_ps(dx21
,fscal
,fjx1
);
2348 fjy1
= _mm_macc_ps(dy21
,fscal
,fjy1
);
2349 fjz1
= _mm_macc_ps(dz21
,fscal
,fjz1
);
2351 /**************************
2352 * CALCULATE INTERACTIONS *
2353 **************************/
2355 r22
= _mm_mul_ps(rsq22
,rinv22
);
2356 r22
= _mm_andnot_ps(dummy_mask
,r22
);
2358 /* Calculate table index by multiplying r with table scale and truncate to integer */
2359 rt
= _mm_mul_ps(r22
,vftabscale
);
2360 vfitab
= _mm_cvttps_epi32(rt
);
2362 vfeps
= _mm_frcz_ps(rt
);
2364 vfeps
= _mm_sub_ps(rt
,_mm_round_ps(rt
, _MM_FROUND_FLOOR
));
2366 twovfeps
= _mm_add_ps(vfeps
,vfeps
);
2367 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
2369 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2370 Y
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,0) );
2371 F
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,1) );
2372 G
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,2) );
2373 H
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,3) );
2374 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2375 Fp
= _mm_macc_ps(vfeps
,_mm_macc_ps(H
,vfeps
,G
),F
);
2376 FF
= _mm_macc_ps(vfeps
,_mm_macc_ps(twovfeps
,H
,G
),Fp
);
2377 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq22
,FF
),_mm_mul_ps(vftabscale
,rinv22
)));
2381 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2383 /* Update vectorial force */
2384 fix2
= _mm_macc_ps(dx22
,fscal
,fix2
);
2385 fiy2
= _mm_macc_ps(dy22
,fscal
,fiy2
);
2386 fiz2
= _mm_macc_ps(dz22
,fscal
,fiz2
);
2388 fjx2
= _mm_macc_ps(dx22
,fscal
,fjx2
);
2389 fjy2
= _mm_macc_ps(dy22
,fscal
,fjy2
);
2390 fjz2
= _mm_macc_ps(dz22
,fscal
,fjz2
);
2392 /**************************
2393 * CALCULATE INTERACTIONS *
2394 **************************/
2396 r23
= _mm_mul_ps(rsq23
,rinv23
);
2397 r23
= _mm_andnot_ps(dummy_mask
,r23
);
2399 /* Calculate table index by multiplying r with table scale and truncate to integer */
2400 rt
= _mm_mul_ps(r23
,vftabscale
);
2401 vfitab
= _mm_cvttps_epi32(rt
);
2403 vfeps
= _mm_frcz_ps(rt
);
2405 vfeps
= _mm_sub_ps(rt
,_mm_round_ps(rt
, _MM_FROUND_FLOOR
));
2407 twovfeps
= _mm_add_ps(vfeps
,vfeps
);
2408 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
2410 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2411 Y
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,0) );
2412 F
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,1) );
2413 G
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,2) );
2414 H
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,3) );
2415 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2416 Fp
= _mm_macc_ps(vfeps
,_mm_macc_ps(H
,vfeps
,G
),F
);
2417 FF
= _mm_macc_ps(vfeps
,_mm_macc_ps(twovfeps
,H
,G
),Fp
);
2418 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq23
,FF
),_mm_mul_ps(vftabscale
,rinv23
)));
2422 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2424 /* Update vectorial force */
2425 fix2
= _mm_macc_ps(dx23
,fscal
,fix2
);
2426 fiy2
= _mm_macc_ps(dy23
,fscal
,fiy2
);
2427 fiz2
= _mm_macc_ps(dz23
,fscal
,fiz2
);
2429 fjx3
= _mm_macc_ps(dx23
,fscal
,fjx3
);
2430 fjy3
= _mm_macc_ps(dy23
,fscal
,fjy3
);
2431 fjz3
= _mm_macc_ps(dz23
,fscal
,fjz3
);
2433 /**************************
2434 * CALCULATE INTERACTIONS *
2435 **************************/
2437 r31
= _mm_mul_ps(rsq31
,rinv31
);
2438 r31
= _mm_andnot_ps(dummy_mask
,r31
);
2440 /* Calculate table index by multiplying r with table scale and truncate to integer */
2441 rt
= _mm_mul_ps(r31
,vftabscale
);
2442 vfitab
= _mm_cvttps_epi32(rt
);
2444 vfeps
= _mm_frcz_ps(rt
);
2446 vfeps
= _mm_sub_ps(rt
,_mm_round_ps(rt
, _MM_FROUND_FLOOR
));
2448 twovfeps
= _mm_add_ps(vfeps
,vfeps
);
2449 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
2451 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2452 Y
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,0) );
2453 F
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,1) );
2454 G
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,2) );
2455 H
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,3) );
2456 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2457 Fp
= _mm_macc_ps(vfeps
,_mm_macc_ps(H
,vfeps
,G
),F
);
2458 FF
= _mm_macc_ps(vfeps
,_mm_macc_ps(twovfeps
,H
,G
),Fp
);
2459 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq31
,FF
),_mm_mul_ps(vftabscale
,rinv31
)));
2463 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2465 /* Update vectorial force */
2466 fix3
= _mm_macc_ps(dx31
,fscal
,fix3
);
2467 fiy3
= _mm_macc_ps(dy31
,fscal
,fiy3
);
2468 fiz3
= _mm_macc_ps(dz31
,fscal
,fiz3
);
2470 fjx1
= _mm_macc_ps(dx31
,fscal
,fjx1
);
2471 fjy1
= _mm_macc_ps(dy31
,fscal
,fjy1
);
2472 fjz1
= _mm_macc_ps(dz31
,fscal
,fjz1
);
2474 /**************************
2475 * CALCULATE INTERACTIONS *
2476 **************************/
2478 r32
= _mm_mul_ps(rsq32
,rinv32
);
2479 r32
= _mm_andnot_ps(dummy_mask
,r32
);
2481 /* Calculate table index by multiplying r with table scale and truncate to integer */
2482 rt
= _mm_mul_ps(r32
,vftabscale
);
2483 vfitab
= _mm_cvttps_epi32(rt
);
2485 vfeps
= _mm_frcz_ps(rt
);
2487 vfeps
= _mm_sub_ps(rt
,_mm_round_ps(rt
, _MM_FROUND_FLOOR
));
2489 twovfeps
= _mm_add_ps(vfeps
,vfeps
);
2490 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
2492 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2493 Y
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,0) );
2494 F
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,1) );
2495 G
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,2) );
2496 H
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,3) );
2497 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2498 Fp
= _mm_macc_ps(vfeps
,_mm_macc_ps(H
,vfeps
,G
),F
);
2499 FF
= _mm_macc_ps(vfeps
,_mm_macc_ps(twovfeps
,H
,G
),Fp
);
2500 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq32
,FF
),_mm_mul_ps(vftabscale
,rinv32
)));
2504 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2506 /* Update vectorial force */
2507 fix3
= _mm_macc_ps(dx32
,fscal
,fix3
);
2508 fiy3
= _mm_macc_ps(dy32
,fscal
,fiy3
);
2509 fiz3
= _mm_macc_ps(dz32
,fscal
,fiz3
);
2511 fjx2
= _mm_macc_ps(dx32
,fscal
,fjx2
);
2512 fjy2
= _mm_macc_ps(dy32
,fscal
,fjy2
);
2513 fjz2
= _mm_macc_ps(dz32
,fscal
,fjz2
);
2515 /**************************
2516 * CALCULATE INTERACTIONS *
2517 **************************/
2519 r33
= _mm_mul_ps(rsq33
,rinv33
);
2520 r33
= _mm_andnot_ps(dummy_mask
,r33
);
2522 /* Calculate table index by multiplying r with table scale and truncate to integer */
2523 rt
= _mm_mul_ps(r33
,vftabscale
);
2524 vfitab
= _mm_cvttps_epi32(rt
);
2526 vfeps
= _mm_frcz_ps(rt
);
2528 vfeps
= _mm_sub_ps(rt
,_mm_round_ps(rt
, _MM_FROUND_FLOOR
));
2530 twovfeps
= _mm_add_ps(vfeps
,vfeps
);
2531 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
2533 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2534 Y
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,0) );
2535 F
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,1) );
2536 G
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,2) );
2537 H
= _mm_load_ps( vftab
+ _mm_extract_epi32(vfitab
,3) );
2538 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2539 Fp
= _mm_macc_ps(vfeps
,_mm_macc_ps(H
,vfeps
,G
),F
);
2540 FF
= _mm_macc_ps(vfeps
,_mm_macc_ps(twovfeps
,H
,G
),Fp
);
2541 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq33
,FF
),_mm_mul_ps(vftabscale
,rinv33
)));
2545 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2547 /* Update vectorial force */
2548 fix3
= _mm_macc_ps(dx33
,fscal
,fix3
);
2549 fiy3
= _mm_macc_ps(dy33
,fscal
,fiy3
);
2550 fiz3
= _mm_macc_ps(dz33
,fscal
,fiz3
);
2552 fjx3
= _mm_macc_ps(dx33
,fscal
,fjx3
);
2553 fjy3
= _mm_macc_ps(dy33
,fscal
,fjy3
);
2554 fjz3
= _mm_macc_ps(dz33
,fscal
,fjz3
);
2556 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
2557 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
2558 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
2559 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
2561 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
2562 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,
2563 fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
2565 /* Inner loop uses 442 flops */
2568 /* End of innermost loop */
2570 gmx_mm_update_iforce_4atom_swizzle_ps(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
2571 f
+i_coord_offset
,fshift
+i_shift_offset
);
2573 /* Increment number of inner iterations */
2574 inneriter
+= j_index_end
- j_index_start
;
2576 /* Outer loop uses 24 flops */
2579 /* Increment number of outer iterations */
2582 /* Update outer/inner flops */
2584 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W4W4_F
,outeriter
*24 + inneriter
*442);