2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_single kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/math/vec.h"
46 #include "gromacs/legacyheaders/nrnb.h"
48 #include "gromacs/simd/math_x86_sse2_single.h"
49 #include "kernelutil_x86_sse2_single.h"
52 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sse2_single
53 * Electrostatics interaction: CubicSplineTable
54 * VdW interaction: CubicSplineTable
55 * Geometry: Water3-Water3
56 * Calculate force/pot: PotentialAndForce
59 nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sse2_single
60 (t_nblist
* gmx_restrict nlist
,
61 rvec
* gmx_restrict xx
,
62 rvec
* gmx_restrict ff
,
63 t_forcerec
* gmx_restrict fr
,
64 t_mdatoms
* gmx_restrict mdatoms
,
65 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
66 t_nrnb
* gmx_restrict nrnb
)
68 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
69 * just 0 for non-waters.
70 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
71 * jnr indices corresponding to data put in the four positions in the SIMD register.
73 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
74 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
75 int jnrA
,jnrB
,jnrC
,jnrD
;
76 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
77 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
78 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
80 real
*shiftvec
,*fshift
,*x
,*f
;
81 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
;
83 __m128 tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
85 __m128 ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
87 __m128 ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
89 __m128 ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
90 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
;
91 __m128 jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
92 int vdwjidx1A
,vdwjidx1B
,vdwjidx1C
,vdwjidx1D
;
93 __m128 jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
94 int vdwjidx2A
,vdwjidx2B
,vdwjidx2C
,vdwjidx2D
;
95 __m128 jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
96 __m128 dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
97 __m128 dx01
,dy01
,dz01
,rsq01
,rinv01
,rinvsq01
,r01
,qq01
,c6_01
,c12_01
;
98 __m128 dx02
,dy02
,dz02
,rsq02
,rinv02
,rinvsq02
,r02
,qq02
,c6_02
,c12_02
;
99 __m128 dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
100 __m128 dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
101 __m128 dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
102 __m128 dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
103 __m128 dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
104 __m128 dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
105 __m128 velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
108 __m128 rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
111 __m128 one_sixth
= _mm_set1_ps(1.0/6.0);
112 __m128 one_twelfth
= _mm_set1_ps(1.0/12.0);
114 __m128i ifour
= _mm_set1_epi32(4);
115 __m128 rt
,vfeps
,vftabscale
,Y
,F
,G
,H
,Heps
,Fp
,VV
,FF
;
117 __m128 dummy_mask
,cutoff_mask
;
118 __m128 signbit
= _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
119 __m128 one
= _mm_set1_ps(1.0);
120 __m128 two
= _mm_set1_ps(2.0);
126 jindex
= nlist
->jindex
;
128 shiftidx
= nlist
->shift
;
130 shiftvec
= fr
->shift_vec
[0];
131 fshift
= fr
->fshift
[0];
132 facel
= _mm_set1_ps(fr
->epsfac
);
133 charge
= mdatoms
->chargeA
;
134 nvdwtype
= fr
->ntype
;
136 vdwtype
= mdatoms
->typeA
;
138 vftab
= kernel_data
->table_elec_vdw
->data
;
139 vftabscale
= _mm_set1_ps(kernel_data
->table_elec_vdw
->scale
);
141 /* Setup water-specific parameters */
142 inr
= nlist
->iinr
[0];
143 iq0
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+0]));
144 iq1
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+1]));
145 iq2
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+2]));
146 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
148 jq0
= _mm_set1_ps(charge
[inr
+0]);
149 jq1
= _mm_set1_ps(charge
[inr
+1]);
150 jq2
= _mm_set1_ps(charge
[inr
+2]);
151 vdwjidx0A
= 2*vdwtype
[inr
+0];
152 qq00
= _mm_mul_ps(iq0
,jq0
);
153 c6_00
= _mm_set1_ps(vdwparam
[vdwioffset0
+vdwjidx0A
]);
154 c12_00
= _mm_set1_ps(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
155 qq01
= _mm_mul_ps(iq0
,jq1
);
156 qq02
= _mm_mul_ps(iq0
,jq2
);
157 qq10
= _mm_mul_ps(iq1
,jq0
);
158 qq11
= _mm_mul_ps(iq1
,jq1
);
159 qq12
= _mm_mul_ps(iq1
,jq2
);
160 qq20
= _mm_mul_ps(iq2
,jq0
);
161 qq21
= _mm_mul_ps(iq2
,jq1
);
162 qq22
= _mm_mul_ps(iq2
,jq2
);
164 /* Avoid stupid compiler warnings */
165 jnrA
= jnrB
= jnrC
= jnrD
= 0;
174 for(iidx
=0;iidx
<4*DIM
;iidx
++)
179 /* Start outer loop over neighborlists */
180 for(iidx
=0; iidx
<nri
; iidx
++)
182 /* Load shift vector for this list */
183 i_shift_offset
= DIM
*shiftidx
[iidx
];
185 /* Load limits for loop over neighbors */
186 j_index_start
= jindex
[iidx
];
187 j_index_end
= jindex
[iidx
+1];
189 /* Get outer coordinate index */
191 i_coord_offset
= DIM
*inr
;
193 /* Load i particle coords and add shift vector */
194 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
195 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
197 fix0
= _mm_setzero_ps();
198 fiy0
= _mm_setzero_ps();
199 fiz0
= _mm_setzero_ps();
200 fix1
= _mm_setzero_ps();
201 fiy1
= _mm_setzero_ps();
202 fiz1
= _mm_setzero_ps();
203 fix2
= _mm_setzero_ps();
204 fiy2
= _mm_setzero_ps();
205 fiz2
= _mm_setzero_ps();
207 /* Reset potential sums */
208 velecsum
= _mm_setzero_ps();
209 vvdwsum
= _mm_setzero_ps();
211 /* Start inner kernel loop */
212 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+3]>=0; jidx
+=4)
215 /* Get j neighbor index, and coordinate index */
220 j_coord_offsetA
= DIM
*jnrA
;
221 j_coord_offsetB
= DIM
*jnrB
;
222 j_coord_offsetC
= DIM
*jnrC
;
223 j_coord_offsetD
= DIM
*jnrD
;
225 /* load j atom coordinates */
226 gmx_mm_load_3rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
227 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
228 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
230 /* Calculate displacement vector */
231 dx00
= _mm_sub_ps(ix0
,jx0
);
232 dy00
= _mm_sub_ps(iy0
,jy0
);
233 dz00
= _mm_sub_ps(iz0
,jz0
);
234 dx01
= _mm_sub_ps(ix0
,jx1
);
235 dy01
= _mm_sub_ps(iy0
,jy1
);
236 dz01
= _mm_sub_ps(iz0
,jz1
);
237 dx02
= _mm_sub_ps(ix0
,jx2
);
238 dy02
= _mm_sub_ps(iy0
,jy2
);
239 dz02
= _mm_sub_ps(iz0
,jz2
);
240 dx10
= _mm_sub_ps(ix1
,jx0
);
241 dy10
= _mm_sub_ps(iy1
,jy0
);
242 dz10
= _mm_sub_ps(iz1
,jz0
);
243 dx11
= _mm_sub_ps(ix1
,jx1
);
244 dy11
= _mm_sub_ps(iy1
,jy1
);
245 dz11
= _mm_sub_ps(iz1
,jz1
);
246 dx12
= _mm_sub_ps(ix1
,jx2
);
247 dy12
= _mm_sub_ps(iy1
,jy2
);
248 dz12
= _mm_sub_ps(iz1
,jz2
);
249 dx20
= _mm_sub_ps(ix2
,jx0
);
250 dy20
= _mm_sub_ps(iy2
,jy0
);
251 dz20
= _mm_sub_ps(iz2
,jz0
);
252 dx21
= _mm_sub_ps(ix2
,jx1
);
253 dy21
= _mm_sub_ps(iy2
,jy1
);
254 dz21
= _mm_sub_ps(iz2
,jz1
);
255 dx22
= _mm_sub_ps(ix2
,jx2
);
256 dy22
= _mm_sub_ps(iy2
,jy2
);
257 dz22
= _mm_sub_ps(iz2
,jz2
);
259 /* Calculate squared distance and things based on it */
260 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
261 rsq01
= gmx_mm_calc_rsq_ps(dx01
,dy01
,dz01
);
262 rsq02
= gmx_mm_calc_rsq_ps(dx02
,dy02
,dz02
);
263 rsq10
= gmx_mm_calc_rsq_ps(dx10
,dy10
,dz10
);
264 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
265 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
266 rsq20
= gmx_mm_calc_rsq_ps(dx20
,dy20
,dz20
);
267 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
268 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
270 rinv00
= gmx_mm_invsqrt_ps(rsq00
);
271 rinv01
= gmx_mm_invsqrt_ps(rsq01
);
272 rinv02
= gmx_mm_invsqrt_ps(rsq02
);
273 rinv10
= gmx_mm_invsqrt_ps(rsq10
);
274 rinv11
= gmx_mm_invsqrt_ps(rsq11
);
275 rinv12
= gmx_mm_invsqrt_ps(rsq12
);
276 rinv20
= gmx_mm_invsqrt_ps(rsq20
);
277 rinv21
= gmx_mm_invsqrt_ps(rsq21
);
278 rinv22
= gmx_mm_invsqrt_ps(rsq22
);
280 fjx0
= _mm_setzero_ps();
281 fjy0
= _mm_setzero_ps();
282 fjz0
= _mm_setzero_ps();
283 fjx1
= _mm_setzero_ps();
284 fjy1
= _mm_setzero_ps();
285 fjz1
= _mm_setzero_ps();
286 fjx2
= _mm_setzero_ps();
287 fjy2
= _mm_setzero_ps();
288 fjz2
= _mm_setzero_ps();
290 /**************************
291 * CALCULATE INTERACTIONS *
292 **************************/
294 r00
= _mm_mul_ps(rsq00
,rinv00
);
296 /* Calculate table index by multiplying r with table scale and truncate to integer */
297 rt
= _mm_mul_ps(r00
,vftabscale
);
298 vfitab
= _mm_cvttps_epi32(rt
);
299 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
300 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
302 /* CUBIC SPLINE TABLE ELECTROSTATICS */
303 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
304 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
305 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
306 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
307 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
308 Heps
= _mm_mul_ps(vfeps
,H
);
309 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
310 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
311 velec
= _mm_mul_ps(qq00
,VV
);
312 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
313 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq00
,FF
),_mm_mul_ps(vftabscale
,rinv00
)));
315 /* CUBIC SPLINE TABLE DISPERSION */
316 vfitab
= _mm_add_epi32(vfitab
,ifour
);
317 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
318 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
319 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
320 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
321 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
322 Heps
= _mm_mul_ps(vfeps
,H
);
323 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
324 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
325 vvdw6
= _mm_mul_ps(c6_00
,VV
);
326 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
327 fvdw6
= _mm_mul_ps(c6_00
,FF
);
329 /* CUBIC SPLINE TABLE REPULSION */
330 vfitab
= _mm_add_epi32(vfitab
,ifour
);
331 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
332 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
333 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
334 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
335 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
336 Heps
= _mm_mul_ps(vfeps
,H
);
337 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
338 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
339 vvdw12
= _mm_mul_ps(c12_00
,VV
);
340 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
341 fvdw12
= _mm_mul_ps(c12_00
,FF
);
342 vvdw
= _mm_add_ps(vvdw12
,vvdw6
);
343 fvdw
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_add_ps(fvdw6
,fvdw12
),_mm_mul_ps(vftabscale
,rinv00
)));
345 /* Update potential sum for this i atom from the interaction with this j atom. */
346 velecsum
= _mm_add_ps(velecsum
,velec
);
347 vvdwsum
= _mm_add_ps(vvdwsum
,vvdw
);
349 fscal
= _mm_add_ps(felec
,fvdw
);
351 /* Calculate temporary vectorial force */
352 tx
= _mm_mul_ps(fscal
,dx00
);
353 ty
= _mm_mul_ps(fscal
,dy00
);
354 tz
= _mm_mul_ps(fscal
,dz00
);
356 /* Update vectorial force */
357 fix0
= _mm_add_ps(fix0
,tx
);
358 fiy0
= _mm_add_ps(fiy0
,ty
);
359 fiz0
= _mm_add_ps(fiz0
,tz
);
361 fjx0
= _mm_add_ps(fjx0
,tx
);
362 fjy0
= _mm_add_ps(fjy0
,ty
);
363 fjz0
= _mm_add_ps(fjz0
,tz
);
365 /**************************
366 * CALCULATE INTERACTIONS *
367 **************************/
369 r01
= _mm_mul_ps(rsq01
,rinv01
);
371 /* Calculate table index by multiplying r with table scale and truncate to integer */
372 rt
= _mm_mul_ps(r01
,vftabscale
);
373 vfitab
= _mm_cvttps_epi32(rt
);
374 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
375 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
377 /* CUBIC SPLINE TABLE ELECTROSTATICS */
378 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
379 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
380 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
381 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
382 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
383 Heps
= _mm_mul_ps(vfeps
,H
);
384 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
385 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
386 velec
= _mm_mul_ps(qq01
,VV
);
387 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
388 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq01
,FF
),_mm_mul_ps(vftabscale
,rinv01
)));
390 /* Update potential sum for this i atom from the interaction with this j atom. */
391 velecsum
= _mm_add_ps(velecsum
,velec
);
395 /* Calculate temporary vectorial force */
396 tx
= _mm_mul_ps(fscal
,dx01
);
397 ty
= _mm_mul_ps(fscal
,dy01
);
398 tz
= _mm_mul_ps(fscal
,dz01
);
400 /* Update vectorial force */
401 fix0
= _mm_add_ps(fix0
,tx
);
402 fiy0
= _mm_add_ps(fiy0
,ty
);
403 fiz0
= _mm_add_ps(fiz0
,tz
);
405 fjx1
= _mm_add_ps(fjx1
,tx
);
406 fjy1
= _mm_add_ps(fjy1
,ty
);
407 fjz1
= _mm_add_ps(fjz1
,tz
);
409 /**************************
410 * CALCULATE INTERACTIONS *
411 **************************/
413 r02
= _mm_mul_ps(rsq02
,rinv02
);
415 /* Calculate table index by multiplying r with table scale and truncate to integer */
416 rt
= _mm_mul_ps(r02
,vftabscale
);
417 vfitab
= _mm_cvttps_epi32(rt
);
418 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
419 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
421 /* CUBIC SPLINE TABLE ELECTROSTATICS */
422 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
423 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
424 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
425 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
426 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
427 Heps
= _mm_mul_ps(vfeps
,H
);
428 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
429 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
430 velec
= _mm_mul_ps(qq02
,VV
);
431 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
432 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq02
,FF
),_mm_mul_ps(vftabscale
,rinv02
)));
434 /* Update potential sum for this i atom from the interaction with this j atom. */
435 velecsum
= _mm_add_ps(velecsum
,velec
);
439 /* Calculate temporary vectorial force */
440 tx
= _mm_mul_ps(fscal
,dx02
);
441 ty
= _mm_mul_ps(fscal
,dy02
);
442 tz
= _mm_mul_ps(fscal
,dz02
);
444 /* Update vectorial force */
445 fix0
= _mm_add_ps(fix0
,tx
);
446 fiy0
= _mm_add_ps(fiy0
,ty
);
447 fiz0
= _mm_add_ps(fiz0
,tz
);
449 fjx2
= _mm_add_ps(fjx2
,tx
);
450 fjy2
= _mm_add_ps(fjy2
,ty
);
451 fjz2
= _mm_add_ps(fjz2
,tz
);
453 /**************************
454 * CALCULATE INTERACTIONS *
455 **************************/
457 r10
= _mm_mul_ps(rsq10
,rinv10
);
459 /* Calculate table index by multiplying r with table scale and truncate to integer */
460 rt
= _mm_mul_ps(r10
,vftabscale
);
461 vfitab
= _mm_cvttps_epi32(rt
);
462 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
463 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
465 /* CUBIC SPLINE TABLE ELECTROSTATICS */
466 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
467 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
468 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
469 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
470 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
471 Heps
= _mm_mul_ps(vfeps
,H
);
472 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
473 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
474 velec
= _mm_mul_ps(qq10
,VV
);
475 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
476 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq10
,FF
),_mm_mul_ps(vftabscale
,rinv10
)));
478 /* Update potential sum for this i atom from the interaction with this j atom. */
479 velecsum
= _mm_add_ps(velecsum
,velec
);
483 /* Calculate temporary vectorial force */
484 tx
= _mm_mul_ps(fscal
,dx10
);
485 ty
= _mm_mul_ps(fscal
,dy10
);
486 tz
= _mm_mul_ps(fscal
,dz10
);
488 /* Update vectorial force */
489 fix1
= _mm_add_ps(fix1
,tx
);
490 fiy1
= _mm_add_ps(fiy1
,ty
);
491 fiz1
= _mm_add_ps(fiz1
,tz
);
493 fjx0
= _mm_add_ps(fjx0
,tx
);
494 fjy0
= _mm_add_ps(fjy0
,ty
);
495 fjz0
= _mm_add_ps(fjz0
,tz
);
497 /**************************
498 * CALCULATE INTERACTIONS *
499 **************************/
501 r11
= _mm_mul_ps(rsq11
,rinv11
);
503 /* Calculate table index by multiplying r with table scale and truncate to integer */
504 rt
= _mm_mul_ps(r11
,vftabscale
);
505 vfitab
= _mm_cvttps_epi32(rt
);
506 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
507 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
509 /* CUBIC SPLINE TABLE ELECTROSTATICS */
510 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
511 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
512 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
513 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
514 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
515 Heps
= _mm_mul_ps(vfeps
,H
);
516 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
517 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
518 velec
= _mm_mul_ps(qq11
,VV
);
519 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
520 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq11
,FF
),_mm_mul_ps(vftabscale
,rinv11
)));
522 /* Update potential sum for this i atom from the interaction with this j atom. */
523 velecsum
= _mm_add_ps(velecsum
,velec
);
527 /* Calculate temporary vectorial force */
528 tx
= _mm_mul_ps(fscal
,dx11
);
529 ty
= _mm_mul_ps(fscal
,dy11
);
530 tz
= _mm_mul_ps(fscal
,dz11
);
532 /* Update vectorial force */
533 fix1
= _mm_add_ps(fix1
,tx
);
534 fiy1
= _mm_add_ps(fiy1
,ty
);
535 fiz1
= _mm_add_ps(fiz1
,tz
);
537 fjx1
= _mm_add_ps(fjx1
,tx
);
538 fjy1
= _mm_add_ps(fjy1
,ty
);
539 fjz1
= _mm_add_ps(fjz1
,tz
);
541 /**************************
542 * CALCULATE INTERACTIONS *
543 **************************/
545 r12
= _mm_mul_ps(rsq12
,rinv12
);
547 /* Calculate table index by multiplying r with table scale and truncate to integer */
548 rt
= _mm_mul_ps(r12
,vftabscale
);
549 vfitab
= _mm_cvttps_epi32(rt
);
550 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
551 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
553 /* CUBIC SPLINE TABLE ELECTROSTATICS */
554 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
555 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
556 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
557 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
558 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
559 Heps
= _mm_mul_ps(vfeps
,H
);
560 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
561 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
562 velec
= _mm_mul_ps(qq12
,VV
);
563 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
564 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq12
,FF
),_mm_mul_ps(vftabscale
,rinv12
)));
566 /* Update potential sum for this i atom from the interaction with this j atom. */
567 velecsum
= _mm_add_ps(velecsum
,velec
);
571 /* Calculate temporary vectorial force */
572 tx
= _mm_mul_ps(fscal
,dx12
);
573 ty
= _mm_mul_ps(fscal
,dy12
);
574 tz
= _mm_mul_ps(fscal
,dz12
);
576 /* Update vectorial force */
577 fix1
= _mm_add_ps(fix1
,tx
);
578 fiy1
= _mm_add_ps(fiy1
,ty
);
579 fiz1
= _mm_add_ps(fiz1
,tz
);
581 fjx2
= _mm_add_ps(fjx2
,tx
);
582 fjy2
= _mm_add_ps(fjy2
,ty
);
583 fjz2
= _mm_add_ps(fjz2
,tz
);
585 /**************************
586 * CALCULATE INTERACTIONS *
587 **************************/
589 r20
= _mm_mul_ps(rsq20
,rinv20
);
591 /* Calculate table index by multiplying r with table scale and truncate to integer */
592 rt
= _mm_mul_ps(r20
,vftabscale
);
593 vfitab
= _mm_cvttps_epi32(rt
);
594 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
595 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
597 /* CUBIC SPLINE TABLE ELECTROSTATICS */
598 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
599 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
600 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
601 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
602 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
603 Heps
= _mm_mul_ps(vfeps
,H
);
604 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
605 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
606 velec
= _mm_mul_ps(qq20
,VV
);
607 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
608 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq20
,FF
),_mm_mul_ps(vftabscale
,rinv20
)));
610 /* Update potential sum for this i atom from the interaction with this j atom. */
611 velecsum
= _mm_add_ps(velecsum
,velec
);
615 /* Calculate temporary vectorial force */
616 tx
= _mm_mul_ps(fscal
,dx20
);
617 ty
= _mm_mul_ps(fscal
,dy20
);
618 tz
= _mm_mul_ps(fscal
,dz20
);
620 /* Update vectorial force */
621 fix2
= _mm_add_ps(fix2
,tx
);
622 fiy2
= _mm_add_ps(fiy2
,ty
);
623 fiz2
= _mm_add_ps(fiz2
,tz
);
625 fjx0
= _mm_add_ps(fjx0
,tx
);
626 fjy0
= _mm_add_ps(fjy0
,ty
);
627 fjz0
= _mm_add_ps(fjz0
,tz
);
629 /**************************
630 * CALCULATE INTERACTIONS *
631 **************************/
633 r21
= _mm_mul_ps(rsq21
,rinv21
);
635 /* Calculate table index by multiplying r with table scale and truncate to integer */
636 rt
= _mm_mul_ps(r21
,vftabscale
);
637 vfitab
= _mm_cvttps_epi32(rt
);
638 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
639 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
641 /* CUBIC SPLINE TABLE ELECTROSTATICS */
642 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
643 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
644 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
645 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
646 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
647 Heps
= _mm_mul_ps(vfeps
,H
);
648 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
649 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
650 velec
= _mm_mul_ps(qq21
,VV
);
651 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
652 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq21
,FF
),_mm_mul_ps(vftabscale
,rinv21
)));
654 /* Update potential sum for this i atom from the interaction with this j atom. */
655 velecsum
= _mm_add_ps(velecsum
,velec
);
659 /* Calculate temporary vectorial force */
660 tx
= _mm_mul_ps(fscal
,dx21
);
661 ty
= _mm_mul_ps(fscal
,dy21
);
662 tz
= _mm_mul_ps(fscal
,dz21
);
664 /* Update vectorial force */
665 fix2
= _mm_add_ps(fix2
,tx
);
666 fiy2
= _mm_add_ps(fiy2
,ty
);
667 fiz2
= _mm_add_ps(fiz2
,tz
);
669 fjx1
= _mm_add_ps(fjx1
,tx
);
670 fjy1
= _mm_add_ps(fjy1
,ty
);
671 fjz1
= _mm_add_ps(fjz1
,tz
);
673 /**************************
674 * CALCULATE INTERACTIONS *
675 **************************/
677 r22
= _mm_mul_ps(rsq22
,rinv22
);
679 /* Calculate table index by multiplying r with table scale and truncate to integer */
680 rt
= _mm_mul_ps(r22
,vftabscale
);
681 vfitab
= _mm_cvttps_epi32(rt
);
682 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
683 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
685 /* CUBIC SPLINE TABLE ELECTROSTATICS */
686 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
687 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
688 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
689 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
690 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
691 Heps
= _mm_mul_ps(vfeps
,H
);
692 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
693 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
694 velec
= _mm_mul_ps(qq22
,VV
);
695 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
696 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq22
,FF
),_mm_mul_ps(vftabscale
,rinv22
)));
698 /* Update potential sum for this i atom from the interaction with this j atom. */
699 velecsum
= _mm_add_ps(velecsum
,velec
);
703 /* Calculate temporary vectorial force */
704 tx
= _mm_mul_ps(fscal
,dx22
);
705 ty
= _mm_mul_ps(fscal
,dy22
);
706 tz
= _mm_mul_ps(fscal
,dz22
);
708 /* Update vectorial force */
709 fix2
= _mm_add_ps(fix2
,tx
);
710 fiy2
= _mm_add_ps(fiy2
,ty
);
711 fiz2
= _mm_add_ps(fiz2
,tz
);
713 fjx2
= _mm_add_ps(fjx2
,tx
);
714 fjy2
= _mm_add_ps(fjy2
,ty
);
715 fjz2
= _mm_add_ps(fjz2
,tz
);
717 fjptrA
= f
+j_coord_offsetA
;
718 fjptrB
= f
+j_coord_offsetB
;
719 fjptrC
= f
+j_coord_offsetC
;
720 fjptrD
= f
+j_coord_offsetD
;
722 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
723 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
725 /* Inner loop uses 417 flops */
731 /* Get j neighbor index, and coordinate index */
732 jnrlistA
= jjnr
[jidx
];
733 jnrlistB
= jjnr
[jidx
+1];
734 jnrlistC
= jjnr
[jidx
+2];
735 jnrlistD
= jjnr
[jidx
+3];
736 /* Sign of each element will be negative for non-real atoms.
737 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
738 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
740 dummy_mask
= gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128()));
741 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
742 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
743 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
744 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
745 j_coord_offsetA
= DIM
*jnrA
;
746 j_coord_offsetB
= DIM
*jnrB
;
747 j_coord_offsetC
= DIM
*jnrC
;
748 j_coord_offsetD
= DIM
*jnrD
;
750 /* load j atom coordinates */
751 gmx_mm_load_3rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
752 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
753 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
755 /* Calculate displacement vector */
756 dx00
= _mm_sub_ps(ix0
,jx0
);
757 dy00
= _mm_sub_ps(iy0
,jy0
);
758 dz00
= _mm_sub_ps(iz0
,jz0
);
759 dx01
= _mm_sub_ps(ix0
,jx1
);
760 dy01
= _mm_sub_ps(iy0
,jy1
);
761 dz01
= _mm_sub_ps(iz0
,jz1
);
762 dx02
= _mm_sub_ps(ix0
,jx2
);
763 dy02
= _mm_sub_ps(iy0
,jy2
);
764 dz02
= _mm_sub_ps(iz0
,jz2
);
765 dx10
= _mm_sub_ps(ix1
,jx0
);
766 dy10
= _mm_sub_ps(iy1
,jy0
);
767 dz10
= _mm_sub_ps(iz1
,jz0
);
768 dx11
= _mm_sub_ps(ix1
,jx1
);
769 dy11
= _mm_sub_ps(iy1
,jy1
);
770 dz11
= _mm_sub_ps(iz1
,jz1
);
771 dx12
= _mm_sub_ps(ix1
,jx2
);
772 dy12
= _mm_sub_ps(iy1
,jy2
);
773 dz12
= _mm_sub_ps(iz1
,jz2
);
774 dx20
= _mm_sub_ps(ix2
,jx0
);
775 dy20
= _mm_sub_ps(iy2
,jy0
);
776 dz20
= _mm_sub_ps(iz2
,jz0
);
777 dx21
= _mm_sub_ps(ix2
,jx1
);
778 dy21
= _mm_sub_ps(iy2
,jy1
);
779 dz21
= _mm_sub_ps(iz2
,jz1
);
780 dx22
= _mm_sub_ps(ix2
,jx2
);
781 dy22
= _mm_sub_ps(iy2
,jy2
);
782 dz22
= _mm_sub_ps(iz2
,jz2
);
784 /* Calculate squared distance and things based on it */
785 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
786 rsq01
= gmx_mm_calc_rsq_ps(dx01
,dy01
,dz01
);
787 rsq02
= gmx_mm_calc_rsq_ps(dx02
,dy02
,dz02
);
788 rsq10
= gmx_mm_calc_rsq_ps(dx10
,dy10
,dz10
);
789 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
790 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
791 rsq20
= gmx_mm_calc_rsq_ps(dx20
,dy20
,dz20
);
792 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
793 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
795 rinv00
= gmx_mm_invsqrt_ps(rsq00
);
796 rinv01
= gmx_mm_invsqrt_ps(rsq01
);
797 rinv02
= gmx_mm_invsqrt_ps(rsq02
);
798 rinv10
= gmx_mm_invsqrt_ps(rsq10
);
799 rinv11
= gmx_mm_invsqrt_ps(rsq11
);
800 rinv12
= gmx_mm_invsqrt_ps(rsq12
);
801 rinv20
= gmx_mm_invsqrt_ps(rsq20
);
802 rinv21
= gmx_mm_invsqrt_ps(rsq21
);
803 rinv22
= gmx_mm_invsqrt_ps(rsq22
);
805 fjx0
= _mm_setzero_ps();
806 fjy0
= _mm_setzero_ps();
807 fjz0
= _mm_setzero_ps();
808 fjx1
= _mm_setzero_ps();
809 fjy1
= _mm_setzero_ps();
810 fjz1
= _mm_setzero_ps();
811 fjx2
= _mm_setzero_ps();
812 fjy2
= _mm_setzero_ps();
813 fjz2
= _mm_setzero_ps();
815 /**************************
816 * CALCULATE INTERACTIONS *
817 **************************/
819 r00
= _mm_mul_ps(rsq00
,rinv00
);
820 r00
= _mm_andnot_ps(dummy_mask
,r00
);
822 /* Calculate table index by multiplying r with table scale and truncate to integer */
823 rt
= _mm_mul_ps(r00
,vftabscale
);
824 vfitab
= _mm_cvttps_epi32(rt
);
825 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
826 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
828 /* CUBIC SPLINE TABLE ELECTROSTATICS */
829 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
830 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
831 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
832 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
833 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
834 Heps
= _mm_mul_ps(vfeps
,H
);
835 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
836 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
837 velec
= _mm_mul_ps(qq00
,VV
);
838 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
839 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq00
,FF
),_mm_mul_ps(vftabscale
,rinv00
)));
841 /* CUBIC SPLINE TABLE DISPERSION */
842 vfitab
= _mm_add_epi32(vfitab
,ifour
);
843 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
844 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
845 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
846 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
847 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
848 Heps
= _mm_mul_ps(vfeps
,H
);
849 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
850 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
851 vvdw6
= _mm_mul_ps(c6_00
,VV
);
852 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
853 fvdw6
= _mm_mul_ps(c6_00
,FF
);
855 /* CUBIC SPLINE TABLE REPULSION */
856 vfitab
= _mm_add_epi32(vfitab
,ifour
);
857 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
858 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
859 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
860 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
861 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
862 Heps
= _mm_mul_ps(vfeps
,H
);
863 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
864 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
865 vvdw12
= _mm_mul_ps(c12_00
,VV
);
866 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
867 fvdw12
= _mm_mul_ps(c12_00
,FF
);
868 vvdw
= _mm_add_ps(vvdw12
,vvdw6
);
869 fvdw
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_add_ps(fvdw6
,fvdw12
),_mm_mul_ps(vftabscale
,rinv00
)));
871 /* Update potential sum for this i atom from the interaction with this j atom. */
872 velec
= _mm_andnot_ps(dummy_mask
,velec
);
873 velecsum
= _mm_add_ps(velecsum
,velec
);
874 vvdw
= _mm_andnot_ps(dummy_mask
,vvdw
);
875 vvdwsum
= _mm_add_ps(vvdwsum
,vvdw
);
877 fscal
= _mm_add_ps(felec
,fvdw
);
879 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
881 /* Calculate temporary vectorial force */
882 tx
= _mm_mul_ps(fscal
,dx00
);
883 ty
= _mm_mul_ps(fscal
,dy00
);
884 tz
= _mm_mul_ps(fscal
,dz00
);
886 /* Update vectorial force */
887 fix0
= _mm_add_ps(fix0
,tx
);
888 fiy0
= _mm_add_ps(fiy0
,ty
);
889 fiz0
= _mm_add_ps(fiz0
,tz
);
891 fjx0
= _mm_add_ps(fjx0
,tx
);
892 fjy0
= _mm_add_ps(fjy0
,ty
);
893 fjz0
= _mm_add_ps(fjz0
,tz
);
895 /**************************
896 * CALCULATE INTERACTIONS *
897 **************************/
899 r01
= _mm_mul_ps(rsq01
,rinv01
);
900 r01
= _mm_andnot_ps(dummy_mask
,r01
);
902 /* Calculate table index by multiplying r with table scale and truncate to integer */
903 rt
= _mm_mul_ps(r01
,vftabscale
);
904 vfitab
= _mm_cvttps_epi32(rt
);
905 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
906 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
908 /* CUBIC SPLINE TABLE ELECTROSTATICS */
909 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
910 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
911 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
912 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
913 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
914 Heps
= _mm_mul_ps(vfeps
,H
);
915 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
916 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
917 velec
= _mm_mul_ps(qq01
,VV
);
918 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
919 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq01
,FF
),_mm_mul_ps(vftabscale
,rinv01
)));
921 /* Update potential sum for this i atom from the interaction with this j atom. */
922 velec
= _mm_andnot_ps(dummy_mask
,velec
);
923 velecsum
= _mm_add_ps(velecsum
,velec
);
927 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
929 /* Calculate temporary vectorial force */
930 tx
= _mm_mul_ps(fscal
,dx01
);
931 ty
= _mm_mul_ps(fscal
,dy01
);
932 tz
= _mm_mul_ps(fscal
,dz01
);
934 /* Update vectorial force */
935 fix0
= _mm_add_ps(fix0
,tx
);
936 fiy0
= _mm_add_ps(fiy0
,ty
);
937 fiz0
= _mm_add_ps(fiz0
,tz
);
939 fjx1
= _mm_add_ps(fjx1
,tx
);
940 fjy1
= _mm_add_ps(fjy1
,ty
);
941 fjz1
= _mm_add_ps(fjz1
,tz
);
943 /**************************
944 * CALCULATE INTERACTIONS *
945 **************************/
947 r02
= _mm_mul_ps(rsq02
,rinv02
);
948 r02
= _mm_andnot_ps(dummy_mask
,r02
);
950 /* Calculate table index by multiplying r with table scale and truncate to integer */
951 rt
= _mm_mul_ps(r02
,vftabscale
);
952 vfitab
= _mm_cvttps_epi32(rt
);
953 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
954 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
956 /* CUBIC SPLINE TABLE ELECTROSTATICS */
957 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
958 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
959 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
960 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
961 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
962 Heps
= _mm_mul_ps(vfeps
,H
);
963 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
964 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
965 velec
= _mm_mul_ps(qq02
,VV
);
966 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
967 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq02
,FF
),_mm_mul_ps(vftabscale
,rinv02
)));
969 /* Update potential sum for this i atom from the interaction with this j atom. */
970 velec
= _mm_andnot_ps(dummy_mask
,velec
);
971 velecsum
= _mm_add_ps(velecsum
,velec
);
975 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
977 /* Calculate temporary vectorial force */
978 tx
= _mm_mul_ps(fscal
,dx02
);
979 ty
= _mm_mul_ps(fscal
,dy02
);
980 tz
= _mm_mul_ps(fscal
,dz02
);
982 /* Update vectorial force */
983 fix0
= _mm_add_ps(fix0
,tx
);
984 fiy0
= _mm_add_ps(fiy0
,ty
);
985 fiz0
= _mm_add_ps(fiz0
,tz
);
987 fjx2
= _mm_add_ps(fjx2
,tx
);
988 fjy2
= _mm_add_ps(fjy2
,ty
);
989 fjz2
= _mm_add_ps(fjz2
,tz
);
991 /**************************
992 * CALCULATE INTERACTIONS *
993 **************************/
995 r10
= _mm_mul_ps(rsq10
,rinv10
);
996 r10
= _mm_andnot_ps(dummy_mask
,r10
);
998 /* Calculate table index by multiplying r with table scale and truncate to integer */
999 rt
= _mm_mul_ps(r10
,vftabscale
);
1000 vfitab
= _mm_cvttps_epi32(rt
);
1001 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
1002 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1004 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1005 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1006 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1007 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
1008 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
1009 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1010 Heps
= _mm_mul_ps(vfeps
,H
);
1011 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
1012 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
1013 velec
= _mm_mul_ps(qq10
,VV
);
1014 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
1015 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq10
,FF
),_mm_mul_ps(vftabscale
,rinv10
)));
1017 /* Update potential sum for this i atom from the interaction with this j atom. */
1018 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1019 velecsum
= _mm_add_ps(velecsum
,velec
);
1023 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1025 /* Calculate temporary vectorial force */
1026 tx
= _mm_mul_ps(fscal
,dx10
);
1027 ty
= _mm_mul_ps(fscal
,dy10
);
1028 tz
= _mm_mul_ps(fscal
,dz10
);
1030 /* Update vectorial force */
1031 fix1
= _mm_add_ps(fix1
,tx
);
1032 fiy1
= _mm_add_ps(fiy1
,ty
);
1033 fiz1
= _mm_add_ps(fiz1
,tz
);
1035 fjx0
= _mm_add_ps(fjx0
,tx
);
1036 fjy0
= _mm_add_ps(fjy0
,ty
);
1037 fjz0
= _mm_add_ps(fjz0
,tz
);
1039 /**************************
1040 * CALCULATE INTERACTIONS *
1041 **************************/
1043 r11
= _mm_mul_ps(rsq11
,rinv11
);
1044 r11
= _mm_andnot_ps(dummy_mask
,r11
);
1046 /* Calculate table index by multiplying r with table scale and truncate to integer */
1047 rt
= _mm_mul_ps(r11
,vftabscale
);
1048 vfitab
= _mm_cvttps_epi32(rt
);
1049 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
1050 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1052 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1053 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1054 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1055 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
1056 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
1057 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1058 Heps
= _mm_mul_ps(vfeps
,H
);
1059 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
1060 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
1061 velec
= _mm_mul_ps(qq11
,VV
);
1062 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
1063 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq11
,FF
),_mm_mul_ps(vftabscale
,rinv11
)));
1065 /* Update potential sum for this i atom from the interaction with this j atom. */
1066 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1067 velecsum
= _mm_add_ps(velecsum
,velec
);
1071 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1073 /* Calculate temporary vectorial force */
1074 tx
= _mm_mul_ps(fscal
,dx11
);
1075 ty
= _mm_mul_ps(fscal
,dy11
);
1076 tz
= _mm_mul_ps(fscal
,dz11
);
1078 /* Update vectorial force */
1079 fix1
= _mm_add_ps(fix1
,tx
);
1080 fiy1
= _mm_add_ps(fiy1
,ty
);
1081 fiz1
= _mm_add_ps(fiz1
,tz
);
1083 fjx1
= _mm_add_ps(fjx1
,tx
);
1084 fjy1
= _mm_add_ps(fjy1
,ty
);
1085 fjz1
= _mm_add_ps(fjz1
,tz
);
1087 /**************************
1088 * CALCULATE INTERACTIONS *
1089 **************************/
1091 r12
= _mm_mul_ps(rsq12
,rinv12
);
1092 r12
= _mm_andnot_ps(dummy_mask
,r12
);
1094 /* Calculate table index by multiplying r with table scale and truncate to integer */
1095 rt
= _mm_mul_ps(r12
,vftabscale
);
1096 vfitab
= _mm_cvttps_epi32(rt
);
1097 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
1098 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1100 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1101 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1102 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1103 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
1104 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
1105 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1106 Heps
= _mm_mul_ps(vfeps
,H
);
1107 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
1108 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
1109 velec
= _mm_mul_ps(qq12
,VV
);
1110 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
1111 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq12
,FF
),_mm_mul_ps(vftabscale
,rinv12
)));
1113 /* Update potential sum for this i atom from the interaction with this j atom. */
1114 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1115 velecsum
= _mm_add_ps(velecsum
,velec
);
1119 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1121 /* Calculate temporary vectorial force */
1122 tx
= _mm_mul_ps(fscal
,dx12
);
1123 ty
= _mm_mul_ps(fscal
,dy12
);
1124 tz
= _mm_mul_ps(fscal
,dz12
);
1126 /* Update vectorial force */
1127 fix1
= _mm_add_ps(fix1
,tx
);
1128 fiy1
= _mm_add_ps(fiy1
,ty
);
1129 fiz1
= _mm_add_ps(fiz1
,tz
);
1131 fjx2
= _mm_add_ps(fjx2
,tx
);
1132 fjy2
= _mm_add_ps(fjy2
,ty
);
1133 fjz2
= _mm_add_ps(fjz2
,tz
);
1135 /**************************
1136 * CALCULATE INTERACTIONS *
1137 **************************/
1139 r20
= _mm_mul_ps(rsq20
,rinv20
);
1140 r20
= _mm_andnot_ps(dummy_mask
,r20
);
1142 /* Calculate table index by multiplying r with table scale and truncate to integer */
1143 rt
= _mm_mul_ps(r20
,vftabscale
);
1144 vfitab
= _mm_cvttps_epi32(rt
);
1145 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
1146 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1148 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1149 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1150 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1151 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
1152 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
1153 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1154 Heps
= _mm_mul_ps(vfeps
,H
);
1155 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
1156 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
1157 velec
= _mm_mul_ps(qq20
,VV
);
1158 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
1159 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq20
,FF
),_mm_mul_ps(vftabscale
,rinv20
)));
1161 /* Update potential sum for this i atom from the interaction with this j atom. */
1162 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1163 velecsum
= _mm_add_ps(velecsum
,velec
);
1167 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1169 /* Calculate temporary vectorial force */
1170 tx
= _mm_mul_ps(fscal
,dx20
);
1171 ty
= _mm_mul_ps(fscal
,dy20
);
1172 tz
= _mm_mul_ps(fscal
,dz20
);
1174 /* Update vectorial force */
1175 fix2
= _mm_add_ps(fix2
,tx
);
1176 fiy2
= _mm_add_ps(fiy2
,ty
);
1177 fiz2
= _mm_add_ps(fiz2
,tz
);
1179 fjx0
= _mm_add_ps(fjx0
,tx
);
1180 fjy0
= _mm_add_ps(fjy0
,ty
);
1181 fjz0
= _mm_add_ps(fjz0
,tz
);
1183 /**************************
1184 * CALCULATE INTERACTIONS *
1185 **************************/
1187 r21
= _mm_mul_ps(rsq21
,rinv21
);
1188 r21
= _mm_andnot_ps(dummy_mask
,r21
);
1190 /* Calculate table index by multiplying r with table scale and truncate to integer */
1191 rt
= _mm_mul_ps(r21
,vftabscale
);
1192 vfitab
= _mm_cvttps_epi32(rt
);
1193 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
1194 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1196 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1197 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1198 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1199 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
1200 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
1201 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1202 Heps
= _mm_mul_ps(vfeps
,H
);
1203 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
1204 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
1205 velec
= _mm_mul_ps(qq21
,VV
);
1206 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
1207 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq21
,FF
),_mm_mul_ps(vftabscale
,rinv21
)));
1209 /* Update potential sum for this i atom from the interaction with this j atom. */
1210 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1211 velecsum
= _mm_add_ps(velecsum
,velec
);
1215 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1217 /* Calculate temporary vectorial force */
1218 tx
= _mm_mul_ps(fscal
,dx21
);
1219 ty
= _mm_mul_ps(fscal
,dy21
);
1220 tz
= _mm_mul_ps(fscal
,dz21
);
1222 /* Update vectorial force */
1223 fix2
= _mm_add_ps(fix2
,tx
);
1224 fiy2
= _mm_add_ps(fiy2
,ty
);
1225 fiz2
= _mm_add_ps(fiz2
,tz
);
1227 fjx1
= _mm_add_ps(fjx1
,tx
);
1228 fjy1
= _mm_add_ps(fjy1
,ty
);
1229 fjz1
= _mm_add_ps(fjz1
,tz
);
1231 /**************************
1232 * CALCULATE INTERACTIONS *
1233 **************************/
1235 r22
= _mm_mul_ps(rsq22
,rinv22
);
1236 r22
= _mm_andnot_ps(dummy_mask
,r22
);
1238 /* Calculate table index by multiplying r with table scale and truncate to integer */
1239 rt
= _mm_mul_ps(r22
,vftabscale
);
1240 vfitab
= _mm_cvttps_epi32(rt
);
1241 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
1242 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1244 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1245 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1246 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1247 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
1248 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
1249 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1250 Heps
= _mm_mul_ps(vfeps
,H
);
1251 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
1252 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
1253 velec
= _mm_mul_ps(qq22
,VV
);
1254 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
1255 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq22
,FF
),_mm_mul_ps(vftabscale
,rinv22
)));
1257 /* Update potential sum for this i atom from the interaction with this j atom. */
1258 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1259 velecsum
= _mm_add_ps(velecsum
,velec
);
1263 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1265 /* Calculate temporary vectorial force */
1266 tx
= _mm_mul_ps(fscal
,dx22
);
1267 ty
= _mm_mul_ps(fscal
,dy22
);
1268 tz
= _mm_mul_ps(fscal
,dz22
);
1270 /* Update vectorial force */
1271 fix2
= _mm_add_ps(fix2
,tx
);
1272 fiy2
= _mm_add_ps(fiy2
,ty
);
1273 fiz2
= _mm_add_ps(fiz2
,tz
);
1275 fjx2
= _mm_add_ps(fjx2
,tx
);
1276 fjy2
= _mm_add_ps(fjy2
,ty
);
1277 fjz2
= _mm_add_ps(fjz2
,tz
);
1279 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
1280 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
1281 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
1282 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
1284 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
1285 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
1287 /* Inner loop uses 426 flops */
1290 /* End of innermost loop */
1292 gmx_mm_update_iforce_3atom_swizzle_ps(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
1293 f
+i_coord_offset
,fshift
+i_shift_offset
);
1296 /* Update potential energies */
1297 gmx_mm_update_1pot_ps(velecsum
,kernel_data
->energygrp_elec
+ggid
);
1298 gmx_mm_update_1pot_ps(vvdwsum
,kernel_data
->energygrp_vdw
+ggid
);
1300 /* Increment number of inner iterations */
1301 inneriter
+= j_index_end
- j_index_start
;
1303 /* Outer loop uses 20 flops */
1306 /* Increment number of outer iterations */
1309 /* Update outer/inner flops */
1311 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W3W3_VF
,outeriter
*20 + inneriter
*426);
1314 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sse2_single
1315 * Electrostatics interaction: CubicSplineTable
1316 * VdW interaction: CubicSplineTable
1317 * Geometry: Water3-Water3
1318 * Calculate force/pot: Force
1321 nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sse2_single
1322 (t_nblist
* gmx_restrict nlist
,
1323 rvec
* gmx_restrict xx
,
1324 rvec
* gmx_restrict ff
,
1325 t_forcerec
* gmx_restrict fr
,
1326 t_mdatoms
* gmx_restrict mdatoms
,
1327 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
1328 t_nrnb
* gmx_restrict nrnb
)
1330 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1331 * just 0 for non-waters.
1332 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1333 * jnr indices corresponding to data put in the four positions in the SIMD register.
1335 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
1336 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
1337 int jnrA
,jnrB
,jnrC
,jnrD
;
1338 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
1339 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
1340 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
1341 real rcutoff_scalar
;
1342 real
*shiftvec
,*fshift
,*x
,*f
;
1343 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
;
1344 real scratch
[4*DIM
];
1345 __m128 tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
1347 __m128 ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
1349 __m128 ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
1351 __m128 ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
1352 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
;
1353 __m128 jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
1354 int vdwjidx1A
,vdwjidx1B
,vdwjidx1C
,vdwjidx1D
;
1355 __m128 jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
1356 int vdwjidx2A
,vdwjidx2B
,vdwjidx2C
,vdwjidx2D
;
1357 __m128 jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
1358 __m128 dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
1359 __m128 dx01
,dy01
,dz01
,rsq01
,rinv01
,rinvsq01
,r01
,qq01
,c6_01
,c12_01
;
1360 __m128 dx02
,dy02
,dz02
,rsq02
,rinv02
,rinvsq02
,r02
,qq02
,c6_02
,c12_02
;
1361 __m128 dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
1362 __m128 dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
1363 __m128 dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
1364 __m128 dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
1365 __m128 dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
1366 __m128 dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
1367 __m128 velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
1370 __m128 rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
1373 __m128 one_sixth
= _mm_set1_ps(1.0/6.0);
1374 __m128 one_twelfth
= _mm_set1_ps(1.0/12.0);
1376 __m128i ifour
= _mm_set1_epi32(4);
1377 __m128 rt
,vfeps
,vftabscale
,Y
,F
,G
,H
,Heps
,Fp
,VV
,FF
;
1379 __m128 dummy_mask
,cutoff_mask
;
1380 __m128 signbit
= _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1381 __m128 one
= _mm_set1_ps(1.0);
1382 __m128 two
= _mm_set1_ps(2.0);
1388 jindex
= nlist
->jindex
;
1390 shiftidx
= nlist
->shift
;
1392 shiftvec
= fr
->shift_vec
[0];
1393 fshift
= fr
->fshift
[0];
1394 facel
= _mm_set1_ps(fr
->epsfac
);
1395 charge
= mdatoms
->chargeA
;
1396 nvdwtype
= fr
->ntype
;
1397 vdwparam
= fr
->nbfp
;
1398 vdwtype
= mdatoms
->typeA
;
1400 vftab
= kernel_data
->table_elec_vdw
->data
;
1401 vftabscale
= _mm_set1_ps(kernel_data
->table_elec_vdw
->scale
);
1403 /* Setup water-specific parameters */
1404 inr
= nlist
->iinr
[0];
1405 iq0
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+0]));
1406 iq1
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+1]));
1407 iq2
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+2]));
1408 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
1410 jq0
= _mm_set1_ps(charge
[inr
+0]);
1411 jq1
= _mm_set1_ps(charge
[inr
+1]);
1412 jq2
= _mm_set1_ps(charge
[inr
+2]);
1413 vdwjidx0A
= 2*vdwtype
[inr
+0];
1414 qq00
= _mm_mul_ps(iq0
,jq0
);
1415 c6_00
= _mm_set1_ps(vdwparam
[vdwioffset0
+vdwjidx0A
]);
1416 c12_00
= _mm_set1_ps(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
1417 qq01
= _mm_mul_ps(iq0
,jq1
);
1418 qq02
= _mm_mul_ps(iq0
,jq2
);
1419 qq10
= _mm_mul_ps(iq1
,jq0
);
1420 qq11
= _mm_mul_ps(iq1
,jq1
);
1421 qq12
= _mm_mul_ps(iq1
,jq2
);
1422 qq20
= _mm_mul_ps(iq2
,jq0
);
1423 qq21
= _mm_mul_ps(iq2
,jq1
);
1424 qq22
= _mm_mul_ps(iq2
,jq2
);
1426 /* Avoid stupid compiler warnings */
1427 jnrA
= jnrB
= jnrC
= jnrD
= 0;
1428 j_coord_offsetA
= 0;
1429 j_coord_offsetB
= 0;
1430 j_coord_offsetC
= 0;
1431 j_coord_offsetD
= 0;
1436 for(iidx
=0;iidx
<4*DIM
;iidx
++)
1438 scratch
[iidx
] = 0.0;
1441 /* Start outer loop over neighborlists */
1442 for(iidx
=0; iidx
<nri
; iidx
++)
1444 /* Load shift vector for this list */
1445 i_shift_offset
= DIM
*shiftidx
[iidx
];
1447 /* Load limits for loop over neighbors */
1448 j_index_start
= jindex
[iidx
];
1449 j_index_end
= jindex
[iidx
+1];
1451 /* Get outer coordinate index */
1453 i_coord_offset
= DIM
*inr
;
1455 /* Load i particle coords and add shift vector */
1456 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
1457 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
1459 fix0
= _mm_setzero_ps();
1460 fiy0
= _mm_setzero_ps();
1461 fiz0
= _mm_setzero_ps();
1462 fix1
= _mm_setzero_ps();
1463 fiy1
= _mm_setzero_ps();
1464 fiz1
= _mm_setzero_ps();
1465 fix2
= _mm_setzero_ps();
1466 fiy2
= _mm_setzero_ps();
1467 fiz2
= _mm_setzero_ps();
1469 /* Start inner kernel loop */
1470 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+3]>=0; jidx
+=4)
1473 /* Get j neighbor index, and coordinate index */
1475 jnrB
= jjnr
[jidx
+1];
1476 jnrC
= jjnr
[jidx
+2];
1477 jnrD
= jjnr
[jidx
+3];
1478 j_coord_offsetA
= DIM
*jnrA
;
1479 j_coord_offsetB
= DIM
*jnrB
;
1480 j_coord_offsetC
= DIM
*jnrC
;
1481 j_coord_offsetD
= DIM
*jnrD
;
1483 /* load j atom coordinates */
1484 gmx_mm_load_3rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1485 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
1486 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
1488 /* Calculate displacement vector */
1489 dx00
= _mm_sub_ps(ix0
,jx0
);
1490 dy00
= _mm_sub_ps(iy0
,jy0
);
1491 dz00
= _mm_sub_ps(iz0
,jz0
);
1492 dx01
= _mm_sub_ps(ix0
,jx1
);
1493 dy01
= _mm_sub_ps(iy0
,jy1
);
1494 dz01
= _mm_sub_ps(iz0
,jz1
);
1495 dx02
= _mm_sub_ps(ix0
,jx2
);
1496 dy02
= _mm_sub_ps(iy0
,jy2
);
1497 dz02
= _mm_sub_ps(iz0
,jz2
);
1498 dx10
= _mm_sub_ps(ix1
,jx0
);
1499 dy10
= _mm_sub_ps(iy1
,jy0
);
1500 dz10
= _mm_sub_ps(iz1
,jz0
);
1501 dx11
= _mm_sub_ps(ix1
,jx1
);
1502 dy11
= _mm_sub_ps(iy1
,jy1
);
1503 dz11
= _mm_sub_ps(iz1
,jz1
);
1504 dx12
= _mm_sub_ps(ix1
,jx2
);
1505 dy12
= _mm_sub_ps(iy1
,jy2
);
1506 dz12
= _mm_sub_ps(iz1
,jz2
);
1507 dx20
= _mm_sub_ps(ix2
,jx0
);
1508 dy20
= _mm_sub_ps(iy2
,jy0
);
1509 dz20
= _mm_sub_ps(iz2
,jz0
);
1510 dx21
= _mm_sub_ps(ix2
,jx1
);
1511 dy21
= _mm_sub_ps(iy2
,jy1
);
1512 dz21
= _mm_sub_ps(iz2
,jz1
);
1513 dx22
= _mm_sub_ps(ix2
,jx2
);
1514 dy22
= _mm_sub_ps(iy2
,jy2
);
1515 dz22
= _mm_sub_ps(iz2
,jz2
);
1517 /* Calculate squared distance and things based on it */
1518 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
1519 rsq01
= gmx_mm_calc_rsq_ps(dx01
,dy01
,dz01
);
1520 rsq02
= gmx_mm_calc_rsq_ps(dx02
,dy02
,dz02
);
1521 rsq10
= gmx_mm_calc_rsq_ps(dx10
,dy10
,dz10
);
1522 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
1523 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
1524 rsq20
= gmx_mm_calc_rsq_ps(dx20
,dy20
,dz20
);
1525 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
1526 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
1528 rinv00
= gmx_mm_invsqrt_ps(rsq00
);
1529 rinv01
= gmx_mm_invsqrt_ps(rsq01
);
1530 rinv02
= gmx_mm_invsqrt_ps(rsq02
);
1531 rinv10
= gmx_mm_invsqrt_ps(rsq10
);
1532 rinv11
= gmx_mm_invsqrt_ps(rsq11
);
1533 rinv12
= gmx_mm_invsqrt_ps(rsq12
);
1534 rinv20
= gmx_mm_invsqrt_ps(rsq20
);
1535 rinv21
= gmx_mm_invsqrt_ps(rsq21
);
1536 rinv22
= gmx_mm_invsqrt_ps(rsq22
);
1538 fjx0
= _mm_setzero_ps();
1539 fjy0
= _mm_setzero_ps();
1540 fjz0
= _mm_setzero_ps();
1541 fjx1
= _mm_setzero_ps();
1542 fjy1
= _mm_setzero_ps();
1543 fjz1
= _mm_setzero_ps();
1544 fjx2
= _mm_setzero_ps();
1545 fjy2
= _mm_setzero_ps();
1546 fjz2
= _mm_setzero_ps();
1548 /**************************
1549 * CALCULATE INTERACTIONS *
1550 **************************/
1552 r00
= _mm_mul_ps(rsq00
,rinv00
);
1554 /* Calculate table index by multiplying r with table scale and truncate to integer */
1555 rt
= _mm_mul_ps(r00
,vftabscale
);
1556 vfitab
= _mm_cvttps_epi32(rt
);
1557 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
1558 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1560 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1561 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1562 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1563 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
1564 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
1565 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1566 Heps
= _mm_mul_ps(vfeps
,H
);
1567 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
1568 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
1569 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq00
,FF
),_mm_mul_ps(vftabscale
,rinv00
)));
1571 /* CUBIC SPLINE TABLE DISPERSION */
1572 vfitab
= _mm_add_epi32(vfitab
,ifour
);
1573 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1574 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1575 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
1576 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
1577 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1578 Heps
= _mm_mul_ps(vfeps
,H
);
1579 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
1580 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
1581 fvdw6
= _mm_mul_ps(c6_00
,FF
);
1583 /* CUBIC SPLINE TABLE REPULSION */
1584 vfitab
= _mm_add_epi32(vfitab
,ifour
);
1585 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1586 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1587 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
1588 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
1589 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1590 Heps
= _mm_mul_ps(vfeps
,H
);
1591 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
1592 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
1593 fvdw12
= _mm_mul_ps(c12_00
,FF
);
1594 fvdw
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_add_ps(fvdw6
,fvdw12
),_mm_mul_ps(vftabscale
,rinv00
)));
1596 fscal
= _mm_add_ps(felec
,fvdw
);
1598 /* Calculate temporary vectorial force */
1599 tx
= _mm_mul_ps(fscal
,dx00
);
1600 ty
= _mm_mul_ps(fscal
,dy00
);
1601 tz
= _mm_mul_ps(fscal
,dz00
);
1603 /* Update vectorial force */
1604 fix0
= _mm_add_ps(fix0
,tx
);
1605 fiy0
= _mm_add_ps(fiy0
,ty
);
1606 fiz0
= _mm_add_ps(fiz0
,tz
);
1608 fjx0
= _mm_add_ps(fjx0
,tx
);
1609 fjy0
= _mm_add_ps(fjy0
,ty
);
1610 fjz0
= _mm_add_ps(fjz0
,tz
);
1612 /**************************
1613 * CALCULATE INTERACTIONS *
1614 **************************/
1616 r01
= _mm_mul_ps(rsq01
,rinv01
);
1618 /* Calculate table index by multiplying r with table scale and truncate to integer */
1619 rt
= _mm_mul_ps(r01
,vftabscale
);
1620 vfitab
= _mm_cvttps_epi32(rt
);
1621 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
1622 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1624 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1625 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1626 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1627 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
1628 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
1629 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1630 Heps
= _mm_mul_ps(vfeps
,H
);
1631 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
1632 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
1633 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq01
,FF
),_mm_mul_ps(vftabscale
,rinv01
)));
1637 /* Calculate temporary vectorial force */
1638 tx
= _mm_mul_ps(fscal
,dx01
);
1639 ty
= _mm_mul_ps(fscal
,dy01
);
1640 tz
= _mm_mul_ps(fscal
,dz01
);
1642 /* Update vectorial force */
1643 fix0
= _mm_add_ps(fix0
,tx
);
1644 fiy0
= _mm_add_ps(fiy0
,ty
);
1645 fiz0
= _mm_add_ps(fiz0
,tz
);
1647 fjx1
= _mm_add_ps(fjx1
,tx
);
1648 fjy1
= _mm_add_ps(fjy1
,ty
);
1649 fjz1
= _mm_add_ps(fjz1
,tz
);
1651 /**************************
1652 * CALCULATE INTERACTIONS *
1653 **************************/
1655 r02
= _mm_mul_ps(rsq02
,rinv02
);
1657 /* Calculate table index by multiplying r with table scale and truncate to integer */
1658 rt
= _mm_mul_ps(r02
,vftabscale
);
1659 vfitab
= _mm_cvttps_epi32(rt
);
1660 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
1661 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1663 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1664 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1665 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1666 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
1667 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
1668 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1669 Heps
= _mm_mul_ps(vfeps
,H
);
1670 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
1671 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
1672 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq02
,FF
),_mm_mul_ps(vftabscale
,rinv02
)));
1676 /* Calculate temporary vectorial force */
1677 tx
= _mm_mul_ps(fscal
,dx02
);
1678 ty
= _mm_mul_ps(fscal
,dy02
);
1679 tz
= _mm_mul_ps(fscal
,dz02
);
1681 /* Update vectorial force */
1682 fix0
= _mm_add_ps(fix0
,tx
);
1683 fiy0
= _mm_add_ps(fiy0
,ty
);
1684 fiz0
= _mm_add_ps(fiz0
,tz
);
1686 fjx2
= _mm_add_ps(fjx2
,tx
);
1687 fjy2
= _mm_add_ps(fjy2
,ty
);
1688 fjz2
= _mm_add_ps(fjz2
,tz
);
1690 /**************************
1691 * CALCULATE INTERACTIONS *
1692 **************************/
1694 r10
= _mm_mul_ps(rsq10
,rinv10
);
1696 /* Calculate table index by multiplying r with table scale and truncate to integer */
1697 rt
= _mm_mul_ps(r10
,vftabscale
);
1698 vfitab
= _mm_cvttps_epi32(rt
);
1699 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
1700 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1702 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1703 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1704 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1705 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
1706 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
1707 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1708 Heps
= _mm_mul_ps(vfeps
,H
);
1709 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
1710 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
1711 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq10
,FF
),_mm_mul_ps(vftabscale
,rinv10
)));
1715 /* Calculate temporary vectorial force */
1716 tx
= _mm_mul_ps(fscal
,dx10
);
1717 ty
= _mm_mul_ps(fscal
,dy10
);
1718 tz
= _mm_mul_ps(fscal
,dz10
);
1720 /* Update vectorial force */
1721 fix1
= _mm_add_ps(fix1
,tx
);
1722 fiy1
= _mm_add_ps(fiy1
,ty
);
1723 fiz1
= _mm_add_ps(fiz1
,tz
);
1725 fjx0
= _mm_add_ps(fjx0
,tx
);
1726 fjy0
= _mm_add_ps(fjy0
,ty
);
1727 fjz0
= _mm_add_ps(fjz0
,tz
);
1729 /**************************
1730 * CALCULATE INTERACTIONS *
1731 **************************/
1733 r11
= _mm_mul_ps(rsq11
,rinv11
);
1735 /* Calculate table index by multiplying r with table scale and truncate to integer */
1736 rt
= _mm_mul_ps(r11
,vftabscale
);
1737 vfitab
= _mm_cvttps_epi32(rt
);
1738 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
1739 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1741 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1742 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1743 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1744 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
1745 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
1746 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1747 Heps
= _mm_mul_ps(vfeps
,H
);
1748 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
1749 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
1750 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq11
,FF
),_mm_mul_ps(vftabscale
,rinv11
)));
1754 /* Calculate temporary vectorial force */
1755 tx
= _mm_mul_ps(fscal
,dx11
);
1756 ty
= _mm_mul_ps(fscal
,dy11
);
1757 tz
= _mm_mul_ps(fscal
,dz11
);
1759 /* Update vectorial force */
1760 fix1
= _mm_add_ps(fix1
,tx
);
1761 fiy1
= _mm_add_ps(fiy1
,ty
);
1762 fiz1
= _mm_add_ps(fiz1
,tz
);
1764 fjx1
= _mm_add_ps(fjx1
,tx
);
1765 fjy1
= _mm_add_ps(fjy1
,ty
);
1766 fjz1
= _mm_add_ps(fjz1
,tz
);
1768 /**************************
1769 * CALCULATE INTERACTIONS *
1770 **************************/
1772 r12
= _mm_mul_ps(rsq12
,rinv12
);
1774 /* Calculate table index by multiplying r with table scale and truncate to integer */
1775 rt
= _mm_mul_ps(r12
,vftabscale
);
1776 vfitab
= _mm_cvttps_epi32(rt
);
1777 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
1778 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1780 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1781 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1782 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1783 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
1784 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
1785 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1786 Heps
= _mm_mul_ps(vfeps
,H
);
1787 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
1788 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
1789 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq12
,FF
),_mm_mul_ps(vftabscale
,rinv12
)));
1793 /* Calculate temporary vectorial force */
1794 tx
= _mm_mul_ps(fscal
,dx12
);
1795 ty
= _mm_mul_ps(fscal
,dy12
);
1796 tz
= _mm_mul_ps(fscal
,dz12
);
1798 /* Update vectorial force */
1799 fix1
= _mm_add_ps(fix1
,tx
);
1800 fiy1
= _mm_add_ps(fiy1
,ty
);
1801 fiz1
= _mm_add_ps(fiz1
,tz
);
1803 fjx2
= _mm_add_ps(fjx2
,tx
);
1804 fjy2
= _mm_add_ps(fjy2
,ty
);
1805 fjz2
= _mm_add_ps(fjz2
,tz
);
1807 /**************************
1808 * CALCULATE INTERACTIONS *
1809 **************************/
1811 r20
= _mm_mul_ps(rsq20
,rinv20
);
1813 /* Calculate table index by multiplying r with table scale and truncate to integer */
1814 rt
= _mm_mul_ps(r20
,vftabscale
);
1815 vfitab
= _mm_cvttps_epi32(rt
);
1816 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
1817 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1819 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1820 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1821 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1822 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
1823 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
1824 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1825 Heps
= _mm_mul_ps(vfeps
,H
);
1826 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
1827 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
1828 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq20
,FF
),_mm_mul_ps(vftabscale
,rinv20
)));
1832 /* Calculate temporary vectorial force */
1833 tx
= _mm_mul_ps(fscal
,dx20
);
1834 ty
= _mm_mul_ps(fscal
,dy20
);
1835 tz
= _mm_mul_ps(fscal
,dz20
);
1837 /* Update vectorial force */
1838 fix2
= _mm_add_ps(fix2
,tx
);
1839 fiy2
= _mm_add_ps(fiy2
,ty
);
1840 fiz2
= _mm_add_ps(fiz2
,tz
);
1842 fjx0
= _mm_add_ps(fjx0
,tx
);
1843 fjy0
= _mm_add_ps(fjy0
,ty
);
1844 fjz0
= _mm_add_ps(fjz0
,tz
);
1846 /**************************
1847 * CALCULATE INTERACTIONS *
1848 **************************/
1850 r21
= _mm_mul_ps(rsq21
,rinv21
);
1852 /* Calculate table index by multiplying r with table scale and truncate to integer */
1853 rt
= _mm_mul_ps(r21
,vftabscale
);
1854 vfitab
= _mm_cvttps_epi32(rt
);
1855 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
1856 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1858 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1859 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1860 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1861 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
1862 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
1863 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1864 Heps
= _mm_mul_ps(vfeps
,H
);
1865 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
1866 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
1867 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq21
,FF
),_mm_mul_ps(vftabscale
,rinv21
)));
1871 /* Calculate temporary vectorial force */
1872 tx
= _mm_mul_ps(fscal
,dx21
);
1873 ty
= _mm_mul_ps(fscal
,dy21
);
1874 tz
= _mm_mul_ps(fscal
,dz21
);
1876 /* Update vectorial force */
1877 fix2
= _mm_add_ps(fix2
,tx
);
1878 fiy2
= _mm_add_ps(fiy2
,ty
);
1879 fiz2
= _mm_add_ps(fiz2
,tz
);
1881 fjx1
= _mm_add_ps(fjx1
,tx
);
1882 fjy1
= _mm_add_ps(fjy1
,ty
);
1883 fjz1
= _mm_add_ps(fjz1
,tz
);
1885 /**************************
1886 * CALCULATE INTERACTIONS *
1887 **************************/
1889 r22
= _mm_mul_ps(rsq22
,rinv22
);
1891 /* Calculate table index by multiplying r with table scale and truncate to integer */
1892 rt
= _mm_mul_ps(r22
,vftabscale
);
1893 vfitab
= _mm_cvttps_epi32(rt
);
1894 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
1895 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1897 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1898 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1899 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1900 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
1901 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
1902 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1903 Heps
= _mm_mul_ps(vfeps
,H
);
1904 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
1905 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
1906 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq22
,FF
),_mm_mul_ps(vftabscale
,rinv22
)));
1910 /* Calculate temporary vectorial force */
1911 tx
= _mm_mul_ps(fscal
,dx22
);
1912 ty
= _mm_mul_ps(fscal
,dy22
);
1913 tz
= _mm_mul_ps(fscal
,dz22
);
1915 /* Update vectorial force */
1916 fix2
= _mm_add_ps(fix2
,tx
);
1917 fiy2
= _mm_add_ps(fiy2
,ty
);
1918 fiz2
= _mm_add_ps(fiz2
,tz
);
1920 fjx2
= _mm_add_ps(fjx2
,tx
);
1921 fjy2
= _mm_add_ps(fjy2
,ty
);
1922 fjz2
= _mm_add_ps(fjz2
,tz
);
1924 fjptrA
= f
+j_coord_offsetA
;
1925 fjptrB
= f
+j_coord_offsetB
;
1926 fjptrC
= f
+j_coord_offsetC
;
1927 fjptrD
= f
+j_coord_offsetD
;
1929 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
1930 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
1932 /* Inner loop uses 373 flops */
1935 if(jidx
<j_index_end
)
1938 /* Get j neighbor index, and coordinate index */
1939 jnrlistA
= jjnr
[jidx
];
1940 jnrlistB
= jjnr
[jidx
+1];
1941 jnrlistC
= jjnr
[jidx
+2];
1942 jnrlistD
= jjnr
[jidx
+3];
1943 /* Sign of each element will be negative for non-real atoms.
1944 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1945 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1947 dummy_mask
= gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128()));
1948 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
1949 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
1950 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
1951 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
1952 j_coord_offsetA
= DIM
*jnrA
;
1953 j_coord_offsetB
= DIM
*jnrB
;
1954 j_coord_offsetC
= DIM
*jnrC
;
1955 j_coord_offsetD
= DIM
*jnrD
;
1957 /* load j atom coordinates */
1958 gmx_mm_load_3rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1959 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
1960 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
1962 /* Calculate displacement vector */
1963 dx00
= _mm_sub_ps(ix0
,jx0
);
1964 dy00
= _mm_sub_ps(iy0
,jy0
);
1965 dz00
= _mm_sub_ps(iz0
,jz0
);
1966 dx01
= _mm_sub_ps(ix0
,jx1
);
1967 dy01
= _mm_sub_ps(iy0
,jy1
);
1968 dz01
= _mm_sub_ps(iz0
,jz1
);
1969 dx02
= _mm_sub_ps(ix0
,jx2
);
1970 dy02
= _mm_sub_ps(iy0
,jy2
);
1971 dz02
= _mm_sub_ps(iz0
,jz2
);
1972 dx10
= _mm_sub_ps(ix1
,jx0
);
1973 dy10
= _mm_sub_ps(iy1
,jy0
);
1974 dz10
= _mm_sub_ps(iz1
,jz0
);
1975 dx11
= _mm_sub_ps(ix1
,jx1
);
1976 dy11
= _mm_sub_ps(iy1
,jy1
);
1977 dz11
= _mm_sub_ps(iz1
,jz1
);
1978 dx12
= _mm_sub_ps(ix1
,jx2
);
1979 dy12
= _mm_sub_ps(iy1
,jy2
);
1980 dz12
= _mm_sub_ps(iz1
,jz2
);
1981 dx20
= _mm_sub_ps(ix2
,jx0
);
1982 dy20
= _mm_sub_ps(iy2
,jy0
);
1983 dz20
= _mm_sub_ps(iz2
,jz0
);
1984 dx21
= _mm_sub_ps(ix2
,jx1
);
1985 dy21
= _mm_sub_ps(iy2
,jy1
);
1986 dz21
= _mm_sub_ps(iz2
,jz1
);
1987 dx22
= _mm_sub_ps(ix2
,jx2
);
1988 dy22
= _mm_sub_ps(iy2
,jy2
);
1989 dz22
= _mm_sub_ps(iz2
,jz2
);
1991 /* Calculate squared distance and things based on it */
1992 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
1993 rsq01
= gmx_mm_calc_rsq_ps(dx01
,dy01
,dz01
);
1994 rsq02
= gmx_mm_calc_rsq_ps(dx02
,dy02
,dz02
);
1995 rsq10
= gmx_mm_calc_rsq_ps(dx10
,dy10
,dz10
);
1996 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
1997 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
1998 rsq20
= gmx_mm_calc_rsq_ps(dx20
,dy20
,dz20
);
1999 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
2000 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
2002 rinv00
= gmx_mm_invsqrt_ps(rsq00
);
2003 rinv01
= gmx_mm_invsqrt_ps(rsq01
);
2004 rinv02
= gmx_mm_invsqrt_ps(rsq02
);
2005 rinv10
= gmx_mm_invsqrt_ps(rsq10
);
2006 rinv11
= gmx_mm_invsqrt_ps(rsq11
);
2007 rinv12
= gmx_mm_invsqrt_ps(rsq12
);
2008 rinv20
= gmx_mm_invsqrt_ps(rsq20
);
2009 rinv21
= gmx_mm_invsqrt_ps(rsq21
);
2010 rinv22
= gmx_mm_invsqrt_ps(rsq22
);
2012 fjx0
= _mm_setzero_ps();
2013 fjy0
= _mm_setzero_ps();
2014 fjz0
= _mm_setzero_ps();
2015 fjx1
= _mm_setzero_ps();
2016 fjy1
= _mm_setzero_ps();
2017 fjz1
= _mm_setzero_ps();
2018 fjx2
= _mm_setzero_ps();
2019 fjy2
= _mm_setzero_ps();
2020 fjz2
= _mm_setzero_ps();
2022 /**************************
2023 * CALCULATE INTERACTIONS *
2024 **************************/
2026 r00
= _mm_mul_ps(rsq00
,rinv00
);
2027 r00
= _mm_andnot_ps(dummy_mask
,r00
);
2029 /* Calculate table index by multiplying r with table scale and truncate to integer */
2030 rt
= _mm_mul_ps(r00
,vftabscale
);
2031 vfitab
= _mm_cvttps_epi32(rt
);
2032 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
2033 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
2035 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2036 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2037 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
2038 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
2039 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
2040 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2041 Heps
= _mm_mul_ps(vfeps
,H
);
2042 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
2043 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
2044 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq00
,FF
),_mm_mul_ps(vftabscale
,rinv00
)));
2046 /* CUBIC SPLINE TABLE DISPERSION */
2047 vfitab
= _mm_add_epi32(vfitab
,ifour
);
2048 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2049 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
2050 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
2051 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
2052 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2053 Heps
= _mm_mul_ps(vfeps
,H
);
2054 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
2055 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
2056 fvdw6
= _mm_mul_ps(c6_00
,FF
);
2058 /* CUBIC SPLINE TABLE REPULSION */
2059 vfitab
= _mm_add_epi32(vfitab
,ifour
);
2060 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2061 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
2062 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
2063 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
2064 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2065 Heps
= _mm_mul_ps(vfeps
,H
);
2066 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
2067 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
2068 fvdw12
= _mm_mul_ps(c12_00
,FF
);
2069 fvdw
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_add_ps(fvdw6
,fvdw12
),_mm_mul_ps(vftabscale
,rinv00
)));
2071 fscal
= _mm_add_ps(felec
,fvdw
);
2073 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2075 /* Calculate temporary vectorial force */
2076 tx
= _mm_mul_ps(fscal
,dx00
);
2077 ty
= _mm_mul_ps(fscal
,dy00
);
2078 tz
= _mm_mul_ps(fscal
,dz00
);
2080 /* Update vectorial force */
2081 fix0
= _mm_add_ps(fix0
,tx
);
2082 fiy0
= _mm_add_ps(fiy0
,ty
);
2083 fiz0
= _mm_add_ps(fiz0
,tz
);
2085 fjx0
= _mm_add_ps(fjx0
,tx
);
2086 fjy0
= _mm_add_ps(fjy0
,ty
);
2087 fjz0
= _mm_add_ps(fjz0
,tz
);
2089 /**************************
2090 * CALCULATE INTERACTIONS *
2091 **************************/
2093 r01
= _mm_mul_ps(rsq01
,rinv01
);
2094 r01
= _mm_andnot_ps(dummy_mask
,r01
);
2096 /* Calculate table index by multiplying r with table scale and truncate to integer */
2097 rt
= _mm_mul_ps(r01
,vftabscale
);
2098 vfitab
= _mm_cvttps_epi32(rt
);
2099 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
2100 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
2102 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2103 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2104 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
2105 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
2106 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
2107 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2108 Heps
= _mm_mul_ps(vfeps
,H
);
2109 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
2110 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
2111 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq01
,FF
),_mm_mul_ps(vftabscale
,rinv01
)));
2115 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2117 /* Calculate temporary vectorial force */
2118 tx
= _mm_mul_ps(fscal
,dx01
);
2119 ty
= _mm_mul_ps(fscal
,dy01
);
2120 tz
= _mm_mul_ps(fscal
,dz01
);
2122 /* Update vectorial force */
2123 fix0
= _mm_add_ps(fix0
,tx
);
2124 fiy0
= _mm_add_ps(fiy0
,ty
);
2125 fiz0
= _mm_add_ps(fiz0
,tz
);
2127 fjx1
= _mm_add_ps(fjx1
,tx
);
2128 fjy1
= _mm_add_ps(fjy1
,ty
);
2129 fjz1
= _mm_add_ps(fjz1
,tz
);
2131 /**************************
2132 * CALCULATE INTERACTIONS *
2133 **************************/
2135 r02
= _mm_mul_ps(rsq02
,rinv02
);
2136 r02
= _mm_andnot_ps(dummy_mask
,r02
);
2138 /* Calculate table index by multiplying r with table scale and truncate to integer */
2139 rt
= _mm_mul_ps(r02
,vftabscale
);
2140 vfitab
= _mm_cvttps_epi32(rt
);
2141 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
2142 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
2144 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2145 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2146 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
2147 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
2148 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
2149 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2150 Heps
= _mm_mul_ps(vfeps
,H
);
2151 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
2152 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
2153 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq02
,FF
),_mm_mul_ps(vftabscale
,rinv02
)));
2157 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2159 /* Calculate temporary vectorial force */
2160 tx
= _mm_mul_ps(fscal
,dx02
);
2161 ty
= _mm_mul_ps(fscal
,dy02
);
2162 tz
= _mm_mul_ps(fscal
,dz02
);
2164 /* Update vectorial force */
2165 fix0
= _mm_add_ps(fix0
,tx
);
2166 fiy0
= _mm_add_ps(fiy0
,ty
);
2167 fiz0
= _mm_add_ps(fiz0
,tz
);
2169 fjx2
= _mm_add_ps(fjx2
,tx
);
2170 fjy2
= _mm_add_ps(fjy2
,ty
);
2171 fjz2
= _mm_add_ps(fjz2
,tz
);
2173 /**************************
2174 * CALCULATE INTERACTIONS *
2175 **************************/
2177 r10
= _mm_mul_ps(rsq10
,rinv10
);
2178 r10
= _mm_andnot_ps(dummy_mask
,r10
);
2180 /* Calculate table index by multiplying r with table scale and truncate to integer */
2181 rt
= _mm_mul_ps(r10
,vftabscale
);
2182 vfitab
= _mm_cvttps_epi32(rt
);
2183 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
2184 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
2186 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2187 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2188 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
2189 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
2190 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
2191 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2192 Heps
= _mm_mul_ps(vfeps
,H
);
2193 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
2194 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
2195 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq10
,FF
),_mm_mul_ps(vftabscale
,rinv10
)));
2199 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2201 /* Calculate temporary vectorial force */
2202 tx
= _mm_mul_ps(fscal
,dx10
);
2203 ty
= _mm_mul_ps(fscal
,dy10
);
2204 tz
= _mm_mul_ps(fscal
,dz10
);
2206 /* Update vectorial force */
2207 fix1
= _mm_add_ps(fix1
,tx
);
2208 fiy1
= _mm_add_ps(fiy1
,ty
);
2209 fiz1
= _mm_add_ps(fiz1
,tz
);
2211 fjx0
= _mm_add_ps(fjx0
,tx
);
2212 fjy0
= _mm_add_ps(fjy0
,ty
);
2213 fjz0
= _mm_add_ps(fjz0
,tz
);
2215 /**************************
2216 * CALCULATE INTERACTIONS *
2217 **************************/
2219 r11
= _mm_mul_ps(rsq11
,rinv11
);
2220 r11
= _mm_andnot_ps(dummy_mask
,r11
);
2222 /* Calculate table index by multiplying r with table scale and truncate to integer */
2223 rt
= _mm_mul_ps(r11
,vftabscale
);
2224 vfitab
= _mm_cvttps_epi32(rt
);
2225 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
2226 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
2228 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2229 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2230 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
2231 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
2232 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
2233 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2234 Heps
= _mm_mul_ps(vfeps
,H
);
2235 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
2236 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
2237 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq11
,FF
),_mm_mul_ps(vftabscale
,rinv11
)));
2241 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2243 /* Calculate temporary vectorial force */
2244 tx
= _mm_mul_ps(fscal
,dx11
);
2245 ty
= _mm_mul_ps(fscal
,dy11
);
2246 tz
= _mm_mul_ps(fscal
,dz11
);
2248 /* Update vectorial force */
2249 fix1
= _mm_add_ps(fix1
,tx
);
2250 fiy1
= _mm_add_ps(fiy1
,ty
);
2251 fiz1
= _mm_add_ps(fiz1
,tz
);
2253 fjx1
= _mm_add_ps(fjx1
,tx
);
2254 fjy1
= _mm_add_ps(fjy1
,ty
);
2255 fjz1
= _mm_add_ps(fjz1
,tz
);
2257 /**************************
2258 * CALCULATE INTERACTIONS *
2259 **************************/
2261 r12
= _mm_mul_ps(rsq12
,rinv12
);
2262 r12
= _mm_andnot_ps(dummy_mask
,r12
);
2264 /* Calculate table index by multiplying r with table scale and truncate to integer */
2265 rt
= _mm_mul_ps(r12
,vftabscale
);
2266 vfitab
= _mm_cvttps_epi32(rt
);
2267 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
2268 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
2270 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2271 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2272 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
2273 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
2274 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
2275 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2276 Heps
= _mm_mul_ps(vfeps
,H
);
2277 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
2278 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
2279 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq12
,FF
),_mm_mul_ps(vftabscale
,rinv12
)));
2283 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2285 /* Calculate temporary vectorial force */
2286 tx
= _mm_mul_ps(fscal
,dx12
);
2287 ty
= _mm_mul_ps(fscal
,dy12
);
2288 tz
= _mm_mul_ps(fscal
,dz12
);
2290 /* Update vectorial force */
2291 fix1
= _mm_add_ps(fix1
,tx
);
2292 fiy1
= _mm_add_ps(fiy1
,ty
);
2293 fiz1
= _mm_add_ps(fiz1
,tz
);
2295 fjx2
= _mm_add_ps(fjx2
,tx
);
2296 fjy2
= _mm_add_ps(fjy2
,ty
);
2297 fjz2
= _mm_add_ps(fjz2
,tz
);
2299 /**************************
2300 * CALCULATE INTERACTIONS *
2301 **************************/
2303 r20
= _mm_mul_ps(rsq20
,rinv20
);
2304 r20
= _mm_andnot_ps(dummy_mask
,r20
);
2306 /* Calculate table index by multiplying r with table scale and truncate to integer */
2307 rt
= _mm_mul_ps(r20
,vftabscale
);
2308 vfitab
= _mm_cvttps_epi32(rt
);
2309 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
2310 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
2312 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2313 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2314 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
2315 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
2316 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
2317 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2318 Heps
= _mm_mul_ps(vfeps
,H
);
2319 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
2320 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
2321 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq20
,FF
),_mm_mul_ps(vftabscale
,rinv20
)));
2325 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2327 /* Calculate temporary vectorial force */
2328 tx
= _mm_mul_ps(fscal
,dx20
);
2329 ty
= _mm_mul_ps(fscal
,dy20
);
2330 tz
= _mm_mul_ps(fscal
,dz20
);
2332 /* Update vectorial force */
2333 fix2
= _mm_add_ps(fix2
,tx
);
2334 fiy2
= _mm_add_ps(fiy2
,ty
);
2335 fiz2
= _mm_add_ps(fiz2
,tz
);
2337 fjx0
= _mm_add_ps(fjx0
,tx
);
2338 fjy0
= _mm_add_ps(fjy0
,ty
);
2339 fjz0
= _mm_add_ps(fjz0
,tz
);
2341 /**************************
2342 * CALCULATE INTERACTIONS *
2343 **************************/
2345 r21
= _mm_mul_ps(rsq21
,rinv21
);
2346 r21
= _mm_andnot_ps(dummy_mask
,r21
);
2348 /* Calculate table index by multiplying r with table scale and truncate to integer */
2349 rt
= _mm_mul_ps(r21
,vftabscale
);
2350 vfitab
= _mm_cvttps_epi32(rt
);
2351 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
2352 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
2354 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2355 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2356 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
2357 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
2358 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
2359 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2360 Heps
= _mm_mul_ps(vfeps
,H
);
2361 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
2362 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
2363 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq21
,FF
),_mm_mul_ps(vftabscale
,rinv21
)));
2367 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2369 /* Calculate temporary vectorial force */
2370 tx
= _mm_mul_ps(fscal
,dx21
);
2371 ty
= _mm_mul_ps(fscal
,dy21
);
2372 tz
= _mm_mul_ps(fscal
,dz21
);
2374 /* Update vectorial force */
2375 fix2
= _mm_add_ps(fix2
,tx
);
2376 fiy2
= _mm_add_ps(fiy2
,ty
);
2377 fiz2
= _mm_add_ps(fiz2
,tz
);
2379 fjx1
= _mm_add_ps(fjx1
,tx
);
2380 fjy1
= _mm_add_ps(fjy1
,ty
);
2381 fjz1
= _mm_add_ps(fjz1
,tz
);
2383 /**************************
2384 * CALCULATE INTERACTIONS *
2385 **************************/
2387 r22
= _mm_mul_ps(rsq22
,rinv22
);
2388 r22
= _mm_andnot_ps(dummy_mask
,r22
);
2390 /* Calculate table index by multiplying r with table scale and truncate to integer */
2391 rt
= _mm_mul_ps(r22
,vftabscale
);
2392 vfitab
= _mm_cvttps_epi32(rt
);
2393 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
2394 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
2396 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2397 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2398 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
2399 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
2400 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
2401 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2402 Heps
= _mm_mul_ps(vfeps
,H
);
2403 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
2404 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
2405 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq22
,FF
),_mm_mul_ps(vftabscale
,rinv22
)));
2409 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2411 /* Calculate temporary vectorial force */
2412 tx
= _mm_mul_ps(fscal
,dx22
);
2413 ty
= _mm_mul_ps(fscal
,dy22
);
2414 tz
= _mm_mul_ps(fscal
,dz22
);
2416 /* Update vectorial force */
2417 fix2
= _mm_add_ps(fix2
,tx
);
2418 fiy2
= _mm_add_ps(fiy2
,ty
);
2419 fiz2
= _mm_add_ps(fiz2
,tz
);
2421 fjx2
= _mm_add_ps(fjx2
,tx
);
2422 fjy2
= _mm_add_ps(fjy2
,ty
);
2423 fjz2
= _mm_add_ps(fjz2
,tz
);
2425 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
2426 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
2427 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
2428 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
2430 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
2431 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
2433 /* Inner loop uses 382 flops */
2436 /* End of innermost loop */
2438 gmx_mm_update_iforce_3atom_swizzle_ps(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
2439 f
+i_coord_offset
,fshift
+i_shift_offset
);
2441 /* Increment number of inner iterations */
2442 inneriter
+= j_index_end
- j_index_start
;
2444 /* Outer loop uses 18 flops */
2447 /* Increment number of outer iterations */
2450 /* Update outer/inner flops */
2452 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W3W3_F
,outeriter
*18 + inneriter
*382);