2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_single kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_sse2_single.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sse2_single
51 * Electrostatics interaction: CubicSplineTable
52 * VdW interaction: CubicSplineTable
53 * Geometry: Water3-Water3
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sse2_single
58 (t_nblist
* gmx_restrict nlist
,
59 rvec
* gmx_restrict xx
,
60 rvec
* gmx_restrict ff
,
61 struct t_forcerec
* gmx_restrict fr
,
62 t_mdatoms
* gmx_restrict mdatoms
,
63 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
64 t_nrnb
* gmx_restrict nrnb
)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
72 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
73 int jnrA
,jnrB
,jnrC
,jnrD
;
74 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
75 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
76 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
78 real
*shiftvec
,*fshift
,*x
,*f
;
79 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
;
81 __m128 tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
83 __m128 ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
85 __m128 ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
87 __m128 ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
88 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
;
89 __m128 jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
90 int vdwjidx1A
,vdwjidx1B
,vdwjidx1C
,vdwjidx1D
;
91 __m128 jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
92 int vdwjidx2A
,vdwjidx2B
,vdwjidx2C
,vdwjidx2D
;
93 __m128 jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
94 __m128 dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
95 __m128 dx01
,dy01
,dz01
,rsq01
,rinv01
,rinvsq01
,r01
,qq01
,c6_01
,c12_01
;
96 __m128 dx02
,dy02
,dz02
,rsq02
,rinv02
,rinvsq02
,r02
,qq02
,c6_02
,c12_02
;
97 __m128 dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
98 __m128 dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
99 __m128 dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
100 __m128 dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
101 __m128 dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
102 __m128 dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
103 __m128 velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
106 __m128 rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
109 __m128 one_sixth
= _mm_set1_ps(1.0/6.0);
110 __m128 one_twelfth
= _mm_set1_ps(1.0/12.0);
112 __m128i ifour
= _mm_set1_epi32(4);
113 __m128 rt
,vfeps
,vftabscale
,Y
,F
,G
,H
,Heps
,Fp
,VV
,FF
;
115 __m128 dummy_mask
,cutoff_mask
;
116 __m128 signbit
= _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
117 __m128 one
= _mm_set1_ps(1.0);
118 __m128 two
= _mm_set1_ps(2.0);
124 jindex
= nlist
->jindex
;
126 shiftidx
= nlist
->shift
;
128 shiftvec
= fr
->shift_vec
[0];
129 fshift
= fr
->fshift
[0];
130 facel
= _mm_set1_ps(fr
->ic
->epsfac
);
131 charge
= mdatoms
->chargeA
;
132 nvdwtype
= fr
->ntype
;
134 vdwtype
= mdatoms
->typeA
;
136 vftab
= kernel_data
->table_elec_vdw
->data
;
137 vftabscale
= _mm_set1_ps(kernel_data
->table_elec_vdw
->scale
);
139 /* Setup water-specific parameters */
140 inr
= nlist
->iinr
[0];
141 iq0
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+0]));
142 iq1
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+1]));
143 iq2
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+2]));
144 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
146 jq0
= _mm_set1_ps(charge
[inr
+0]);
147 jq1
= _mm_set1_ps(charge
[inr
+1]);
148 jq2
= _mm_set1_ps(charge
[inr
+2]);
149 vdwjidx0A
= 2*vdwtype
[inr
+0];
150 qq00
= _mm_mul_ps(iq0
,jq0
);
151 c6_00
= _mm_set1_ps(vdwparam
[vdwioffset0
+vdwjidx0A
]);
152 c12_00
= _mm_set1_ps(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
153 qq01
= _mm_mul_ps(iq0
,jq1
);
154 qq02
= _mm_mul_ps(iq0
,jq2
);
155 qq10
= _mm_mul_ps(iq1
,jq0
);
156 qq11
= _mm_mul_ps(iq1
,jq1
);
157 qq12
= _mm_mul_ps(iq1
,jq2
);
158 qq20
= _mm_mul_ps(iq2
,jq0
);
159 qq21
= _mm_mul_ps(iq2
,jq1
);
160 qq22
= _mm_mul_ps(iq2
,jq2
);
162 /* Avoid stupid compiler warnings */
163 jnrA
= jnrB
= jnrC
= jnrD
= 0;
172 for(iidx
=0;iidx
<4*DIM
;iidx
++)
177 /* Start outer loop over neighborlists */
178 for(iidx
=0; iidx
<nri
; iidx
++)
180 /* Load shift vector for this list */
181 i_shift_offset
= DIM
*shiftidx
[iidx
];
183 /* Load limits for loop over neighbors */
184 j_index_start
= jindex
[iidx
];
185 j_index_end
= jindex
[iidx
+1];
187 /* Get outer coordinate index */
189 i_coord_offset
= DIM
*inr
;
191 /* Load i particle coords and add shift vector */
192 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
193 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
195 fix0
= _mm_setzero_ps();
196 fiy0
= _mm_setzero_ps();
197 fiz0
= _mm_setzero_ps();
198 fix1
= _mm_setzero_ps();
199 fiy1
= _mm_setzero_ps();
200 fiz1
= _mm_setzero_ps();
201 fix2
= _mm_setzero_ps();
202 fiy2
= _mm_setzero_ps();
203 fiz2
= _mm_setzero_ps();
205 /* Reset potential sums */
206 velecsum
= _mm_setzero_ps();
207 vvdwsum
= _mm_setzero_ps();
209 /* Start inner kernel loop */
210 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+3]>=0; jidx
+=4)
213 /* Get j neighbor index, and coordinate index */
218 j_coord_offsetA
= DIM
*jnrA
;
219 j_coord_offsetB
= DIM
*jnrB
;
220 j_coord_offsetC
= DIM
*jnrC
;
221 j_coord_offsetD
= DIM
*jnrD
;
223 /* load j atom coordinates */
224 gmx_mm_load_3rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
225 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
226 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
228 /* Calculate displacement vector */
229 dx00
= _mm_sub_ps(ix0
,jx0
);
230 dy00
= _mm_sub_ps(iy0
,jy0
);
231 dz00
= _mm_sub_ps(iz0
,jz0
);
232 dx01
= _mm_sub_ps(ix0
,jx1
);
233 dy01
= _mm_sub_ps(iy0
,jy1
);
234 dz01
= _mm_sub_ps(iz0
,jz1
);
235 dx02
= _mm_sub_ps(ix0
,jx2
);
236 dy02
= _mm_sub_ps(iy0
,jy2
);
237 dz02
= _mm_sub_ps(iz0
,jz2
);
238 dx10
= _mm_sub_ps(ix1
,jx0
);
239 dy10
= _mm_sub_ps(iy1
,jy0
);
240 dz10
= _mm_sub_ps(iz1
,jz0
);
241 dx11
= _mm_sub_ps(ix1
,jx1
);
242 dy11
= _mm_sub_ps(iy1
,jy1
);
243 dz11
= _mm_sub_ps(iz1
,jz1
);
244 dx12
= _mm_sub_ps(ix1
,jx2
);
245 dy12
= _mm_sub_ps(iy1
,jy2
);
246 dz12
= _mm_sub_ps(iz1
,jz2
);
247 dx20
= _mm_sub_ps(ix2
,jx0
);
248 dy20
= _mm_sub_ps(iy2
,jy0
);
249 dz20
= _mm_sub_ps(iz2
,jz0
);
250 dx21
= _mm_sub_ps(ix2
,jx1
);
251 dy21
= _mm_sub_ps(iy2
,jy1
);
252 dz21
= _mm_sub_ps(iz2
,jz1
);
253 dx22
= _mm_sub_ps(ix2
,jx2
);
254 dy22
= _mm_sub_ps(iy2
,jy2
);
255 dz22
= _mm_sub_ps(iz2
,jz2
);
257 /* Calculate squared distance and things based on it */
258 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
259 rsq01
= gmx_mm_calc_rsq_ps(dx01
,dy01
,dz01
);
260 rsq02
= gmx_mm_calc_rsq_ps(dx02
,dy02
,dz02
);
261 rsq10
= gmx_mm_calc_rsq_ps(dx10
,dy10
,dz10
);
262 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
263 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
264 rsq20
= gmx_mm_calc_rsq_ps(dx20
,dy20
,dz20
);
265 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
266 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
268 rinv00
= sse2_invsqrt_f(rsq00
);
269 rinv01
= sse2_invsqrt_f(rsq01
);
270 rinv02
= sse2_invsqrt_f(rsq02
);
271 rinv10
= sse2_invsqrt_f(rsq10
);
272 rinv11
= sse2_invsqrt_f(rsq11
);
273 rinv12
= sse2_invsqrt_f(rsq12
);
274 rinv20
= sse2_invsqrt_f(rsq20
);
275 rinv21
= sse2_invsqrt_f(rsq21
);
276 rinv22
= sse2_invsqrt_f(rsq22
);
278 fjx0
= _mm_setzero_ps();
279 fjy0
= _mm_setzero_ps();
280 fjz0
= _mm_setzero_ps();
281 fjx1
= _mm_setzero_ps();
282 fjy1
= _mm_setzero_ps();
283 fjz1
= _mm_setzero_ps();
284 fjx2
= _mm_setzero_ps();
285 fjy2
= _mm_setzero_ps();
286 fjz2
= _mm_setzero_ps();
288 /**************************
289 * CALCULATE INTERACTIONS *
290 **************************/
292 r00
= _mm_mul_ps(rsq00
,rinv00
);
294 /* Calculate table index by multiplying r with table scale and truncate to integer */
295 rt
= _mm_mul_ps(r00
,vftabscale
);
296 vfitab
= _mm_cvttps_epi32(rt
);
297 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
298 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
300 /* CUBIC SPLINE TABLE ELECTROSTATICS */
301 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
302 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
303 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
304 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
305 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
306 Heps
= _mm_mul_ps(vfeps
,H
);
307 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
308 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
309 velec
= _mm_mul_ps(qq00
,VV
);
310 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
311 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq00
,FF
),_mm_mul_ps(vftabscale
,rinv00
)));
313 /* CUBIC SPLINE TABLE DISPERSION */
314 vfitab
= _mm_add_epi32(vfitab
,ifour
);
315 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
316 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
317 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
318 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
319 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
320 Heps
= _mm_mul_ps(vfeps
,H
);
321 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
322 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
323 vvdw6
= _mm_mul_ps(c6_00
,VV
);
324 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
325 fvdw6
= _mm_mul_ps(c6_00
,FF
);
327 /* CUBIC SPLINE TABLE REPULSION */
328 vfitab
= _mm_add_epi32(vfitab
,ifour
);
329 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
330 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
331 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
332 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
333 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
334 Heps
= _mm_mul_ps(vfeps
,H
);
335 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
336 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
337 vvdw12
= _mm_mul_ps(c12_00
,VV
);
338 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
339 fvdw12
= _mm_mul_ps(c12_00
,FF
);
340 vvdw
= _mm_add_ps(vvdw12
,vvdw6
);
341 fvdw
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_add_ps(fvdw6
,fvdw12
),_mm_mul_ps(vftabscale
,rinv00
)));
343 /* Update potential sum for this i atom from the interaction with this j atom. */
344 velecsum
= _mm_add_ps(velecsum
,velec
);
345 vvdwsum
= _mm_add_ps(vvdwsum
,vvdw
);
347 fscal
= _mm_add_ps(felec
,fvdw
);
349 /* Calculate temporary vectorial force */
350 tx
= _mm_mul_ps(fscal
,dx00
);
351 ty
= _mm_mul_ps(fscal
,dy00
);
352 tz
= _mm_mul_ps(fscal
,dz00
);
354 /* Update vectorial force */
355 fix0
= _mm_add_ps(fix0
,tx
);
356 fiy0
= _mm_add_ps(fiy0
,ty
);
357 fiz0
= _mm_add_ps(fiz0
,tz
);
359 fjx0
= _mm_add_ps(fjx0
,tx
);
360 fjy0
= _mm_add_ps(fjy0
,ty
);
361 fjz0
= _mm_add_ps(fjz0
,tz
);
363 /**************************
364 * CALCULATE INTERACTIONS *
365 **************************/
367 r01
= _mm_mul_ps(rsq01
,rinv01
);
369 /* Calculate table index by multiplying r with table scale and truncate to integer */
370 rt
= _mm_mul_ps(r01
,vftabscale
);
371 vfitab
= _mm_cvttps_epi32(rt
);
372 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
373 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
375 /* CUBIC SPLINE TABLE ELECTROSTATICS */
376 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
377 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
378 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
379 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
380 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
381 Heps
= _mm_mul_ps(vfeps
,H
);
382 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
383 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
384 velec
= _mm_mul_ps(qq01
,VV
);
385 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
386 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq01
,FF
),_mm_mul_ps(vftabscale
,rinv01
)));
388 /* Update potential sum for this i atom from the interaction with this j atom. */
389 velecsum
= _mm_add_ps(velecsum
,velec
);
393 /* Calculate temporary vectorial force */
394 tx
= _mm_mul_ps(fscal
,dx01
);
395 ty
= _mm_mul_ps(fscal
,dy01
);
396 tz
= _mm_mul_ps(fscal
,dz01
);
398 /* Update vectorial force */
399 fix0
= _mm_add_ps(fix0
,tx
);
400 fiy0
= _mm_add_ps(fiy0
,ty
);
401 fiz0
= _mm_add_ps(fiz0
,tz
);
403 fjx1
= _mm_add_ps(fjx1
,tx
);
404 fjy1
= _mm_add_ps(fjy1
,ty
);
405 fjz1
= _mm_add_ps(fjz1
,tz
);
407 /**************************
408 * CALCULATE INTERACTIONS *
409 **************************/
411 r02
= _mm_mul_ps(rsq02
,rinv02
);
413 /* Calculate table index by multiplying r with table scale and truncate to integer */
414 rt
= _mm_mul_ps(r02
,vftabscale
);
415 vfitab
= _mm_cvttps_epi32(rt
);
416 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
417 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
419 /* CUBIC SPLINE TABLE ELECTROSTATICS */
420 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
421 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
422 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
423 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
424 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
425 Heps
= _mm_mul_ps(vfeps
,H
);
426 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
427 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
428 velec
= _mm_mul_ps(qq02
,VV
);
429 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
430 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq02
,FF
),_mm_mul_ps(vftabscale
,rinv02
)));
432 /* Update potential sum for this i atom from the interaction with this j atom. */
433 velecsum
= _mm_add_ps(velecsum
,velec
);
437 /* Calculate temporary vectorial force */
438 tx
= _mm_mul_ps(fscal
,dx02
);
439 ty
= _mm_mul_ps(fscal
,dy02
);
440 tz
= _mm_mul_ps(fscal
,dz02
);
442 /* Update vectorial force */
443 fix0
= _mm_add_ps(fix0
,tx
);
444 fiy0
= _mm_add_ps(fiy0
,ty
);
445 fiz0
= _mm_add_ps(fiz0
,tz
);
447 fjx2
= _mm_add_ps(fjx2
,tx
);
448 fjy2
= _mm_add_ps(fjy2
,ty
);
449 fjz2
= _mm_add_ps(fjz2
,tz
);
451 /**************************
452 * CALCULATE INTERACTIONS *
453 **************************/
455 r10
= _mm_mul_ps(rsq10
,rinv10
);
457 /* Calculate table index by multiplying r with table scale and truncate to integer */
458 rt
= _mm_mul_ps(r10
,vftabscale
);
459 vfitab
= _mm_cvttps_epi32(rt
);
460 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
461 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
463 /* CUBIC SPLINE TABLE ELECTROSTATICS */
464 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
465 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
466 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
467 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
468 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
469 Heps
= _mm_mul_ps(vfeps
,H
);
470 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
471 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
472 velec
= _mm_mul_ps(qq10
,VV
);
473 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
474 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq10
,FF
),_mm_mul_ps(vftabscale
,rinv10
)));
476 /* Update potential sum for this i atom from the interaction with this j atom. */
477 velecsum
= _mm_add_ps(velecsum
,velec
);
481 /* Calculate temporary vectorial force */
482 tx
= _mm_mul_ps(fscal
,dx10
);
483 ty
= _mm_mul_ps(fscal
,dy10
);
484 tz
= _mm_mul_ps(fscal
,dz10
);
486 /* Update vectorial force */
487 fix1
= _mm_add_ps(fix1
,tx
);
488 fiy1
= _mm_add_ps(fiy1
,ty
);
489 fiz1
= _mm_add_ps(fiz1
,tz
);
491 fjx0
= _mm_add_ps(fjx0
,tx
);
492 fjy0
= _mm_add_ps(fjy0
,ty
);
493 fjz0
= _mm_add_ps(fjz0
,tz
);
495 /**************************
496 * CALCULATE INTERACTIONS *
497 **************************/
499 r11
= _mm_mul_ps(rsq11
,rinv11
);
501 /* Calculate table index by multiplying r with table scale and truncate to integer */
502 rt
= _mm_mul_ps(r11
,vftabscale
);
503 vfitab
= _mm_cvttps_epi32(rt
);
504 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
505 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
507 /* CUBIC SPLINE TABLE ELECTROSTATICS */
508 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
509 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
510 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
511 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
512 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
513 Heps
= _mm_mul_ps(vfeps
,H
);
514 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
515 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
516 velec
= _mm_mul_ps(qq11
,VV
);
517 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
518 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq11
,FF
),_mm_mul_ps(vftabscale
,rinv11
)));
520 /* Update potential sum for this i atom from the interaction with this j atom. */
521 velecsum
= _mm_add_ps(velecsum
,velec
);
525 /* Calculate temporary vectorial force */
526 tx
= _mm_mul_ps(fscal
,dx11
);
527 ty
= _mm_mul_ps(fscal
,dy11
);
528 tz
= _mm_mul_ps(fscal
,dz11
);
530 /* Update vectorial force */
531 fix1
= _mm_add_ps(fix1
,tx
);
532 fiy1
= _mm_add_ps(fiy1
,ty
);
533 fiz1
= _mm_add_ps(fiz1
,tz
);
535 fjx1
= _mm_add_ps(fjx1
,tx
);
536 fjy1
= _mm_add_ps(fjy1
,ty
);
537 fjz1
= _mm_add_ps(fjz1
,tz
);
539 /**************************
540 * CALCULATE INTERACTIONS *
541 **************************/
543 r12
= _mm_mul_ps(rsq12
,rinv12
);
545 /* Calculate table index by multiplying r with table scale and truncate to integer */
546 rt
= _mm_mul_ps(r12
,vftabscale
);
547 vfitab
= _mm_cvttps_epi32(rt
);
548 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
549 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
551 /* CUBIC SPLINE TABLE ELECTROSTATICS */
552 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
553 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
554 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
555 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
556 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
557 Heps
= _mm_mul_ps(vfeps
,H
);
558 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
559 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
560 velec
= _mm_mul_ps(qq12
,VV
);
561 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
562 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq12
,FF
),_mm_mul_ps(vftabscale
,rinv12
)));
564 /* Update potential sum for this i atom from the interaction with this j atom. */
565 velecsum
= _mm_add_ps(velecsum
,velec
);
569 /* Calculate temporary vectorial force */
570 tx
= _mm_mul_ps(fscal
,dx12
);
571 ty
= _mm_mul_ps(fscal
,dy12
);
572 tz
= _mm_mul_ps(fscal
,dz12
);
574 /* Update vectorial force */
575 fix1
= _mm_add_ps(fix1
,tx
);
576 fiy1
= _mm_add_ps(fiy1
,ty
);
577 fiz1
= _mm_add_ps(fiz1
,tz
);
579 fjx2
= _mm_add_ps(fjx2
,tx
);
580 fjy2
= _mm_add_ps(fjy2
,ty
);
581 fjz2
= _mm_add_ps(fjz2
,tz
);
583 /**************************
584 * CALCULATE INTERACTIONS *
585 **************************/
587 r20
= _mm_mul_ps(rsq20
,rinv20
);
589 /* Calculate table index by multiplying r with table scale and truncate to integer */
590 rt
= _mm_mul_ps(r20
,vftabscale
);
591 vfitab
= _mm_cvttps_epi32(rt
);
592 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
593 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
595 /* CUBIC SPLINE TABLE ELECTROSTATICS */
596 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
597 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
598 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
599 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
600 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
601 Heps
= _mm_mul_ps(vfeps
,H
);
602 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
603 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
604 velec
= _mm_mul_ps(qq20
,VV
);
605 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
606 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq20
,FF
),_mm_mul_ps(vftabscale
,rinv20
)));
608 /* Update potential sum for this i atom from the interaction with this j atom. */
609 velecsum
= _mm_add_ps(velecsum
,velec
);
613 /* Calculate temporary vectorial force */
614 tx
= _mm_mul_ps(fscal
,dx20
);
615 ty
= _mm_mul_ps(fscal
,dy20
);
616 tz
= _mm_mul_ps(fscal
,dz20
);
618 /* Update vectorial force */
619 fix2
= _mm_add_ps(fix2
,tx
);
620 fiy2
= _mm_add_ps(fiy2
,ty
);
621 fiz2
= _mm_add_ps(fiz2
,tz
);
623 fjx0
= _mm_add_ps(fjx0
,tx
);
624 fjy0
= _mm_add_ps(fjy0
,ty
);
625 fjz0
= _mm_add_ps(fjz0
,tz
);
627 /**************************
628 * CALCULATE INTERACTIONS *
629 **************************/
631 r21
= _mm_mul_ps(rsq21
,rinv21
);
633 /* Calculate table index by multiplying r with table scale and truncate to integer */
634 rt
= _mm_mul_ps(r21
,vftabscale
);
635 vfitab
= _mm_cvttps_epi32(rt
);
636 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
637 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
639 /* CUBIC SPLINE TABLE ELECTROSTATICS */
640 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
641 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
642 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
643 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
644 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
645 Heps
= _mm_mul_ps(vfeps
,H
);
646 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
647 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
648 velec
= _mm_mul_ps(qq21
,VV
);
649 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
650 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq21
,FF
),_mm_mul_ps(vftabscale
,rinv21
)));
652 /* Update potential sum for this i atom from the interaction with this j atom. */
653 velecsum
= _mm_add_ps(velecsum
,velec
);
657 /* Calculate temporary vectorial force */
658 tx
= _mm_mul_ps(fscal
,dx21
);
659 ty
= _mm_mul_ps(fscal
,dy21
);
660 tz
= _mm_mul_ps(fscal
,dz21
);
662 /* Update vectorial force */
663 fix2
= _mm_add_ps(fix2
,tx
);
664 fiy2
= _mm_add_ps(fiy2
,ty
);
665 fiz2
= _mm_add_ps(fiz2
,tz
);
667 fjx1
= _mm_add_ps(fjx1
,tx
);
668 fjy1
= _mm_add_ps(fjy1
,ty
);
669 fjz1
= _mm_add_ps(fjz1
,tz
);
671 /**************************
672 * CALCULATE INTERACTIONS *
673 **************************/
675 r22
= _mm_mul_ps(rsq22
,rinv22
);
677 /* Calculate table index by multiplying r with table scale and truncate to integer */
678 rt
= _mm_mul_ps(r22
,vftabscale
);
679 vfitab
= _mm_cvttps_epi32(rt
);
680 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
681 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
683 /* CUBIC SPLINE TABLE ELECTROSTATICS */
684 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
685 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
686 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
687 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
688 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
689 Heps
= _mm_mul_ps(vfeps
,H
);
690 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
691 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
692 velec
= _mm_mul_ps(qq22
,VV
);
693 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
694 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq22
,FF
),_mm_mul_ps(vftabscale
,rinv22
)));
696 /* Update potential sum for this i atom from the interaction with this j atom. */
697 velecsum
= _mm_add_ps(velecsum
,velec
);
701 /* Calculate temporary vectorial force */
702 tx
= _mm_mul_ps(fscal
,dx22
);
703 ty
= _mm_mul_ps(fscal
,dy22
);
704 tz
= _mm_mul_ps(fscal
,dz22
);
706 /* Update vectorial force */
707 fix2
= _mm_add_ps(fix2
,tx
);
708 fiy2
= _mm_add_ps(fiy2
,ty
);
709 fiz2
= _mm_add_ps(fiz2
,tz
);
711 fjx2
= _mm_add_ps(fjx2
,tx
);
712 fjy2
= _mm_add_ps(fjy2
,ty
);
713 fjz2
= _mm_add_ps(fjz2
,tz
);
715 fjptrA
= f
+j_coord_offsetA
;
716 fjptrB
= f
+j_coord_offsetB
;
717 fjptrC
= f
+j_coord_offsetC
;
718 fjptrD
= f
+j_coord_offsetD
;
720 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
721 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
723 /* Inner loop uses 417 flops */
729 /* Get j neighbor index, and coordinate index */
730 jnrlistA
= jjnr
[jidx
];
731 jnrlistB
= jjnr
[jidx
+1];
732 jnrlistC
= jjnr
[jidx
+2];
733 jnrlistD
= jjnr
[jidx
+3];
734 /* Sign of each element will be negative for non-real atoms.
735 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
736 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
738 dummy_mask
= gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128()));
739 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
740 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
741 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
742 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
743 j_coord_offsetA
= DIM
*jnrA
;
744 j_coord_offsetB
= DIM
*jnrB
;
745 j_coord_offsetC
= DIM
*jnrC
;
746 j_coord_offsetD
= DIM
*jnrD
;
748 /* load j atom coordinates */
749 gmx_mm_load_3rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
750 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
751 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
753 /* Calculate displacement vector */
754 dx00
= _mm_sub_ps(ix0
,jx0
);
755 dy00
= _mm_sub_ps(iy0
,jy0
);
756 dz00
= _mm_sub_ps(iz0
,jz0
);
757 dx01
= _mm_sub_ps(ix0
,jx1
);
758 dy01
= _mm_sub_ps(iy0
,jy1
);
759 dz01
= _mm_sub_ps(iz0
,jz1
);
760 dx02
= _mm_sub_ps(ix0
,jx2
);
761 dy02
= _mm_sub_ps(iy0
,jy2
);
762 dz02
= _mm_sub_ps(iz0
,jz2
);
763 dx10
= _mm_sub_ps(ix1
,jx0
);
764 dy10
= _mm_sub_ps(iy1
,jy0
);
765 dz10
= _mm_sub_ps(iz1
,jz0
);
766 dx11
= _mm_sub_ps(ix1
,jx1
);
767 dy11
= _mm_sub_ps(iy1
,jy1
);
768 dz11
= _mm_sub_ps(iz1
,jz1
);
769 dx12
= _mm_sub_ps(ix1
,jx2
);
770 dy12
= _mm_sub_ps(iy1
,jy2
);
771 dz12
= _mm_sub_ps(iz1
,jz2
);
772 dx20
= _mm_sub_ps(ix2
,jx0
);
773 dy20
= _mm_sub_ps(iy2
,jy0
);
774 dz20
= _mm_sub_ps(iz2
,jz0
);
775 dx21
= _mm_sub_ps(ix2
,jx1
);
776 dy21
= _mm_sub_ps(iy2
,jy1
);
777 dz21
= _mm_sub_ps(iz2
,jz1
);
778 dx22
= _mm_sub_ps(ix2
,jx2
);
779 dy22
= _mm_sub_ps(iy2
,jy2
);
780 dz22
= _mm_sub_ps(iz2
,jz2
);
782 /* Calculate squared distance and things based on it */
783 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
784 rsq01
= gmx_mm_calc_rsq_ps(dx01
,dy01
,dz01
);
785 rsq02
= gmx_mm_calc_rsq_ps(dx02
,dy02
,dz02
);
786 rsq10
= gmx_mm_calc_rsq_ps(dx10
,dy10
,dz10
);
787 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
788 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
789 rsq20
= gmx_mm_calc_rsq_ps(dx20
,dy20
,dz20
);
790 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
791 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
793 rinv00
= sse2_invsqrt_f(rsq00
);
794 rinv01
= sse2_invsqrt_f(rsq01
);
795 rinv02
= sse2_invsqrt_f(rsq02
);
796 rinv10
= sse2_invsqrt_f(rsq10
);
797 rinv11
= sse2_invsqrt_f(rsq11
);
798 rinv12
= sse2_invsqrt_f(rsq12
);
799 rinv20
= sse2_invsqrt_f(rsq20
);
800 rinv21
= sse2_invsqrt_f(rsq21
);
801 rinv22
= sse2_invsqrt_f(rsq22
);
803 fjx0
= _mm_setzero_ps();
804 fjy0
= _mm_setzero_ps();
805 fjz0
= _mm_setzero_ps();
806 fjx1
= _mm_setzero_ps();
807 fjy1
= _mm_setzero_ps();
808 fjz1
= _mm_setzero_ps();
809 fjx2
= _mm_setzero_ps();
810 fjy2
= _mm_setzero_ps();
811 fjz2
= _mm_setzero_ps();
813 /**************************
814 * CALCULATE INTERACTIONS *
815 **************************/
817 r00
= _mm_mul_ps(rsq00
,rinv00
);
818 r00
= _mm_andnot_ps(dummy_mask
,r00
);
820 /* Calculate table index by multiplying r with table scale and truncate to integer */
821 rt
= _mm_mul_ps(r00
,vftabscale
);
822 vfitab
= _mm_cvttps_epi32(rt
);
823 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
824 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
826 /* CUBIC SPLINE TABLE ELECTROSTATICS */
827 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
828 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
829 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
830 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
831 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
832 Heps
= _mm_mul_ps(vfeps
,H
);
833 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
834 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
835 velec
= _mm_mul_ps(qq00
,VV
);
836 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
837 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq00
,FF
),_mm_mul_ps(vftabscale
,rinv00
)));
839 /* CUBIC SPLINE TABLE DISPERSION */
840 vfitab
= _mm_add_epi32(vfitab
,ifour
);
841 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
842 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
843 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
844 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
845 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
846 Heps
= _mm_mul_ps(vfeps
,H
);
847 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
848 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
849 vvdw6
= _mm_mul_ps(c6_00
,VV
);
850 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
851 fvdw6
= _mm_mul_ps(c6_00
,FF
);
853 /* CUBIC SPLINE TABLE REPULSION */
854 vfitab
= _mm_add_epi32(vfitab
,ifour
);
855 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
856 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
857 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
858 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
859 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
860 Heps
= _mm_mul_ps(vfeps
,H
);
861 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
862 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
863 vvdw12
= _mm_mul_ps(c12_00
,VV
);
864 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
865 fvdw12
= _mm_mul_ps(c12_00
,FF
);
866 vvdw
= _mm_add_ps(vvdw12
,vvdw6
);
867 fvdw
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_add_ps(fvdw6
,fvdw12
),_mm_mul_ps(vftabscale
,rinv00
)));
869 /* Update potential sum for this i atom from the interaction with this j atom. */
870 velec
= _mm_andnot_ps(dummy_mask
,velec
);
871 velecsum
= _mm_add_ps(velecsum
,velec
);
872 vvdw
= _mm_andnot_ps(dummy_mask
,vvdw
);
873 vvdwsum
= _mm_add_ps(vvdwsum
,vvdw
);
875 fscal
= _mm_add_ps(felec
,fvdw
);
877 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
879 /* Calculate temporary vectorial force */
880 tx
= _mm_mul_ps(fscal
,dx00
);
881 ty
= _mm_mul_ps(fscal
,dy00
);
882 tz
= _mm_mul_ps(fscal
,dz00
);
884 /* Update vectorial force */
885 fix0
= _mm_add_ps(fix0
,tx
);
886 fiy0
= _mm_add_ps(fiy0
,ty
);
887 fiz0
= _mm_add_ps(fiz0
,tz
);
889 fjx0
= _mm_add_ps(fjx0
,tx
);
890 fjy0
= _mm_add_ps(fjy0
,ty
);
891 fjz0
= _mm_add_ps(fjz0
,tz
);
893 /**************************
894 * CALCULATE INTERACTIONS *
895 **************************/
897 r01
= _mm_mul_ps(rsq01
,rinv01
);
898 r01
= _mm_andnot_ps(dummy_mask
,r01
);
900 /* Calculate table index by multiplying r with table scale and truncate to integer */
901 rt
= _mm_mul_ps(r01
,vftabscale
);
902 vfitab
= _mm_cvttps_epi32(rt
);
903 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
904 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
906 /* CUBIC SPLINE TABLE ELECTROSTATICS */
907 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
908 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
909 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
910 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
911 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
912 Heps
= _mm_mul_ps(vfeps
,H
);
913 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
914 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
915 velec
= _mm_mul_ps(qq01
,VV
);
916 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
917 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq01
,FF
),_mm_mul_ps(vftabscale
,rinv01
)));
919 /* Update potential sum for this i atom from the interaction with this j atom. */
920 velec
= _mm_andnot_ps(dummy_mask
,velec
);
921 velecsum
= _mm_add_ps(velecsum
,velec
);
925 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
927 /* Calculate temporary vectorial force */
928 tx
= _mm_mul_ps(fscal
,dx01
);
929 ty
= _mm_mul_ps(fscal
,dy01
);
930 tz
= _mm_mul_ps(fscal
,dz01
);
932 /* Update vectorial force */
933 fix0
= _mm_add_ps(fix0
,tx
);
934 fiy0
= _mm_add_ps(fiy0
,ty
);
935 fiz0
= _mm_add_ps(fiz0
,tz
);
937 fjx1
= _mm_add_ps(fjx1
,tx
);
938 fjy1
= _mm_add_ps(fjy1
,ty
);
939 fjz1
= _mm_add_ps(fjz1
,tz
);
941 /**************************
942 * CALCULATE INTERACTIONS *
943 **************************/
945 r02
= _mm_mul_ps(rsq02
,rinv02
);
946 r02
= _mm_andnot_ps(dummy_mask
,r02
);
948 /* Calculate table index by multiplying r with table scale and truncate to integer */
949 rt
= _mm_mul_ps(r02
,vftabscale
);
950 vfitab
= _mm_cvttps_epi32(rt
);
951 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
952 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
954 /* CUBIC SPLINE TABLE ELECTROSTATICS */
955 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
956 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
957 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
958 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
959 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
960 Heps
= _mm_mul_ps(vfeps
,H
);
961 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
962 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
963 velec
= _mm_mul_ps(qq02
,VV
);
964 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
965 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq02
,FF
),_mm_mul_ps(vftabscale
,rinv02
)));
967 /* Update potential sum for this i atom from the interaction with this j atom. */
968 velec
= _mm_andnot_ps(dummy_mask
,velec
);
969 velecsum
= _mm_add_ps(velecsum
,velec
);
973 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
975 /* Calculate temporary vectorial force */
976 tx
= _mm_mul_ps(fscal
,dx02
);
977 ty
= _mm_mul_ps(fscal
,dy02
);
978 tz
= _mm_mul_ps(fscal
,dz02
);
980 /* Update vectorial force */
981 fix0
= _mm_add_ps(fix0
,tx
);
982 fiy0
= _mm_add_ps(fiy0
,ty
);
983 fiz0
= _mm_add_ps(fiz0
,tz
);
985 fjx2
= _mm_add_ps(fjx2
,tx
);
986 fjy2
= _mm_add_ps(fjy2
,ty
);
987 fjz2
= _mm_add_ps(fjz2
,tz
);
989 /**************************
990 * CALCULATE INTERACTIONS *
991 **************************/
993 r10
= _mm_mul_ps(rsq10
,rinv10
);
994 r10
= _mm_andnot_ps(dummy_mask
,r10
);
996 /* Calculate table index by multiplying r with table scale and truncate to integer */
997 rt
= _mm_mul_ps(r10
,vftabscale
);
998 vfitab
= _mm_cvttps_epi32(rt
);
999 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
1000 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1002 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1003 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1004 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1005 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
1006 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
1007 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1008 Heps
= _mm_mul_ps(vfeps
,H
);
1009 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
1010 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
1011 velec
= _mm_mul_ps(qq10
,VV
);
1012 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
1013 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq10
,FF
),_mm_mul_ps(vftabscale
,rinv10
)));
1015 /* Update potential sum for this i atom from the interaction with this j atom. */
1016 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1017 velecsum
= _mm_add_ps(velecsum
,velec
);
1021 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1023 /* Calculate temporary vectorial force */
1024 tx
= _mm_mul_ps(fscal
,dx10
);
1025 ty
= _mm_mul_ps(fscal
,dy10
);
1026 tz
= _mm_mul_ps(fscal
,dz10
);
1028 /* Update vectorial force */
1029 fix1
= _mm_add_ps(fix1
,tx
);
1030 fiy1
= _mm_add_ps(fiy1
,ty
);
1031 fiz1
= _mm_add_ps(fiz1
,tz
);
1033 fjx0
= _mm_add_ps(fjx0
,tx
);
1034 fjy0
= _mm_add_ps(fjy0
,ty
);
1035 fjz0
= _mm_add_ps(fjz0
,tz
);
1037 /**************************
1038 * CALCULATE INTERACTIONS *
1039 **************************/
1041 r11
= _mm_mul_ps(rsq11
,rinv11
);
1042 r11
= _mm_andnot_ps(dummy_mask
,r11
);
1044 /* Calculate table index by multiplying r with table scale and truncate to integer */
1045 rt
= _mm_mul_ps(r11
,vftabscale
);
1046 vfitab
= _mm_cvttps_epi32(rt
);
1047 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
1048 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1050 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1051 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1052 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1053 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
1054 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
1055 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1056 Heps
= _mm_mul_ps(vfeps
,H
);
1057 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
1058 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
1059 velec
= _mm_mul_ps(qq11
,VV
);
1060 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
1061 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq11
,FF
),_mm_mul_ps(vftabscale
,rinv11
)));
1063 /* Update potential sum for this i atom from the interaction with this j atom. */
1064 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1065 velecsum
= _mm_add_ps(velecsum
,velec
);
1069 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1071 /* Calculate temporary vectorial force */
1072 tx
= _mm_mul_ps(fscal
,dx11
);
1073 ty
= _mm_mul_ps(fscal
,dy11
);
1074 tz
= _mm_mul_ps(fscal
,dz11
);
1076 /* Update vectorial force */
1077 fix1
= _mm_add_ps(fix1
,tx
);
1078 fiy1
= _mm_add_ps(fiy1
,ty
);
1079 fiz1
= _mm_add_ps(fiz1
,tz
);
1081 fjx1
= _mm_add_ps(fjx1
,tx
);
1082 fjy1
= _mm_add_ps(fjy1
,ty
);
1083 fjz1
= _mm_add_ps(fjz1
,tz
);
1085 /**************************
1086 * CALCULATE INTERACTIONS *
1087 **************************/
1089 r12
= _mm_mul_ps(rsq12
,rinv12
);
1090 r12
= _mm_andnot_ps(dummy_mask
,r12
);
1092 /* Calculate table index by multiplying r with table scale and truncate to integer */
1093 rt
= _mm_mul_ps(r12
,vftabscale
);
1094 vfitab
= _mm_cvttps_epi32(rt
);
1095 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
1096 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1098 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1099 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1100 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1101 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
1102 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
1103 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1104 Heps
= _mm_mul_ps(vfeps
,H
);
1105 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
1106 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
1107 velec
= _mm_mul_ps(qq12
,VV
);
1108 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
1109 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq12
,FF
),_mm_mul_ps(vftabscale
,rinv12
)));
1111 /* Update potential sum for this i atom from the interaction with this j atom. */
1112 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1113 velecsum
= _mm_add_ps(velecsum
,velec
);
1117 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1119 /* Calculate temporary vectorial force */
1120 tx
= _mm_mul_ps(fscal
,dx12
);
1121 ty
= _mm_mul_ps(fscal
,dy12
);
1122 tz
= _mm_mul_ps(fscal
,dz12
);
1124 /* Update vectorial force */
1125 fix1
= _mm_add_ps(fix1
,tx
);
1126 fiy1
= _mm_add_ps(fiy1
,ty
);
1127 fiz1
= _mm_add_ps(fiz1
,tz
);
1129 fjx2
= _mm_add_ps(fjx2
,tx
);
1130 fjy2
= _mm_add_ps(fjy2
,ty
);
1131 fjz2
= _mm_add_ps(fjz2
,tz
);
1133 /**************************
1134 * CALCULATE INTERACTIONS *
1135 **************************/
1137 r20
= _mm_mul_ps(rsq20
,rinv20
);
1138 r20
= _mm_andnot_ps(dummy_mask
,r20
);
1140 /* Calculate table index by multiplying r with table scale and truncate to integer */
1141 rt
= _mm_mul_ps(r20
,vftabscale
);
1142 vfitab
= _mm_cvttps_epi32(rt
);
1143 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
1144 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1146 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1147 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1148 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1149 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
1150 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
1151 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1152 Heps
= _mm_mul_ps(vfeps
,H
);
1153 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
1154 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
1155 velec
= _mm_mul_ps(qq20
,VV
);
1156 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
1157 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq20
,FF
),_mm_mul_ps(vftabscale
,rinv20
)));
1159 /* Update potential sum for this i atom from the interaction with this j atom. */
1160 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1161 velecsum
= _mm_add_ps(velecsum
,velec
);
1165 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1167 /* Calculate temporary vectorial force */
1168 tx
= _mm_mul_ps(fscal
,dx20
);
1169 ty
= _mm_mul_ps(fscal
,dy20
);
1170 tz
= _mm_mul_ps(fscal
,dz20
);
1172 /* Update vectorial force */
1173 fix2
= _mm_add_ps(fix2
,tx
);
1174 fiy2
= _mm_add_ps(fiy2
,ty
);
1175 fiz2
= _mm_add_ps(fiz2
,tz
);
1177 fjx0
= _mm_add_ps(fjx0
,tx
);
1178 fjy0
= _mm_add_ps(fjy0
,ty
);
1179 fjz0
= _mm_add_ps(fjz0
,tz
);
1181 /**************************
1182 * CALCULATE INTERACTIONS *
1183 **************************/
1185 r21
= _mm_mul_ps(rsq21
,rinv21
);
1186 r21
= _mm_andnot_ps(dummy_mask
,r21
);
1188 /* Calculate table index by multiplying r with table scale and truncate to integer */
1189 rt
= _mm_mul_ps(r21
,vftabscale
);
1190 vfitab
= _mm_cvttps_epi32(rt
);
1191 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
1192 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1194 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1195 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1196 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1197 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
1198 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
1199 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1200 Heps
= _mm_mul_ps(vfeps
,H
);
1201 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
1202 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
1203 velec
= _mm_mul_ps(qq21
,VV
);
1204 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
1205 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq21
,FF
),_mm_mul_ps(vftabscale
,rinv21
)));
1207 /* Update potential sum for this i atom from the interaction with this j atom. */
1208 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1209 velecsum
= _mm_add_ps(velecsum
,velec
);
1213 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1215 /* Calculate temporary vectorial force */
1216 tx
= _mm_mul_ps(fscal
,dx21
);
1217 ty
= _mm_mul_ps(fscal
,dy21
);
1218 tz
= _mm_mul_ps(fscal
,dz21
);
1220 /* Update vectorial force */
1221 fix2
= _mm_add_ps(fix2
,tx
);
1222 fiy2
= _mm_add_ps(fiy2
,ty
);
1223 fiz2
= _mm_add_ps(fiz2
,tz
);
1225 fjx1
= _mm_add_ps(fjx1
,tx
);
1226 fjy1
= _mm_add_ps(fjy1
,ty
);
1227 fjz1
= _mm_add_ps(fjz1
,tz
);
1229 /**************************
1230 * CALCULATE INTERACTIONS *
1231 **************************/
1233 r22
= _mm_mul_ps(rsq22
,rinv22
);
1234 r22
= _mm_andnot_ps(dummy_mask
,r22
);
1236 /* Calculate table index by multiplying r with table scale and truncate to integer */
1237 rt
= _mm_mul_ps(r22
,vftabscale
);
1238 vfitab
= _mm_cvttps_epi32(rt
);
1239 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
1240 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1242 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1243 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1244 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1245 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
1246 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
1247 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1248 Heps
= _mm_mul_ps(vfeps
,H
);
1249 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
1250 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
1251 velec
= _mm_mul_ps(qq22
,VV
);
1252 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
1253 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq22
,FF
),_mm_mul_ps(vftabscale
,rinv22
)));
1255 /* Update potential sum for this i atom from the interaction with this j atom. */
1256 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1257 velecsum
= _mm_add_ps(velecsum
,velec
);
1261 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1263 /* Calculate temporary vectorial force */
1264 tx
= _mm_mul_ps(fscal
,dx22
);
1265 ty
= _mm_mul_ps(fscal
,dy22
);
1266 tz
= _mm_mul_ps(fscal
,dz22
);
1268 /* Update vectorial force */
1269 fix2
= _mm_add_ps(fix2
,tx
);
1270 fiy2
= _mm_add_ps(fiy2
,ty
);
1271 fiz2
= _mm_add_ps(fiz2
,tz
);
1273 fjx2
= _mm_add_ps(fjx2
,tx
);
1274 fjy2
= _mm_add_ps(fjy2
,ty
);
1275 fjz2
= _mm_add_ps(fjz2
,tz
);
1277 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
1278 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
1279 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
1280 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
1282 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
1283 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
1285 /* Inner loop uses 426 flops */
1288 /* End of innermost loop */
1290 gmx_mm_update_iforce_3atom_swizzle_ps(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
1291 f
+i_coord_offset
,fshift
+i_shift_offset
);
1294 /* Update potential energies */
1295 gmx_mm_update_1pot_ps(velecsum
,kernel_data
->energygrp_elec
+ggid
);
1296 gmx_mm_update_1pot_ps(vvdwsum
,kernel_data
->energygrp_vdw
+ggid
);
1298 /* Increment number of inner iterations */
1299 inneriter
+= j_index_end
- j_index_start
;
1301 /* Outer loop uses 20 flops */
1304 /* Increment number of outer iterations */
1307 /* Update outer/inner flops */
1309 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W3W3_VF
,outeriter
*20 + inneriter
*426);
1312 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sse2_single
1313 * Electrostatics interaction: CubicSplineTable
1314 * VdW interaction: CubicSplineTable
1315 * Geometry: Water3-Water3
1316 * Calculate force/pot: Force
1319 nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sse2_single
1320 (t_nblist
* gmx_restrict nlist
,
1321 rvec
* gmx_restrict xx
,
1322 rvec
* gmx_restrict ff
,
1323 struct t_forcerec
* gmx_restrict fr
,
1324 t_mdatoms
* gmx_restrict mdatoms
,
1325 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
1326 t_nrnb
* gmx_restrict nrnb
)
1328 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1329 * just 0 for non-waters.
1330 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1331 * jnr indices corresponding to data put in the four positions in the SIMD register.
1333 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
1334 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
1335 int jnrA
,jnrB
,jnrC
,jnrD
;
1336 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
1337 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
1338 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
1339 real rcutoff_scalar
;
1340 real
*shiftvec
,*fshift
,*x
,*f
;
1341 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
;
1342 real scratch
[4*DIM
];
1343 __m128 tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
1345 __m128 ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
1347 __m128 ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
1349 __m128 ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
1350 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
;
1351 __m128 jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
1352 int vdwjidx1A
,vdwjidx1B
,vdwjidx1C
,vdwjidx1D
;
1353 __m128 jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
1354 int vdwjidx2A
,vdwjidx2B
,vdwjidx2C
,vdwjidx2D
;
1355 __m128 jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
1356 __m128 dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
1357 __m128 dx01
,dy01
,dz01
,rsq01
,rinv01
,rinvsq01
,r01
,qq01
,c6_01
,c12_01
;
1358 __m128 dx02
,dy02
,dz02
,rsq02
,rinv02
,rinvsq02
,r02
,qq02
,c6_02
,c12_02
;
1359 __m128 dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
1360 __m128 dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
1361 __m128 dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
1362 __m128 dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
1363 __m128 dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
1364 __m128 dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
1365 __m128 velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
1368 __m128 rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
1371 __m128 one_sixth
= _mm_set1_ps(1.0/6.0);
1372 __m128 one_twelfth
= _mm_set1_ps(1.0/12.0);
1374 __m128i ifour
= _mm_set1_epi32(4);
1375 __m128 rt
,vfeps
,vftabscale
,Y
,F
,G
,H
,Heps
,Fp
,VV
,FF
;
1377 __m128 dummy_mask
,cutoff_mask
;
1378 __m128 signbit
= _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1379 __m128 one
= _mm_set1_ps(1.0);
1380 __m128 two
= _mm_set1_ps(2.0);
1386 jindex
= nlist
->jindex
;
1388 shiftidx
= nlist
->shift
;
1390 shiftvec
= fr
->shift_vec
[0];
1391 fshift
= fr
->fshift
[0];
1392 facel
= _mm_set1_ps(fr
->ic
->epsfac
);
1393 charge
= mdatoms
->chargeA
;
1394 nvdwtype
= fr
->ntype
;
1395 vdwparam
= fr
->nbfp
;
1396 vdwtype
= mdatoms
->typeA
;
1398 vftab
= kernel_data
->table_elec_vdw
->data
;
1399 vftabscale
= _mm_set1_ps(kernel_data
->table_elec_vdw
->scale
);
1401 /* Setup water-specific parameters */
1402 inr
= nlist
->iinr
[0];
1403 iq0
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+0]));
1404 iq1
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+1]));
1405 iq2
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+2]));
1406 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
1408 jq0
= _mm_set1_ps(charge
[inr
+0]);
1409 jq1
= _mm_set1_ps(charge
[inr
+1]);
1410 jq2
= _mm_set1_ps(charge
[inr
+2]);
1411 vdwjidx0A
= 2*vdwtype
[inr
+0];
1412 qq00
= _mm_mul_ps(iq0
,jq0
);
1413 c6_00
= _mm_set1_ps(vdwparam
[vdwioffset0
+vdwjidx0A
]);
1414 c12_00
= _mm_set1_ps(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
1415 qq01
= _mm_mul_ps(iq0
,jq1
);
1416 qq02
= _mm_mul_ps(iq0
,jq2
);
1417 qq10
= _mm_mul_ps(iq1
,jq0
);
1418 qq11
= _mm_mul_ps(iq1
,jq1
);
1419 qq12
= _mm_mul_ps(iq1
,jq2
);
1420 qq20
= _mm_mul_ps(iq2
,jq0
);
1421 qq21
= _mm_mul_ps(iq2
,jq1
);
1422 qq22
= _mm_mul_ps(iq2
,jq2
);
1424 /* Avoid stupid compiler warnings */
1425 jnrA
= jnrB
= jnrC
= jnrD
= 0;
1426 j_coord_offsetA
= 0;
1427 j_coord_offsetB
= 0;
1428 j_coord_offsetC
= 0;
1429 j_coord_offsetD
= 0;
1434 for(iidx
=0;iidx
<4*DIM
;iidx
++)
1436 scratch
[iidx
] = 0.0;
1439 /* Start outer loop over neighborlists */
1440 for(iidx
=0; iidx
<nri
; iidx
++)
1442 /* Load shift vector for this list */
1443 i_shift_offset
= DIM
*shiftidx
[iidx
];
1445 /* Load limits for loop over neighbors */
1446 j_index_start
= jindex
[iidx
];
1447 j_index_end
= jindex
[iidx
+1];
1449 /* Get outer coordinate index */
1451 i_coord_offset
= DIM
*inr
;
1453 /* Load i particle coords and add shift vector */
1454 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
1455 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
1457 fix0
= _mm_setzero_ps();
1458 fiy0
= _mm_setzero_ps();
1459 fiz0
= _mm_setzero_ps();
1460 fix1
= _mm_setzero_ps();
1461 fiy1
= _mm_setzero_ps();
1462 fiz1
= _mm_setzero_ps();
1463 fix2
= _mm_setzero_ps();
1464 fiy2
= _mm_setzero_ps();
1465 fiz2
= _mm_setzero_ps();
1467 /* Start inner kernel loop */
1468 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+3]>=0; jidx
+=4)
1471 /* Get j neighbor index, and coordinate index */
1473 jnrB
= jjnr
[jidx
+1];
1474 jnrC
= jjnr
[jidx
+2];
1475 jnrD
= jjnr
[jidx
+3];
1476 j_coord_offsetA
= DIM
*jnrA
;
1477 j_coord_offsetB
= DIM
*jnrB
;
1478 j_coord_offsetC
= DIM
*jnrC
;
1479 j_coord_offsetD
= DIM
*jnrD
;
1481 /* load j atom coordinates */
1482 gmx_mm_load_3rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1483 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
1484 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
1486 /* Calculate displacement vector */
1487 dx00
= _mm_sub_ps(ix0
,jx0
);
1488 dy00
= _mm_sub_ps(iy0
,jy0
);
1489 dz00
= _mm_sub_ps(iz0
,jz0
);
1490 dx01
= _mm_sub_ps(ix0
,jx1
);
1491 dy01
= _mm_sub_ps(iy0
,jy1
);
1492 dz01
= _mm_sub_ps(iz0
,jz1
);
1493 dx02
= _mm_sub_ps(ix0
,jx2
);
1494 dy02
= _mm_sub_ps(iy0
,jy2
);
1495 dz02
= _mm_sub_ps(iz0
,jz2
);
1496 dx10
= _mm_sub_ps(ix1
,jx0
);
1497 dy10
= _mm_sub_ps(iy1
,jy0
);
1498 dz10
= _mm_sub_ps(iz1
,jz0
);
1499 dx11
= _mm_sub_ps(ix1
,jx1
);
1500 dy11
= _mm_sub_ps(iy1
,jy1
);
1501 dz11
= _mm_sub_ps(iz1
,jz1
);
1502 dx12
= _mm_sub_ps(ix1
,jx2
);
1503 dy12
= _mm_sub_ps(iy1
,jy2
);
1504 dz12
= _mm_sub_ps(iz1
,jz2
);
1505 dx20
= _mm_sub_ps(ix2
,jx0
);
1506 dy20
= _mm_sub_ps(iy2
,jy0
);
1507 dz20
= _mm_sub_ps(iz2
,jz0
);
1508 dx21
= _mm_sub_ps(ix2
,jx1
);
1509 dy21
= _mm_sub_ps(iy2
,jy1
);
1510 dz21
= _mm_sub_ps(iz2
,jz1
);
1511 dx22
= _mm_sub_ps(ix2
,jx2
);
1512 dy22
= _mm_sub_ps(iy2
,jy2
);
1513 dz22
= _mm_sub_ps(iz2
,jz2
);
1515 /* Calculate squared distance and things based on it */
1516 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
1517 rsq01
= gmx_mm_calc_rsq_ps(dx01
,dy01
,dz01
);
1518 rsq02
= gmx_mm_calc_rsq_ps(dx02
,dy02
,dz02
);
1519 rsq10
= gmx_mm_calc_rsq_ps(dx10
,dy10
,dz10
);
1520 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
1521 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
1522 rsq20
= gmx_mm_calc_rsq_ps(dx20
,dy20
,dz20
);
1523 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
1524 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
1526 rinv00
= sse2_invsqrt_f(rsq00
);
1527 rinv01
= sse2_invsqrt_f(rsq01
);
1528 rinv02
= sse2_invsqrt_f(rsq02
);
1529 rinv10
= sse2_invsqrt_f(rsq10
);
1530 rinv11
= sse2_invsqrt_f(rsq11
);
1531 rinv12
= sse2_invsqrt_f(rsq12
);
1532 rinv20
= sse2_invsqrt_f(rsq20
);
1533 rinv21
= sse2_invsqrt_f(rsq21
);
1534 rinv22
= sse2_invsqrt_f(rsq22
);
1536 fjx0
= _mm_setzero_ps();
1537 fjy0
= _mm_setzero_ps();
1538 fjz0
= _mm_setzero_ps();
1539 fjx1
= _mm_setzero_ps();
1540 fjy1
= _mm_setzero_ps();
1541 fjz1
= _mm_setzero_ps();
1542 fjx2
= _mm_setzero_ps();
1543 fjy2
= _mm_setzero_ps();
1544 fjz2
= _mm_setzero_ps();
1546 /**************************
1547 * CALCULATE INTERACTIONS *
1548 **************************/
1550 r00
= _mm_mul_ps(rsq00
,rinv00
);
1552 /* Calculate table index by multiplying r with table scale and truncate to integer */
1553 rt
= _mm_mul_ps(r00
,vftabscale
);
1554 vfitab
= _mm_cvttps_epi32(rt
);
1555 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
1556 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1558 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1559 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1560 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1561 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
1562 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
1563 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1564 Heps
= _mm_mul_ps(vfeps
,H
);
1565 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
1566 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
1567 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq00
,FF
),_mm_mul_ps(vftabscale
,rinv00
)));
1569 /* CUBIC SPLINE TABLE DISPERSION */
1570 vfitab
= _mm_add_epi32(vfitab
,ifour
);
1571 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1572 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1573 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
1574 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
1575 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1576 Heps
= _mm_mul_ps(vfeps
,H
);
1577 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
1578 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
1579 fvdw6
= _mm_mul_ps(c6_00
,FF
);
1581 /* CUBIC SPLINE TABLE REPULSION */
1582 vfitab
= _mm_add_epi32(vfitab
,ifour
);
1583 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1584 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1585 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
1586 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
1587 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1588 Heps
= _mm_mul_ps(vfeps
,H
);
1589 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
1590 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
1591 fvdw12
= _mm_mul_ps(c12_00
,FF
);
1592 fvdw
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_add_ps(fvdw6
,fvdw12
),_mm_mul_ps(vftabscale
,rinv00
)));
1594 fscal
= _mm_add_ps(felec
,fvdw
);
1596 /* Calculate temporary vectorial force */
1597 tx
= _mm_mul_ps(fscal
,dx00
);
1598 ty
= _mm_mul_ps(fscal
,dy00
);
1599 tz
= _mm_mul_ps(fscal
,dz00
);
1601 /* Update vectorial force */
1602 fix0
= _mm_add_ps(fix0
,tx
);
1603 fiy0
= _mm_add_ps(fiy0
,ty
);
1604 fiz0
= _mm_add_ps(fiz0
,tz
);
1606 fjx0
= _mm_add_ps(fjx0
,tx
);
1607 fjy0
= _mm_add_ps(fjy0
,ty
);
1608 fjz0
= _mm_add_ps(fjz0
,tz
);
1610 /**************************
1611 * CALCULATE INTERACTIONS *
1612 **************************/
1614 r01
= _mm_mul_ps(rsq01
,rinv01
);
1616 /* Calculate table index by multiplying r with table scale and truncate to integer */
1617 rt
= _mm_mul_ps(r01
,vftabscale
);
1618 vfitab
= _mm_cvttps_epi32(rt
);
1619 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
1620 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1622 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1623 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1624 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1625 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
1626 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
1627 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1628 Heps
= _mm_mul_ps(vfeps
,H
);
1629 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
1630 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
1631 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq01
,FF
),_mm_mul_ps(vftabscale
,rinv01
)));
1635 /* Calculate temporary vectorial force */
1636 tx
= _mm_mul_ps(fscal
,dx01
);
1637 ty
= _mm_mul_ps(fscal
,dy01
);
1638 tz
= _mm_mul_ps(fscal
,dz01
);
1640 /* Update vectorial force */
1641 fix0
= _mm_add_ps(fix0
,tx
);
1642 fiy0
= _mm_add_ps(fiy0
,ty
);
1643 fiz0
= _mm_add_ps(fiz0
,tz
);
1645 fjx1
= _mm_add_ps(fjx1
,tx
);
1646 fjy1
= _mm_add_ps(fjy1
,ty
);
1647 fjz1
= _mm_add_ps(fjz1
,tz
);
1649 /**************************
1650 * CALCULATE INTERACTIONS *
1651 **************************/
1653 r02
= _mm_mul_ps(rsq02
,rinv02
);
1655 /* Calculate table index by multiplying r with table scale and truncate to integer */
1656 rt
= _mm_mul_ps(r02
,vftabscale
);
1657 vfitab
= _mm_cvttps_epi32(rt
);
1658 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
1659 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1661 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1662 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1663 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1664 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
1665 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
1666 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1667 Heps
= _mm_mul_ps(vfeps
,H
);
1668 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
1669 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
1670 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq02
,FF
),_mm_mul_ps(vftabscale
,rinv02
)));
1674 /* Calculate temporary vectorial force */
1675 tx
= _mm_mul_ps(fscal
,dx02
);
1676 ty
= _mm_mul_ps(fscal
,dy02
);
1677 tz
= _mm_mul_ps(fscal
,dz02
);
1679 /* Update vectorial force */
1680 fix0
= _mm_add_ps(fix0
,tx
);
1681 fiy0
= _mm_add_ps(fiy0
,ty
);
1682 fiz0
= _mm_add_ps(fiz0
,tz
);
1684 fjx2
= _mm_add_ps(fjx2
,tx
);
1685 fjy2
= _mm_add_ps(fjy2
,ty
);
1686 fjz2
= _mm_add_ps(fjz2
,tz
);
1688 /**************************
1689 * CALCULATE INTERACTIONS *
1690 **************************/
1692 r10
= _mm_mul_ps(rsq10
,rinv10
);
1694 /* Calculate table index by multiplying r with table scale and truncate to integer */
1695 rt
= _mm_mul_ps(r10
,vftabscale
);
1696 vfitab
= _mm_cvttps_epi32(rt
);
1697 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
1698 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1700 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1701 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1702 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1703 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
1704 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
1705 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1706 Heps
= _mm_mul_ps(vfeps
,H
);
1707 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
1708 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
1709 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq10
,FF
),_mm_mul_ps(vftabscale
,rinv10
)));
1713 /* Calculate temporary vectorial force */
1714 tx
= _mm_mul_ps(fscal
,dx10
);
1715 ty
= _mm_mul_ps(fscal
,dy10
);
1716 tz
= _mm_mul_ps(fscal
,dz10
);
1718 /* Update vectorial force */
1719 fix1
= _mm_add_ps(fix1
,tx
);
1720 fiy1
= _mm_add_ps(fiy1
,ty
);
1721 fiz1
= _mm_add_ps(fiz1
,tz
);
1723 fjx0
= _mm_add_ps(fjx0
,tx
);
1724 fjy0
= _mm_add_ps(fjy0
,ty
);
1725 fjz0
= _mm_add_ps(fjz0
,tz
);
1727 /**************************
1728 * CALCULATE INTERACTIONS *
1729 **************************/
1731 r11
= _mm_mul_ps(rsq11
,rinv11
);
1733 /* Calculate table index by multiplying r with table scale and truncate to integer */
1734 rt
= _mm_mul_ps(r11
,vftabscale
);
1735 vfitab
= _mm_cvttps_epi32(rt
);
1736 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
1737 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1739 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1740 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1741 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1742 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
1743 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
1744 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1745 Heps
= _mm_mul_ps(vfeps
,H
);
1746 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
1747 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
1748 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq11
,FF
),_mm_mul_ps(vftabscale
,rinv11
)));
1752 /* Calculate temporary vectorial force */
1753 tx
= _mm_mul_ps(fscal
,dx11
);
1754 ty
= _mm_mul_ps(fscal
,dy11
);
1755 tz
= _mm_mul_ps(fscal
,dz11
);
1757 /* Update vectorial force */
1758 fix1
= _mm_add_ps(fix1
,tx
);
1759 fiy1
= _mm_add_ps(fiy1
,ty
);
1760 fiz1
= _mm_add_ps(fiz1
,tz
);
1762 fjx1
= _mm_add_ps(fjx1
,tx
);
1763 fjy1
= _mm_add_ps(fjy1
,ty
);
1764 fjz1
= _mm_add_ps(fjz1
,tz
);
1766 /**************************
1767 * CALCULATE INTERACTIONS *
1768 **************************/
1770 r12
= _mm_mul_ps(rsq12
,rinv12
);
1772 /* Calculate table index by multiplying r with table scale and truncate to integer */
1773 rt
= _mm_mul_ps(r12
,vftabscale
);
1774 vfitab
= _mm_cvttps_epi32(rt
);
1775 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
1776 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1778 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1779 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1780 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1781 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
1782 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
1783 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1784 Heps
= _mm_mul_ps(vfeps
,H
);
1785 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
1786 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
1787 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq12
,FF
),_mm_mul_ps(vftabscale
,rinv12
)));
1791 /* Calculate temporary vectorial force */
1792 tx
= _mm_mul_ps(fscal
,dx12
);
1793 ty
= _mm_mul_ps(fscal
,dy12
);
1794 tz
= _mm_mul_ps(fscal
,dz12
);
1796 /* Update vectorial force */
1797 fix1
= _mm_add_ps(fix1
,tx
);
1798 fiy1
= _mm_add_ps(fiy1
,ty
);
1799 fiz1
= _mm_add_ps(fiz1
,tz
);
1801 fjx2
= _mm_add_ps(fjx2
,tx
);
1802 fjy2
= _mm_add_ps(fjy2
,ty
);
1803 fjz2
= _mm_add_ps(fjz2
,tz
);
1805 /**************************
1806 * CALCULATE INTERACTIONS *
1807 **************************/
1809 r20
= _mm_mul_ps(rsq20
,rinv20
);
1811 /* Calculate table index by multiplying r with table scale and truncate to integer */
1812 rt
= _mm_mul_ps(r20
,vftabscale
);
1813 vfitab
= _mm_cvttps_epi32(rt
);
1814 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
1815 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1817 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1818 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1819 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1820 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
1821 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
1822 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1823 Heps
= _mm_mul_ps(vfeps
,H
);
1824 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
1825 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
1826 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq20
,FF
),_mm_mul_ps(vftabscale
,rinv20
)));
1830 /* Calculate temporary vectorial force */
1831 tx
= _mm_mul_ps(fscal
,dx20
);
1832 ty
= _mm_mul_ps(fscal
,dy20
);
1833 tz
= _mm_mul_ps(fscal
,dz20
);
1835 /* Update vectorial force */
1836 fix2
= _mm_add_ps(fix2
,tx
);
1837 fiy2
= _mm_add_ps(fiy2
,ty
);
1838 fiz2
= _mm_add_ps(fiz2
,tz
);
1840 fjx0
= _mm_add_ps(fjx0
,tx
);
1841 fjy0
= _mm_add_ps(fjy0
,ty
);
1842 fjz0
= _mm_add_ps(fjz0
,tz
);
1844 /**************************
1845 * CALCULATE INTERACTIONS *
1846 **************************/
1848 r21
= _mm_mul_ps(rsq21
,rinv21
);
1850 /* Calculate table index by multiplying r with table scale and truncate to integer */
1851 rt
= _mm_mul_ps(r21
,vftabscale
);
1852 vfitab
= _mm_cvttps_epi32(rt
);
1853 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
1854 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1856 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1857 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1858 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1859 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
1860 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
1861 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1862 Heps
= _mm_mul_ps(vfeps
,H
);
1863 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
1864 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
1865 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq21
,FF
),_mm_mul_ps(vftabscale
,rinv21
)));
1869 /* Calculate temporary vectorial force */
1870 tx
= _mm_mul_ps(fscal
,dx21
);
1871 ty
= _mm_mul_ps(fscal
,dy21
);
1872 tz
= _mm_mul_ps(fscal
,dz21
);
1874 /* Update vectorial force */
1875 fix2
= _mm_add_ps(fix2
,tx
);
1876 fiy2
= _mm_add_ps(fiy2
,ty
);
1877 fiz2
= _mm_add_ps(fiz2
,tz
);
1879 fjx1
= _mm_add_ps(fjx1
,tx
);
1880 fjy1
= _mm_add_ps(fjy1
,ty
);
1881 fjz1
= _mm_add_ps(fjz1
,tz
);
1883 /**************************
1884 * CALCULATE INTERACTIONS *
1885 **************************/
1887 r22
= _mm_mul_ps(rsq22
,rinv22
);
1889 /* Calculate table index by multiplying r with table scale and truncate to integer */
1890 rt
= _mm_mul_ps(r22
,vftabscale
);
1891 vfitab
= _mm_cvttps_epi32(rt
);
1892 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
1893 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1895 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1896 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1897 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1898 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
1899 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
1900 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1901 Heps
= _mm_mul_ps(vfeps
,H
);
1902 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
1903 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
1904 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq22
,FF
),_mm_mul_ps(vftabscale
,rinv22
)));
1908 /* Calculate temporary vectorial force */
1909 tx
= _mm_mul_ps(fscal
,dx22
);
1910 ty
= _mm_mul_ps(fscal
,dy22
);
1911 tz
= _mm_mul_ps(fscal
,dz22
);
1913 /* Update vectorial force */
1914 fix2
= _mm_add_ps(fix2
,tx
);
1915 fiy2
= _mm_add_ps(fiy2
,ty
);
1916 fiz2
= _mm_add_ps(fiz2
,tz
);
1918 fjx2
= _mm_add_ps(fjx2
,tx
);
1919 fjy2
= _mm_add_ps(fjy2
,ty
);
1920 fjz2
= _mm_add_ps(fjz2
,tz
);
1922 fjptrA
= f
+j_coord_offsetA
;
1923 fjptrB
= f
+j_coord_offsetB
;
1924 fjptrC
= f
+j_coord_offsetC
;
1925 fjptrD
= f
+j_coord_offsetD
;
1927 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
1928 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
1930 /* Inner loop uses 373 flops */
1933 if(jidx
<j_index_end
)
1936 /* Get j neighbor index, and coordinate index */
1937 jnrlistA
= jjnr
[jidx
];
1938 jnrlistB
= jjnr
[jidx
+1];
1939 jnrlistC
= jjnr
[jidx
+2];
1940 jnrlistD
= jjnr
[jidx
+3];
1941 /* Sign of each element will be negative for non-real atoms.
1942 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1943 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1945 dummy_mask
= gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128()));
1946 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
1947 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
1948 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
1949 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
1950 j_coord_offsetA
= DIM
*jnrA
;
1951 j_coord_offsetB
= DIM
*jnrB
;
1952 j_coord_offsetC
= DIM
*jnrC
;
1953 j_coord_offsetD
= DIM
*jnrD
;
1955 /* load j atom coordinates */
1956 gmx_mm_load_3rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1957 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
1958 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
1960 /* Calculate displacement vector */
1961 dx00
= _mm_sub_ps(ix0
,jx0
);
1962 dy00
= _mm_sub_ps(iy0
,jy0
);
1963 dz00
= _mm_sub_ps(iz0
,jz0
);
1964 dx01
= _mm_sub_ps(ix0
,jx1
);
1965 dy01
= _mm_sub_ps(iy0
,jy1
);
1966 dz01
= _mm_sub_ps(iz0
,jz1
);
1967 dx02
= _mm_sub_ps(ix0
,jx2
);
1968 dy02
= _mm_sub_ps(iy0
,jy2
);
1969 dz02
= _mm_sub_ps(iz0
,jz2
);
1970 dx10
= _mm_sub_ps(ix1
,jx0
);
1971 dy10
= _mm_sub_ps(iy1
,jy0
);
1972 dz10
= _mm_sub_ps(iz1
,jz0
);
1973 dx11
= _mm_sub_ps(ix1
,jx1
);
1974 dy11
= _mm_sub_ps(iy1
,jy1
);
1975 dz11
= _mm_sub_ps(iz1
,jz1
);
1976 dx12
= _mm_sub_ps(ix1
,jx2
);
1977 dy12
= _mm_sub_ps(iy1
,jy2
);
1978 dz12
= _mm_sub_ps(iz1
,jz2
);
1979 dx20
= _mm_sub_ps(ix2
,jx0
);
1980 dy20
= _mm_sub_ps(iy2
,jy0
);
1981 dz20
= _mm_sub_ps(iz2
,jz0
);
1982 dx21
= _mm_sub_ps(ix2
,jx1
);
1983 dy21
= _mm_sub_ps(iy2
,jy1
);
1984 dz21
= _mm_sub_ps(iz2
,jz1
);
1985 dx22
= _mm_sub_ps(ix2
,jx2
);
1986 dy22
= _mm_sub_ps(iy2
,jy2
);
1987 dz22
= _mm_sub_ps(iz2
,jz2
);
1989 /* Calculate squared distance and things based on it */
1990 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
1991 rsq01
= gmx_mm_calc_rsq_ps(dx01
,dy01
,dz01
);
1992 rsq02
= gmx_mm_calc_rsq_ps(dx02
,dy02
,dz02
);
1993 rsq10
= gmx_mm_calc_rsq_ps(dx10
,dy10
,dz10
);
1994 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
1995 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
1996 rsq20
= gmx_mm_calc_rsq_ps(dx20
,dy20
,dz20
);
1997 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
1998 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
2000 rinv00
= sse2_invsqrt_f(rsq00
);
2001 rinv01
= sse2_invsqrt_f(rsq01
);
2002 rinv02
= sse2_invsqrt_f(rsq02
);
2003 rinv10
= sse2_invsqrt_f(rsq10
);
2004 rinv11
= sse2_invsqrt_f(rsq11
);
2005 rinv12
= sse2_invsqrt_f(rsq12
);
2006 rinv20
= sse2_invsqrt_f(rsq20
);
2007 rinv21
= sse2_invsqrt_f(rsq21
);
2008 rinv22
= sse2_invsqrt_f(rsq22
);
2010 fjx0
= _mm_setzero_ps();
2011 fjy0
= _mm_setzero_ps();
2012 fjz0
= _mm_setzero_ps();
2013 fjx1
= _mm_setzero_ps();
2014 fjy1
= _mm_setzero_ps();
2015 fjz1
= _mm_setzero_ps();
2016 fjx2
= _mm_setzero_ps();
2017 fjy2
= _mm_setzero_ps();
2018 fjz2
= _mm_setzero_ps();
2020 /**************************
2021 * CALCULATE INTERACTIONS *
2022 **************************/
2024 r00
= _mm_mul_ps(rsq00
,rinv00
);
2025 r00
= _mm_andnot_ps(dummy_mask
,r00
);
2027 /* Calculate table index by multiplying r with table scale and truncate to integer */
2028 rt
= _mm_mul_ps(r00
,vftabscale
);
2029 vfitab
= _mm_cvttps_epi32(rt
);
2030 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
2031 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
2033 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2034 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2035 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
2036 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
2037 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
2038 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2039 Heps
= _mm_mul_ps(vfeps
,H
);
2040 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
2041 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
2042 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq00
,FF
),_mm_mul_ps(vftabscale
,rinv00
)));
2044 /* CUBIC SPLINE TABLE DISPERSION */
2045 vfitab
= _mm_add_epi32(vfitab
,ifour
);
2046 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2047 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
2048 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
2049 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
2050 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2051 Heps
= _mm_mul_ps(vfeps
,H
);
2052 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
2053 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
2054 fvdw6
= _mm_mul_ps(c6_00
,FF
);
2056 /* CUBIC SPLINE TABLE REPULSION */
2057 vfitab
= _mm_add_epi32(vfitab
,ifour
);
2058 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2059 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
2060 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
2061 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
2062 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2063 Heps
= _mm_mul_ps(vfeps
,H
);
2064 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
2065 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
2066 fvdw12
= _mm_mul_ps(c12_00
,FF
);
2067 fvdw
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_add_ps(fvdw6
,fvdw12
),_mm_mul_ps(vftabscale
,rinv00
)));
2069 fscal
= _mm_add_ps(felec
,fvdw
);
2071 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2073 /* Calculate temporary vectorial force */
2074 tx
= _mm_mul_ps(fscal
,dx00
);
2075 ty
= _mm_mul_ps(fscal
,dy00
);
2076 tz
= _mm_mul_ps(fscal
,dz00
);
2078 /* Update vectorial force */
2079 fix0
= _mm_add_ps(fix0
,tx
);
2080 fiy0
= _mm_add_ps(fiy0
,ty
);
2081 fiz0
= _mm_add_ps(fiz0
,tz
);
2083 fjx0
= _mm_add_ps(fjx0
,tx
);
2084 fjy0
= _mm_add_ps(fjy0
,ty
);
2085 fjz0
= _mm_add_ps(fjz0
,tz
);
2087 /**************************
2088 * CALCULATE INTERACTIONS *
2089 **************************/
2091 r01
= _mm_mul_ps(rsq01
,rinv01
);
2092 r01
= _mm_andnot_ps(dummy_mask
,r01
);
2094 /* Calculate table index by multiplying r with table scale and truncate to integer */
2095 rt
= _mm_mul_ps(r01
,vftabscale
);
2096 vfitab
= _mm_cvttps_epi32(rt
);
2097 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
2098 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
2100 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2101 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2102 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
2103 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
2104 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
2105 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2106 Heps
= _mm_mul_ps(vfeps
,H
);
2107 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
2108 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
2109 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq01
,FF
),_mm_mul_ps(vftabscale
,rinv01
)));
2113 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2115 /* Calculate temporary vectorial force */
2116 tx
= _mm_mul_ps(fscal
,dx01
);
2117 ty
= _mm_mul_ps(fscal
,dy01
);
2118 tz
= _mm_mul_ps(fscal
,dz01
);
2120 /* Update vectorial force */
2121 fix0
= _mm_add_ps(fix0
,tx
);
2122 fiy0
= _mm_add_ps(fiy0
,ty
);
2123 fiz0
= _mm_add_ps(fiz0
,tz
);
2125 fjx1
= _mm_add_ps(fjx1
,tx
);
2126 fjy1
= _mm_add_ps(fjy1
,ty
);
2127 fjz1
= _mm_add_ps(fjz1
,tz
);
2129 /**************************
2130 * CALCULATE INTERACTIONS *
2131 **************************/
2133 r02
= _mm_mul_ps(rsq02
,rinv02
);
2134 r02
= _mm_andnot_ps(dummy_mask
,r02
);
2136 /* Calculate table index by multiplying r with table scale and truncate to integer */
2137 rt
= _mm_mul_ps(r02
,vftabscale
);
2138 vfitab
= _mm_cvttps_epi32(rt
);
2139 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
2140 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
2142 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2143 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2144 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
2145 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
2146 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
2147 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2148 Heps
= _mm_mul_ps(vfeps
,H
);
2149 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
2150 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
2151 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq02
,FF
),_mm_mul_ps(vftabscale
,rinv02
)));
2155 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2157 /* Calculate temporary vectorial force */
2158 tx
= _mm_mul_ps(fscal
,dx02
);
2159 ty
= _mm_mul_ps(fscal
,dy02
);
2160 tz
= _mm_mul_ps(fscal
,dz02
);
2162 /* Update vectorial force */
2163 fix0
= _mm_add_ps(fix0
,tx
);
2164 fiy0
= _mm_add_ps(fiy0
,ty
);
2165 fiz0
= _mm_add_ps(fiz0
,tz
);
2167 fjx2
= _mm_add_ps(fjx2
,tx
);
2168 fjy2
= _mm_add_ps(fjy2
,ty
);
2169 fjz2
= _mm_add_ps(fjz2
,tz
);
2171 /**************************
2172 * CALCULATE INTERACTIONS *
2173 **************************/
2175 r10
= _mm_mul_ps(rsq10
,rinv10
);
2176 r10
= _mm_andnot_ps(dummy_mask
,r10
);
2178 /* Calculate table index by multiplying r with table scale and truncate to integer */
2179 rt
= _mm_mul_ps(r10
,vftabscale
);
2180 vfitab
= _mm_cvttps_epi32(rt
);
2181 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
2182 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
2184 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2185 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2186 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
2187 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
2188 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
2189 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2190 Heps
= _mm_mul_ps(vfeps
,H
);
2191 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
2192 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
2193 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq10
,FF
),_mm_mul_ps(vftabscale
,rinv10
)));
2197 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2199 /* Calculate temporary vectorial force */
2200 tx
= _mm_mul_ps(fscal
,dx10
);
2201 ty
= _mm_mul_ps(fscal
,dy10
);
2202 tz
= _mm_mul_ps(fscal
,dz10
);
2204 /* Update vectorial force */
2205 fix1
= _mm_add_ps(fix1
,tx
);
2206 fiy1
= _mm_add_ps(fiy1
,ty
);
2207 fiz1
= _mm_add_ps(fiz1
,tz
);
2209 fjx0
= _mm_add_ps(fjx0
,tx
);
2210 fjy0
= _mm_add_ps(fjy0
,ty
);
2211 fjz0
= _mm_add_ps(fjz0
,tz
);
2213 /**************************
2214 * CALCULATE INTERACTIONS *
2215 **************************/
2217 r11
= _mm_mul_ps(rsq11
,rinv11
);
2218 r11
= _mm_andnot_ps(dummy_mask
,r11
);
2220 /* Calculate table index by multiplying r with table scale and truncate to integer */
2221 rt
= _mm_mul_ps(r11
,vftabscale
);
2222 vfitab
= _mm_cvttps_epi32(rt
);
2223 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
2224 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
2226 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2227 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2228 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
2229 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
2230 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
2231 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2232 Heps
= _mm_mul_ps(vfeps
,H
);
2233 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
2234 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
2235 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq11
,FF
),_mm_mul_ps(vftabscale
,rinv11
)));
2239 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2241 /* Calculate temporary vectorial force */
2242 tx
= _mm_mul_ps(fscal
,dx11
);
2243 ty
= _mm_mul_ps(fscal
,dy11
);
2244 tz
= _mm_mul_ps(fscal
,dz11
);
2246 /* Update vectorial force */
2247 fix1
= _mm_add_ps(fix1
,tx
);
2248 fiy1
= _mm_add_ps(fiy1
,ty
);
2249 fiz1
= _mm_add_ps(fiz1
,tz
);
2251 fjx1
= _mm_add_ps(fjx1
,tx
);
2252 fjy1
= _mm_add_ps(fjy1
,ty
);
2253 fjz1
= _mm_add_ps(fjz1
,tz
);
2255 /**************************
2256 * CALCULATE INTERACTIONS *
2257 **************************/
2259 r12
= _mm_mul_ps(rsq12
,rinv12
);
2260 r12
= _mm_andnot_ps(dummy_mask
,r12
);
2262 /* Calculate table index by multiplying r with table scale and truncate to integer */
2263 rt
= _mm_mul_ps(r12
,vftabscale
);
2264 vfitab
= _mm_cvttps_epi32(rt
);
2265 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
2266 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
2268 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2269 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2270 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
2271 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
2272 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
2273 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2274 Heps
= _mm_mul_ps(vfeps
,H
);
2275 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
2276 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
2277 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq12
,FF
),_mm_mul_ps(vftabscale
,rinv12
)));
2281 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2283 /* Calculate temporary vectorial force */
2284 tx
= _mm_mul_ps(fscal
,dx12
);
2285 ty
= _mm_mul_ps(fscal
,dy12
);
2286 tz
= _mm_mul_ps(fscal
,dz12
);
2288 /* Update vectorial force */
2289 fix1
= _mm_add_ps(fix1
,tx
);
2290 fiy1
= _mm_add_ps(fiy1
,ty
);
2291 fiz1
= _mm_add_ps(fiz1
,tz
);
2293 fjx2
= _mm_add_ps(fjx2
,tx
);
2294 fjy2
= _mm_add_ps(fjy2
,ty
);
2295 fjz2
= _mm_add_ps(fjz2
,tz
);
2297 /**************************
2298 * CALCULATE INTERACTIONS *
2299 **************************/
2301 r20
= _mm_mul_ps(rsq20
,rinv20
);
2302 r20
= _mm_andnot_ps(dummy_mask
,r20
);
2304 /* Calculate table index by multiplying r with table scale and truncate to integer */
2305 rt
= _mm_mul_ps(r20
,vftabscale
);
2306 vfitab
= _mm_cvttps_epi32(rt
);
2307 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
2308 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
2310 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2311 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2312 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
2313 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
2314 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
2315 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2316 Heps
= _mm_mul_ps(vfeps
,H
);
2317 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
2318 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
2319 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq20
,FF
),_mm_mul_ps(vftabscale
,rinv20
)));
2323 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2325 /* Calculate temporary vectorial force */
2326 tx
= _mm_mul_ps(fscal
,dx20
);
2327 ty
= _mm_mul_ps(fscal
,dy20
);
2328 tz
= _mm_mul_ps(fscal
,dz20
);
2330 /* Update vectorial force */
2331 fix2
= _mm_add_ps(fix2
,tx
);
2332 fiy2
= _mm_add_ps(fiy2
,ty
);
2333 fiz2
= _mm_add_ps(fiz2
,tz
);
2335 fjx0
= _mm_add_ps(fjx0
,tx
);
2336 fjy0
= _mm_add_ps(fjy0
,ty
);
2337 fjz0
= _mm_add_ps(fjz0
,tz
);
2339 /**************************
2340 * CALCULATE INTERACTIONS *
2341 **************************/
2343 r21
= _mm_mul_ps(rsq21
,rinv21
);
2344 r21
= _mm_andnot_ps(dummy_mask
,r21
);
2346 /* Calculate table index by multiplying r with table scale and truncate to integer */
2347 rt
= _mm_mul_ps(r21
,vftabscale
);
2348 vfitab
= _mm_cvttps_epi32(rt
);
2349 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
2350 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
2352 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2353 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2354 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
2355 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
2356 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
2357 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2358 Heps
= _mm_mul_ps(vfeps
,H
);
2359 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
2360 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
2361 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq21
,FF
),_mm_mul_ps(vftabscale
,rinv21
)));
2365 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2367 /* Calculate temporary vectorial force */
2368 tx
= _mm_mul_ps(fscal
,dx21
);
2369 ty
= _mm_mul_ps(fscal
,dy21
);
2370 tz
= _mm_mul_ps(fscal
,dz21
);
2372 /* Update vectorial force */
2373 fix2
= _mm_add_ps(fix2
,tx
);
2374 fiy2
= _mm_add_ps(fiy2
,ty
);
2375 fiz2
= _mm_add_ps(fiz2
,tz
);
2377 fjx1
= _mm_add_ps(fjx1
,tx
);
2378 fjy1
= _mm_add_ps(fjy1
,ty
);
2379 fjz1
= _mm_add_ps(fjz1
,tz
);
2381 /**************************
2382 * CALCULATE INTERACTIONS *
2383 **************************/
2385 r22
= _mm_mul_ps(rsq22
,rinv22
);
2386 r22
= _mm_andnot_ps(dummy_mask
,r22
);
2388 /* Calculate table index by multiplying r with table scale and truncate to integer */
2389 rt
= _mm_mul_ps(r22
,vftabscale
);
2390 vfitab
= _mm_cvttps_epi32(rt
);
2391 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
2392 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
2394 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2395 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2396 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
2397 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
2398 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
2399 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2400 Heps
= _mm_mul_ps(vfeps
,H
);
2401 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
2402 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
2403 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq22
,FF
),_mm_mul_ps(vftabscale
,rinv22
)));
2407 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2409 /* Calculate temporary vectorial force */
2410 tx
= _mm_mul_ps(fscal
,dx22
);
2411 ty
= _mm_mul_ps(fscal
,dy22
);
2412 tz
= _mm_mul_ps(fscal
,dz22
);
2414 /* Update vectorial force */
2415 fix2
= _mm_add_ps(fix2
,tx
);
2416 fiy2
= _mm_add_ps(fiy2
,ty
);
2417 fiz2
= _mm_add_ps(fiz2
,tz
);
2419 fjx2
= _mm_add_ps(fjx2
,tx
);
2420 fjy2
= _mm_add_ps(fjy2
,ty
);
2421 fjz2
= _mm_add_ps(fjz2
,tz
);
2423 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
2424 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
2425 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
2426 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
2428 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
2429 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
2431 /* Inner loop uses 382 flops */
2434 /* End of innermost loop */
2436 gmx_mm_update_iforce_3atom_swizzle_ps(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
2437 f
+i_coord_offset
,fshift
+i_shift_offset
);
2439 /* Increment number of inner iterations */
2440 inneriter
+= j_index_end
- j_index_start
;
2442 /* Outer loop uses 18 flops */
2445 /* Increment number of outer iterations */
2448 /* Update outer/inner flops */
2450 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W3W3_F
,outeriter
*18 + inneriter
*382);