2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_single kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_sse2_single.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_sse2_single
51 * Electrostatics interaction: CubicSplineTable
52 * VdW interaction: LennardJones
53 * Geometry: Water4-Water4
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_sse2_single
58 (t_nblist
* gmx_restrict nlist
,
59 rvec
* gmx_restrict xx
,
60 rvec
* gmx_restrict ff
,
61 struct t_forcerec
* gmx_restrict fr
,
62 t_mdatoms
* gmx_restrict mdatoms
,
63 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
64 t_nrnb
* gmx_restrict nrnb
)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
72 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
73 int jnrA
,jnrB
,jnrC
,jnrD
;
74 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
75 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
76 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
78 real
*shiftvec
,*fshift
,*x
,*f
;
79 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
;
81 __m128 tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
83 __m128 ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
85 __m128 ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
87 __m128 ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
89 __m128 ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
90 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
;
91 __m128 jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
92 int vdwjidx1A
,vdwjidx1B
,vdwjidx1C
,vdwjidx1D
;
93 __m128 jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
94 int vdwjidx2A
,vdwjidx2B
,vdwjidx2C
,vdwjidx2D
;
95 __m128 jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
96 int vdwjidx3A
,vdwjidx3B
,vdwjidx3C
,vdwjidx3D
;
97 __m128 jx3
,jy3
,jz3
,fjx3
,fjy3
,fjz3
,jq3
,isaj3
;
98 __m128 dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
99 __m128 dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
100 __m128 dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
101 __m128 dx13
,dy13
,dz13
,rsq13
,rinv13
,rinvsq13
,r13
,qq13
,c6_13
,c12_13
;
102 __m128 dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
103 __m128 dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
104 __m128 dx23
,dy23
,dz23
,rsq23
,rinv23
,rinvsq23
,r23
,qq23
,c6_23
,c12_23
;
105 __m128 dx31
,dy31
,dz31
,rsq31
,rinv31
,rinvsq31
,r31
,qq31
,c6_31
,c12_31
;
106 __m128 dx32
,dy32
,dz32
,rsq32
,rinv32
,rinvsq32
,r32
,qq32
,c6_32
,c12_32
;
107 __m128 dx33
,dy33
,dz33
,rsq33
,rinv33
,rinvsq33
,r33
,qq33
,c6_33
,c12_33
;
108 __m128 velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
111 __m128 rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
114 __m128 one_sixth
= _mm_set1_ps(1.0/6.0);
115 __m128 one_twelfth
= _mm_set1_ps(1.0/12.0);
117 __m128i ifour
= _mm_set1_epi32(4);
118 __m128 rt
,vfeps
,vftabscale
,Y
,F
,G
,H
,Heps
,Fp
,VV
,FF
;
120 __m128 dummy_mask
,cutoff_mask
;
121 __m128 signbit
= _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
122 __m128 one
= _mm_set1_ps(1.0);
123 __m128 two
= _mm_set1_ps(2.0);
129 jindex
= nlist
->jindex
;
131 shiftidx
= nlist
->shift
;
133 shiftvec
= fr
->shift_vec
[0];
134 fshift
= fr
->fshift
[0];
135 facel
= _mm_set1_ps(fr
->ic
->epsfac
);
136 charge
= mdatoms
->chargeA
;
137 nvdwtype
= fr
->ntype
;
139 vdwtype
= mdatoms
->typeA
;
141 vftab
= kernel_data
->table_elec
->data
;
142 vftabscale
= _mm_set1_ps(kernel_data
->table_elec
->scale
);
144 /* Setup water-specific parameters */
145 inr
= nlist
->iinr
[0];
146 iq1
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+1]));
147 iq2
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+2]));
148 iq3
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+3]));
149 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
151 jq1
= _mm_set1_ps(charge
[inr
+1]);
152 jq2
= _mm_set1_ps(charge
[inr
+2]);
153 jq3
= _mm_set1_ps(charge
[inr
+3]);
154 vdwjidx0A
= 2*vdwtype
[inr
+0];
155 c6_00
= _mm_set1_ps(vdwparam
[vdwioffset0
+vdwjidx0A
]);
156 c12_00
= _mm_set1_ps(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
157 qq11
= _mm_mul_ps(iq1
,jq1
);
158 qq12
= _mm_mul_ps(iq1
,jq2
);
159 qq13
= _mm_mul_ps(iq1
,jq3
);
160 qq21
= _mm_mul_ps(iq2
,jq1
);
161 qq22
= _mm_mul_ps(iq2
,jq2
);
162 qq23
= _mm_mul_ps(iq2
,jq3
);
163 qq31
= _mm_mul_ps(iq3
,jq1
);
164 qq32
= _mm_mul_ps(iq3
,jq2
);
165 qq33
= _mm_mul_ps(iq3
,jq3
);
167 /* Avoid stupid compiler warnings */
168 jnrA
= jnrB
= jnrC
= jnrD
= 0;
177 for(iidx
=0;iidx
<4*DIM
;iidx
++)
182 /* Start outer loop over neighborlists */
183 for(iidx
=0; iidx
<nri
; iidx
++)
185 /* Load shift vector for this list */
186 i_shift_offset
= DIM
*shiftidx
[iidx
];
188 /* Load limits for loop over neighbors */
189 j_index_start
= jindex
[iidx
];
190 j_index_end
= jindex
[iidx
+1];
192 /* Get outer coordinate index */
194 i_coord_offset
= DIM
*inr
;
196 /* Load i particle coords and add shift vector */
197 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
198 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
200 fix0
= _mm_setzero_ps();
201 fiy0
= _mm_setzero_ps();
202 fiz0
= _mm_setzero_ps();
203 fix1
= _mm_setzero_ps();
204 fiy1
= _mm_setzero_ps();
205 fiz1
= _mm_setzero_ps();
206 fix2
= _mm_setzero_ps();
207 fiy2
= _mm_setzero_ps();
208 fiz2
= _mm_setzero_ps();
209 fix3
= _mm_setzero_ps();
210 fiy3
= _mm_setzero_ps();
211 fiz3
= _mm_setzero_ps();
213 /* Reset potential sums */
214 velecsum
= _mm_setzero_ps();
215 vvdwsum
= _mm_setzero_ps();
217 /* Start inner kernel loop */
218 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+3]>=0; jidx
+=4)
221 /* Get j neighbor index, and coordinate index */
226 j_coord_offsetA
= DIM
*jnrA
;
227 j_coord_offsetB
= DIM
*jnrB
;
228 j_coord_offsetC
= DIM
*jnrC
;
229 j_coord_offsetD
= DIM
*jnrD
;
231 /* load j atom coordinates */
232 gmx_mm_load_4rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
233 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
234 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
235 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
237 /* Calculate displacement vector */
238 dx00
= _mm_sub_ps(ix0
,jx0
);
239 dy00
= _mm_sub_ps(iy0
,jy0
);
240 dz00
= _mm_sub_ps(iz0
,jz0
);
241 dx11
= _mm_sub_ps(ix1
,jx1
);
242 dy11
= _mm_sub_ps(iy1
,jy1
);
243 dz11
= _mm_sub_ps(iz1
,jz1
);
244 dx12
= _mm_sub_ps(ix1
,jx2
);
245 dy12
= _mm_sub_ps(iy1
,jy2
);
246 dz12
= _mm_sub_ps(iz1
,jz2
);
247 dx13
= _mm_sub_ps(ix1
,jx3
);
248 dy13
= _mm_sub_ps(iy1
,jy3
);
249 dz13
= _mm_sub_ps(iz1
,jz3
);
250 dx21
= _mm_sub_ps(ix2
,jx1
);
251 dy21
= _mm_sub_ps(iy2
,jy1
);
252 dz21
= _mm_sub_ps(iz2
,jz1
);
253 dx22
= _mm_sub_ps(ix2
,jx2
);
254 dy22
= _mm_sub_ps(iy2
,jy2
);
255 dz22
= _mm_sub_ps(iz2
,jz2
);
256 dx23
= _mm_sub_ps(ix2
,jx3
);
257 dy23
= _mm_sub_ps(iy2
,jy3
);
258 dz23
= _mm_sub_ps(iz2
,jz3
);
259 dx31
= _mm_sub_ps(ix3
,jx1
);
260 dy31
= _mm_sub_ps(iy3
,jy1
);
261 dz31
= _mm_sub_ps(iz3
,jz1
);
262 dx32
= _mm_sub_ps(ix3
,jx2
);
263 dy32
= _mm_sub_ps(iy3
,jy2
);
264 dz32
= _mm_sub_ps(iz3
,jz2
);
265 dx33
= _mm_sub_ps(ix3
,jx3
);
266 dy33
= _mm_sub_ps(iy3
,jy3
);
267 dz33
= _mm_sub_ps(iz3
,jz3
);
269 /* Calculate squared distance and things based on it */
270 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
271 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
272 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
273 rsq13
= gmx_mm_calc_rsq_ps(dx13
,dy13
,dz13
);
274 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
275 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
276 rsq23
= gmx_mm_calc_rsq_ps(dx23
,dy23
,dz23
);
277 rsq31
= gmx_mm_calc_rsq_ps(dx31
,dy31
,dz31
);
278 rsq32
= gmx_mm_calc_rsq_ps(dx32
,dy32
,dz32
);
279 rsq33
= gmx_mm_calc_rsq_ps(dx33
,dy33
,dz33
);
281 rinv11
= sse2_invsqrt_f(rsq11
);
282 rinv12
= sse2_invsqrt_f(rsq12
);
283 rinv13
= sse2_invsqrt_f(rsq13
);
284 rinv21
= sse2_invsqrt_f(rsq21
);
285 rinv22
= sse2_invsqrt_f(rsq22
);
286 rinv23
= sse2_invsqrt_f(rsq23
);
287 rinv31
= sse2_invsqrt_f(rsq31
);
288 rinv32
= sse2_invsqrt_f(rsq32
);
289 rinv33
= sse2_invsqrt_f(rsq33
);
291 rinvsq00
= sse2_inv_f(rsq00
);
293 fjx0
= _mm_setzero_ps();
294 fjy0
= _mm_setzero_ps();
295 fjz0
= _mm_setzero_ps();
296 fjx1
= _mm_setzero_ps();
297 fjy1
= _mm_setzero_ps();
298 fjz1
= _mm_setzero_ps();
299 fjx2
= _mm_setzero_ps();
300 fjy2
= _mm_setzero_ps();
301 fjz2
= _mm_setzero_ps();
302 fjx3
= _mm_setzero_ps();
303 fjy3
= _mm_setzero_ps();
304 fjz3
= _mm_setzero_ps();
306 /**************************
307 * CALCULATE INTERACTIONS *
308 **************************/
310 /* LENNARD-JONES DISPERSION/REPULSION */
312 rinvsix
= _mm_mul_ps(_mm_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
313 vvdw6
= _mm_mul_ps(c6_00
,rinvsix
);
314 vvdw12
= _mm_mul_ps(c12_00
,_mm_mul_ps(rinvsix
,rinvsix
));
315 vvdw
= _mm_sub_ps( _mm_mul_ps(vvdw12
,one_twelfth
) , _mm_mul_ps(vvdw6
,one_sixth
) );
316 fvdw
= _mm_mul_ps(_mm_sub_ps(vvdw12
,vvdw6
),rinvsq00
);
318 /* Update potential sum for this i atom from the interaction with this j atom. */
319 vvdwsum
= _mm_add_ps(vvdwsum
,vvdw
);
323 /* Calculate temporary vectorial force */
324 tx
= _mm_mul_ps(fscal
,dx00
);
325 ty
= _mm_mul_ps(fscal
,dy00
);
326 tz
= _mm_mul_ps(fscal
,dz00
);
328 /* Update vectorial force */
329 fix0
= _mm_add_ps(fix0
,tx
);
330 fiy0
= _mm_add_ps(fiy0
,ty
);
331 fiz0
= _mm_add_ps(fiz0
,tz
);
333 fjx0
= _mm_add_ps(fjx0
,tx
);
334 fjy0
= _mm_add_ps(fjy0
,ty
);
335 fjz0
= _mm_add_ps(fjz0
,tz
);
337 /**************************
338 * CALCULATE INTERACTIONS *
339 **************************/
341 r11
= _mm_mul_ps(rsq11
,rinv11
);
343 /* Calculate table index by multiplying r with table scale and truncate to integer */
344 rt
= _mm_mul_ps(r11
,vftabscale
);
345 vfitab
= _mm_cvttps_epi32(rt
);
346 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
347 vfitab
= _mm_slli_epi32(vfitab
,2);
349 /* CUBIC SPLINE TABLE ELECTROSTATICS */
350 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
351 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
352 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
353 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
354 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
355 Heps
= _mm_mul_ps(vfeps
,H
);
356 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
357 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
358 velec
= _mm_mul_ps(qq11
,VV
);
359 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
360 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq11
,FF
),_mm_mul_ps(vftabscale
,rinv11
)));
362 /* Update potential sum for this i atom from the interaction with this j atom. */
363 velecsum
= _mm_add_ps(velecsum
,velec
);
367 /* Calculate temporary vectorial force */
368 tx
= _mm_mul_ps(fscal
,dx11
);
369 ty
= _mm_mul_ps(fscal
,dy11
);
370 tz
= _mm_mul_ps(fscal
,dz11
);
372 /* Update vectorial force */
373 fix1
= _mm_add_ps(fix1
,tx
);
374 fiy1
= _mm_add_ps(fiy1
,ty
);
375 fiz1
= _mm_add_ps(fiz1
,tz
);
377 fjx1
= _mm_add_ps(fjx1
,tx
);
378 fjy1
= _mm_add_ps(fjy1
,ty
);
379 fjz1
= _mm_add_ps(fjz1
,tz
);
381 /**************************
382 * CALCULATE INTERACTIONS *
383 **************************/
385 r12
= _mm_mul_ps(rsq12
,rinv12
);
387 /* Calculate table index by multiplying r with table scale and truncate to integer */
388 rt
= _mm_mul_ps(r12
,vftabscale
);
389 vfitab
= _mm_cvttps_epi32(rt
);
390 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
391 vfitab
= _mm_slli_epi32(vfitab
,2);
393 /* CUBIC SPLINE TABLE ELECTROSTATICS */
394 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
395 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
396 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
397 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
398 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
399 Heps
= _mm_mul_ps(vfeps
,H
);
400 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
401 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
402 velec
= _mm_mul_ps(qq12
,VV
);
403 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
404 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq12
,FF
),_mm_mul_ps(vftabscale
,rinv12
)));
406 /* Update potential sum for this i atom from the interaction with this j atom. */
407 velecsum
= _mm_add_ps(velecsum
,velec
);
411 /* Calculate temporary vectorial force */
412 tx
= _mm_mul_ps(fscal
,dx12
);
413 ty
= _mm_mul_ps(fscal
,dy12
);
414 tz
= _mm_mul_ps(fscal
,dz12
);
416 /* Update vectorial force */
417 fix1
= _mm_add_ps(fix1
,tx
);
418 fiy1
= _mm_add_ps(fiy1
,ty
);
419 fiz1
= _mm_add_ps(fiz1
,tz
);
421 fjx2
= _mm_add_ps(fjx2
,tx
);
422 fjy2
= _mm_add_ps(fjy2
,ty
);
423 fjz2
= _mm_add_ps(fjz2
,tz
);
425 /**************************
426 * CALCULATE INTERACTIONS *
427 **************************/
429 r13
= _mm_mul_ps(rsq13
,rinv13
);
431 /* Calculate table index by multiplying r with table scale and truncate to integer */
432 rt
= _mm_mul_ps(r13
,vftabscale
);
433 vfitab
= _mm_cvttps_epi32(rt
);
434 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
435 vfitab
= _mm_slli_epi32(vfitab
,2);
437 /* CUBIC SPLINE TABLE ELECTROSTATICS */
438 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
439 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
440 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
441 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
442 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
443 Heps
= _mm_mul_ps(vfeps
,H
);
444 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
445 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
446 velec
= _mm_mul_ps(qq13
,VV
);
447 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
448 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq13
,FF
),_mm_mul_ps(vftabscale
,rinv13
)));
450 /* Update potential sum for this i atom from the interaction with this j atom. */
451 velecsum
= _mm_add_ps(velecsum
,velec
);
455 /* Calculate temporary vectorial force */
456 tx
= _mm_mul_ps(fscal
,dx13
);
457 ty
= _mm_mul_ps(fscal
,dy13
);
458 tz
= _mm_mul_ps(fscal
,dz13
);
460 /* Update vectorial force */
461 fix1
= _mm_add_ps(fix1
,tx
);
462 fiy1
= _mm_add_ps(fiy1
,ty
);
463 fiz1
= _mm_add_ps(fiz1
,tz
);
465 fjx3
= _mm_add_ps(fjx3
,tx
);
466 fjy3
= _mm_add_ps(fjy3
,ty
);
467 fjz3
= _mm_add_ps(fjz3
,tz
);
469 /**************************
470 * CALCULATE INTERACTIONS *
471 **************************/
473 r21
= _mm_mul_ps(rsq21
,rinv21
);
475 /* Calculate table index by multiplying r with table scale and truncate to integer */
476 rt
= _mm_mul_ps(r21
,vftabscale
);
477 vfitab
= _mm_cvttps_epi32(rt
);
478 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
479 vfitab
= _mm_slli_epi32(vfitab
,2);
481 /* CUBIC SPLINE TABLE ELECTROSTATICS */
482 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
483 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
484 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
485 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
486 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
487 Heps
= _mm_mul_ps(vfeps
,H
);
488 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
489 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
490 velec
= _mm_mul_ps(qq21
,VV
);
491 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
492 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq21
,FF
),_mm_mul_ps(vftabscale
,rinv21
)));
494 /* Update potential sum for this i atom from the interaction with this j atom. */
495 velecsum
= _mm_add_ps(velecsum
,velec
);
499 /* Calculate temporary vectorial force */
500 tx
= _mm_mul_ps(fscal
,dx21
);
501 ty
= _mm_mul_ps(fscal
,dy21
);
502 tz
= _mm_mul_ps(fscal
,dz21
);
504 /* Update vectorial force */
505 fix2
= _mm_add_ps(fix2
,tx
);
506 fiy2
= _mm_add_ps(fiy2
,ty
);
507 fiz2
= _mm_add_ps(fiz2
,tz
);
509 fjx1
= _mm_add_ps(fjx1
,tx
);
510 fjy1
= _mm_add_ps(fjy1
,ty
);
511 fjz1
= _mm_add_ps(fjz1
,tz
);
513 /**************************
514 * CALCULATE INTERACTIONS *
515 **************************/
517 r22
= _mm_mul_ps(rsq22
,rinv22
);
519 /* Calculate table index by multiplying r with table scale and truncate to integer */
520 rt
= _mm_mul_ps(r22
,vftabscale
);
521 vfitab
= _mm_cvttps_epi32(rt
);
522 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
523 vfitab
= _mm_slli_epi32(vfitab
,2);
525 /* CUBIC SPLINE TABLE ELECTROSTATICS */
526 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
527 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
528 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
529 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
530 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
531 Heps
= _mm_mul_ps(vfeps
,H
);
532 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
533 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
534 velec
= _mm_mul_ps(qq22
,VV
);
535 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
536 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq22
,FF
),_mm_mul_ps(vftabscale
,rinv22
)));
538 /* Update potential sum for this i atom from the interaction with this j atom. */
539 velecsum
= _mm_add_ps(velecsum
,velec
);
543 /* Calculate temporary vectorial force */
544 tx
= _mm_mul_ps(fscal
,dx22
);
545 ty
= _mm_mul_ps(fscal
,dy22
);
546 tz
= _mm_mul_ps(fscal
,dz22
);
548 /* Update vectorial force */
549 fix2
= _mm_add_ps(fix2
,tx
);
550 fiy2
= _mm_add_ps(fiy2
,ty
);
551 fiz2
= _mm_add_ps(fiz2
,tz
);
553 fjx2
= _mm_add_ps(fjx2
,tx
);
554 fjy2
= _mm_add_ps(fjy2
,ty
);
555 fjz2
= _mm_add_ps(fjz2
,tz
);
557 /**************************
558 * CALCULATE INTERACTIONS *
559 **************************/
561 r23
= _mm_mul_ps(rsq23
,rinv23
);
563 /* Calculate table index by multiplying r with table scale and truncate to integer */
564 rt
= _mm_mul_ps(r23
,vftabscale
);
565 vfitab
= _mm_cvttps_epi32(rt
);
566 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
567 vfitab
= _mm_slli_epi32(vfitab
,2);
569 /* CUBIC SPLINE TABLE ELECTROSTATICS */
570 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
571 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
572 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
573 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
574 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
575 Heps
= _mm_mul_ps(vfeps
,H
);
576 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
577 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
578 velec
= _mm_mul_ps(qq23
,VV
);
579 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
580 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq23
,FF
),_mm_mul_ps(vftabscale
,rinv23
)));
582 /* Update potential sum for this i atom from the interaction with this j atom. */
583 velecsum
= _mm_add_ps(velecsum
,velec
);
587 /* Calculate temporary vectorial force */
588 tx
= _mm_mul_ps(fscal
,dx23
);
589 ty
= _mm_mul_ps(fscal
,dy23
);
590 tz
= _mm_mul_ps(fscal
,dz23
);
592 /* Update vectorial force */
593 fix2
= _mm_add_ps(fix2
,tx
);
594 fiy2
= _mm_add_ps(fiy2
,ty
);
595 fiz2
= _mm_add_ps(fiz2
,tz
);
597 fjx3
= _mm_add_ps(fjx3
,tx
);
598 fjy3
= _mm_add_ps(fjy3
,ty
);
599 fjz3
= _mm_add_ps(fjz3
,tz
);
601 /**************************
602 * CALCULATE INTERACTIONS *
603 **************************/
605 r31
= _mm_mul_ps(rsq31
,rinv31
);
607 /* Calculate table index by multiplying r with table scale and truncate to integer */
608 rt
= _mm_mul_ps(r31
,vftabscale
);
609 vfitab
= _mm_cvttps_epi32(rt
);
610 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
611 vfitab
= _mm_slli_epi32(vfitab
,2);
613 /* CUBIC SPLINE TABLE ELECTROSTATICS */
614 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
615 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
616 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
617 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
618 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
619 Heps
= _mm_mul_ps(vfeps
,H
);
620 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
621 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
622 velec
= _mm_mul_ps(qq31
,VV
);
623 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
624 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq31
,FF
),_mm_mul_ps(vftabscale
,rinv31
)));
626 /* Update potential sum for this i atom from the interaction with this j atom. */
627 velecsum
= _mm_add_ps(velecsum
,velec
);
631 /* Calculate temporary vectorial force */
632 tx
= _mm_mul_ps(fscal
,dx31
);
633 ty
= _mm_mul_ps(fscal
,dy31
);
634 tz
= _mm_mul_ps(fscal
,dz31
);
636 /* Update vectorial force */
637 fix3
= _mm_add_ps(fix3
,tx
);
638 fiy3
= _mm_add_ps(fiy3
,ty
);
639 fiz3
= _mm_add_ps(fiz3
,tz
);
641 fjx1
= _mm_add_ps(fjx1
,tx
);
642 fjy1
= _mm_add_ps(fjy1
,ty
);
643 fjz1
= _mm_add_ps(fjz1
,tz
);
645 /**************************
646 * CALCULATE INTERACTIONS *
647 **************************/
649 r32
= _mm_mul_ps(rsq32
,rinv32
);
651 /* Calculate table index by multiplying r with table scale and truncate to integer */
652 rt
= _mm_mul_ps(r32
,vftabscale
);
653 vfitab
= _mm_cvttps_epi32(rt
);
654 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
655 vfitab
= _mm_slli_epi32(vfitab
,2);
657 /* CUBIC SPLINE TABLE ELECTROSTATICS */
658 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
659 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
660 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
661 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
662 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
663 Heps
= _mm_mul_ps(vfeps
,H
);
664 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
665 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
666 velec
= _mm_mul_ps(qq32
,VV
);
667 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
668 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq32
,FF
),_mm_mul_ps(vftabscale
,rinv32
)));
670 /* Update potential sum for this i atom from the interaction with this j atom. */
671 velecsum
= _mm_add_ps(velecsum
,velec
);
675 /* Calculate temporary vectorial force */
676 tx
= _mm_mul_ps(fscal
,dx32
);
677 ty
= _mm_mul_ps(fscal
,dy32
);
678 tz
= _mm_mul_ps(fscal
,dz32
);
680 /* Update vectorial force */
681 fix3
= _mm_add_ps(fix3
,tx
);
682 fiy3
= _mm_add_ps(fiy3
,ty
);
683 fiz3
= _mm_add_ps(fiz3
,tz
);
685 fjx2
= _mm_add_ps(fjx2
,tx
);
686 fjy2
= _mm_add_ps(fjy2
,ty
);
687 fjz2
= _mm_add_ps(fjz2
,tz
);
689 /**************************
690 * CALCULATE INTERACTIONS *
691 **************************/
693 r33
= _mm_mul_ps(rsq33
,rinv33
);
695 /* Calculate table index by multiplying r with table scale and truncate to integer */
696 rt
= _mm_mul_ps(r33
,vftabscale
);
697 vfitab
= _mm_cvttps_epi32(rt
);
698 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
699 vfitab
= _mm_slli_epi32(vfitab
,2);
701 /* CUBIC SPLINE TABLE ELECTROSTATICS */
702 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
703 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
704 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
705 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
706 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
707 Heps
= _mm_mul_ps(vfeps
,H
);
708 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
709 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
710 velec
= _mm_mul_ps(qq33
,VV
);
711 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
712 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq33
,FF
),_mm_mul_ps(vftabscale
,rinv33
)));
714 /* Update potential sum for this i atom from the interaction with this j atom. */
715 velecsum
= _mm_add_ps(velecsum
,velec
);
719 /* Calculate temporary vectorial force */
720 tx
= _mm_mul_ps(fscal
,dx33
);
721 ty
= _mm_mul_ps(fscal
,dy33
);
722 tz
= _mm_mul_ps(fscal
,dz33
);
724 /* Update vectorial force */
725 fix3
= _mm_add_ps(fix3
,tx
);
726 fiy3
= _mm_add_ps(fiy3
,ty
);
727 fiz3
= _mm_add_ps(fiz3
,tz
);
729 fjx3
= _mm_add_ps(fjx3
,tx
);
730 fjy3
= _mm_add_ps(fjy3
,ty
);
731 fjz3
= _mm_add_ps(fjz3
,tz
);
733 fjptrA
= f
+j_coord_offsetA
;
734 fjptrB
= f
+j_coord_offsetB
;
735 fjptrC
= f
+j_coord_offsetC
;
736 fjptrD
= f
+j_coord_offsetD
;
738 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
739 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,
740 fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
742 /* Inner loop uses 422 flops */
748 /* Get j neighbor index, and coordinate index */
749 jnrlistA
= jjnr
[jidx
];
750 jnrlistB
= jjnr
[jidx
+1];
751 jnrlistC
= jjnr
[jidx
+2];
752 jnrlistD
= jjnr
[jidx
+3];
753 /* Sign of each element will be negative for non-real atoms.
754 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
755 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
757 dummy_mask
= gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128()));
758 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
759 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
760 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
761 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
762 j_coord_offsetA
= DIM
*jnrA
;
763 j_coord_offsetB
= DIM
*jnrB
;
764 j_coord_offsetC
= DIM
*jnrC
;
765 j_coord_offsetD
= DIM
*jnrD
;
767 /* load j atom coordinates */
768 gmx_mm_load_4rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
769 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
770 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
771 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
773 /* Calculate displacement vector */
774 dx00
= _mm_sub_ps(ix0
,jx0
);
775 dy00
= _mm_sub_ps(iy0
,jy0
);
776 dz00
= _mm_sub_ps(iz0
,jz0
);
777 dx11
= _mm_sub_ps(ix1
,jx1
);
778 dy11
= _mm_sub_ps(iy1
,jy1
);
779 dz11
= _mm_sub_ps(iz1
,jz1
);
780 dx12
= _mm_sub_ps(ix1
,jx2
);
781 dy12
= _mm_sub_ps(iy1
,jy2
);
782 dz12
= _mm_sub_ps(iz1
,jz2
);
783 dx13
= _mm_sub_ps(ix1
,jx3
);
784 dy13
= _mm_sub_ps(iy1
,jy3
);
785 dz13
= _mm_sub_ps(iz1
,jz3
);
786 dx21
= _mm_sub_ps(ix2
,jx1
);
787 dy21
= _mm_sub_ps(iy2
,jy1
);
788 dz21
= _mm_sub_ps(iz2
,jz1
);
789 dx22
= _mm_sub_ps(ix2
,jx2
);
790 dy22
= _mm_sub_ps(iy2
,jy2
);
791 dz22
= _mm_sub_ps(iz2
,jz2
);
792 dx23
= _mm_sub_ps(ix2
,jx3
);
793 dy23
= _mm_sub_ps(iy2
,jy3
);
794 dz23
= _mm_sub_ps(iz2
,jz3
);
795 dx31
= _mm_sub_ps(ix3
,jx1
);
796 dy31
= _mm_sub_ps(iy3
,jy1
);
797 dz31
= _mm_sub_ps(iz3
,jz1
);
798 dx32
= _mm_sub_ps(ix3
,jx2
);
799 dy32
= _mm_sub_ps(iy3
,jy2
);
800 dz32
= _mm_sub_ps(iz3
,jz2
);
801 dx33
= _mm_sub_ps(ix3
,jx3
);
802 dy33
= _mm_sub_ps(iy3
,jy3
);
803 dz33
= _mm_sub_ps(iz3
,jz3
);
805 /* Calculate squared distance and things based on it */
806 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
807 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
808 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
809 rsq13
= gmx_mm_calc_rsq_ps(dx13
,dy13
,dz13
);
810 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
811 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
812 rsq23
= gmx_mm_calc_rsq_ps(dx23
,dy23
,dz23
);
813 rsq31
= gmx_mm_calc_rsq_ps(dx31
,dy31
,dz31
);
814 rsq32
= gmx_mm_calc_rsq_ps(dx32
,dy32
,dz32
);
815 rsq33
= gmx_mm_calc_rsq_ps(dx33
,dy33
,dz33
);
817 rinv11
= sse2_invsqrt_f(rsq11
);
818 rinv12
= sse2_invsqrt_f(rsq12
);
819 rinv13
= sse2_invsqrt_f(rsq13
);
820 rinv21
= sse2_invsqrt_f(rsq21
);
821 rinv22
= sse2_invsqrt_f(rsq22
);
822 rinv23
= sse2_invsqrt_f(rsq23
);
823 rinv31
= sse2_invsqrt_f(rsq31
);
824 rinv32
= sse2_invsqrt_f(rsq32
);
825 rinv33
= sse2_invsqrt_f(rsq33
);
827 rinvsq00
= sse2_inv_f(rsq00
);
829 fjx0
= _mm_setzero_ps();
830 fjy0
= _mm_setzero_ps();
831 fjz0
= _mm_setzero_ps();
832 fjx1
= _mm_setzero_ps();
833 fjy1
= _mm_setzero_ps();
834 fjz1
= _mm_setzero_ps();
835 fjx2
= _mm_setzero_ps();
836 fjy2
= _mm_setzero_ps();
837 fjz2
= _mm_setzero_ps();
838 fjx3
= _mm_setzero_ps();
839 fjy3
= _mm_setzero_ps();
840 fjz3
= _mm_setzero_ps();
842 /**************************
843 * CALCULATE INTERACTIONS *
844 **************************/
846 /* LENNARD-JONES DISPERSION/REPULSION */
848 rinvsix
= _mm_mul_ps(_mm_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
849 vvdw6
= _mm_mul_ps(c6_00
,rinvsix
);
850 vvdw12
= _mm_mul_ps(c12_00
,_mm_mul_ps(rinvsix
,rinvsix
));
851 vvdw
= _mm_sub_ps( _mm_mul_ps(vvdw12
,one_twelfth
) , _mm_mul_ps(vvdw6
,one_sixth
) );
852 fvdw
= _mm_mul_ps(_mm_sub_ps(vvdw12
,vvdw6
),rinvsq00
);
854 /* Update potential sum for this i atom from the interaction with this j atom. */
855 vvdw
= _mm_andnot_ps(dummy_mask
,vvdw
);
856 vvdwsum
= _mm_add_ps(vvdwsum
,vvdw
);
860 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
862 /* Calculate temporary vectorial force */
863 tx
= _mm_mul_ps(fscal
,dx00
);
864 ty
= _mm_mul_ps(fscal
,dy00
);
865 tz
= _mm_mul_ps(fscal
,dz00
);
867 /* Update vectorial force */
868 fix0
= _mm_add_ps(fix0
,tx
);
869 fiy0
= _mm_add_ps(fiy0
,ty
);
870 fiz0
= _mm_add_ps(fiz0
,tz
);
872 fjx0
= _mm_add_ps(fjx0
,tx
);
873 fjy0
= _mm_add_ps(fjy0
,ty
);
874 fjz0
= _mm_add_ps(fjz0
,tz
);
876 /**************************
877 * CALCULATE INTERACTIONS *
878 **************************/
880 r11
= _mm_mul_ps(rsq11
,rinv11
);
881 r11
= _mm_andnot_ps(dummy_mask
,r11
);
883 /* Calculate table index by multiplying r with table scale and truncate to integer */
884 rt
= _mm_mul_ps(r11
,vftabscale
);
885 vfitab
= _mm_cvttps_epi32(rt
);
886 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
887 vfitab
= _mm_slli_epi32(vfitab
,2);
889 /* CUBIC SPLINE TABLE ELECTROSTATICS */
890 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
891 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
892 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
893 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
894 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
895 Heps
= _mm_mul_ps(vfeps
,H
);
896 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
897 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
898 velec
= _mm_mul_ps(qq11
,VV
);
899 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
900 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq11
,FF
),_mm_mul_ps(vftabscale
,rinv11
)));
902 /* Update potential sum for this i atom from the interaction with this j atom. */
903 velec
= _mm_andnot_ps(dummy_mask
,velec
);
904 velecsum
= _mm_add_ps(velecsum
,velec
);
908 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
910 /* Calculate temporary vectorial force */
911 tx
= _mm_mul_ps(fscal
,dx11
);
912 ty
= _mm_mul_ps(fscal
,dy11
);
913 tz
= _mm_mul_ps(fscal
,dz11
);
915 /* Update vectorial force */
916 fix1
= _mm_add_ps(fix1
,tx
);
917 fiy1
= _mm_add_ps(fiy1
,ty
);
918 fiz1
= _mm_add_ps(fiz1
,tz
);
920 fjx1
= _mm_add_ps(fjx1
,tx
);
921 fjy1
= _mm_add_ps(fjy1
,ty
);
922 fjz1
= _mm_add_ps(fjz1
,tz
);
924 /**************************
925 * CALCULATE INTERACTIONS *
926 **************************/
928 r12
= _mm_mul_ps(rsq12
,rinv12
);
929 r12
= _mm_andnot_ps(dummy_mask
,r12
);
931 /* Calculate table index by multiplying r with table scale and truncate to integer */
932 rt
= _mm_mul_ps(r12
,vftabscale
);
933 vfitab
= _mm_cvttps_epi32(rt
);
934 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
935 vfitab
= _mm_slli_epi32(vfitab
,2);
937 /* CUBIC SPLINE TABLE ELECTROSTATICS */
938 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
939 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
940 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
941 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
942 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
943 Heps
= _mm_mul_ps(vfeps
,H
);
944 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
945 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
946 velec
= _mm_mul_ps(qq12
,VV
);
947 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
948 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq12
,FF
),_mm_mul_ps(vftabscale
,rinv12
)));
950 /* Update potential sum for this i atom from the interaction with this j atom. */
951 velec
= _mm_andnot_ps(dummy_mask
,velec
);
952 velecsum
= _mm_add_ps(velecsum
,velec
);
956 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
958 /* Calculate temporary vectorial force */
959 tx
= _mm_mul_ps(fscal
,dx12
);
960 ty
= _mm_mul_ps(fscal
,dy12
);
961 tz
= _mm_mul_ps(fscal
,dz12
);
963 /* Update vectorial force */
964 fix1
= _mm_add_ps(fix1
,tx
);
965 fiy1
= _mm_add_ps(fiy1
,ty
);
966 fiz1
= _mm_add_ps(fiz1
,tz
);
968 fjx2
= _mm_add_ps(fjx2
,tx
);
969 fjy2
= _mm_add_ps(fjy2
,ty
);
970 fjz2
= _mm_add_ps(fjz2
,tz
);
972 /**************************
973 * CALCULATE INTERACTIONS *
974 **************************/
976 r13
= _mm_mul_ps(rsq13
,rinv13
);
977 r13
= _mm_andnot_ps(dummy_mask
,r13
);
979 /* Calculate table index by multiplying r with table scale and truncate to integer */
980 rt
= _mm_mul_ps(r13
,vftabscale
);
981 vfitab
= _mm_cvttps_epi32(rt
);
982 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
983 vfitab
= _mm_slli_epi32(vfitab
,2);
985 /* CUBIC SPLINE TABLE ELECTROSTATICS */
986 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
987 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
988 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
989 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
990 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
991 Heps
= _mm_mul_ps(vfeps
,H
);
992 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
993 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
994 velec
= _mm_mul_ps(qq13
,VV
);
995 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
996 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq13
,FF
),_mm_mul_ps(vftabscale
,rinv13
)));
998 /* Update potential sum for this i atom from the interaction with this j atom. */
999 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1000 velecsum
= _mm_add_ps(velecsum
,velec
);
1004 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1006 /* Calculate temporary vectorial force */
1007 tx
= _mm_mul_ps(fscal
,dx13
);
1008 ty
= _mm_mul_ps(fscal
,dy13
);
1009 tz
= _mm_mul_ps(fscal
,dz13
);
1011 /* Update vectorial force */
1012 fix1
= _mm_add_ps(fix1
,tx
);
1013 fiy1
= _mm_add_ps(fiy1
,ty
);
1014 fiz1
= _mm_add_ps(fiz1
,tz
);
1016 fjx3
= _mm_add_ps(fjx3
,tx
);
1017 fjy3
= _mm_add_ps(fjy3
,ty
);
1018 fjz3
= _mm_add_ps(fjz3
,tz
);
1020 /**************************
1021 * CALCULATE INTERACTIONS *
1022 **************************/
1024 r21
= _mm_mul_ps(rsq21
,rinv21
);
1025 r21
= _mm_andnot_ps(dummy_mask
,r21
);
1027 /* Calculate table index by multiplying r with table scale and truncate to integer */
1028 rt
= _mm_mul_ps(r21
,vftabscale
);
1029 vfitab
= _mm_cvttps_epi32(rt
);
1030 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
1031 vfitab
= _mm_slli_epi32(vfitab
,2);
1033 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1034 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1035 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1036 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
1037 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
1038 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1039 Heps
= _mm_mul_ps(vfeps
,H
);
1040 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
1041 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
1042 velec
= _mm_mul_ps(qq21
,VV
);
1043 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
1044 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq21
,FF
),_mm_mul_ps(vftabscale
,rinv21
)));
1046 /* Update potential sum for this i atom from the interaction with this j atom. */
1047 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1048 velecsum
= _mm_add_ps(velecsum
,velec
);
1052 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1054 /* Calculate temporary vectorial force */
1055 tx
= _mm_mul_ps(fscal
,dx21
);
1056 ty
= _mm_mul_ps(fscal
,dy21
);
1057 tz
= _mm_mul_ps(fscal
,dz21
);
1059 /* Update vectorial force */
1060 fix2
= _mm_add_ps(fix2
,tx
);
1061 fiy2
= _mm_add_ps(fiy2
,ty
);
1062 fiz2
= _mm_add_ps(fiz2
,tz
);
1064 fjx1
= _mm_add_ps(fjx1
,tx
);
1065 fjy1
= _mm_add_ps(fjy1
,ty
);
1066 fjz1
= _mm_add_ps(fjz1
,tz
);
1068 /**************************
1069 * CALCULATE INTERACTIONS *
1070 **************************/
1072 r22
= _mm_mul_ps(rsq22
,rinv22
);
1073 r22
= _mm_andnot_ps(dummy_mask
,r22
);
1075 /* Calculate table index by multiplying r with table scale and truncate to integer */
1076 rt
= _mm_mul_ps(r22
,vftabscale
);
1077 vfitab
= _mm_cvttps_epi32(rt
);
1078 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
1079 vfitab
= _mm_slli_epi32(vfitab
,2);
1081 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1082 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1083 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1084 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
1085 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
1086 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1087 Heps
= _mm_mul_ps(vfeps
,H
);
1088 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
1089 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
1090 velec
= _mm_mul_ps(qq22
,VV
);
1091 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
1092 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq22
,FF
),_mm_mul_ps(vftabscale
,rinv22
)));
1094 /* Update potential sum for this i atom from the interaction with this j atom. */
1095 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1096 velecsum
= _mm_add_ps(velecsum
,velec
);
1100 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1102 /* Calculate temporary vectorial force */
1103 tx
= _mm_mul_ps(fscal
,dx22
);
1104 ty
= _mm_mul_ps(fscal
,dy22
);
1105 tz
= _mm_mul_ps(fscal
,dz22
);
1107 /* Update vectorial force */
1108 fix2
= _mm_add_ps(fix2
,tx
);
1109 fiy2
= _mm_add_ps(fiy2
,ty
);
1110 fiz2
= _mm_add_ps(fiz2
,tz
);
1112 fjx2
= _mm_add_ps(fjx2
,tx
);
1113 fjy2
= _mm_add_ps(fjy2
,ty
);
1114 fjz2
= _mm_add_ps(fjz2
,tz
);
1116 /**************************
1117 * CALCULATE INTERACTIONS *
1118 **************************/
1120 r23
= _mm_mul_ps(rsq23
,rinv23
);
1121 r23
= _mm_andnot_ps(dummy_mask
,r23
);
1123 /* Calculate table index by multiplying r with table scale and truncate to integer */
1124 rt
= _mm_mul_ps(r23
,vftabscale
);
1125 vfitab
= _mm_cvttps_epi32(rt
);
1126 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
1127 vfitab
= _mm_slli_epi32(vfitab
,2);
1129 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1130 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1131 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1132 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
1133 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
1134 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1135 Heps
= _mm_mul_ps(vfeps
,H
);
1136 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
1137 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
1138 velec
= _mm_mul_ps(qq23
,VV
);
1139 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
1140 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq23
,FF
),_mm_mul_ps(vftabscale
,rinv23
)));
1142 /* Update potential sum for this i atom from the interaction with this j atom. */
1143 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1144 velecsum
= _mm_add_ps(velecsum
,velec
);
1148 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1150 /* Calculate temporary vectorial force */
1151 tx
= _mm_mul_ps(fscal
,dx23
);
1152 ty
= _mm_mul_ps(fscal
,dy23
);
1153 tz
= _mm_mul_ps(fscal
,dz23
);
1155 /* Update vectorial force */
1156 fix2
= _mm_add_ps(fix2
,tx
);
1157 fiy2
= _mm_add_ps(fiy2
,ty
);
1158 fiz2
= _mm_add_ps(fiz2
,tz
);
1160 fjx3
= _mm_add_ps(fjx3
,tx
);
1161 fjy3
= _mm_add_ps(fjy3
,ty
);
1162 fjz3
= _mm_add_ps(fjz3
,tz
);
1164 /**************************
1165 * CALCULATE INTERACTIONS *
1166 **************************/
1168 r31
= _mm_mul_ps(rsq31
,rinv31
);
1169 r31
= _mm_andnot_ps(dummy_mask
,r31
);
1171 /* Calculate table index by multiplying r with table scale and truncate to integer */
1172 rt
= _mm_mul_ps(r31
,vftabscale
);
1173 vfitab
= _mm_cvttps_epi32(rt
);
1174 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
1175 vfitab
= _mm_slli_epi32(vfitab
,2);
1177 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1178 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1179 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1180 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
1181 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
1182 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1183 Heps
= _mm_mul_ps(vfeps
,H
);
1184 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
1185 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
1186 velec
= _mm_mul_ps(qq31
,VV
);
1187 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
1188 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq31
,FF
),_mm_mul_ps(vftabscale
,rinv31
)));
1190 /* Update potential sum for this i atom from the interaction with this j atom. */
1191 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1192 velecsum
= _mm_add_ps(velecsum
,velec
);
1196 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1198 /* Calculate temporary vectorial force */
1199 tx
= _mm_mul_ps(fscal
,dx31
);
1200 ty
= _mm_mul_ps(fscal
,dy31
);
1201 tz
= _mm_mul_ps(fscal
,dz31
);
1203 /* Update vectorial force */
1204 fix3
= _mm_add_ps(fix3
,tx
);
1205 fiy3
= _mm_add_ps(fiy3
,ty
);
1206 fiz3
= _mm_add_ps(fiz3
,tz
);
1208 fjx1
= _mm_add_ps(fjx1
,tx
);
1209 fjy1
= _mm_add_ps(fjy1
,ty
);
1210 fjz1
= _mm_add_ps(fjz1
,tz
);
1212 /**************************
1213 * CALCULATE INTERACTIONS *
1214 **************************/
1216 r32
= _mm_mul_ps(rsq32
,rinv32
);
1217 r32
= _mm_andnot_ps(dummy_mask
,r32
);
1219 /* Calculate table index by multiplying r with table scale and truncate to integer */
1220 rt
= _mm_mul_ps(r32
,vftabscale
);
1221 vfitab
= _mm_cvttps_epi32(rt
);
1222 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
1223 vfitab
= _mm_slli_epi32(vfitab
,2);
1225 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1226 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1227 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1228 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
1229 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
1230 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1231 Heps
= _mm_mul_ps(vfeps
,H
);
1232 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
1233 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
1234 velec
= _mm_mul_ps(qq32
,VV
);
1235 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
1236 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq32
,FF
),_mm_mul_ps(vftabscale
,rinv32
)));
1238 /* Update potential sum for this i atom from the interaction with this j atom. */
1239 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1240 velecsum
= _mm_add_ps(velecsum
,velec
);
1244 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1246 /* Calculate temporary vectorial force */
1247 tx
= _mm_mul_ps(fscal
,dx32
);
1248 ty
= _mm_mul_ps(fscal
,dy32
);
1249 tz
= _mm_mul_ps(fscal
,dz32
);
1251 /* Update vectorial force */
1252 fix3
= _mm_add_ps(fix3
,tx
);
1253 fiy3
= _mm_add_ps(fiy3
,ty
);
1254 fiz3
= _mm_add_ps(fiz3
,tz
);
1256 fjx2
= _mm_add_ps(fjx2
,tx
);
1257 fjy2
= _mm_add_ps(fjy2
,ty
);
1258 fjz2
= _mm_add_ps(fjz2
,tz
);
1260 /**************************
1261 * CALCULATE INTERACTIONS *
1262 **************************/
1264 r33
= _mm_mul_ps(rsq33
,rinv33
);
1265 r33
= _mm_andnot_ps(dummy_mask
,r33
);
1267 /* Calculate table index by multiplying r with table scale and truncate to integer */
1268 rt
= _mm_mul_ps(r33
,vftabscale
);
1269 vfitab
= _mm_cvttps_epi32(rt
);
1270 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
1271 vfitab
= _mm_slli_epi32(vfitab
,2);
1273 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1274 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1275 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1276 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
1277 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
1278 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1279 Heps
= _mm_mul_ps(vfeps
,H
);
1280 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
1281 VV
= _mm_add_ps(Y
,_mm_mul_ps(vfeps
,Fp
));
1282 velec
= _mm_mul_ps(qq33
,VV
);
1283 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
1284 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq33
,FF
),_mm_mul_ps(vftabscale
,rinv33
)));
1286 /* Update potential sum for this i atom from the interaction with this j atom. */
1287 velec
= _mm_andnot_ps(dummy_mask
,velec
);
1288 velecsum
= _mm_add_ps(velecsum
,velec
);
1292 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1294 /* Calculate temporary vectorial force */
1295 tx
= _mm_mul_ps(fscal
,dx33
);
1296 ty
= _mm_mul_ps(fscal
,dy33
);
1297 tz
= _mm_mul_ps(fscal
,dz33
);
1299 /* Update vectorial force */
1300 fix3
= _mm_add_ps(fix3
,tx
);
1301 fiy3
= _mm_add_ps(fiy3
,ty
);
1302 fiz3
= _mm_add_ps(fiz3
,tz
);
1304 fjx3
= _mm_add_ps(fjx3
,tx
);
1305 fjy3
= _mm_add_ps(fjy3
,ty
);
1306 fjz3
= _mm_add_ps(fjz3
,tz
);
1308 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
1309 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
1310 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
1311 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
1313 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
1314 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,
1315 fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
1317 /* Inner loop uses 431 flops */
1320 /* End of innermost loop */
1322 gmx_mm_update_iforce_4atom_swizzle_ps(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
1323 f
+i_coord_offset
,fshift
+i_shift_offset
);
1326 /* Update potential energies */
1327 gmx_mm_update_1pot_ps(velecsum
,kernel_data
->energygrp_elec
+ggid
);
1328 gmx_mm_update_1pot_ps(vvdwsum
,kernel_data
->energygrp_vdw
+ggid
);
1330 /* Increment number of inner iterations */
1331 inneriter
+= j_index_end
- j_index_start
;
1333 /* Outer loop uses 26 flops */
1336 /* Increment number of outer iterations */
1339 /* Update outer/inner flops */
1341 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W4W4_VF
,outeriter
*26 + inneriter
*431);
1344 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_sse2_single
1345 * Electrostatics interaction: CubicSplineTable
1346 * VdW interaction: LennardJones
1347 * Geometry: Water4-Water4
1348 * Calculate force/pot: Force
1351 nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_sse2_single
1352 (t_nblist
* gmx_restrict nlist
,
1353 rvec
* gmx_restrict xx
,
1354 rvec
* gmx_restrict ff
,
1355 struct t_forcerec
* gmx_restrict fr
,
1356 t_mdatoms
* gmx_restrict mdatoms
,
1357 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
1358 t_nrnb
* gmx_restrict nrnb
)
1360 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1361 * just 0 for non-waters.
1362 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1363 * jnr indices corresponding to data put in the four positions in the SIMD register.
1365 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
1366 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
1367 int jnrA
,jnrB
,jnrC
,jnrD
;
1368 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
1369 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
1370 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
1371 real rcutoff_scalar
;
1372 real
*shiftvec
,*fshift
,*x
,*f
;
1373 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
;
1374 real scratch
[4*DIM
];
1375 __m128 tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
1377 __m128 ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
1379 __m128 ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
1381 __m128 ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
1383 __m128 ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
1384 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
;
1385 __m128 jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
1386 int vdwjidx1A
,vdwjidx1B
,vdwjidx1C
,vdwjidx1D
;
1387 __m128 jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
1388 int vdwjidx2A
,vdwjidx2B
,vdwjidx2C
,vdwjidx2D
;
1389 __m128 jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
1390 int vdwjidx3A
,vdwjidx3B
,vdwjidx3C
,vdwjidx3D
;
1391 __m128 jx3
,jy3
,jz3
,fjx3
,fjy3
,fjz3
,jq3
,isaj3
;
1392 __m128 dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
1393 __m128 dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
1394 __m128 dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
1395 __m128 dx13
,dy13
,dz13
,rsq13
,rinv13
,rinvsq13
,r13
,qq13
,c6_13
,c12_13
;
1396 __m128 dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
1397 __m128 dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
1398 __m128 dx23
,dy23
,dz23
,rsq23
,rinv23
,rinvsq23
,r23
,qq23
,c6_23
,c12_23
;
1399 __m128 dx31
,dy31
,dz31
,rsq31
,rinv31
,rinvsq31
,r31
,qq31
,c6_31
,c12_31
;
1400 __m128 dx32
,dy32
,dz32
,rsq32
,rinv32
,rinvsq32
,r32
,qq32
,c6_32
,c12_32
;
1401 __m128 dx33
,dy33
,dz33
,rsq33
,rinv33
,rinvsq33
,r33
,qq33
,c6_33
,c12_33
;
1402 __m128 velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
1405 __m128 rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
1408 __m128 one_sixth
= _mm_set1_ps(1.0/6.0);
1409 __m128 one_twelfth
= _mm_set1_ps(1.0/12.0);
1411 __m128i ifour
= _mm_set1_epi32(4);
1412 __m128 rt
,vfeps
,vftabscale
,Y
,F
,G
,H
,Heps
,Fp
,VV
,FF
;
1414 __m128 dummy_mask
,cutoff_mask
;
1415 __m128 signbit
= _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1416 __m128 one
= _mm_set1_ps(1.0);
1417 __m128 two
= _mm_set1_ps(2.0);
1423 jindex
= nlist
->jindex
;
1425 shiftidx
= nlist
->shift
;
1427 shiftvec
= fr
->shift_vec
[0];
1428 fshift
= fr
->fshift
[0];
1429 facel
= _mm_set1_ps(fr
->ic
->epsfac
);
1430 charge
= mdatoms
->chargeA
;
1431 nvdwtype
= fr
->ntype
;
1432 vdwparam
= fr
->nbfp
;
1433 vdwtype
= mdatoms
->typeA
;
1435 vftab
= kernel_data
->table_elec
->data
;
1436 vftabscale
= _mm_set1_ps(kernel_data
->table_elec
->scale
);
1438 /* Setup water-specific parameters */
1439 inr
= nlist
->iinr
[0];
1440 iq1
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+1]));
1441 iq2
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+2]));
1442 iq3
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+3]));
1443 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
1445 jq1
= _mm_set1_ps(charge
[inr
+1]);
1446 jq2
= _mm_set1_ps(charge
[inr
+2]);
1447 jq3
= _mm_set1_ps(charge
[inr
+3]);
1448 vdwjidx0A
= 2*vdwtype
[inr
+0];
1449 c6_00
= _mm_set1_ps(vdwparam
[vdwioffset0
+vdwjidx0A
]);
1450 c12_00
= _mm_set1_ps(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
1451 qq11
= _mm_mul_ps(iq1
,jq1
);
1452 qq12
= _mm_mul_ps(iq1
,jq2
);
1453 qq13
= _mm_mul_ps(iq1
,jq3
);
1454 qq21
= _mm_mul_ps(iq2
,jq1
);
1455 qq22
= _mm_mul_ps(iq2
,jq2
);
1456 qq23
= _mm_mul_ps(iq2
,jq3
);
1457 qq31
= _mm_mul_ps(iq3
,jq1
);
1458 qq32
= _mm_mul_ps(iq3
,jq2
);
1459 qq33
= _mm_mul_ps(iq3
,jq3
);
1461 /* Avoid stupid compiler warnings */
1462 jnrA
= jnrB
= jnrC
= jnrD
= 0;
1463 j_coord_offsetA
= 0;
1464 j_coord_offsetB
= 0;
1465 j_coord_offsetC
= 0;
1466 j_coord_offsetD
= 0;
1471 for(iidx
=0;iidx
<4*DIM
;iidx
++)
1473 scratch
[iidx
] = 0.0;
1476 /* Start outer loop over neighborlists */
1477 for(iidx
=0; iidx
<nri
; iidx
++)
1479 /* Load shift vector for this list */
1480 i_shift_offset
= DIM
*shiftidx
[iidx
];
1482 /* Load limits for loop over neighbors */
1483 j_index_start
= jindex
[iidx
];
1484 j_index_end
= jindex
[iidx
+1];
1486 /* Get outer coordinate index */
1488 i_coord_offset
= DIM
*inr
;
1490 /* Load i particle coords and add shift vector */
1491 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
1492 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
1494 fix0
= _mm_setzero_ps();
1495 fiy0
= _mm_setzero_ps();
1496 fiz0
= _mm_setzero_ps();
1497 fix1
= _mm_setzero_ps();
1498 fiy1
= _mm_setzero_ps();
1499 fiz1
= _mm_setzero_ps();
1500 fix2
= _mm_setzero_ps();
1501 fiy2
= _mm_setzero_ps();
1502 fiz2
= _mm_setzero_ps();
1503 fix3
= _mm_setzero_ps();
1504 fiy3
= _mm_setzero_ps();
1505 fiz3
= _mm_setzero_ps();
1507 /* Start inner kernel loop */
1508 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+3]>=0; jidx
+=4)
1511 /* Get j neighbor index, and coordinate index */
1513 jnrB
= jjnr
[jidx
+1];
1514 jnrC
= jjnr
[jidx
+2];
1515 jnrD
= jjnr
[jidx
+3];
1516 j_coord_offsetA
= DIM
*jnrA
;
1517 j_coord_offsetB
= DIM
*jnrB
;
1518 j_coord_offsetC
= DIM
*jnrC
;
1519 j_coord_offsetD
= DIM
*jnrD
;
1521 /* load j atom coordinates */
1522 gmx_mm_load_4rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1523 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
1524 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
1525 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
1527 /* Calculate displacement vector */
1528 dx00
= _mm_sub_ps(ix0
,jx0
);
1529 dy00
= _mm_sub_ps(iy0
,jy0
);
1530 dz00
= _mm_sub_ps(iz0
,jz0
);
1531 dx11
= _mm_sub_ps(ix1
,jx1
);
1532 dy11
= _mm_sub_ps(iy1
,jy1
);
1533 dz11
= _mm_sub_ps(iz1
,jz1
);
1534 dx12
= _mm_sub_ps(ix1
,jx2
);
1535 dy12
= _mm_sub_ps(iy1
,jy2
);
1536 dz12
= _mm_sub_ps(iz1
,jz2
);
1537 dx13
= _mm_sub_ps(ix1
,jx3
);
1538 dy13
= _mm_sub_ps(iy1
,jy3
);
1539 dz13
= _mm_sub_ps(iz1
,jz3
);
1540 dx21
= _mm_sub_ps(ix2
,jx1
);
1541 dy21
= _mm_sub_ps(iy2
,jy1
);
1542 dz21
= _mm_sub_ps(iz2
,jz1
);
1543 dx22
= _mm_sub_ps(ix2
,jx2
);
1544 dy22
= _mm_sub_ps(iy2
,jy2
);
1545 dz22
= _mm_sub_ps(iz2
,jz2
);
1546 dx23
= _mm_sub_ps(ix2
,jx3
);
1547 dy23
= _mm_sub_ps(iy2
,jy3
);
1548 dz23
= _mm_sub_ps(iz2
,jz3
);
1549 dx31
= _mm_sub_ps(ix3
,jx1
);
1550 dy31
= _mm_sub_ps(iy3
,jy1
);
1551 dz31
= _mm_sub_ps(iz3
,jz1
);
1552 dx32
= _mm_sub_ps(ix3
,jx2
);
1553 dy32
= _mm_sub_ps(iy3
,jy2
);
1554 dz32
= _mm_sub_ps(iz3
,jz2
);
1555 dx33
= _mm_sub_ps(ix3
,jx3
);
1556 dy33
= _mm_sub_ps(iy3
,jy3
);
1557 dz33
= _mm_sub_ps(iz3
,jz3
);
1559 /* Calculate squared distance and things based on it */
1560 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
1561 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
1562 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
1563 rsq13
= gmx_mm_calc_rsq_ps(dx13
,dy13
,dz13
);
1564 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
1565 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
1566 rsq23
= gmx_mm_calc_rsq_ps(dx23
,dy23
,dz23
);
1567 rsq31
= gmx_mm_calc_rsq_ps(dx31
,dy31
,dz31
);
1568 rsq32
= gmx_mm_calc_rsq_ps(dx32
,dy32
,dz32
);
1569 rsq33
= gmx_mm_calc_rsq_ps(dx33
,dy33
,dz33
);
1571 rinv11
= sse2_invsqrt_f(rsq11
);
1572 rinv12
= sse2_invsqrt_f(rsq12
);
1573 rinv13
= sse2_invsqrt_f(rsq13
);
1574 rinv21
= sse2_invsqrt_f(rsq21
);
1575 rinv22
= sse2_invsqrt_f(rsq22
);
1576 rinv23
= sse2_invsqrt_f(rsq23
);
1577 rinv31
= sse2_invsqrt_f(rsq31
);
1578 rinv32
= sse2_invsqrt_f(rsq32
);
1579 rinv33
= sse2_invsqrt_f(rsq33
);
1581 rinvsq00
= sse2_inv_f(rsq00
);
1583 fjx0
= _mm_setzero_ps();
1584 fjy0
= _mm_setzero_ps();
1585 fjz0
= _mm_setzero_ps();
1586 fjx1
= _mm_setzero_ps();
1587 fjy1
= _mm_setzero_ps();
1588 fjz1
= _mm_setzero_ps();
1589 fjx2
= _mm_setzero_ps();
1590 fjy2
= _mm_setzero_ps();
1591 fjz2
= _mm_setzero_ps();
1592 fjx3
= _mm_setzero_ps();
1593 fjy3
= _mm_setzero_ps();
1594 fjz3
= _mm_setzero_ps();
1596 /**************************
1597 * CALCULATE INTERACTIONS *
1598 **************************/
1600 /* LENNARD-JONES DISPERSION/REPULSION */
1602 rinvsix
= _mm_mul_ps(_mm_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
1603 fvdw
= _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00
,rinvsix
),c6_00
),_mm_mul_ps(rinvsix
,rinvsq00
));
1607 /* Calculate temporary vectorial force */
1608 tx
= _mm_mul_ps(fscal
,dx00
);
1609 ty
= _mm_mul_ps(fscal
,dy00
);
1610 tz
= _mm_mul_ps(fscal
,dz00
);
1612 /* Update vectorial force */
1613 fix0
= _mm_add_ps(fix0
,tx
);
1614 fiy0
= _mm_add_ps(fiy0
,ty
);
1615 fiz0
= _mm_add_ps(fiz0
,tz
);
1617 fjx0
= _mm_add_ps(fjx0
,tx
);
1618 fjy0
= _mm_add_ps(fjy0
,ty
);
1619 fjz0
= _mm_add_ps(fjz0
,tz
);
1621 /**************************
1622 * CALCULATE INTERACTIONS *
1623 **************************/
1625 r11
= _mm_mul_ps(rsq11
,rinv11
);
1627 /* Calculate table index by multiplying r with table scale and truncate to integer */
1628 rt
= _mm_mul_ps(r11
,vftabscale
);
1629 vfitab
= _mm_cvttps_epi32(rt
);
1630 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
1631 vfitab
= _mm_slli_epi32(vfitab
,2);
1633 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1634 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1635 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1636 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
1637 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
1638 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1639 Heps
= _mm_mul_ps(vfeps
,H
);
1640 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
1641 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
1642 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq11
,FF
),_mm_mul_ps(vftabscale
,rinv11
)));
1646 /* Calculate temporary vectorial force */
1647 tx
= _mm_mul_ps(fscal
,dx11
);
1648 ty
= _mm_mul_ps(fscal
,dy11
);
1649 tz
= _mm_mul_ps(fscal
,dz11
);
1651 /* Update vectorial force */
1652 fix1
= _mm_add_ps(fix1
,tx
);
1653 fiy1
= _mm_add_ps(fiy1
,ty
);
1654 fiz1
= _mm_add_ps(fiz1
,tz
);
1656 fjx1
= _mm_add_ps(fjx1
,tx
);
1657 fjy1
= _mm_add_ps(fjy1
,ty
);
1658 fjz1
= _mm_add_ps(fjz1
,tz
);
1660 /**************************
1661 * CALCULATE INTERACTIONS *
1662 **************************/
1664 r12
= _mm_mul_ps(rsq12
,rinv12
);
1666 /* Calculate table index by multiplying r with table scale and truncate to integer */
1667 rt
= _mm_mul_ps(r12
,vftabscale
);
1668 vfitab
= _mm_cvttps_epi32(rt
);
1669 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
1670 vfitab
= _mm_slli_epi32(vfitab
,2);
1672 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1673 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1674 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1675 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
1676 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
1677 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1678 Heps
= _mm_mul_ps(vfeps
,H
);
1679 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
1680 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
1681 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq12
,FF
),_mm_mul_ps(vftabscale
,rinv12
)));
1685 /* Calculate temporary vectorial force */
1686 tx
= _mm_mul_ps(fscal
,dx12
);
1687 ty
= _mm_mul_ps(fscal
,dy12
);
1688 tz
= _mm_mul_ps(fscal
,dz12
);
1690 /* Update vectorial force */
1691 fix1
= _mm_add_ps(fix1
,tx
);
1692 fiy1
= _mm_add_ps(fiy1
,ty
);
1693 fiz1
= _mm_add_ps(fiz1
,tz
);
1695 fjx2
= _mm_add_ps(fjx2
,tx
);
1696 fjy2
= _mm_add_ps(fjy2
,ty
);
1697 fjz2
= _mm_add_ps(fjz2
,tz
);
1699 /**************************
1700 * CALCULATE INTERACTIONS *
1701 **************************/
1703 r13
= _mm_mul_ps(rsq13
,rinv13
);
1705 /* Calculate table index by multiplying r with table scale and truncate to integer */
1706 rt
= _mm_mul_ps(r13
,vftabscale
);
1707 vfitab
= _mm_cvttps_epi32(rt
);
1708 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
1709 vfitab
= _mm_slli_epi32(vfitab
,2);
1711 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1712 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1713 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1714 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
1715 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
1716 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1717 Heps
= _mm_mul_ps(vfeps
,H
);
1718 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
1719 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
1720 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq13
,FF
),_mm_mul_ps(vftabscale
,rinv13
)));
1724 /* Calculate temporary vectorial force */
1725 tx
= _mm_mul_ps(fscal
,dx13
);
1726 ty
= _mm_mul_ps(fscal
,dy13
);
1727 tz
= _mm_mul_ps(fscal
,dz13
);
1729 /* Update vectorial force */
1730 fix1
= _mm_add_ps(fix1
,tx
);
1731 fiy1
= _mm_add_ps(fiy1
,ty
);
1732 fiz1
= _mm_add_ps(fiz1
,tz
);
1734 fjx3
= _mm_add_ps(fjx3
,tx
);
1735 fjy3
= _mm_add_ps(fjy3
,ty
);
1736 fjz3
= _mm_add_ps(fjz3
,tz
);
1738 /**************************
1739 * CALCULATE INTERACTIONS *
1740 **************************/
1742 r21
= _mm_mul_ps(rsq21
,rinv21
);
1744 /* Calculate table index by multiplying r with table scale and truncate to integer */
1745 rt
= _mm_mul_ps(r21
,vftabscale
);
1746 vfitab
= _mm_cvttps_epi32(rt
);
1747 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
1748 vfitab
= _mm_slli_epi32(vfitab
,2);
1750 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1751 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1752 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1753 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
1754 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
1755 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1756 Heps
= _mm_mul_ps(vfeps
,H
);
1757 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
1758 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
1759 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq21
,FF
),_mm_mul_ps(vftabscale
,rinv21
)));
1763 /* Calculate temporary vectorial force */
1764 tx
= _mm_mul_ps(fscal
,dx21
);
1765 ty
= _mm_mul_ps(fscal
,dy21
);
1766 tz
= _mm_mul_ps(fscal
,dz21
);
1768 /* Update vectorial force */
1769 fix2
= _mm_add_ps(fix2
,tx
);
1770 fiy2
= _mm_add_ps(fiy2
,ty
);
1771 fiz2
= _mm_add_ps(fiz2
,tz
);
1773 fjx1
= _mm_add_ps(fjx1
,tx
);
1774 fjy1
= _mm_add_ps(fjy1
,ty
);
1775 fjz1
= _mm_add_ps(fjz1
,tz
);
1777 /**************************
1778 * CALCULATE INTERACTIONS *
1779 **************************/
1781 r22
= _mm_mul_ps(rsq22
,rinv22
);
1783 /* Calculate table index by multiplying r with table scale and truncate to integer */
1784 rt
= _mm_mul_ps(r22
,vftabscale
);
1785 vfitab
= _mm_cvttps_epi32(rt
);
1786 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
1787 vfitab
= _mm_slli_epi32(vfitab
,2);
1789 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1790 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1791 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1792 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
1793 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
1794 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1795 Heps
= _mm_mul_ps(vfeps
,H
);
1796 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
1797 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
1798 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq22
,FF
),_mm_mul_ps(vftabscale
,rinv22
)));
1802 /* Calculate temporary vectorial force */
1803 tx
= _mm_mul_ps(fscal
,dx22
);
1804 ty
= _mm_mul_ps(fscal
,dy22
);
1805 tz
= _mm_mul_ps(fscal
,dz22
);
1807 /* Update vectorial force */
1808 fix2
= _mm_add_ps(fix2
,tx
);
1809 fiy2
= _mm_add_ps(fiy2
,ty
);
1810 fiz2
= _mm_add_ps(fiz2
,tz
);
1812 fjx2
= _mm_add_ps(fjx2
,tx
);
1813 fjy2
= _mm_add_ps(fjy2
,ty
);
1814 fjz2
= _mm_add_ps(fjz2
,tz
);
1816 /**************************
1817 * CALCULATE INTERACTIONS *
1818 **************************/
1820 r23
= _mm_mul_ps(rsq23
,rinv23
);
1822 /* Calculate table index by multiplying r with table scale and truncate to integer */
1823 rt
= _mm_mul_ps(r23
,vftabscale
);
1824 vfitab
= _mm_cvttps_epi32(rt
);
1825 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
1826 vfitab
= _mm_slli_epi32(vfitab
,2);
1828 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1829 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1830 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1831 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
1832 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
1833 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1834 Heps
= _mm_mul_ps(vfeps
,H
);
1835 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
1836 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
1837 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq23
,FF
),_mm_mul_ps(vftabscale
,rinv23
)));
1841 /* Calculate temporary vectorial force */
1842 tx
= _mm_mul_ps(fscal
,dx23
);
1843 ty
= _mm_mul_ps(fscal
,dy23
);
1844 tz
= _mm_mul_ps(fscal
,dz23
);
1846 /* Update vectorial force */
1847 fix2
= _mm_add_ps(fix2
,tx
);
1848 fiy2
= _mm_add_ps(fiy2
,ty
);
1849 fiz2
= _mm_add_ps(fiz2
,tz
);
1851 fjx3
= _mm_add_ps(fjx3
,tx
);
1852 fjy3
= _mm_add_ps(fjy3
,ty
);
1853 fjz3
= _mm_add_ps(fjz3
,tz
);
1855 /**************************
1856 * CALCULATE INTERACTIONS *
1857 **************************/
1859 r31
= _mm_mul_ps(rsq31
,rinv31
);
1861 /* Calculate table index by multiplying r with table scale and truncate to integer */
1862 rt
= _mm_mul_ps(r31
,vftabscale
);
1863 vfitab
= _mm_cvttps_epi32(rt
);
1864 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
1865 vfitab
= _mm_slli_epi32(vfitab
,2);
1867 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1868 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1869 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1870 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
1871 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
1872 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1873 Heps
= _mm_mul_ps(vfeps
,H
);
1874 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
1875 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
1876 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq31
,FF
),_mm_mul_ps(vftabscale
,rinv31
)));
1880 /* Calculate temporary vectorial force */
1881 tx
= _mm_mul_ps(fscal
,dx31
);
1882 ty
= _mm_mul_ps(fscal
,dy31
);
1883 tz
= _mm_mul_ps(fscal
,dz31
);
1885 /* Update vectorial force */
1886 fix3
= _mm_add_ps(fix3
,tx
);
1887 fiy3
= _mm_add_ps(fiy3
,ty
);
1888 fiz3
= _mm_add_ps(fiz3
,tz
);
1890 fjx1
= _mm_add_ps(fjx1
,tx
);
1891 fjy1
= _mm_add_ps(fjy1
,ty
);
1892 fjz1
= _mm_add_ps(fjz1
,tz
);
1894 /**************************
1895 * CALCULATE INTERACTIONS *
1896 **************************/
1898 r32
= _mm_mul_ps(rsq32
,rinv32
);
1900 /* Calculate table index by multiplying r with table scale and truncate to integer */
1901 rt
= _mm_mul_ps(r32
,vftabscale
);
1902 vfitab
= _mm_cvttps_epi32(rt
);
1903 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
1904 vfitab
= _mm_slli_epi32(vfitab
,2);
1906 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1907 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1908 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1909 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
1910 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
1911 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1912 Heps
= _mm_mul_ps(vfeps
,H
);
1913 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
1914 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
1915 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq32
,FF
),_mm_mul_ps(vftabscale
,rinv32
)));
1919 /* Calculate temporary vectorial force */
1920 tx
= _mm_mul_ps(fscal
,dx32
);
1921 ty
= _mm_mul_ps(fscal
,dy32
);
1922 tz
= _mm_mul_ps(fscal
,dz32
);
1924 /* Update vectorial force */
1925 fix3
= _mm_add_ps(fix3
,tx
);
1926 fiy3
= _mm_add_ps(fiy3
,ty
);
1927 fiz3
= _mm_add_ps(fiz3
,tz
);
1929 fjx2
= _mm_add_ps(fjx2
,tx
);
1930 fjy2
= _mm_add_ps(fjy2
,ty
);
1931 fjz2
= _mm_add_ps(fjz2
,tz
);
1933 /**************************
1934 * CALCULATE INTERACTIONS *
1935 **************************/
1937 r33
= _mm_mul_ps(rsq33
,rinv33
);
1939 /* Calculate table index by multiplying r with table scale and truncate to integer */
1940 rt
= _mm_mul_ps(r33
,vftabscale
);
1941 vfitab
= _mm_cvttps_epi32(rt
);
1942 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
1943 vfitab
= _mm_slli_epi32(vfitab
,2);
1945 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1946 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1947 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1948 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
1949 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
1950 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
1951 Heps
= _mm_mul_ps(vfeps
,H
);
1952 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
1953 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
1954 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq33
,FF
),_mm_mul_ps(vftabscale
,rinv33
)));
1958 /* Calculate temporary vectorial force */
1959 tx
= _mm_mul_ps(fscal
,dx33
);
1960 ty
= _mm_mul_ps(fscal
,dy33
);
1961 tz
= _mm_mul_ps(fscal
,dz33
);
1963 /* Update vectorial force */
1964 fix3
= _mm_add_ps(fix3
,tx
);
1965 fiy3
= _mm_add_ps(fiy3
,ty
);
1966 fiz3
= _mm_add_ps(fiz3
,tz
);
1968 fjx3
= _mm_add_ps(fjx3
,tx
);
1969 fjy3
= _mm_add_ps(fjy3
,ty
);
1970 fjz3
= _mm_add_ps(fjz3
,tz
);
1972 fjptrA
= f
+j_coord_offsetA
;
1973 fjptrB
= f
+j_coord_offsetB
;
1974 fjptrC
= f
+j_coord_offsetC
;
1975 fjptrD
= f
+j_coord_offsetD
;
1977 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
1978 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,
1979 fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
1981 /* Inner loop uses 381 flops */
1984 if(jidx
<j_index_end
)
1987 /* Get j neighbor index, and coordinate index */
1988 jnrlistA
= jjnr
[jidx
];
1989 jnrlistB
= jjnr
[jidx
+1];
1990 jnrlistC
= jjnr
[jidx
+2];
1991 jnrlistD
= jjnr
[jidx
+3];
1992 /* Sign of each element will be negative for non-real atoms.
1993 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1994 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1996 dummy_mask
= gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128()));
1997 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
1998 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
1999 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
2000 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
2001 j_coord_offsetA
= DIM
*jnrA
;
2002 j_coord_offsetB
= DIM
*jnrB
;
2003 j_coord_offsetC
= DIM
*jnrC
;
2004 j_coord_offsetD
= DIM
*jnrD
;
2006 /* load j atom coordinates */
2007 gmx_mm_load_4rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
2008 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
2009 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
2010 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
2012 /* Calculate displacement vector */
2013 dx00
= _mm_sub_ps(ix0
,jx0
);
2014 dy00
= _mm_sub_ps(iy0
,jy0
);
2015 dz00
= _mm_sub_ps(iz0
,jz0
);
2016 dx11
= _mm_sub_ps(ix1
,jx1
);
2017 dy11
= _mm_sub_ps(iy1
,jy1
);
2018 dz11
= _mm_sub_ps(iz1
,jz1
);
2019 dx12
= _mm_sub_ps(ix1
,jx2
);
2020 dy12
= _mm_sub_ps(iy1
,jy2
);
2021 dz12
= _mm_sub_ps(iz1
,jz2
);
2022 dx13
= _mm_sub_ps(ix1
,jx3
);
2023 dy13
= _mm_sub_ps(iy1
,jy3
);
2024 dz13
= _mm_sub_ps(iz1
,jz3
);
2025 dx21
= _mm_sub_ps(ix2
,jx1
);
2026 dy21
= _mm_sub_ps(iy2
,jy1
);
2027 dz21
= _mm_sub_ps(iz2
,jz1
);
2028 dx22
= _mm_sub_ps(ix2
,jx2
);
2029 dy22
= _mm_sub_ps(iy2
,jy2
);
2030 dz22
= _mm_sub_ps(iz2
,jz2
);
2031 dx23
= _mm_sub_ps(ix2
,jx3
);
2032 dy23
= _mm_sub_ps(iy2
,jy3
);
2033 dz23
= _mm_sub_ps(iz2
,jz3
);
2034 dx31
= _mm_sub_ps(ix3
,jx1
);
2035 dy31
= _mm_sub_ps(iy3
,jy1
);
2036 dz31
= _mm_sub_ps(iz3
,jz1
);
2037 dx32
= _mm_sub_ps(ix3
,jx2
);
2038 dy32
= _mm_sub_ps(iy3
,jy2
);
2039 dz32
= _mm_sub_ps(iz3
,jz2
);
2040 dx33
= _mm_sub_ps(ix3
,jx3
);
2041 dy33
= _mm_sub_ps(iy3
,jy3
);
2042 dz33
= _mm_sub_ps(iz3
,jz3
);
2044 /* Calculate squared distance and things based on it */
2045 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
2046 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
2047 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
2048 rsq13
= gmx_mm_calc_rsq_ps(dx13
,dy13
,dz13
);
2049 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
2050 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
2051 rsq23
= gmx_mm_calc_rsq_ps(dx23
,dy23
,dz23
);
2052 rsq31
= gmx_mm_calc_rsq_ps(dx31
,dy31
,dz31
);
2053 rsq32
= gmx_mm_calc_rsq_ps(dx32
,dy32
,dz32
);
2054 rsq33
= gmx_mm_calc_rsq_ps(dx33
,dy33
,dz33
);
2056 rinv11
= sse2_invsqrt_f(rsq11
);
2057 rinv12
= sse2_invsqrt_f(rsq12
);
2058 rinv13
= sse2_invsqrt_f(rsq13
);
2059 rinv21
= sse2_invsqrt_f(rsq21
);
2060 rinv22
= sse2_invsqrt_f(rsq22
);
2061 rinv23
= sse2_invsqrt_f(rsq23
);
2062 rinv31
= sse2_invsqrt_f(rsq31
);
2063 rinv32
= sse2_invsqrt_f(rsq32
);
2064 rinv33
= sse2_invsqrt_f(rsq33
);
2066 rinvsq00
= sse2_inv_f(rsq00
);
2068 fjx0
= _mm_setzero_ps();
2069 fjy0
= _mm_setzero_ps();
2070 fjz0
= _mm_setzero_ps();
2071 fjx1
= _mm_setzero_ps();
2072 fjy1
= _mm_setzero_ps();
2073 fjz1
= _mm_setzero_ps();
2074 fjx2
= _mm_setzero_ps();
2075 fjy2
= _mm_setzero_ps();
2076 fjz2
= _mm_setzero_ps();
2077 fjx3
= _mm_setzero_ps();
2078 fjy3
= _mm_setzero_ps();
2079 fjz3
= _mm_setzero_ps();
2081 /**************************
2082 * CALCULATE INTERACTIONS *
2083 **************************/
2085 /* LENNARD-JONES DISPERSION/REPULSION */
2087 rinvsix
= _mm_mul_ps(_mm_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
2088 fvdw
= _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00
,rinvsix
),c6_00
),_mm_mul_ps(rinvsix
,rinvsq00
));
2092 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2094 /* Calculate temporary vectorial force */
2095 tx
= _mm_mul_ps(fscal
,dx00
);
2096 ty
= _mm_mul_ps(fscal
,dy00
);
2097 tz
= _mm_mul_ps(fscal
,dz00
);
2099 /* Update vectorial force */
2100 fix0
= _mm_add_ps(fix0
,tx
);
2101 fiy0
= _mm_add_ps(fiy0
,ty
);
2102 fiz0
= _mm_add_ps(fiz0
,tz
);
2104 fjx0
= _mm_add_ps(fjx0
,tx
);
2105 fjy0
= _mm_add_ps(fjy0
,ty
);
2106 fjz0
= _mm_add_ps(fjz0
,tz
);
2108 /**************************
2109 * CALCULATE INTERACTIONS *
2110 **************************/
2112 r11
= _mm_mul_ps(rsq11
,rinv11
);
2113 r11
= _mm_andnot_ps(dummy_mask
,r11
);
2115 /* Calculate table index by multiplying r with table scale and truncate to integer */
2116 rt
= _mm_mul_ps(r11
,vftabscale
);
2117 vfitab
= _mm_cvttps_epi32(rt
);
2118 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
2119 vfitab
= _mm_slli_epi32(vfitab
,2);
2121 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2122 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2123 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
2124 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
2125 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
2126 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2127 Heps
= _mm_mul_ps(vfeps
,H
);
2128 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
2129 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
2130 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq11
,FF
),_mm_mul_ps(vftabscale
,rinv11
)));
2134 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2136 /* Calculate temporary vectorial force */
2137 tx
= _mm_mul_ps(fscal
,dx11
);
2138 ty
= _mm_mul_ps(fscal
,dy11
);
2139 tz
= _mm_mul_ps(fscal
,dz11
);
2141 /* Update vectorial force */
2142 fix1
= _mm_add_ps(fix1
,tx
);
2143 fiy1
= _mm_add_ps(fiy1
,ty
);
2144 fiz1
= _mm_add_ps(fiz1
,tz
);
2146 fjx1
= _mm_add_ps(fjx1
,tx
);
2147 fjy1
= _mm_add_ps(fjy1
,ty
);
2148 fjz1
= _mm_add_ps(fjz1
,tz
);
2150 /**************************
2151 * CALCULATE INTERACTIONS *
2152 **************************/
2154 r12
= _mm_mul_ps(rsq12
,rinv12
);
2155 r12
= _mm_andnot_ps(dummy_mask
,r12
);
2157 /* Calculate table index by multiplying r with table scale and truncate to integer */
2158 rt
= _mm_mul_ps(r12
,vftabscale
);
2159 vfitab
= _mm_cvttps_epi32(rt
);
2160 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
2161 vfitab
= _mm_slli_epi32(vfitab
,2);
2163 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2164 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2165 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
2166 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
2167 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
2168 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2169 Heps
= _mm_mul_ps(vfeps
,H
);
2170 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
2171 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
2172 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq12
,FF
),_mm_mul_ps(vftabscale
,rinv12
)));
2176 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2178 /* Calculate temporary vectorial force */
2179 tx
= _mm_mul_ps(fscal
,dx12
);
2180 ty
= _mm_mul_ps(fscal
,dy12
);
2181 tz
= _mm_mul_ps(fscal
,dz12
);
2183 /* Update vectorial force */
2184 fix1
= _mm_add_ps(fix1
,tx
);
2185 fiy1
= _mm_add_ps(fiy1
,ty
);
2186 fiz1
= _mm_add_ps(fiz1
,tz
);
2188 fjx2
= _mm_add_ps(fjx2
,tx
);
2189 fjy2
= _mm_add_ps(fjy2
,ty
);
2190 fjz2
= _mm_add_ps(fjz2
,tz
);
2192 /**************************
2193 * CALCULATE INTERACTIONS *
2194 **************************/
2196 r13
= _mm_mul_ps(rsq13
,rinv13
);
2197 r13
= _mm_andnot_ps(dummy_mask
,r13
);
2199 /* Calculate table index by multiplying r with table scale and truncate to integer */
2200 rt
= _mm_mul_ps(r13
,vftabscale
);
2201 vfitab
= _mm_cvttps_epi32(rt
);
2202 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
2203 vfitab
= _mm_slli_epi32(vfitab
,2);
2205 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2206 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2207 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
2208 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
2209 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
2210 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2211 Heps
= _mm_mul_ps(vfeps
,H
);
2212 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
2213 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
2214 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq13
,FF
),_mm_mul_ps(vftabscale
,rinv13
)));
2218 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2220 /* Calculate temporary vectorial force */
2221 tx
= _mm_mul_ps(fscal
,dx13
);
2222 ty
= _mm_mul_ps(fscal
,dy13
);
2223 tz
= _mm_mul_ps(fscal
,dz13
);
2225 /* Update vectorial force */
2226 fix1
= _mm_add_ps(fix1
,tx
);
2227 fiy1
= _mm_add_ps(fiy1
,ty
);
2228 fiz1
= _mm_add_ps(fiz1
,tz
);
2230 fjx3
= _mm_add_ps(fjx3
,tx
);
2231 fjy3
= _mm_add_ps(fjy3
,ty
);
2232 fjz3
= _mm_add_ps(fjz3
,tz
);
2234 /**************************
2235 * CALCULATE INTERACTIONS *
2236 **************************/
2238 r21
= _mm_mul_ps(rsq21
,rinv21
);
2239 r21
= _mm_andnot_ps(dummy_mask
,r21
);
2241 /* Calculate table index by multiplying r with table scale and truncate to integer */
2242 rt
= _mm_mul_ps(r21
,vftabscale
);
2243 vfitab
= _mm_cvttps_epi32(rt
);
2244 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
2245 vfitab
= _mm_slli_epi32(vfitab
,2);
2247 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2248 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2249 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
2250 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
2251 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
2252 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2253 Heps
= _mm_mul_ps(vfeps
,H
);
2254 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
2255 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
2256 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq21
,FF
),_mm_mul_ps(vftabscale
,rinv21
)));
2260 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2262 /* Calculate temporary vectorial force */
2263 tx
= _mm_mul_ps(fscal
,dx21
);
2264 ty
= _mm_mul_ps(fscal
,dy21
);
2265 tz
= _mm_mul_ps(fscal
,dz21
);
2267 /* Update vectorial force */
2268 fix2
= _mm_add_ps(fix2
,tx
);
2269 fiy2
= _mm_add_ps(fiy2
,ty
);
2270 fiz2
= _mm_add_ps(fiz2
,tz
);
2272 fjx1
= _mm_add_ps(fjx1
,tx
);
2273 fjy1
= _mm_add_ps(fjy1
,ty
);
2274 fjz1
= _mm_add_ps(fjz1
,tz
);
2276 /**************************
2277 * CALCULATE INTERACTIONS *
2278 **************************/
2280 r22
= _mm_mul_ps(rsq22
,rinv22
);
2281 r22
= _mm_andnot_ps(dummy_mask
,r22
);
2283 /* Calculate table index by multiplying r with table scale and truncate to integer */
2284 rt
= _mm_mul_ps(r22
,vftabscale
);
2285 vfitab
= _mm_cvttps_epi32(rt
);
2286 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
2287 vfitab
= _mm_slli_epi32(vfitab
,2);
2289 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2290 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2291 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
2292 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
2293 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
2294 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2295 Heps
= _mm_mul_ps(vfeps
,H
);
2296 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
2297 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
2298 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq22
,FF
),_mm_mul_ps(vftabscale
,rinv22
)));
2302 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2304 /* Calculate temporary vectorial force */
2305 tx
= _mm_mul_ps(fscal
,dx22
);
2306 ty
= _mm_mul_ps(fscal
,dy22
);
2307 tz
= _mm_mul_ps(fscal
,dz22
);
2309 /* Update vectorial force */
2310 fix2
= _mm_add_ps(fix2
,tx
);
2311 fiy2
= _mm_add_ps(fiy2
,ty
);
2312 fiz2
= _mm_add_ps(fiz2
,tz
);
2314 fjx2
= _mm_add_ps(fjx2
,tx
);
2315 fjy2
= _mm_add_ps(fjy2
,ty
);
2316 fjz2
= _mm_add_ps(fjz2
,tz
);
2318 /**************************
2319 * CALCULATE INTERACTIONS *
2320 **************************/
2322 r23
= _mm_mul_ps(rsq23
,rinv23
);
2323 r23
= _mm_andnot_ps(dummy_mask
,r23
);
2325 /* Calculate table index by multiplying r with table scale and truncate to integer */
2326 rt
= _mm_mul_ps(r23
,vftabscale
);
2327 vfitab
= _mm_cvttps_epi32(rt
);
2328 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
2329 vfitab
= _mm_slli_epi32(vfitab
,2);
2331 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2332 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2333 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
2334 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
2335 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
2336 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2337 Heps
= _mm_mul_ps(vfeps
,H
);
2338 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
2339 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
2340 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq23
,FF
),_mm_mul_ps(vftabscale
,rinv23
)));
2344 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2346 /* Calculate temporary vectorial force */
2347 tx
= _mm_mul_ps(fscal
,dx23
);
2348 ty
= _mm_mul_ps(fscal
,dy23
);
2349 tz
= _mm_mul_ps(fscal
,dz23
);
2351 /* Update vectorial force */
2352 fix2
= _mm_add_ps(fix2
,tx
);
2353 fiy2
= _mm_add_ps(fiy2
,ty
);
2354 fiz2
= _mm_add_ps(fiz2
,tz
);
2356 fjx3
= _mm_add_ps(fjx3
,tx
);
2357 fjy3
= _mm_add_ps(fjy3
,ty
);
2358 fjz3
= _mm_add_ps(fjz3
,tz
);
2360 /**************************
2361 * CALCULATE INTERACTIONS *
2362 **************************/
2364 r31
= _mm_mul_ps(rsq31
,rinv31
);
2365 r31
= _mm_andnot_ps(dummy_mask
,r31
);
2367 /* Calculate table index by multiplying r with table scale and truncate to integer */
2368 rt
= _mm_mul_ps(r31
,vftabscale
);
2369 vfitab
= _mm_cvttps_epi32(rt
);
2370 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
2371 vfitab
= _mm_slli_epi32(vfitab
,2);
2373 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2374 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2375 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
2376 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
2377 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
2378 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2379 Heps
= _mm_mul_ps(vfeps
,H
);
2380 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
2381 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
2382 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq31
,FF
),_mm_mul_ps(vftabscale
,rinv31
)));
2386 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2388 /* Calculate temporary vectorial force */
2389 tx
= _mm_mul_ps(fscal
,dx31
);
2390 ty
= _mm_mul_ps(fscal
,dy31
);
2391 tz
= _mm_mul_ps(fscal
,dz31
);
2393 /* Update vectorial force */
2394 fix3
= _mm_add_ps(fix3
,tx
);
2395 fiy3
= _mm_add_ps(fiy3
,ty
);
2396 fiz3
= _mm_add_ps(fiz3
,tz
);
2398 fjx1
= _mm_add_ps(fjx1
,tx
);
2399 fjy1
= _mm_add_ps(fjy1
,ty
);
2400 fjz1
= _mm_add_ps(fjz1
,tz
);
2402 /**************************
2403 * CALCULATE INTERACTIONS *
2404 **************************/
2406 r32
= _mm_mul_ps(rsq32
,rinv32
);
2407 r32
= _mm_andnot_ps(dummy_mask
,r32
);
2409 /* Calculate table index by multiplying r with table scale and truncate to integer */
2410 rt
= _mm_mul_ps(r32
,vftabscale
);
2411 vfitab
= _mm_cvttps_epi32(rt
);
2412 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
2413 vfitab
= _mm_slli_epi32(vfitab
,2);
2415 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2416 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2417 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
2418 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
2419 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
2420 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2421 Heps
= _mm_mul_ps(vfeps
,H
);
2422 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
2423 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
2424 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq32
,FF
),_mm_mul_ps(vftabscale
,rinv32
)));
2428 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2430 /* Calculate temporary vectorial force */
2431 tx
= _mm_mul_ps(fscal
,dx32
);
2432 ty
= _mm_mul_ps(fscal
,dy32
);
2433 tz
= _mm_mul_ps(fscal
,dz32
);
2435 /* Update vectorial force */
2436 fix3
= _mm_add_ps(fix3
,tx
);
2437 fiy3
= _mm_add_ps(fiy3
,ty
);
2438 fiz3
= _mm_add_ps(fiz3
,tz
);
2440 fjx2
= _mm_add_ps(fjx2
,tx
);
2441 fjy2
= _mm_add_ps(fjy2
,ty
);
2442 fjz2
= _mm_add_ps(fjz2
,tz
);
2444 /**************************
2445 * CALCULATE INTERACTIONS *
2446 **************************/
2448 r33
= _mm_mul_ps(rsq33
,rinv33
);
2449 r33
= _mm_andnot_ps(dummy_mask
,r33
);
2451 /* Calculate table index by multiplying r with table scale and truncate to integer */
2452 rt
= _mm_mul_ps(r33
,vftabscale
);
2453 vfitab
= _mm_cvttps_epi32(rt
);
2454 vfeps
= _mm_sub_ps(rt
,_mm_cvtepi32_ps(vfitab
));
2455 vfitab
= _mm_slli_epi32(vfitab
,2);
2457 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2458 Y
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2459 F
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
2460 G
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,2) );
2461 H
= _mm_load_ps( vftab
+ gmx_mm_extract_epi32(vfitab
,3) );
2462 _MM_TRANSPOSE4_PS(Y
,F
,G
,H
);
2463 Heps
= _mm_mul_ps(vfeps
,H
);
2464 Fp
= _mm_add_ps(F
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,Heps
)));
2465 FF
= _mm_add_ps(Fp
,_mm_mul_ps(vfeps
,_mm_add_ps(G
,_mm_add_ps(Heps
,Heps
))));
2466 felec
= _mm_xor_ps(signbit
,_mm_mul_ps(_mm_mul_ps(qq33
,FF
),_mm_mul_ps(vftabscale
,rinv33
)));
2470 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
2472 /* Calculate temporary vectorial force */
2473 tx
= _mm_mul_ps(fscal
,dx33
);
2474 ty
= _mm_mul_ps(fscal
,dy33
);
2475 tz
= _mm_mul_ps(fscal
,dz33
);
2477 /* Update vectorial force */
2478 fix3
= _mm_add_ps(fix3
,tx
);
2479 fiy3
= _mm_add_ps(fiy3
,ty
);
2480 fiz3
= _mm_add_ps(fiz3
,tz
);
2482 fjx3
= _mm_add_ps(fjx3
,tx
);
2483 fjy3
= _mm_add_ps(fjy3
,ty
);
2484 fjz3
= _mm_add_ps(fjz3
,tz
);
2486 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
2487 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
2488 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
2489 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
2491 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
2492 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,
2493 fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
2495 /* Inner loop uses 390 flops */
2498 /* End of innermost loop */
2500 gmx_mm_update_iforce_4atom_swizzle_ps(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
2501 f
+i_coord_offset
,fshift
+i_shift_offset
);
2503 /* Increment number of inner iterations */
2504 inneriter
+= j_index_end
- j_index_start
;
2506 /* Outer loop uses 24 flops */
2509 /* Increment number of outer iterations */
2512 /* Update outer/inner flops */
2514 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W4W4_F
,outeriter
*24 + inneriter
*390);