Remove nb-parameters from t_forcerec
[gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_128_fma_single / nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_avx_128_fma_single.c
blob07d622bcd50819b4064804b83d5a55087fbc824a
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
38 #include "gmxpre.h"
40 #include "config.h"
42 #include <math.h>
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_avx_128_fma_single.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_avx_128_fma_single
51 * Electrostatics interaction: CubicSplineTable
52 * VdW interaction: CubicSplineTable
53 * Geometry: Water4-Water4
54 * Calculate force/pot: PotentialAndForce
56 void
57 nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_avx_128_fma_single
58 (t_nblist * gmx_restrict nlist,
59 rvec * gmx_restrict xx,
60 rvec * gmx_restrict ff,
61 struct t_forcerec * gmx_restrict fr,
62 t_mdatoms * gmx_restrict mdatoms,
63 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64 t_nrnb * gmx_restrict nrnb)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset,i_coord_offset,outeriter,inneriter;
72 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73 int jnrA,jnrB,jnrC,jnrD;
74 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
75 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
76 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
77 real rcutoff_scalar;
78 real *shiftvec,*fshift,*x,*f;
79 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
80 real scratch[4*DIM];
81 __m128 fscal,rcutoff,rcutoff2,jidxall;
82 int vdwioffset0;
83 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
84 int vdwioffset1;
85 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
86 int vdwioffset2;
87 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
88 int vdwioffset3;
89 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
90 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
91 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
92 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
93 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
94 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
95 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
96 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
97 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
98 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
99 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
100 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
101 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
102 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
103 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
104 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
105 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
106 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
107 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
108 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
109 real *charge;
110 int nvdwtype;
111 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
112 int *vdwtype;
113 real *vdwparam;
114 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
115 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
116 __m128i vfitab;
117 __m128i ifour = _mm_set1_epi32(4);
118 __m128 rt,vfeps,twovfeps,vftabscale,Y,F,G,H,Fp,VV,FF;
119 real *vftab;
120 __m128 dummy_mask,cutoff_mask;
121 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
122 __m128 one = _mm_set1_ps(1.0);
123 __m128 two = _mm_set1_ps(2.0);
124 x = xx[0];
125 f = ff[0];
127 nri = nlist->nri;
128 iinr = nlist->iinr;
129 jindex = nlist->jindex;
130 jjnr = nlist->jjnr;
131 shiftidx = nlist->shift;
132 gid = nlist->gid;
133 shiftvec = fr->shift_vec[0];
134 fshift = fr->fshift[0];
135 facel = _mm_set1_ps(fr->ic->epsfac);
136 charge = mdatoms->chargeA;
137 nvdwtype = fr->ntype;
138 vdwparam = fr->nbfp;
139 vdwtype = mdatoms->typeA;
141 vftab = kernel_data->table_elec_vdw->data;
142 vftabscale = _mm_set1_ps(kernel_data->table_elec_vdw->scale);
144 /* Setup water-specific parameters */
145 inr = nlist->iinr[0];
146 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
147 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
148 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
149 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
151 jq1 = _mm_set1_ps(charge[inr+1]);
152 jq2 = _mm_set1_ps(charge[inr+2]);
153 jq3 = _mm_set1_ps(charge[inr+3]);
154 vdwjidx0A = 2*vdwtype[inr+0];
155 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
156 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
157 qq11 = _mm_mul_ps(iq1,jq1);
158 qq12 = _mm_mul_ps(iq1,jq2);
159 qq13 = _mm_mul_ps(iq1,jq3);
160 qq21 = _mm_mul_ps(iq2,jq1);
161 qq22 = _mm_mul_ps(iq2,jq2);
162 qq23 = _mm_mul_ps(iq2,jq3);
163 qq31 = _mm_mul_ps(iq3,jq1);
164 qq32 = _mm_mul_ps(iq3,jq2);
165 qq33 = _mm_mul_ps(iq3,jq3);
167 /* Avoid stupid compiler warnings */
168 jnrA = jnrB = jnrC = jnrD = 0;
169 j_coord_offsetA = 0;
170 j_coord_offsetB = 0;
171 j_coord_offsetC = 0;
172 j_coord_offsetD = 0;
174 outeriter = 0;
175 inneriter = 0;
177 for(iidx=0;iidx<4*DIM;iidx++)
179 scratch[iidx] = 0.0;
182 /* Start outer loop over neighborlists */
183 for(iidx=0; iidx<nri; iidx++)
185 /* Load shift vector for this list */
186 i_shift_offset = DIM*shiftidx[iidx];
188 /* Load limits for loop over neighbors */
189 j_index_start = jindex[iidx];
190 j_index_end = jindex[iidx+1];
192 /* Get outer coordinate index */
193 inr = iinr[iidx];
194 i_coord_offset = DIM*inr;
196 /* Load i particle coords and add shift vector */
197 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
198 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
200 fix0 = _mm_setzero_ps();
201 fiy0 = _mm_setzero_ps();
202 fiz0 = _mm_setzero_ps();
203 fix1 = _mm_setzero_ps();
204 fiy1 = _mm_setzero_ps();
205 fiz1 = _mm_setzero_ps();
206 fix2 = _mm_setzero_ps();
207 fiy2 = _mm_setzero_ps();
208 fiz2 = _mm_setzero_ps();
209 fix3 = _mm_setzero_ps();
210 fiy3 = _mm_setzero_ps();
211 fiz3 = _mm_setzero_ps();
213 /* Reset potential sums */
214 velecsum = _mm_setzero_ps();
215 vvdwsum = _mm_setzero_ps();
217 /* Start inner kernel loop */
218 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
221 /* Get j neighbor index, and coordinate index */
222 jnrA = jjnr[jidx];
223 jnrB = jjnr[jidx+1];
224 jnrC = jjnr[jidx+2];
225 jnrD = jjnr[jidx+3];
226 j_coord_offsetA = DIM*jnrA;
227 j_coord_offsetB = DIM*jnrB;
228 j_coord_offsetC = DIM*jnrC;
229 j_coord_offsetD = DIM*jnrD;
231 /* load j atom coordinates */
232 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
233 x+j_coord_offsetC,x+j_coord_offsetD,
234 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
235 &jy2,&jz2,&jx3,&jy3,&jz3);
237 /* Calculate displacement vector */
238 dx00 = _mm_sub_ps(ix0,jx0);
239 dy00 = _mm_sub_ps(iy0,jy0);
240 dz00 = _mm_sub_ps(iz0,jz0);
241 dx11 = _mm_sub_ps(ix1,jx1);
242 dy11 = _mm_sub_ps(iy1,jy1);
243 dz11 = _mm_sub_ps(iz1,jz1);
244 dx12 = _mm_sub_ps(ix1,jx2);
245 dy12 = _mm_sub_ps(iy1,jy2);
246 dz12 = _mm_sub_ps(iz1,jz2);
247 dx13 = _mm_sub_ps(ix1,jx3);
248 dy13 = _mm_sub_ps(iy1,jy3);
249 dz13 = _mm_sub_ps(iz1,jz3);
250 dx21 = _mm_sub_ps(ix2,jx1);
251 dy21 = _mm_sub_ps(iy2,jy1);
252 dz21 = _mm_sub_ps(iz2,jz1);
253 dx22 = _mm_sub_ps(ix2,jx2);
254 dy22 = _mm_sub_ps(iy2,jy2);
255 dz22 = _mm_sub_ps(iz2,jz2);
256 dx23 = _mm_sub_ps(ix2,jx3);
257 dy23 = _mm_sub_ps(iy2,jy3);
258 dz23 = _mm_sub_ps(iz2,jz3);
259 dx31 = _mm_sub_ps(ix3,jx1);
260 dy31 = _mm_sub_ps(iy3,jy1);
261 dz31 = _mm_sub_ps(iz3,jz1);
262 dx32 = _mm_sub_ps(ix3,jx2);
263 dy32 = _mm_sub_ps(iy3,jy2);
264 dz32 = _mm_sub_ps(iz3,jz2);
265 dx33 = _mm_sub_ps(ix3,jx3);
266 dy33 = _mm_sub_ps(iy3,jy3);
267 dz33 = _mm_sub_ps(iz3,jz3);
269 /* Calculate squared distance and things based on it */
270 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
271 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
272 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
273 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
274 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
275 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
276 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
277 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
278 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
279 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
281 rinv00 = avx128fma_invsqrt_f(rsq00);
282 rinv11 = avx128fma_invsqrt_f(rsq11);
283 rinv12 = avx128fma_invsqrt_f(rsq12);
284 rinv13 = avx128fma_invsqrt_f(rsq13);
285 rinv21 = avx128fma_invsqrt_f(rsq21);
286 rinv22 = avx128fma_invsqrt_f(rsq22);
287 rinv23 = avx128fma_invsqrt_f(rsq23);
288 rinv31 = avx128fma_invsqrt_f(rsq31);
289 rinv32 = avx128fma_invsqrt_f(rsq32);
290 rinv33 = avx128fma_invsqrt_f(rsq33);
292 fjx0 = _mm_setzero_ps();
293 fjy0 = _mm_setzero_ps();
294 fjz0 = _mm_setzero_ps();
295 fjx1 = _mm_setzero_ps();
296 fjy1 = _mm_setzero_ps();
297 fjz1 = _mm_setzero_ps();
298 fjx2 = _mm_setzero_ps();
299 fjy2 = _mm_setzero_ps();
300 fjz2 = _mm_setzero_ps();
301 fjx3 = _mm_setzero_ps();
302 fjy3 = _mm_setzero_ps();
303 fjz3 = _mm_setzero_ps();
305 /**************************
306 * CALCULATE INTERACTIONS *
307 **************************/
309 r00 = _mm_mul_ps(rsq00,rinv00);
311 /* Calculate table index by multiplying r with table scale and truncate to integer */
312 rt = _mm_mul_ps(r00,vftabscale);
313 vfitab = _mm_cvttps_epi32(rt);
314 #ifdef __XOP__
315 vfeps = _mm_frcz_ps(rt);
316 #else
317 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
318 #endif
319 twovfeps = _mm_add_ps(vfeps,vfeps);
320 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
322 /* CUBIC SPLINE TABLE DISPERSION */
323 vfitab = _mm_add_epi32(vfitab,ifour);
324 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
325 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
326 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
327 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
328 _MM_TRANSPOSE4_PS(Y,F,G,H);
329 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
330 VV = _mm_macc_ps(vfeps,Fp,Y);
331 vvdw6 = _mm_mul_ps(c6_00,VV);
332 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
333 fvdw6 = _mm_mul_ps(c6_00,FF);
335 /* CUBIC SPLINE TABLE REPULSION */
336 vfitab = _mm_add_epi32(vfitab,ifour);
337 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
338 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
339 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
340 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
341 _MM_TRANSPOSE4_PS(Y,F,G,H);
342 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
343 VV = _mm_macc_ps(vfeps,Fp,Y);
344 vvdw12 = _mm_mul_ps(c12_00,VV);
345 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
346 fvdw12 = _mm_mul_ps(c12_00,FF);
347 vvdw = _mm_add_ps(vvdw12,vvdw6);
348 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
350 /* Update potential sum for this i atom from the interaction with this j atom. */
351 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
353 fscal = fvdw;
355 /* Update vectorial force */
356 fix0 = _mm_macc_ps(dx00,fscal,fix0);
357 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
358 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
360 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
361 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
362 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
364 /**************************
365 * CALCULATE INTERACTIONS *
366 **************************/
368 r11 = _mm_mul_ps(rsq11,rinv11);
370 /* Calculate table index by multiplying r with table scale and truncate to integer */
371 rt = _mm_mul_ps(r11,vftabscale);
372 vfitab = _mm_cvttps_epi32(rt);
373 #ifdef __XOP__
374 vfeps = _mm_frcz_ps(rt);
375 #else
376 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
377 #endif
378 twovfeps = _mm_add_ps(vfeps,vfeps);
379 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
381 /* CUBIC SPLINE TABLE ELECTROSTATICS */
382 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
383 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
384 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
385 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
386 _MM_TRANSPOSE4_PS(Y,F,G,H);
387 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
388 VV = _mm_macc_ps(vfeps,Fp,Y);
389 velec = _mm_mul_ps(qq11,VV);
390 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
391 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
393 /* Update potential sum for this i atom from the interaction with this j atom. */
394 velecsum = _mm_add_ps(velecsum,velec);
396 fscal = felec;
398 /* Update vectorial force */
399 fix1 = _mm_macc_ps(dx11,fscal,fix1);
400 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
401 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
403 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
404 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
405 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
407 /**************************
408 * CALCULATE INTERACTIONS *
409 **************************/
411 r12 = _mm_mul_ps(rsq12,rinv12);
413 /* Calculate table index by multiplying r with table scale and truncate to integer */
414 rt = _mm_mul_ps(r12,vftabscale);
415 vfitab = _mm_cvttps_epi32(rt);
416 #ifdef __XOP__
417 vfeps = _mm_frcz_ps(rt);
418 #else
419 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
420 #endif
421 twovfeps = _mm_add_ps(vfeps,vfeps);
422 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
424 /* CUBIC SPLINE TABLE ELECTROSTATICS */
425 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
426 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
427 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
428 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
429 _MM_TRANSPOSE4_PS(Y,F,G,H);
430 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
431 VV = _mm_macc_ps(vfeps,Fp,Y);
432 velec = _mm_mul_ps(qq12,VV);
433 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
434 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
436 /* Update potential sum for this i atom from the interaction with this j atom. */
437 velecsum = _mm_add_ps(velecsum,velec);
439 fscal = felec;
441 /* Update vectorial force */
442 fix1 = _mm_macc_ps(dx12,fscal,fix1);
443 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
444 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
446 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
447 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
448 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
450 /**************************
451 * CALCULATE INTERACTIONS *
452 **************************/
454 r13 = _mm_mul_ps(rsq13,rinv13);
456 /* Calculate table index by multiplying r with table scale and truncate to integer */
457 rt = _mm_mul_ps(r13,vftabscale);
458 vfitab = _mm_cvttps_epi32(rt);
459 #ifdef __XOP__
460 vfeps = _mm_frcz_ps(rt);
461 #else
462 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
463 #endif
464 twovfeps = _mm_add_ps(vfeps,vfeps);
465 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
467 /* CUBIC SPLINE TABLE ELECTROSTATICS */
468 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
469 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
470 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
471 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
472 _MM_TRANSPOSE4_PS(Y,F,G,H);
473 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
474 VV = _mm_macc_ps(vfeps,Fp,Y);
475 velec = _mm_mul_ps(qq13,VV);
476 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
477 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
479 /* Update potential sum for this i atom from the interaction with this j atom. */
480 velecsum = _mm_add_ps(velecsum,velec);
482 fscal = felec;
484 /* Update vectorial force */
485 fix1 = _mm_macc_ps(dx13,fscal,fix1);
486 fiy1 = _mm_macc_ps(dy13,fscal,fiy1);
487 fiz1 = _mm_macc_ps(dz13,fscal,fiz1);
489 fjx3 = _mm_macc_ps(dx13,fscal,fjx3);
490 fjy3 = _mm_macc_ps(dy13,fscal,fjy3);
491 fjz3 = _mm_macc_ps(dz13,fscal,fjz3);
493 /**************************
494 * CALCULATE INTERACTIONS *
495 **************************/
497 r21 = _mm_mul_ps(rsq21,rinv21);
499 /* Calculate table index by multiplying r with table scale and truncate to integer */
500 rt = _mm_mul_ps(r21,vftabscale);
501 vfitab = _mm_cvttps_epi32(rt);
502 #ifdef __XOP__
503 vfeps = _mm_frcz_ps(rt);
504 #else
505 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
506 #endif
507 twovfeps = _mm_add_ps(vfeps,vfeps);
508 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
510 /* CUBIC SPLINE TABLE ELECTROSTATICS */
511 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
512 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
513 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
514 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
515 _MM_TRANSPOSE4_PS(Y,F,G,H);
516 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
517 VV = _mm_macc_ps(vfeps,Fp,Y);
518 velec = _mm_mul_ps(qq21,VV);
519 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
520 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
522 /* Update potential sum for this i atom from the interaction with this j atom. */
523 velecsum = _mm_add_ps(velecsum,velec);
525 fscal = felec;
527 /* Update vectorial force */
528 fix2 = _mm_macc_ps(dx21,fscal,fix2);
529 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
530 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
532 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
533 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
534 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
536 /**************************
537 * CALCULATE INTERACTIONS *
538 **************************/
540 r22 = _mm_mul_ps(rsq22,rinv22);
542 /* Calculate table index by multiplying r with table scale and truncate to integer */
543 rt = _mm_mul_ps(r22,vftabscale);
544 vfitab = _mm_cvttps_epi32(rt);
545 #ifdef __XOP__
546 vfeps = _mm_frcz_ps(rt);
547 #else
548 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
549 #endif
550 twovfeps = _mm_add_ps(vfeps,vfeps);
551 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
553 /* CUBIC SPLINE TABLE ELECTROSTATICS */
554 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
555 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
556 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
557 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
558 _MM_TRANSPOSE4_PS(Y,F,G,H);
559 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
560 VV = _mm_macc_ps(vfeps,Fp,Y);
561 velec = _mm_mul_ps(qq22,VV);
562 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
563 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
565 /* Update potential sum for this i atom from the interaction with this j atom. */
566 velecsum = _mm_add_ps(velecsum,velec);
568 fscal = felec;
570 /* Update vectorial force */
571 fix2 = _mm_macc_ps(dx22,fscal,fix2);
572 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
573 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
575 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
576 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
577 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
579 /**************************
580 * CALCULATE INTERACTIONS *
581 **************************/
583 r23 = _mm_mul_ps(rsq23,rinv23);
585 /* Calculate table index by multiplying r with table scale and truncate to integer */
586 rt = _mm_mul_ps(r23,vftabscale);
587 vfitab = _mm_cvttps_epi32(rt);
588 #ifdef __XOP__
589 vfeps = _mm_frcz_ps(rt);
590 #else
591 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
592 #endif
593 twovfeps = _mm_add_ps(vfeps,vfeps);
594 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
596 /* CUBIC SPLINE TABLE ELECTROSTATICS */
597 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
598 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
599 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
600 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
601 _MM_TRANSPOSE4_PS(Y,F,G,H);
602 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
603 VV = _mm_macc_ps(vfeps,Fp,Y);
604 velec = _mm_mul_ps(qq23,VV);
605 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
606 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
608 /* Update potential sum for this i atom from the interaction with this j atom. */
609 velecsum = _mm_add_ps(velecsum,velec);
611 fscal = felec;
613 /* Update vectorial force */
614 fix2 = _mm_macc_ps(dx23,fscal,fix2);
615 fiy2 = _mm_macc_ps(dy23,fscal,fiy2);
616 fiz2 = _mm_macc_ps(dz23,fscal,fiz2);
618 fjx3 = _mm_macc_ps(dx23,fscal,fjx3);
619 fjy3 = _mm_macc_ps(dy23,fscal,fjy3);
620 fjz3 = _mm_macc_ps(dz23,fscal,fjz3);
622 /**************************
623 * CALCULATE INTERACTIONS *
624 **************************/
626 r31 = _mm_mul_ps(rsq31,rinv31);
628 /* Calculate table index by multiplying r with table scale and truncate to integer */
629 rt = _mm_mul_ps(r31,vftabscale);
630 vfitab = _mm_cvttps_epi32(rt);
631 #ifdef __XOP__
632 vfeps = _mm_frcz_ps(rt);
633 #else
634 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
635 #endif
636 twovfeps = _mm_add_ps(vfeps,vfeps);
637 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
639 /* CUBIC SPLINE TABLE ELECTROSTATICS */
640 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
641 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
642 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
643 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
644 _MM_TRANSPOSE4_PS(Y,F,G,H);
645 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
646 VV = _mm_macc_ps(vfeps,Fp,Y);
647 velec = _mm_mul_ps(qq31,VV);
648 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
649 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
651 /* Update potential sum for this i atom from the interaction with this j atom. */
652 velecsum = _mm_add_ps(velecsum,velec);
654 fscal = felec;
656 /* Update vectorial force */
657 fix3 = _mm_macc_ps(dx31,fscal,fix3);
658 fiy3 = _mm_macc_ps(dy31,fscal,fiy3);
659 fiz3 = _mm_macc_ps(dz31,fscal,fiz3);
661 fjx1 = _mm_macc_ps(dx31,fscal,fjx1);
662 fjy1 = _mm_macc_ps(dy31,fscal,fjy1);
663 fjz1 = _mm_macc_ps(dz31,fscal,fjz1);
665 /**************************
666 * CALCULATE INTERACTIONS *
667 **************************/
669 r32 = _mm_mul_ps(rsq32,rinv32);
671 /* Calculate table index by multiplying r with table scale and truncate to integer */
672 rt = _mm_mul_ps(r32,vftabscale);
673 vfitab = _mm_cvttps_epi32(rt);
674 #ifdef __XOP__
675 vfeps = _mm_frcz_ps(rt);
676 #else
677 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
678 #endif
679 twovfeps = _mm_add_ps(vfeps,vfeps);
680 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
682 /* CUBIC SPLINE TABLE ELECTROSTATICS */
683 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
684 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
685 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
686 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
687 _MM_TRANSPOSE4_PS(Y,F,G,H);
688 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
689 VV = _mm_macc_ps(vfeps,Fp,Y);
690 velec = _mm_mul_ps(qq32,VV);
691 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
692 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
694 /* Update potential sum for this i atom from the interaction with this j atom. */
695 velecsum = _mm_add_ps(velecsum,velec);
697 fscal = felec;
699 /* Update vectorial force */
700 fix3 = _mm_macc_ps(dx32,fscal,fix3);
701 fiy3 = _mm_macc_ps(dy32,fscal,fiy3);
702 fiz3 = _mm_macc_ps(dz32,fscal,fiz3);
704 fjx2 = _mm_macc_ps(dx32,fscal,fjx2);
705 fjy2 = _mm_macc_ps(dy32,fscal,fjy2);
706 fjz2 = _mm_macc_ps(dz32,fscal,fjz2);
708 /**************************
709 * CALCULATE INTERACTIONS *
710 **************************/
712 r33 = _mm_mul_ps(rsq33,rinv33);
714 /* Calculate table index by multiplying r with table scale and truncate to integer */
715 rt = _mm_mul_ps(r33,vftabscale);
716 vfitab = _mm_cvttps_epi32(rt);
717 #ifdef __XOP__
718 vfeps = _mm_frcz_ps(rt);
719 #else
720 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
721 #endif
722 twovfeps = _mm_add_ps(vfeps,vfeps);
723 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
725 /* CUBIC SPLINE TABLE ELECTROSTATICS */
726 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
727 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
728 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
729 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
730 _MM_TRANSPOSE4_PS(Y,F,G,H);
731 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
732 VV = _mm_macc_ps(vfeps,Fp,Y);
733 velec = _mm_mul_ps(qq33,VV);
734 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
735 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
737 /* Update potential sum for this i atom from the interaction with this j atom. */
738 velecsum = _mm_add_ps(velecsum,velec);
740 fscal = felec;
742 /* Update vectorial force */
743 fix3 = _mm_macc_ps(dx33,fscal,fix3);
744 fiy3 = _mm_macc_ps(dy33,fscal,fiy3);
745 fiz3 = _mm_macc_ps(dz33,fscal,fiz3);
747 fjx3 = _mm_macc_ps(dx33,fscal,fjx3);
748 fjy3 = _mm_macc_ps(dy33,fscal,fjy3);
749 fjz3 = _mm_macc_ps(dz33,fscal,fjz3);
751 fjptrA = f+j_coord_offsetA;
752 fjptrB = f+j_coord_offsetB;
753 fjptrC = f+j_coord_offsetC;
754 fjptrD = f+j_coord_offsetD;
756 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
757 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
758 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
760 /* Inner loop uses 476 flops */
763 if(jidx<j_index_end)
766 /* Get j neighbor index, and coordinate index */
767 jnrlistA = jjnr[jidx];
768 jnrlistB = jjnr[jidx+1];
769 jnrlistC = jjnr[jidx+2];
770 jnrlistD = jjnr[jidx+3];
771 /* Sign of each element will be negative for non-real atoms.
772 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
773 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
775 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
776 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
777 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
778 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
779 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
780 j_coord_offsetA = DIM*jnrA;
781 j_coord_offsetB = DIM*jnrB;
782 j_coord_offsetC = DIM*jnrC;
783 j_coord_offsetD = DIM*jnrD;
785 /* load j atom coordinates */
786 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
787 x+j_coord_offsetC,x+j_coord_offsetD,
788 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
789 &jy2,&jz2,&jx3,&jy3,&jz3);
791 /* Calculate displacement vector */
792 dx00 = _mm_sub_ps(ix0,jx0);
793 dy00 = _mm_sub_ps(iy0,jy0);
794 dz00 = _mm_sub_ps(iz0,jz0);
795 dx11 = _mm_sub_ps(ix1,jx1);
796 dy11 = _mm_sub_ps(iy1,jy1);
797 dz11 = _mm_sub_ps(iz1,jz1);
798 dx12 = _mm_sub_ps(ix1,jx2);
799 dy12 = _mm_sub_ps(iy1,jy2);
800 dz12 = _mm_sub_ps(iz1,jz2);
801 dx13 = _mm_sub_ps(ix1,jx3);
802 dy13 = _mm_sub_ps(iy1,jy3);
803 dz13 = _mm_sub_ps(iz1,jz3);
804 dx21 = _mm_sub_ps(ix2,jx1);
805 dy21 = _mm_sub_ps(iy2,jy1);
806 dz21 = _mm_sub_ps(iz2,jz1);
807 dx22 = _mm_sub_ps(ix2,jx2);
808 dy22 = _mm_sub_ps(iy2,jy2);
809 dz22 = _mm_sub_ps(iz2,jz2);
810 dx23 = _mm_sub_ps(ix2,jx3);
811 dy23 = _mm_sub_ps(iy2,jy3);
812 dz23 = _mm_sub_ps(iz2,jz3);
813 dx31 = _mm_sub_ps(ix3,jx1);
814 dy31 = _mm_sub_ps(iy3,jy1);
815 dz31 = _mm_sub_ps(iz3,jz1);
816 dx32 = _mm_sub_ps(ix3,jx2);
817 dy32 = _mm_sub_ps(iy3,jy2);
818 dz32 = _mm_sub_ps(iz3,jz2);
819 dx33 = _mm_sub_ps(ix3,jx3);
820 dy33 = _mm_sub_ps(iy3,jy3);
821 dz33 = _mm_sub_ps(iz3,jz3);
823 /* Calculate squared distance and things based on it */
824 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
825 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
826 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
827 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
828 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
829 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
830 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
831 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
832 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
833 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
835 rinv00 = avx128fma_invsqrt_f(rsq00);
836 rinv11 = avx128fma_invsqrt_f(rsq11);
837 rinv12 = avx128fma_invsqrt_f(rsq12);
838 rinv13 = avx128fma_invsqrt_f(rsq13);
839 rinv21 = avx128fma_invsqrt_f(rsq21);
840 rinv22 = avx128fma_invsqrt_f(rsq22);
841 rinv23 = avx128fma_invsqrt_f(rsq23);
842 rinv31 = avx128fma_invsqrt_f(rsq31);
843 rinv32 = avx128fma_invsqrt_f(rsq32);
844 rinv33 = avx128fma_invsqrt_f(rsq33);
846 fjx0 = _mm_setzero_ps();
847 fjy0 = _mm_setzero_ps();
848 fjz0 = _mm_setzero_ps();
849 fjx1 = _mm_setzero_ps();
850 fjy1 = _mm_setzero_ps();
851 fjz1 = _mm_setzero_ps();
852 fjx2 = _mm_setzero_ps();
853 fjy2 = _mm_setzero_ps();
854 fjz2 = _mm_setzero_ps();
855 fjx3 = _mm_setzero_ps();
856 fjy3 = _mm_setzero_ps();
857 fjz3 = _mm_setzero_ps();
859 /**************************
860 * CALCULATE INTERACTIONS *
861 **************************/
863 r00 = _mm_mul_ps(rsq00,rinv00);
864 r00 = _mm_andnot_ps(dummy_mask,r00);
866 /* Calculate table index by multiplying r with table scale and truncate to integer */
867 rt = _mm_mul_ps(r00,vftabscale);
868 vfitab = _mm_cvttps_epi32(rt);
869 #ifdef __XOP__
870 vfeps = _mm_frcz_ps(rt);
871 #else
872 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
873 #endif
874 twovfeps = _mm_add_ps(vfeps,vfeps);
875 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
877 /* CUBIC SPLINE TABLE DISPERSION */
878 vfitab = _mm_add_epi32(vfitab,ifour);
879 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
880 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
881 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
882 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
883 _MM_TRANSPOSE4_PS(Y,F,G,H);
884 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
885 VV = _mm_macc_ps(vfeps,Fp,Y);
886 vvdw6 = _mm_mul_ps(c6_00,VV);
887 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
888 fvdw6 = _mm_mul_ps(c6_00,FF);
890 /* CUBIC SPLINE TABLE REPULSION */
891 vfitab = _mm_add_epi32(vfitab,ifour);
892 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
893 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
894 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
895 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
896 _MM_TRANSPOSE4_PS(Y,F,G,H);
897 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
898 VV = _mm_macc_ps(vfeps,Fp,Y);
899 vvdw12 = _mm_mul_ps(c12_00,VV);
900 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
901 fvdw12 = _mm_mul_ps(c12_00,FF);
902 vvdw = _mm_add_ps(vvdw12,vvdw6);
903 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
905 /* Update potential sum for this i atom from the interaction with this j atom. */
906 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
907 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
909 fscal = fvdw;
911 fscal = _mm_andnot_ps(dummy_mask,fscal);
913 /* Update vectorial force */
914 fix0 = _mm_macc_ps(dx00,fscal,fix0);
915 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
916 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
918 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
919 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
920 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
922 /**************************
923 * CALCULATE INTERACTIONS *
924 **************************/
926 r11 = _mm_mul_ps(rsq11,rinv11);
927 r11 = _mm_andnot_ps(dummy_mask,r11);
929 /* Calculate table index by multiplying r with table scale and truncate to integer */
930 rt = _mm_mul_ps(r11,vftabscale);
931 vfitab = _mm_cvttps_epi32(rt);
932 #ifdef __XOP__
933 vfeps = _mm_frcz_ps(rt);
934 #else
935 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
936 #endif
937 twovfeps = _mm_add_ps(vfeps,vfeps);
938 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
940 /* CUBIC SPLINE TABLE ELECTROSTATICS */
941 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
942 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
943 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
944 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
945 _MM_TRANSPOSE4_PS(Y,F,G,H);
946 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
947 VV = _mm_macc_ps(vfeps,Fp,Y);
948 velec = _mm_mul_ps(qq11,VV);
949 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
950 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
952 /* Update potential sum for this i atom from the interaction with this j atom. */
953 velec = _mm_andnot_ps(dummy_mask,velec);
954 velecsum = _mm_add_ps(velecsum,velec);
956 fscal = felec;
958 fscal = _mm_andnot_ps(dummy_mask,fscal);
960 /* Update vectorial force */
961 fix1 = _mm_macc_ps(dx11,fscal,fix1);
962 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
963 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
965 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
966 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
967 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
969 /**************************
970 * CALCULATE INTERACTIONS *
971 **************************/
973 r12 = _mm_mul_ps(rsq12,rinv12);
974 r12 = _mm_andnot_ps(dummy_mask,r12);
976 /* Calculate table index by multiplying r with table scale and truncate to integer */
977 rt = _mm_mul_ps(r12,vftabscale);
978 vfitab = _mm_cvttps_epi32(rt);
979 #ifdef __XOP__
980 vfeps = _mm_frcz_ps(rt);
981 #else
982 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
983 #endif
984 twovfeps = _mm_add_ps(vfeps,vfeps);
985 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
987 /* CUBIC SPLINE TABLE ELECTROSTATICS */
988 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
989 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
990 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
991 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
992 _MM_TRANSPOSE4_PS(Y,F,G,H);
993 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
994 VV = _mm_macc_ps(vfeps,Fp,Y);
995 velec = _mm_mul_ps(qq12,VV);
996 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
997 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
999 /* Update potential sum for this i atom from the interaction with this j atom. */
1000 velec = _mm_andnot_ps(dummy_mask,velec);
1001 velecsum = _mm_add_ps(velecsum,velec);
1003 fscal = felec;
1005 fscal = _mm_andnot_ps(dummy_mask,fscal);
1007 /* Update vectorial force */
1008 fix1 = _mm_macc_ps(dx12,fscal,fix1);
1009 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
1010 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
1012 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
1013 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
1014 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
1016 /**************************
1017 * CALCULATE INTERACTIONS *
1018 **************************/
1020 r13 = _mm_mul_ps(rsq13,rinv13);
1021 r13 = _mm_andnot_ps(dummy_mask,r13);
1023 /* Calculate table index by multiplying r with table scale and truncate to integer */
1024 rt = _mm_mul_ps(r13,vftabscale);
1025 vfitab = _mm_cvttps_epi32(rt);
1026 #ifdef __XOP__
1027 vfeps = _mm_frcz_ps(rt);
1028 #else
1029 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1030 #endif
1031 twovfeps = _mm_add_ps(vfeps,vfeps);
1032 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1034 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1035 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1036 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1037 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1038 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1039 _MM_TRANSPOSE4_PS(Y,F,G,H);
1040 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1041 VV = _mm_macc_ps(vfeps,Fp,Y);
1042 velec = _mm_mul_ps(qq13,VV);
1043 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1044 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
1046 /* Update potential sum for this i atom from the interaction with this j atom. */
1047 velec = _mm_andnot_ps(dummy_mask,velec);
1048 velecsum = _mm_add_ps(velecsum,velec);
1050 fscal = felec;
1052 fscal = _mm_andnot_ps(dummy_mask,fscal);
1054 /* Update vectorial force */
1055 fix1 = _mm_macc_ps(dx13,fscal,fix1);
1056 fiy1 = _mm_macc_ps(dy13,fscal,fiy1);
1057 fiz1 = _mm_macc_ps(dz13,fscal,fiz1);
1059 fjx3 = _mm_macc_ps(dx13,fscal,fjx3);
1060 fjy3 = _mm_macc_ps(dy13,fscal,fjy3);
1061 fjz3 = _mm_macc_ps(dz13,fscal,fjz3);
1063 /**************************
1064 * CALCULATE INTERACTIONS *
1065 **************************/
1067 r21 = _mm_mul_ps(rsq21,rinv21);
1068 r21 = _mm_andnot_ps(dummy_mask,r21);
1070 /* Calculate table index by multiplying r with table scale and truncate to integer */
1071 rt = _mm_mul_ps(r21,vftabscale);
1072 vfitab = _mm_cvttps_epi32(rt);
1073 #ifdef __XOP__
1074 vfeps = _mm_frcz_ps(rt);
1075 #else
1076 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1077 #endif
1078 twovfeps = _mm_add_ps(vfeps,vfeps);
1079 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1081 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1082 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1083 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1084 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1085 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1086 _MM_TRANSPOSE4_PS(Y,F,G,H);
1087 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1088 VV = _mm_macc_ps(vfeps,Fp,Y);
1089 velec = _mm_mul_ps(qq21,VV);
1090 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1091 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1093 /* Update potential sum for this i atom from the interaction with this j atom. */
1094 velec = _mm_andnot_ps(dummy_mask,velec);
1095 velecsum = _mm_add_ps(velecsum,velec);
1097 fscal = felec;
1099 fscal = _mm_andnot_ps(dummy_mask,fscal);
1101 /* Update vectorial force */
1102 fix2 = _mm_macc_ps(dx21,fscal,fix2);
1103 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
1104 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
1106 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
1107 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
1108 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
1110 /**************************
1111 * CALCULATE INTERACTIONS *
1112 **************************/
1114 r22 = _mm_mul_ps(rsq22,rinv22);
1115 r22 = _mm_andnot_ps(dummy_mask,r22);
1117 /* Calculate table index by multiplying r with table scale and truncate to integer */
1118 rt = _mm_mul_ps(r22,vftabscale);
1119 vfitab = _mm_cvttps_epi32(rt);
1120 #ifdef __XOP__
1121 vfeps = _mm_frcz_ps(rt);
1122 #else
1123 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1124 #endif
1125 twovfeps = _mm_add_ps(vfeps,vfeps);
1126 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1128 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1129 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1130 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1131 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1132 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1133 _MM_TRANSPOSE4_PS(Y,F,G,H);
1134 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1135 VV = _mm_macc_ps(vfeps,Fp,Y);
1136 velec = _mm_mul_ps(qq22,VV);
1137 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1138 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1140 /* Update potential sum for this i atom from the interaction with this j atom. */
1141 velec = _mm_andnot_ps(dummy_mask,velec);
1142 velecsum = _mm_add_ps(velecsum,velec);
1144 fscal = felec;
1146 fscal = _mm_andnot_ps(dummy_mask,fscal);
1148 /* Update vectorial force */
1149 fix2 = _mm_macc_ps(dx22,fscal,fix2);
1150 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
1151 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
1153 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
1154 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
1155 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
1157 /**************************
1158 * CALCULATE INTERACTIONS *
1159 **************************/
1161 r23 = _mm_mul_ps(rsq23,rinv23);
1162 r23 = _mm_andnot_ps(dummy_mask,r23);
1164 /* Calculate table index by multiplying r with table scale and truncate to integer */
1165 rt = _mm_mul_ps(r23,vftabscale);
1166 vfitab = _mm_cvttps_epi32(rt);
1167 #ifdef __XOP__
1168 vfeps = _mm_frcz_ps(rt);
1169 #else
1170 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1171 #endif
1172 twovfeps = _mm_add_ps(vfeps,vfeps);
1173 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1175 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1176 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1177 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1178 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1179 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1180 _MM_TRANSPOSE4_PS(Y,F,G,H);
1181 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1182 VV = _mm_macc_ps(vfeps,Fp,Y);
1183 velec = _mm_mul_ps(qq23,VV);
1184 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1185 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
1187 /* Update potential sum for this i atom from the interaction with this j atom. */
1188 velec = _mm_andnot_ps(dummy_mask,velec);
1189 velecsum = _mm_add_ps(velecsum,velec);
1191 fscal = felec;
1193 fscal = _mm_andnot_ps(dummy_mask,fscal);
1195 /* Update vectorial force */
1196 fix2 = _mm_macc_ps(dx23,fscal,fix2);
1197 fiy2 = _mm_macc_ps(dy23,fscal,fiy2);
1198 fiz2 = _mm_macc_ps(dz23,fscal,fiz2);
1200 fjx3 = _mm_macc_ps(dx23,fscal,fjx3);
1201 fjy3 = _mm_macc_ps(dy23,fscal,fjy3);
1202 fjz3 = _mm_macc_ps(dz23,fscal,fjz3);
1204 /**************************
1205 * CALCULATE INTERACTIONS *
1206 **************************/
1208 r31 = _mm_mul_ps(rsq31,rinv31);
1209 r31 = _mm_andnot_ps(dummy_mask,r31);
1211 /* Calculate table index by multiplying r with table scale and truncate to integer */
1212 rt = _mm_mul_ps(r31,vftabscale);
1213 vfitab = _mm_cvttps_epi32(rt);
1214 #ifdef __XOP__
1215 vfeps = _mm_frcz_ps(rt);
1216 #else
1217 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1218 #endif
1219 twovfeps = _mm_add_ps(vfeps,vfeps);
1220 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1222 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1223 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1224 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1225 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1226 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1227 _MM_TRANSPOSE4_PS(Y,F,G,H);
1228 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1229 VV = _mm_macc_ps(vfeps,Fp,Y);
1230 velec = _mm_mul_ps(qq31,VV);
1231 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1232 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
1234 /* Update potential sum for this i atom from the interaction with this j atom. */
1235 velec = _mm_andnot_ps(dummy_mask,velec);
1236 velecsum = _mm_add_ps(velecsum,velec);
1238 fscal = felec;
1240 fscal = _mm_andnot_ps(dummy_mask,fscal);
1242 /* Update vectorial force */
1243 fix3 = _mm_macc_ps(dx31,fscal,fix3);
1244 fiy3 = _mm_macc_ps(dy31,fscal,fiy3);
1245 fiz3 = _mm_macc_ps(dz31,fscal,fiz3);
1247 fjx1 = _mm_macc_ps(dx31,fscal,fjx1);
1248 fjy1 = _mm_macc_ps(dy31,fscal,fjy1);
1249 fjz1 = _mm_macc_ps(dz31,fscal,fjz1);
1251 /**************************
1252 * CALCULATE INTERACTIONS *
1253 **************************/
1255 r32 = _mm_mul_ps(rsq32,rinv32);
1256 r32 = _mm_andnot_ps(dummy_mask,r32);
1258 /* Calculate table index by multiplying r with table scale and truncate to integer */
1259 rt = _mm_mul_ps(r32,vftabscale);
1260 vfitab = _mm_cvttps_epi32(rt);
1261 #ifdef __XOP__
1262 vfeps = _mm_frcz_ps(rt);
1263 #else
1264 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1265 #endif
1266 twovfeps = _mm_add_ps(vfeps,vfeps);
1267 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1269 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1270 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1271 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1272 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1273 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1274 _MM_TRANSPOSE4_PS(Y,F,G,H);
1275 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1276 VV = _mm_macc_ps(vfeps,Fp,Y);
1277 velec = _mm_mul_ps(qq32,VV);
1278 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1279 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
1281 /* Update potential sum for this i atom from the interaction with this j atom. */
1282 velec = _mm_andnot_ps(dummy_mask,velec);
1283 velecsum = _mm_add_ps(velecsum,velec);
1285 fscal = felec;
1287 fscal = _mm_andnot_ps(dummy_mask,fscal);
1289 /* Update vectorial force */
1290 fix3 = _mm_macc_ps(dx32,fscal,fix3);
1291 fiy3 = _mm_macc_ps(dy32,fscal,fiy3);
1292 fiz3 = _mm_macc_ps(dz32,fscal,fiz3);
1294 fjx2 = _mm_macc_ps(dx32,fscal,fjx2);
1295 fjy2 = _mm_macc_ps(dy32,fscal,fjy2);
1296 fjz2 = _mm_macc_ps(dz32,fscal,fjz2);
1298 /**************************
1299 * CALCULATE INTERACTIONS *
1300 **************************/
1302 r33 = _mm_mul_ps(rsq33,rinv33);
1303 r33 = _mm_andnot_ps(dummy_mask,r33);
1305 /* Calculate table index by multiplying r with table scale and truncate to integer */
1306 rt = _mm_mul_ps(r33,vftabscale);
1307 vfitab = _mm_cvttps_epi32(rt);
1308 #ifdef __XOP__
1309 vfeps = _mm_frcz_ps(rt);
1310 #else
1311 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1312 #endif
1313 twovfeps = _mm_add_ps(vfeps,vfeps);
1314 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1316 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1317 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1318 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1319 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1320 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1321 _MM_TRANSPOSE4_PS(Y,F,G,H);
1322 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1323 VV = _mm_macc_ps(vfeps,Fp,Y);
1324 velec = _mm_mul_ps(qq33,VV);
1325 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1326 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
1328 /* Update potential sum for this i atom from the interaction with this j atom. */
1329 velec = _mm_andnot_ps(dummy_mask,velec);
1330 velecsum = _mm_add_ps(velecsum,velec);
1332 fscal = felec;
1334 fscal = _mm_andnot_ps(dummy_mask,fscal);
1336 /* Update vectorial force */
1337 fix3 = _mm_macc_ps(dx33,fscal,fix3);
1338 fiy3 = _mm_macc_ps(dy33,fscal,fiy3);
1339 fiz3 = _mm_macc_ps(dz33,fscal,fiz3);
1341 fjx3 = _mm_macc_ps(dx33,fscal,fjx3);
1342 fjy3 = _mm_macc_ps(dy33,fscal,fjy3);
1343 fjz3 = _mm_macc_ps(dz33,fscal,fjz3);
1345 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1346 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1347 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1348 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1350 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1351 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1352 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1354 /* Inner loop uses 486 flops */
1357 /* End of innermost loop */
1359 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1360 f+i_coord_offset,fshift+i_shift_offset);
1362 ggid = gid[iidx];
1363 /* Update potential energies */
1364 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1365 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1367 /* Increment number of inner iterations */
1368 inneriter += j_index_end - j_index_start;
1370 /* Outer loop uses 26 flops */
1373 /* Increment number of outer iterations */
1374 outeriter += nri;
1376 /* Update outer/inner flops */
1378 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*486);
1381 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_avx_128_fma_single
1382 * Electrostatics interaction: CubicSplineTable
1383 * VdW interaction: CubicSplineTable
1384 * Geometry: Water4-Water4
1385 * Calculate force/pot: Force
1387 void
1388 nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_avx_128_fma_single
1389 (t_nblist * gmx_restrict nlist,
1390 rvec * gmx_restrict xx,
1391 rvec * gmx_restrict ff,
1392 struct t_forcerec * gmx_restrict fr,
1393 t_mdatoms * gmx_restrict mdatoms,
1394 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1395 t_nrnb * gmx_restrict nrnb)
1397 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1398 * just 0 for non-waters.
1399 * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
1400 * jnr indices corresponding to data put in the four positions in the SIMD register.
1402 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1403 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1404 int jnrA,jnrB,jnrC,jnrD;
1405 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1406 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1407 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1408 real rcutoff_scalar;
1409 real *shiftvec,*fshift,*x,*f;
1410 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1411 real scratch[4*DIM];
1412 __m128 fscal,rcutoff,rcutoff2,jidxall;
1413 int vdwioffset0;
1414 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1415 int vdwioffset1;
1416 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1417 int vdwioffset2;
1418 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1419 int vdwioffset3;
1420 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1421 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1422 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1423 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1424 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1425 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1426 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1427 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1428 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1429 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1430 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1431 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1432 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1433 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1434 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1435 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1436 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1437 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1438 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1439 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1440 real *charge;
1441 int nvdwtype;
1442 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1443 int *vdwtype;
1444 real *vdwparam;
1445 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1446 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1447 __m128i vfitab;
1448 __m128i ifour = _mm_set1_epi32(4);
1449 __m128 rt,vfeps,twovfeps,vftabscale,Y,F,G,H,Fp,VV,FF;
1450 real *vftab;
1451 __m128 dummy_mask,cutoff_mask;
1452 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1453 __m128 one = _mm_set1_ps(1.0);
1454 __m128 two = _mm_set1_ps(2.0);
1455 x = xx[0];
1456 f = ff[0];
1458 nri = nlist->nri;
1459 iinr = nlist->iinr;
1460 jindex = nlist->jindex;
1461 jjnr = nlist->jjnr;
1462 shiftidx = nlist->shift;
1463 gid = nlist->gid;
1464 shiftvec = fr->shift_vec[0];
1465 fshift = fr->fshift[0];
1466 facel = _mm_set1_ps(fr->ic->epsfac);
1467 charge = mdatoms->chargeA;
1468 nvdwtype = fr->ntype;
1469 vdwparam = fr->nbfp;
1470 vdwtype = mdatoms->typeA;
1472 vftab = kernel_data->table_elec_vdw->data;
1473 vftabscale = _mm_set1_ps(kernel_data->table_elec_vdw->scale);
1475 /* Setup water-specific parameters */
1476 inr = nlist->iinr[0];
1477 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1478 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1479 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1480 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1482 jq1 = _mm_set1_ps(charge[inr+1]);
1483 jq2 = _mm_set1_ps(charge[inr+2]);
1484 jq3 = _mm_set1_ps(charge[inr+3]);
1485 vdwjidx0A = 2*vdwtype[inr+0];
1486 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1487 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1488 qq11 = _mm_mul_ps(iq1,jq1);
1489 qq12 = _mm_mul_ps(iq1,jq2);
1490 qq13 = _mm_mul_ps(iq1,jq3);
1491 qq21 = _mm_mul_ps(iq2,jq1);
1492 qq22 = _mm_mul_ps(iq2,jq2);
1493 qq23 = _mm_mul_ps(iq2,jq3);
1494 qq31 = _mm_mul_ps(iq3,jq1);
1495 qq32 = _mm_mul_ps(iq3,jq2);
1496 qq33 = _mm_mul_ps(iq3,jq3);
1498 /* Avoid stupid compiler warnings */
1499 jnrA = jnrB = jnrC = jnrD = 0;
1500 j_coord_offsetA = 0;
1501 j_coord_offsetB = 0;
1502 j_coord_offsetC = 0;
1503 j_coord_offsetD = 0;
1505 outeriter = 0;
1506 inneriter = 0;
1508 for(iidx=0;iidx<4*DIM;iidx++)
1510 scratch[iidx] = 0.0;
1513 /* Start outer loop over neighborlists */
1514 for(iidx=0; iidx<nri; iidx++)
1516 /* Load shift vector for this list */
1517 i_shift_offset = DIM*shiftidx[iidx];
1519 /* Load limits for loop over neighbors */
1520 j_index_start = jindex[iidx];
1521 j_index_end = jindex[iidx+1];
1523 /* Get outer coordinate index */
1524 inr = iinr[iidx];
1525 i_coord_offset = DIM*inr;
1527 /* Load i particle coords and add shift vector */
1528 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1529 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1531 fix0 = _mm_setzero_ps();
1532 fiy0 = _mm_setzero_ps();
1533 fiz0 = _mm_setzero_ps();
1534 fix1 = _mm_setzero_ps();
1535 fiy1 = _mm_setzero_ps();
1536 fiz1 = _mm_setzero_ps();
1537 fix2 = _mm_setzero_ps();
1538 fiy2 = _mm_setzero_ps();
1539 fiz2 = _mm_setzero_ps();
1540 fix3 = _mm_setzero_ps();
1541 fiy3 = _mm_setzero_ps();
1542 fiz3 = _mm_setzero_ps();
1544 /* Start inner kernel loop */
1545 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1548 /* Get j neighbor index, and coordinate index */
1549 jnrA = jjnr[jidx];
1550 jnrB = jjnr[jidx+1];
1551 jnrC = jjnr[jidx+2];
1552 jnrD = jjnr[jidx+3];
1553 j_coord_offsetA = DIM*jnrA;
1554 j_coord_offsetB = DIM*jnrB;
1555 j_coord_offsetC = DIM*jnrC;
1556 j_coord_offsetD = DIM*jnrD;
1558 /* load j atom coordinates */
1559 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1560 x+j_coord_offsetC,x+j_coord_offsetD,
1561 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1562 &jy2,&jz2,&jx3,&jy3,&jz3);
1564 /* Calculate displacement vector */
1565 dx00 = _mm_sub_ps(ix0,jx0);
1566 dy00 = _mm_sub_ps(iy0,jy0);
1567 dz00 = _mm_sub_ps(iz0,jz0);
1568 dx11 = _mm_sub_ps(ix1,jx1);
1569 dy11 = _mm_sub_ps(iy1,jy1);
1570 dz11 = _mm_sub_ps(iz1,jz1);
1571 dx12 = _mm_sub_ps(ix1,jx2);
1572 dy12 = _mm_sub_ps(iy1,jy2);
1573 dz12 = _mm_sub_ps(iz1,jz2);
1574 dx13 = _mm_sub_ps(ix1,jx3);
1575 dy13 = _mm_sub_ps(iy1,jy3);
1576 dz13 = _mm_sub_ps(iz1,jz3);
1577 dx21 = _mm_sub_ps(ix2,jx1);
1578 dy21 = _mm_sub_ps(iy2,jy1);
1579 dz21 = _mm_sub_ps(iz2,jz1);
1580 dx22 = _mm_sub_ps(ix2,jx2);
1581 dy22 = _mm_sub_ps(iy2,jy2);
1582 dz22 = _mm_sub_ps(iz2,jz2);
1583 dx23 = _mm_sub_ps(ix2,jx3);
1584 dy23 = _mm_sub_ps(iy2,jy3);
1585 dz23 = _mm_sub_ps(iz2,jz3);
1586 dx31 = _mm_sub_ps(ix3,jx1);
1587 dy31 = _mm_sub_ps(iy3,jy1);
1588 dz31 = _mm_sub_ps(iz3,jz1);
1589 dx32 = _mm_sub_ps(ix3,jx2);
1590 dy32 = _mm_sub_ps(iy3,jy2);
1591 dz32 = _mm_sub_ps(iz3,jz2);
1592 dx33 = _mm_sub_ps(ix3,jx3);
1593 dy33 = _mm_sub_ps(iy3,jy3);
1594 dz33 = _mm_sub_ps(iz3,jz3);
1596 /* Calculate squared distance and things based on it */
1597 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1598 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1599 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1600 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1601 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1602 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1603 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1604 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1605 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1606 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1608 rinv00 = avx128fma_invsqrt_f(rsq00);
1609 rinv11 = avx128fma_invsqrt_f(rsq11);
1610 rinv12 = avx128fma_invsqrt_f(rsq12);
1611 rinv13 = avx128fma_invsqrt_f(rsq13);
1612 rinv21 = avx128fma_invsqrt_f(rsq21);
1613 rinv22 = avx128fma_invsqrt_f(rsq22);
1614 rinv23 = avx128fma_invsqrt_f(rsq23);
1615 rinv31 = avx128fma_invsqrt_f(rsq31);
1616 rinv32 = avx128fma_invsqrt_f(rsq32);
1617 rinv33 = avx128fma_invsqrt_f(rsq33);
1619 fjx0 = _mm_setzero_ps();
1620 fjy0 = _mm_setzero_ps();
1621 fjz0 = _mm_setzero_ps();
1622 fjx1 = _mm_setzero_ps();
1623 fjy1 = _mm_setzero_ps();
1624 fjz1 = _mm_setzero_ps();
1625 fjx2 = _mm_setzero_ps();
1626 fjy2 = _mm_setzero_ps();
1627 fjz2 = _mm_setzero_ps();
1628 fjx3 = _mm_setzero_ps();
1629 fjy3 = _mm_setzero_ps();
1630 fjz3 = _mm_setzero_ps();
1632 /**************************
1633 * CALCULATE INTERACTIONS *
1634 **************************/
1636 r00 = _mm_mul_ps(rsq00,rinv00);
1638 /* Calculate table index by multiplying r with table scale and truncate to integer */
1639 rt = _mm_mul_ps(r00,vftabscale);
1640 vfitab = _mm_cvttps_epi32(rt);
1641 #ifdef __XOP__
1642 vfeps = _mm_frcz_ps(rt);
1643 #else
1644 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1645 #endif
1646 twovfeps = _mm_add_ps(vfeps,vfeps);
1647 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1649 /* CUBIC SPLINE TABLE DISPERSION */
1650 vfitab = _mm_add_epi32(vfitab,ifour);
1651 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1652 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1653 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1654 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1655 _MM_TRANSPOSE4_PS(Y,F,G,H);
1656 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1657 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1658 fvdw6 = _mm_mul_ps(c6_00,FF);
1660 /* CUBIC SPLINE TABLE REPULSION */
1661 vfitab = _mm_add_epi32(vfitab,ifour);
1662 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1663 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1664 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1665 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1666 _MM_TRANSPOSE4_PS(Y,F,G,H);
1667 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1668 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1669 fvdw12 = _mm_mul_ps(c12_00,FF);
1670 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
1672 fscal = fvdw;
1674 /* Update vectorial force */
1675 fix0 = _mm_macc_ps(dx00,fscal,fix0);
1676 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
1677 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
1679 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
1680 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
1681 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
1683 /**************************
1684 * CALCULATE INTERACTIONS *
1685 **************************/
1687 r11 = _mm_mul_ps(rsq11,rinv11);
1689 /* Calculate table index by multiplying r with table scale and truncate to integer */
1690 rt = _mm_mul_ps(r11,vftabscale);
1691 vfitab = _mm_cvttps_epi32(rt);
1692 #ifdef __XOP__
1693 vfeps = _mm_frcz_ps(rt);
1694 #else
1695 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1696 #endif
1697 twovfeps = _mm_add_ps(vfeps,vfeps);
1698 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1700 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1701 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1702 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1703 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1704 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1705 _MM_TRANSPOSE4_PS(Y,F,G,H);
1706 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1707 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1708 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
1710 fscal = felec;
1712 /* Update vectorial force */
1713 fix1 = _mm_macc_ps(dx11,fscal,fix1);
1714 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
1715 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
1717 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
1718 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
1719 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
1721 /**************************
1722 * CALCULATE INTERACTIONS *
1723 **************************/
1725 r12 = _mm_mul_ps(rsq12,rinv12);
1727 /* Calculate table index by multiplying r with table scale and truncate to integer */
1728 rt = _mm_mul_ps(r12,vftabscale);
1729 vfitab = _mm_cvttps_epi32(rt);
1730 #ifdef __XOP__
1731 vfeps = _mm_frcz_ps(rt);
1732 #else
1733 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1734 #endif
1735 twovfeps = _mm_add_ps(vfeps,vfeps);
1736 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1738 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1739 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1740 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1741 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1742 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1743 _MM_TRANSPOSE4_PS(Y,F,G,H);
1744 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1745 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1746 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
1748 fscal = felec;
1750 /* Update vectorial force */
1751 fix1 = _mm_macc_ps(dx12,fscal,fix1);
1752 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
1753 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
1755 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
1756 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
1757 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
1759 /**************************
1760 * CALCULATE INTERACTIONS *
1761 **************************/
1763 r13 = _mm_mul_ps(rsq13,rinv13);
1765 /* Calculate table index by multiplying r with table scale and truncate to integer */
1766 rt = _mm_mul_ps(r13,vftabscale);
1767 vfitab = _mm_cvttps_epi32(rt);
1768 #ifdef __XOP__
1769 vfeps = _mm_frcz_ps(rt);
1770 #else
1771 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1772 #endif
1773 twovfeps = _mm_add_ps(vfeps,vfeps);
1774 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1776 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1777 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1778 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1779 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1780 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1781 _MM_TRANSPOSE4_PS(Y,F,G,H);
1782 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1783 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1784 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
1786 fscal = felec;
1788 /* Update vectorial force */
1789 fix1 = _mm_macc_ps(dx13,fscal,fix1);
1790 fiy1 = _mm_macc_ps(dy13,fscal,fiy1);
1791 fiz1 = _mm_macc_ps(dz13,fscal,fiz1);
1793 fjx3 = _mm_macc_ps(dx13,fscal,fjx3);
1794 fjy3 = _mm_macc_ps(dy13,fscal,fjy3);
1795 fjz3 = _mm_macc_ps(dz13,fscal,fjz3);
1797 /**************************
1798 * CALCULATE INTERACTIONS *
1799 **************************/
1801 r21 = _mm_mul_ps(rsq21,rinv21);
1803 /* Calculate table index by multiplying r with table scale and truncate to integer */
1804 rt = _mm_mul_ps(r21,vftabscale);
1805 vfitab = _mm_cvttps_epi32(rt);
1806 #ifdef __XOP__
1807 vfeps = _mm_frcz_ps(rt);
1808 #else
1809 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1810 #endif
1811 twovfeps = _mm_add_ps(vfeps,vfeps);
1812 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1814 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1815 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1816 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1817 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1818 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1819 _MM_TRANSPOSE4_PS(Y,F,G,H);
1820 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1821 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1822 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1824 fscal = felec;
1826 /* Update vectorial force */
1827 fix2 = _mm_macc_ps(dx21,fscal,fix2);
1828 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
1829 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
1831 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
1832 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
1833 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
1835 /**************************
1836 * CALCULATE INTERACTIONS *
1837 **************************/
1839 r22 = _mm_mul_ps(rsq22,rinv22);
1841 /* Calculate table index by multiplying r with table scale and truncate to integer */
1842 rt = _mm_mul_ps(r22,vftabscale);
1843 vfitab = _mm_cvttps_epi32(rt);
1844 #ifdef __XOP__
1845 vfeps = _mm_frcz_ps(rt);
1846 #else
1847 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1848 #endif
1849 twovfeps = _mm_add_ps(vfeps,vfeps);
1850 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1852 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1853 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1854 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1855 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1856 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1857 _MM_TRANSPOSE4_PS(Y,F,G,H);
1858 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1859 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1860 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1862 fscal = felec;
1864 /* Update vectorial force */
1865 fix2 = _mm_macc_ps(dx22,fscal,fix2);
1866 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
1867 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
1869 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
1870 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
1871 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
1873 /**************************
1874 * CALCULATE INTERACTIONS *
1875 **************************/
1877 r23 = _mm_mul_ps(rsq23,rinv23);
1879 /* Calculate table index by multiplying r with table scale and truncate to integer */
1880 rt = _mm_mul_ps(r23,vftabscale);
1881 vfitab = _mm_cvttps_epi32(rt);
1882 #ifdef __XOP__
1883 vfeps = _mm_frcz_ps(rt);
1884 #else
1885 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1886 #endif
1887 twovfeps = _mm_add_ps(vfeps,vfeps);
1888 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1890 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1891 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1892 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1893 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1894 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1895 _MM_TRANSPOSE4_PS(Y,F,G,H);
1896 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1897 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1898 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
1900 fscal = felec;
1902 /* Update vectorial force */
1903 fix2 = _mm_macc_ps(dx23,fscal,fix2);
1904 fiy2 = _mm_macc_ps(dy23,fscal,fiy2);
1905 fiz2 = _mm_macc_ps(dz23,fscal,fiz2);
1907 fjx3 = _mm_macc_ps(dx23,fscal,fjx3);
1908 fjy3 = _mm_macc_ps(dy23,fscal,fjy3);
1909 fjz3 = _mm_macc_ps(dz23,fscal,fjz3);
1911 /**************************
1912 * CALCULATE INTERACTIONS *
1913 **************************/
1915 r31 = _mm_mul_ps(rsq31,rinv31);
1917 /* Calculate table index by multiplying r with table scale and truncate to integer */
1918 rt = _mm_mul_ps(r31,vftabscale);
1919 vfitab = _mm_cvttps_epi32(rt);
1920 #ifdef __XOP__
1921 vfeps = _mm_frcz_ps(rt);
1922 #else
1923 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1924 #endif
1925 twovfeps = _mm_add_ps(vfeps,vfeps);
1926 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1928 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1929 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1930 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1931 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1932 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1933 _MM_TRANSPOSE4_PS(Y,F,G,H);
1934 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1935 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1936 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
1938 fscal = felec;
1940 /* Update vectorial force */
1941 fix3 = _mm_macc_ps(dx31,fscal,fix3);
1942 fiy3 = _mm_macc_ps(dy31,fscal,fiy3);
1943 fiz3 = _mm_macc_ps(dz31,fscal,fiz3);
1945 fjx1 = _mm_macc_ps(dx31,fscal,fjx1);
1946 fjy1 = _mm_macc_ps(dy31,fscal,fjy1);
1947 fjz1 = _mm_macc_ps(dz31,fscal,fjz1);
1949 /**************************
1950 * CALCULATE INTERACTIONS *
1951 **************************/
1953 r32 = _mm_mul_ps(rsq32,rinv32);
1955 /* Calculate table index by multiplying r with table scale and truncate to integer */
1956 rt = _mm_mul_ps(r32,vftabscale);
1957 vfitab = _mm_cvttps_epi32(rt);
1958 #ifdef __XOP__
1959 vfeps = _mm_frcz_ps(rt);
1960 #else
1961 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1962 #endif
1963 twovfeps = _mm_add_ps(vfeps,vfeps);
1964 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1966 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1967 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1968 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1969 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1970 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1971 _MM_TRANSPOSE4_PS(Y,F,G,H);
1972 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1973 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1974 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
1976 fscal = felec;
1978 /* Update vectorial force */
1979 fix3 = _mm_macc_ps(dx32,fscal,fix3);
1980 fiy3 = _mm_macc_ps(dy32,fscal,fiy3);
1981 fiz3 = _mm_macc_ps(dz32,fscal,fiz3);
1983 fjx2 = _mm_macc_ps(dx32,fscal,fjx2);
1984 fjy2 = _mm_macc_ps(dy32,fscal,fjy2);
1985 fjz2 = _mm_macc_ps(dz32,fscal,fjz2);
1987 /**************************
1988 * CALCULATE INTERACTIONS *
1989 **************************/
1991 r33 = _mm_mul_ps(rsq33,rinv33);
1993 /* Calculate table index by multiplying r with table scale and truncate to integer */
1994 rt = _mm_mul_ps(r33,vftabscale);
1995 vfitab = _mm_cvttps_epi32(rt);
1996 #ifdef __XOP__
1997 vfeps = _mm_frcz_ps(rt);
1998 #else
1999 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2000 #endif
2001 twovfeps = _mm_add_ps(vfeps,vfeps);
2002 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2004 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2005 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2006 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2007 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2008 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2009 _MM_TRANSPOSE4_PS(Y,F,G,H);
2010 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2011 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2012 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
2014 fscal = felec;
2016 /* Update vectorial force */
2017 fix3 = _mm_macc_ps(dx33,fscal,fix3);
2018 fiy3 = _mm_macc_ps(dy33,fscal,fiy3);
2019 fiz3 = _mm_macc_ps(dz33,fscal,fiz3);
2021 fjx3 = _mm_macc_ps(dx33,fscal,fjx3);
2022 fjy3 = _mm_macc_ps(dy33,fscal,fjy3);
2023 fjz3 = _mm_macc_ps(dz33,fscal,fjz3);
2025 fjptrA = f+j_coord_offsetA;
2026 fjptrB = f+j_coord_offsetB;
2027 fjptrC = f+j_coord_offsetC;
2028 fjptrD = f+j_coord_offsetD;
2030 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2031 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2032 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2034 /* Inner loop uses 432 flops */
2037 if(jidx<j_index_end)
2040 /* Get j neighbor index, and coordinate index */
2041 jnrlistA = jjnr[jidx];
2042 jnrlistB = jjnr[jidx+1];
2043 jnrlistC = jjnr[jidx+2];
2044 jnrlistD = jjnr[jidx+3];
2045 /* Sign of each element will be negative for non-real atoms.
2046 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2047 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2049 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2050 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2051 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2052 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2053 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2054 j_coord_offsetA = DIM*jnrA;
2055 j_coord_offsetB = DIM*jnrB;
2056 j_coord_offsetC = DIM*jnrC;
2057 j_coord_offsetD = DIM*jnrD;
2059 /* load j atom coordinates */
2060 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2061 x+j_coord_offsetC,x+j_coord_offsetD,
2062 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2063 &jy2,&jz2,&jx3,&jy3,&jz3);
2065 /* Calculate displacement vector */
2066 dx00 = _mm_sub_ps(ix0,jx0);
2067 dy00 = _mm_sub_ps(iy0,jy0);
2068 dz00 = _mm_sub_ps(iz0,jz0);
2069 dx11 = _mm_sub_ps(ix1,jx1);
2070 dy11 = _mm_sub_ps(iy1,jy1);
2071 dz11 = _mm_sub_ps(iz1,jz1);
2072 dx12 = _mm_sub_ps(ix1,jx2);
2073 dy12 = _mm_sub_ps(iy1,jy2);
2074 dz12 = _mm_sub_ps(iz1,jz2);
2075 dx13 = _mm_sub_ps(ix1,jx3);
2076 dy13 = _mm_sub_ps(iy1,jy3);
2077 dz13 = _mm_sub_ps(iz1,jz3);
2078 dx21 = _mm_sub_ps(ix2,jx1);
2079 dy21 = _mm_sub_ps(iy2,jy1);
2080 dz21 = _mm_sub_ps(iz2,jz1);
2081 dx22 = _mm_sub_ps(ix2,jx2);
2082 dy22 = _mm_sub_ps(iy2,jy2);
2083 dz22 = _mm_sub_ps(iz2,jz2);
2084 dx23 = _mm_sub_ps(ix2,jx3);
2085 dy23 = _mm_sub_ps(iy2,jy3);
2086 dz23 = _mm_sub_ps(iz2,jz3);
2087 dx31 = _mm_sub_ps(ix3,jx1);
2088 dy31 = _mm_sub_ps(iy3,jy1);
2089 dz31 = _mm_sub_ps(iz3,jz1);
2090 dx32 = _mm_sub_ps(ix3,jx2);
2091 dy32 = _mm_sub_ps(iy3,jy2);
2092 dz32 = _mm_sub_ps(iz3,jz2);
2093 dx33 = _mm_sub_ps(ix3,jx3);
2094 dy33 = _mm_sub_ps(iy3,jy3);
2095 dz33 = _mm_sub_ps(iz3,jz3);
2097 /* Calculate squared distance and things based on it */
2098 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2099 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2100 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2101 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
2102 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2103 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2104 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
2105 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
2106 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
2107 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
2109 rinv00 = avx128fma_invsqrt_f(rsq00);
2110 rinv11 = avx128fma_invsqrt_f(rsq11);
2111 rinv12 = avx128fma_invsqrt_f(rsq12);
2112 rinv13 = avx128fma_invsqrt_f(rsq13);
2113 rinv21 = avx128fma_invsqrt_f(rsq21);
2114 rinv22 = avx128fma_invsqrt_f(rsq22);
2115 rinv23 = avx128fma_invsqrt_f(rsq23);
2116 rinv31 = avx128fma_invsqrt_f(rsq31);
2117 rinv32 = avx128fma_invsqrt_f(rsq32);
2118 rinv33 = avx128fma_invsqrt_f(rsq33);
2120 fjx0 = _mm_setzero_ps();
2121 fjy0 = _mm_setzero_ps();
2122 fjz0 = _mm_setzero_ps();
2123 fjx1 = _mm_setzero_ps();
2124 fjy1 = _mm_setzero_ps();
2125 fjz1 = _mm_setzero_ps();
2126 fjx2 = _mm_setzero_ps();
2127 fjy2 = _mm_setzero_ps();
2128 fjz2 = _mm_setzero_ps();
2129 fjx3 = _mm_setzero_ps();
2130 fjy3 = _mm_setzero_ps();
2131 fjz3 = _mm_setzero_ps();
2133 /**************************
2134 * CALCULATE INTERACTIONS *
2135 **************************/
2137 r00 = _mm_mul_ps(rsq00,rinv00);
2138 r00 = _mm_andnot_ps(dummy_mask,r00);
2140 /* Calculate table index by multiplying r with table scale and truncate to integer */
2141 rt = _mm_mul_ps(r00,vftabscale);
2142 vfitab = _mm_cvttps_epi32(rt);
2143 #ifdef __XOP__
2144 vfeps = _mm_frcz_ps(rt);
2145 #else
2146 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2147 #endif
2148 twovfeps = _mm_add_ps(vfeps,vfeps);
2149 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2151 /* CUBIC SPLINE TABLE DISPERSION */
2152 vfitab = _mm_add_epi32(vfitab,ifour);
2153 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2154 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2155 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2156 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2157 _MM_TRANSPOSE4_PS(Y,F,G,H);
2158 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2159 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2160 fvdw6 = _mm_mul_ps(c6_00,FF);
2162 /* CUBIC SPLINE TABLE REPULSION */
2163 vfitab = _mm_add_epi32(vfitab,ifour);
2164 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2165 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2166 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2167 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2168 _MM_TRANSPOSE4_PS(Y,F,G,H);
2169 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2170 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2171 fvdw12 = _mm_mul_ps(c12_00,FF);
2172 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
2174 fscal = fvdw;
2176 fscal = _mm_andnot_ps(dummy_mask,fscal);
2178 /* Update vectorial force */
2179 fix0 = _mm_macc_ps(dx00,fscal,fix0);
2180 fiy0 = _mm_macc_ps(dy00,fscal,fiy0);
2181 fiz0 = _mm_macc_ps(dz00,fscal,fiz0);
2183 fjx0 = _mm_macc_ps(dx00,fscal,fjx0);
2184 fjy0 = _mm_macc_ps(dy00,fscal,fjy0);
2185 fjz0 = _mm_macc_ps(dz00,fscal,fjz0);
2187 /**************************
2188 * CALCULATE INTERACTIONS *
2189 **************************/
2191 r11 = _mm_mul_ps(rsq11,rinv11);
2192 r11 = _mm_andnot_ps(dummy_mask,r11);
2194 /* Calculate table index by multiplying r with table scale and truncate to integer */
2195 rt = _mm_mul_ps(r11,vftabscale);
2196 vfitab = _mm_cvttps_epi32(rt);
2197 #ifdef __XOP__
2198 vfeps = _mm_frcz_ps(rt);
2199 #else
2200 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2201 #endif
2202 twovfeps = _mm_add_ps(vfeps,vfeps);
2203 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2205 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2206 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2207 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2208 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2209 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2210 _MM_TRANSPOSE4_PS(Y,F,G,H);
2211 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2212 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2213 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
2215 fscal = felec;
2217 fscal = _mm_andnot_ps(dummy_mask,fscal);
2219 /* Update vectorial force */
2220 fix1 = _mm_macc_ps(dx11,fscal,fix1);
2221 fiy1 = _mm_macc_ps(dy11,fscal,fiy1);
2222 fiz1 = _mm_macc_ps(dz11,fscal,fiz1);
2224 fjx1 = _mm_macc_ps(dx11,fscal,fjx1);
2225 fjy1 = _mm_macc_ps(dy11,fscal,fjy1);
2226 fjz1 = _mm_macc_ps(dz11,fscal,fjz1);
2228 /**************************
2229 * CALCULATE INTERACTIONS *
2230 **************************/
2232 r12 = _mm_mul_ps(rsq12,rinv12);
2233 r12 = _mm_andnot_ps(dummy_mask,r12);
2235 /* Calculate table index by multiplying r with table scale and truncate to integer */
2236 rt = _mm_mul_ps(r12,vftabscale);
2237 vfitab = _mm_cvttps_epi32(rt);
2238 #ifdef __XOP__
2239 vfeps = _mm_frcz_ps(rt);
2240 #else
2241 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2242 #endif
2243 twovfeps = _mm_add_ps(vfeps,vfeps);
2244 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2246 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2247 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2248 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2249 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2250 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2251 _MM_TRANSPOSE4_PS(Y,F,G,H);
2252 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2253 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2254 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
2256 fscal = felec;
2258 fscal = _mm_andnot_ps(dummy_mask,fscal);
2260 /* Update vectorial force */
2261 fix1 = _mm_macc_ps(dx12,fscal,fix1);
2262 fiy1 = _mm_macc_ps(dy12,fscal,fiy1);
2263 fiz1 = _mm_macc_ps(dz12,fscal,fiz1);
2265 fjx2 = _mm_macc_ps(dx12,fscal,fjx2);
2266 fjy2 = _mm_macc_ps(dy12,fscal,fjy2);
2267 fjz2 = _mm_macc_ps(dz12,fscal,fjz2);
2269 /**************************
2270 * CALCULATE INTERACTIONS *
2271 **************************/
2273 r13 = _mm_mul_ps(rsq13,rinv13);
2274 r13 = _mm_andnot_ps(dummy_mask,r13);
2276 /* Calculate table index by multiplying r with table scale and truncate to integer */
2277 rt = _mm_mul_ps(r13,vftabscale);
2278 vfitab = _mm_cvttps_epi32(rt);
2279 #ifdef __XOP__
2280 vfeps = _mm_frcz_ps(rt);
2281 #else
2282 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2283 #endif
2284 twovfeps = _mm_add_ps(vfeps,vfeps);
2285 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2287 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2288 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2289 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2290 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2291 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2292 _MM_TRANSPOSE4_PS(Y,F,G,H);
2293 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2294 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2295 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
2297 fscal = felec;
2299 fscal = _mm_andnot_ps(dummy_mask,fscal);
2301 /* Update vectorial force */
2302 fix1 = _mm_macc_ps(dx13,fscal,fix1);
2303 fiy1 = _mm_macc_ps(dy13,fscal,fiy1);
2304 fiz1 = _mm_macc_ps(dz13,fscal,fiz1);
2306 fjx3 = _mm_macc_ps(dx13,fscal,fjx3);
2307 fjy3 = _mm_macc_ps(dy13,fscal,fjy3);
2308 fjz3 = _mm_macc_ps(dz13,fscal,fjz3);
2310 /**************************
2311 * CALCULATE INTERACTIONS *
2312 **************************/
2314 r21 = _mm_mul_ps(rsq21,rinv21);
2315 r21 = _mm_andnot_ps(dummy_mask,r21);
2317 /* Calculate table index by multiplying r with table scale and truncate to integer */
2318 rt = _mm_mul_ps(r21,vftabscale);
2319 vfitab = _mm_cvttps_epi32(rt);
2320 #ifdef __XOP__
2321 vfeps = _mm_frcz_ps(rt);
2322 #else
2323 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2324 #endif
2325 twovfeps = _mm_add_ps(vfeps,vfeps);
2326 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2328 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2329 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2330 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2331 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2332 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2333 _MM_TRANSPOSE4_PS(Y,F,G,H);
2334 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2335 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2336 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
2338 fscal = felec;
2340 fscal = _mm_andnot_ps(dummy_mask,fscal);
2342 /* Update vectorial force */
2343 fix2 = _mm_macc_ps(dx21,fscal,fix2);
2344 fiy2 = _mm_macc_ps(dy21,fscal,fiy2);
2345 fiz2 = _mm_macc_ps(dz21,fscal,fiz2);
2347 fjx1 = _mm_macc_ps(dx21,fscal,fjx1);
2348 fjy1 = _mm_macc_ps(dy21,fscal,fjy1);
2349 fjz1 = _mm_macc_ps(dz21,fscal,fjz1);
2351 /**************************
2352 * CALCULATE INTERACTIONS *
2353 **************************/
2355 r22 = _mm_mul_ps(rsq22,rinv22);
2356 r22 = _mm_andnot_ps(dummy_mask,r22);
2358 /* Calculate table index by multiplying r with table scale and truncate to integer */
2359 rt = _mm_mul_ps(r22,vftabscale);
2360 vfitab = _mm_cvttps_epi32(rt);
2361 #ifdef __XOP__
2362 vfeps = _mm_frcz_ps(rt);
2363 #else
2364 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2365 #endif
2366 twovfeps = _mm_add_ps(vfeps,vfeps);
2367 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2369 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2370 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2371 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2372 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2373 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2374 _MM_TRANSPOSE4_PS(Y,F,G,H);
2375 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2376 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2377 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
2379 fscal = felec;
2381 fscal = _mm_andnot_ps(dummy_mask,fscal);
2383 /* Update vectorial force */
2384 fix2 = _mm_macc_ps(dx22,fscal,fix2);
2385 fiy2 = _mm_macc_ps(dy22,fscal,fiy2);
2386 fiz2 = _mm_macc_ps(dz22,fscal,fiz2);
2388 fjx2 = _mm_macc_ps(dx22,fscal,fjx2);
2389 fjy2 = _mm_macc_ps(dy22,fscal,fjy2);
2390 fjz2 = _mm_macc_ps(dz22,fscal,fjz2);
2392 /**************************
2393 * CALCULATE INTERACTIONS *
2394 **************************/
2396 r23 = _mm_mul_ps(rsq23,rinv23);
2397 r23 = _mm_andnot_ps(dummy_mask,r23);
2399 /* Calculate table index by multiplying r with table scale and truncate to integer */
2400 rt = _mm_mul_ps(r23,vftabscale);
2401 vfitab = _mm_cvttps_epi32(rt);
2402 #ifdef __XOP__
2403 vfeps = _mm_frcz_ps(rt);
2404 #else
2405 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2406 #endif
2407 twovfeps = _mm_add_ps(vfeps,vfeps);
2408 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2410 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2411 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2412 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2413 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2414 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2415 _MM_TRANSPOSE4_PS(Y,F,G,H);
2416 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2417 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2418 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
2420 fscal = felec;
2422 fscal = _mm_andnot_ps(dummy_mask,fscal);
2424 /* Update vectorial force */
2425 fix2 = _mm_macc_ps(dx23,fscal,fix2);
2426 fiy2 = _mm_macc_ps(dy23,fscal,fiy2);
2427 fiz2 = _mm_macc_ps(dz23,fscal,fiz2);
2429 fjx3 = _mm_macc_ps(dx23,fscal,fjx3);
2430 fjy3 = _mm_macc_ps(dy23,fscal,fjy3);
2431 fjz3 = _mm_macc_ps(dz23,fscal,fjz3);
2433 /**************************
2434 * CALCULATE INTERACTIONS *
2435 **************************/
2437 r31 = _mm_mul_ps(rsq31,rinv31);
2438 r31 = _mm_andnot_ps(dummy_mask,r31);
2440 /* Calculate table index by multiplying r with table scale and truncate to integer */
2441 rt = _mm_mul_ps(r31,vftabscale);
2442 vfitab = _mm_cvttps_epi32(rt);
2443 #ifdef __XOP__
2444 vfeps = _mm_frcz_ps(rt);
2445 #else
2446 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2447 #endif
2448 twovfeps = _mm_add_ps(vfeps,vfeps);
2449 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2451 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2452 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2453 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2454 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2455 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2456 _MM_TRANSPOSE4_PS(Y,F,G,H);
2457 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2458 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2459 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
2461 fscal = felec;
2463 fscal = _mm_andnot_ps(dummy_mask,fscal);
2465 /* Update vectorial force */
2466 fix3 = _mm_macc_ps(dx31,fscal,fix3);
2467 fiy3 = _mm_macc_ps(dy31,fscal,fiy3);
2468 fiz3 = _mm_macc_ps(dz31,fscal,fiz3);
2470 fjx1 = _mm_macc_ps(dx31,fscal,fjx1);
2471 fjy1 = _mm_macc_ps(dy31,fscal,fjy1);
2472 fjz1 = _mm_macc_ps(dz31,fscal,fjz1);
2474 /**************************
2475 * CALCULATE INTERACTIONS *
2476 **************************/
2478 r32 = _mm_mul_ps(rsq32,rinv32);
2479 r32 = _mm_andnot_ps(dummy_mask,r32);
2481 /* Calculate table index by multiplying r with table scale and truncate to integer */
2482 rt = _mm_mul_ps(r32,vftabscale);
2483 vfitab = _mm_cvttps_epi32(rt);
2484 #ifdef __XOP__
2485 vfeps = _mm_frcz_ps(rt);
2486 #else
2487 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2488 #endif
2489 twovfeps = _mm_add_ps(vfeps,vfeps);
2490 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2492 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2493 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2494 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2495 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2496 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2497 _MM_TRANSPOSE4_PS(Y,F,G,H);
2498 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2499 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2500 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
2502 fscal = felec;
2504 fscal = _mm_andnot_ps(dummy_mask,fscal);
2506 /* Update vectorial force */
2507 fix3 = _mm_macc_ps(dx32,fscal,fix3);
2508 fiy3 = _mm_macc_ps(dy32,fscal,fiy3);
2509 fiz3 = _mm_macc_ps(dz32,fscal,fiz3);
2511 fjx2 = _mm_macc_ps(dx32,fscal,fjx2);
2512 fjy2 = _mm_macc_ps(dy32,fscal,fjy2);
2513 fjz2 = _mm_macc_ps(dz32,fscal,fjz2);
2515 /**************************
2516 * CALCULATE INTERACTIONS *
2517 **************************/
2519 r33 = _mm_mul_ps(rsq33,rinv33);
2520 r33 = _mm_andnot_ps(dummy_mask,r33);
2522 /* Calculate table index by multiplying r with table scale and truncate to integer */
2523 rt = _mm_mul_ps(r33,vftabscale);
2524 vfitab = _mm_cvttps_epi32(rt);
2525 #ifdef __XOP__
2526 vfeps = _mm_frcz_ps(rt);
2527 #else
2528 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2529 #endif
2530 twovfeps = _mm_add_ps(vfeps,vfeps);
2531 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2533 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2534 Y = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2535 F = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2536 G = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2537 H = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2538 _MM_TRANSPOSE4_PS(Y,F,G,H);
2539 Fp = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2540 FF = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2541 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
2543 fscal = felec;
2545 fscal = _mm_andnot_ps(dummy_mask,fscal);
2547 /* Update vectorial force */
2548 fix3 = _mm_macc_ps(dx33,fscal,fix3);
2549 fiy3 = _mm_macc_ps(dy33,fscal,fiy3);
2550 fiz3 = _mm_macc_ps(dz33,fscal,fiz3);
2552 fjx3 = _mm_macc_ps(dx33,fscal,fjx3);
2553 fjy3 = _mm_macc_ps(dy33,fscal,fjy3);
2554 fjz3 = _mm_macc_ps(dz33,fscal,fjz3);
2556 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2557 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2558 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2559 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2561 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2562 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2563 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2565 /* Inner loop uses 442 flops */
2568 /* End of innermost loop */
2570 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2571 f+i_coord_offset,fshift+i_shift_offset);
2573 /* Increment number of inner iterations */
2574 inneriter += j_index_end - j_index_start;
2576 /* Outer loop uses 24 flops */
2579 /* Increment number of outer iterations */
2580 outeriter += nri;
2582 /* Update outer/inner flops */
2584 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*442);