Remove nb-parameters from t_forcerec
[gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse2_single / nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_sse2_single.c
blobc2b8fd258a0f929dcadc930ebb997b0ce9a48629
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_single kernel generator.
38 #include "gmxpre.h"
40 #include "config.h"
42 #include <math.h>
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_sse2_single.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sse2_single
51 * Electrostatics interaction: CubicSplineTable
52 * VdW interaction: CubicSplineTable
53 * Geometry: Water3-Water3
54 * Calculate force/pot: PotentialAndForce
56 void
57 nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sse2_single
58 (t_nblist * gmx_restrict nlist,
59 rvec * gmx_restrict xx,
60 rvec * gmx_restrict ff,
61 struct t_forcerec * gmx_restrict fr,
62 t_mdatoms * gmx_restrict mdatoms,
63 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64 t_nrnb * gmx_restrict nrnb)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset,i_coord_offset,outeriter,inneriter;
72 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73 int jnrA,jnrB,jnrC,jnrD;
74 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
75 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
76 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
77 real rcutoff_scalar;
78 real *shiftvec,*fshift,*x,*f;
79 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
80 real scratch[4*DIM];
81 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
82 int vdwioffset0;
83 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
84 int vdwioffset1;
85 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
86 int vdwioffset2;
87 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
88 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
89 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
90 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
91 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
92 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
93 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
94 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
95 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
96 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
97 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
98 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
99 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
100 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
101 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
102 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
103 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
104 real *charge;
105 int nvdwtype;
106 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
107 int *vdwtype;
108 real *vdwparam;
109 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
110 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
111 __m128i vfitab;
112 __m128i ifour = _mm_set1_epi32(4);
113 __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
114 real *vftab;
115 __m128 dummy_mask,cutoff_mask;
116 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
117 __m128 one = _mm_set1_ps(1.0);
118 __m128 two = _mm_set1_ps(2.0);
119 x = xx[0];
120 f = ff[0];
122 nri = nlist->nri;
123 iinr = nlist->iinr;
124 jindex = nlist->jindex;
125 jjnr = nlist->jjnr;
126 shiftidx = nlist->shift;
127 gid = nlist->gid;
128 shiftvec = fr->shift_vec[0];
129 fshift = fr->fshift[0];
130 facel = _mm_set1_ps(fr->ic->epsfac);
131 charge = mdatoms->chargeA;
132 nvdwtype = fr->ntype;
133 vdwparam = fr->nbfp;
134 vdwtype = mdatoms->typeA;
136 vftab = kernel_data->table_elec_vdw->data;
137 vftabscale = _mm_set1_ps(kernel_data->table_elec_vdw->scale);
139 /* Setup water-specific parameters */
140 inr = nlist->iinr[0];
141 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
142 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
143 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
144 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
146 jq0 = _mm_set1_ps(charge[inr+0]);
147 jq1 = _mm_set1_ps(charge[inr+1]);
148 jq2 = _mm_set1_ps(charge[inr+2]);
149 vdwjidx0A = 2*vdwtype[inr+0];
150 qq00 = _mm_mul_ps(iq0,jq0);
151 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
152 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
153 qq01 = _mm_mul_ps(iq0,jq1);
154 qq02 = _mm_mul_ps(iq0,jq2);
155 qq10 = _mm_mul_ps(iq1,jq0);
156 qq11 = _mm_mul_ps(iq1,jq1);
157 qq12 = _mm_mul_ps(iq1,jq2);
158 qq20 = _mm_mul_ps(iq2,jq0);
159 qq21 = _mm_mul_ps(iq2,jq1);
160 qq22 = _mm_mul_ps(iq2,jq2);
162 /* Avoid stupid compiler warnings */
163 jnrA = jnrB = jnrC = jnrD = 0;
164 j_coord_offsetA = 0;
165 j_coord_offsetB = 0;
166 j_coord_offsetC = 0;
167 j_coord_offsetD = 0;
169 outeriter = 0;
170 inneriter = 0;
172 for(iidx=0;iidx<4*DIM;iidx++)
174 scratch[iidx] = 0.0;
177 /* Start outer loop over neighborlists */
178 for(iidx=0; iidx<nri; iidx++)
180 /* Load shift vector for this list */
181 i_shift_offset = DIM*shiftidx[iidx];
183 /* Load limits for loop over neighbors */
184 j_index_start = jindex[iidx];
185 j_index_end = jindex[iidx+1];
187 /* Get outer coordinate index */
188 inr = iinr[iidx];
189 i_coord_offset = DIM*inr;
191 /* Load i particle coords and add shift vector */
192 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
193 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
195 fix0 = _mm_setzero_ps();
196 fiy0 = _mm_setzero_ps();
197 fiz0 = _mm_setzero_ps();
198 fix1 = _mm_setzero_ps();
199 fiy1 = _mm_setzero_ps();
200 fiz1 = _mm_setzero_ps();
201 fix2 = _mm_setzero_ps();
202 fiy2 = _mm_setzero_ps();
203 fiz2 = _mm_setzero_ps();
205 /* Reset potential sums */
206 velecsum = _mm_setzero_ps();
207 vvdwsum = _mm_setzero_ps();
209 /* Start inner kernel loop */
210 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
213 /* Get j neighbor index, and coordinate index */
214 jnrA = jjnr[jidx];
215 jnrB = jjnr[jidx+1];
216 jnrC = jjnr[jidx+2];
217 jnrD = jjnr[jidx+3];
218 j_coord_offsetA = DIM*jnrA;
219 j_coord_offsetB = DIM*jnrB;
220 j_coord_offsetC = DIM*jnrC;
221 j_coord_offsetD = DIM*jnrD;
223 /* load j atom coordinates */
224 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
225 x+j_coord_offsetC,x+j_coord_offsetD,
226 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
228 /* Calculate displacement vector */
229 dx00 = _mm_sub_ps(ix0,jx0);
230 dy00 = _mm_sub_ps(iy0,jy0);
231 dz00 = _mm_sub_ps(iz0,jz0);
232 dx01 = _mm_sub_ps(ix0,jx1);
233 dy01 = _mm_sub_ps(iy0,jy1);
234 dz01 = _mm_sub_ps(iz0,jz1);
235 dx02 = _mm_sub_ps(ix0,jx2);
236 dy02 = _mm_sub_ps(iy0,jy2);
237 dz02 = _mm_sub_ps(iz0,jz2);
238 dx10 = _mm_sub_ps(ix1,jx0);
239 dy10 = _mm_sub_ps(iy1,jy0);
240 dz10 = _mm_sub_ps(iz1,jz0);
241 dx11 = _mm_sub_ps(ix1,jx1);
242 dy11 = _mm_sub_ps(iy1,jy1);
243 dz11 = _mm_sub_ps(iz1,jz1);
244 dx12 = _mm_sub_ps(ix1,jx2);
245 dy12 = _mm_sub_ps(iy1,jy2);
246 dz12 = _mm_sub_ps(iz1,jz2);
247 dx20 = _mm_sub_ps(ix2,jx0);
248 dy20 = _mm_sub_ps(iy2,jy0);
249 dz20 = _mm_sub_ps(iz2,jz0);
250 dx21 = _mm_sub_ps(ix2,jx1);
251 dy21 = _mm_sub_ps(iy2,jy1);
252 dz21 = _mm_sub_ps(iz2,jz1);
253 dx22 = _mm_sub_ps(ix2,jx2);
254 dy22 = _mm_sub_ps(iy2,jy2);
255 dz22 = _mm_sub_ps(iz2,jz2);
257 /* Calculate squared distance and things based on it */
258 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
259 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
260 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
261 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
262 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
263 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
264 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
265 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
266 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
268 rinv00 = sse2_invsqrt_f(rsq00);
269 rinv01 = sse2_invsqrt_f(rsq01);
270 rinv02 = sse2_invsqrt_f(rsq02);
271 rinv10 = sse2_invsqrt_f(rsq10);
272 rinv11 = sse2_invsqrt_f(rsq11);
273 rinv12 = sse2_invsqrt_f(rsq12);
274 rinv20 = sse2_invsqrt_f(rsq20);
275 rinv21 = sse2_invsqrt_f(rsq21);
276 rinv22 = sse2_invsqrt_f(rsq22);
278 fjx0 = _mm_setzero_ps();
279 fjy0 = _mm_setzero_ps();
280 fjz0 = _mm_setzero_ps();
281 fjx1 = _mm_setzero_ps();
282 fjy1 = _mm_setzero_ps();
283 fjz1 = _mm_setzero_ps();
284 fjx2 = _mm_setzero_ps();
285 fjy2 = _mm_setzero_ps();
286 fjz2 = _mm_setzero_ps();
288 /**************************
289 * CALCULATE INTERACTIONS *
290 **************************/
292 r00 = _mm_mul_ps(rsq00,rinv00);
294 /* Calculate table index by multiplying r with table scale and truncate to integer */
295 rt = _mm_mul_ps(r00,vftabscale);
296 vfitab = _mm_cvttps_epi32(rt);
297 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
298 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
300 /* CUBIC SPLINE TABLE ELECTROSTATICS */
301 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
302 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
303 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
304 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
305 _MM_TRANSPOSE4_PS(Y,F,G,H);
306 Heps = _mm_mul_ps(vfeps,H);
307 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
308 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
309 velec = _mm_mul_ps(qq00,VV);
310 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
311 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
313 /* CUBIC SPLINE TABLE DISPERSION */
314 vfitab = _mm_add_epi32(vfitab,ifour);
315 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
316 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
317 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
318 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
319 _MM_TRANSPOSE4_PS(Y,F,G,H);
320 Heps = _mm_mul_ps(vfeps,H);
321 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
322 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
323 vvdw6 = _mm_mul_ps(c6_00,VV);
324 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
325 fvdw6 = _mm_mul_ps(c6_00,FF);
327 /* CUBIC SPLINE TABLE REPULSION */
328 vfitab = _mm_add_epi32(vfitab,ifour);
329 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
330 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
331 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
332 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
333 _MM_TRANSPOSE4_PS(Y,F,G,H);
334 Heps = _mm_mul_ps(vfeps,H);
335 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
336 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
337 vvdw12 = _mm_mul_ps(c12_00,VV);
338 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
339 fvdw12 = _mm_mul_ps(c12_00,FF);
340 vvdw = _mm_add_ps(vvdw12,vvdw6);
341 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
343 /* Update potential sum for this i atom from the interaction with this j atom. */
344 velecsum = _mm_add_ps(velecsum,velec);
345 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
347 fscal = _mm_add_ps(felec,fvdw);
349 /* Calculate temporary vectorial force */
350 tx = _mm_mul_ps(fscal,dx00);
351 ty = _mm_mul_ps(fscal,dy00);
352 tz = _mm_mul_ps(fscal,dz00);
354 /* Update vectorial force */
355 fix0 = _mm_add_ps(fix0,tx);
356 fiy0 = _mm_add_ps(fiy0,ty);
357 fiz0 = _mm_add_ps(fiz0,tz);
359 fjx0 = _mm_add_ps(fjx0,tx);
360 fjy0 = _mm_add_ps(fjy0,ty);
361 fjz0 = _mm_add_ps(fjz0,tz);
363 /**************************
364 * CALCULATE INTERACTIONS *
365 **************************/
367 r01 = _mm_mul_ps(rsq01,rinv01);
369 /* Calculate table index by multiplying r with table scale and truncate to integer */
370 rt = _mm_mul_ps(r01,vftabscale);
371 vfitab = _mm_cvttps_epi32(rt);
372 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
373 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
375 /* CUBIC SPLINE TABLE ELECTROSTATICS */
376 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
377 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
378 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
379 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
380 _MM_TRANSPOSE4_PS(Y,F,G,H);
381 Heps = _mm_mul_ps(vfeps,H);
382 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
383 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
384 velec = _mm_mul_ps(qq01,VV);
385 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
386 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
388 /* Update potential sum for this i atom from the interaction with this j atom. */
389 velecsum = _mm_add_ps(velecsum,velec);
391 fscal = felec;
393 /* Calculate temporary vectorial force */
394 tx = _mm_mul_ps(fscal,dx01);
395 ty = _mm_mul_ps(fscal,dy01);
396 tz = _mm_mul_ps(fscal,dz01);
398 /* Update vectorial force */
399 fix0 = _mm_add_ps(fix0,tx);
400 fiy0 = _mm_add_ps(fiy0,ty);
401 fiz0 = _mm_add_ps(fiz0,tz);
403 fjx1 = _mm_add_ps(fjx1,tx);
404 fjy1 = _mm_add_ps(fjy1,ty);
405 fjz1 = _mm_add_ps(fjz1,tz);
407 /**************************
408 * CALCULATE INTERACTIONS *
409 **************************/
411 r02 = _mm_mul_ps(rsq02,rinv02);
413 /* Calculate table index by multiplying r with table scale and truncate to integer */
414 rt = _mm_mul_ps(r02,vftabscale);
415 vfitab = _mm_cvttps_epi32(rt);
416 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
417 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
419 /* CUBIC SPLINE TABLE ELECTROSTATICS */
420 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
421 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
422 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
423 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
424 _MM_TRANSPOSE4_PS(Y,F,G,H);
425 Heps = _mm_mul_ps(vfeps,H);
426 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
427 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
428 velec = _mm_mul_ps(qq02,VV);
429 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
430 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
432 /* Update potential sum for this i atom from the interaction with this j atom. */
433 velecsum = _mm_add_ps(velecsum,velec);
435 fscal = felec;
437 /* Calculate temporary vectorial force */
438 tx = _mm_mul_ps(fscal,dx02);
439 ty = _mm_mul_ps(fscal,dy02);
440 tz = _mm_mul_ps(fscal,dz02);
442 /* Update vectorial force */
443 fix0 = _mm_add_ps(fix0,tx);
444 fiy0 = _mm_add_ps(fiy0,ty);
445 fiz0 = _mm_add_ps(fiz0,tz);
447 fjx2 = _mm_add_ps(fjx2,tx);
448 fjy2 = _mm_add_ps(fjy2,ty);
449 fjz2 = _mm_add_ps(fjz2,tz);
451 /**************************
452 * CALCULATE INTERACTIONS *
453 **************************/
455 r10 = _mm_mul_ps(rsq10,rinv10);
457 /* Calculate table index by multiplying r with table scale and truncate to integer */
458 rt = _mm_mul_ps(r10,vftabscale);
459 vfitab = _mm_cvttps_epi32(rt);
460 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
461 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
463 /* CUBIC SPLINE TABLE ELECTROSTATICS */
464 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
465 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
466 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
467 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
468 _MM_TRANSPOSE4_PS(Y,F,G,H);
469 Heps = _mm_mul_ps(vfeps,H);
470 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
471 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
472 velec = _mm_mul_ps(qq10,VV);
473 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
474 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
476 /* Update potential sum for this i atom from the interaction with this j atom. */
477 velecsum = _mm_add_ps(velecsum,velec);
479 fscal = felec;
481 /* Calculate temporary vectorial force */
482 tx = _mm_mul_ps(fscal,dx10);
483 ty = _mm_mul_ps(fscal,dy10);
484 tz = _mm_mul_ps(fscal,dz10);
486 /* Update vectorial force */
487 fix1 = _mm_add_ps(fix1,tx);
488 fiy1 = _mm_add_ps(fiy1,ty);
489 fiz1 = _mm_add_ps(fiz1,tz);
491 fjx0 = _mm_add_ps(fjx0,tx);
492 fjy0 = _mm_add_ps(fjy0,ty);
493 fjz0 = _mm_add_ps(fjz0,tz);
495 /**************************
496 * CALCULATE INTERACTIONS *
497 **************************/
499 r11 = _mm_mul_ps(rsq11,rinv11);
501 /* Calculate table index by multiplying r with table scale and truncate to integer */
502 rt = _mm_mul_ps(r11,vftabscale);
503 vfitab = _mm_cvttps_epi32(rt);
504 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
505 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
507 /* CUBIC SPLINE TABLE ELECTROSTATICS */
508 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
509 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
510 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
511 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
512 _MM_TRANSPOSE4_PS(Y,F,G,H);
513 Heps = _mm_mul_ps(vfeps,H);
514 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
515 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
516 velec = _mm_mul_ps(qq11,VV);
517 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
518 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
520 /* Update potential sum for this i atom from the interaction with this j atom. */
521 velecsum = _mm_add_ps(velecsum,velec);
523 fscal = felec;
525 /* Calculate temporary vectorial force */
526 tx = _mm_mul_ps(fscal,dx11);
527 ty = _mm_mul_ps(fscal,dy11);
528 tz = _mm_mul_ps(fscal,dz11);
530 /* Update vectorial force */
531 fix1 = _mm_add_ps(fix1,tx);
532 fiy1 = _mm_add_ps(fiy1,ty);
533 fiz1 = _mm_add_ps(fiz1,tz);
535 fjx1 = _mm_add_ps(fjx1,tx);
536 fjy1 = _mm_add_ps(fjy1,ty);
537 fjz1 = _mm_add_ps(fjz1,tz);
539 /**************************
540 * CALCULATE INTERACTIONS *
541 **************************/
543 r12 = _mm_mul_ps(rsq12,rinv12);
545 /* Calculate table index by multiplying r with table scale and truncate to integer */
546 rt = _mm_mul_ps(r12,vftabscale);
547 vfitab = _mm_cvttps_epi32(rt);
548 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
549 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
551 /* CUBIC SPLINE TABLE ELECTROSTATICS */
552 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
553 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
554 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
555 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
556 _MM_TRANSPOSE4_PS(Y,F,G,H);
557 Heps = _mm_mul_ps(vfeps,H);
558 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
559 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
560 velec = _mm_mul_ps(qq12,VV);
561 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
562 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
564 /* Update potential sum for this i atom from the interaction with this j atom. */
565 velecsum = _mm_add_ps(velecsum,velec);
567 fscal = felec;
569 /* Calculate temporary vectorial force */
570 tx = _mm_mul_ps(fscal,dx12);
571 ty = _mm_mul_ps(fscal,dy12);
572 tz = _mm_mul_ps(fscal,dz12);
574 /* Update vectorial force */
575 fix1 = _mm_add_ps(fix1,tx);
576 fiy1 = _mm_add_ps(fiy1,ty);
577 fiz1 = _mm_add_ps(fiz1,tz);
579 fjx2 = _mm_add_ps(fjx2,tx);
580 fjy2 = _mm_add_ps(fjy2,ty);
581 fjz2 = _mm_add_ps(fjz2,tz);
583 /**************************
584 * CALCULATE INTERACTIONS *
585 **************************/
587 r20 = _mm_mul_ps(rsq20,rinv20);
589 /* Calculate table index by multiplying r with table scale and truncate to integer */
590 rt = _mm_mul_ps(r20,vftabscale);
591 vfitab = _mm_cvttps_epi32(rt);
592 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
593 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
595 /* CUBIC SPLINE TABLE ELECTROSTATICS */
596 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
597 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
598 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
599 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
600 _MM_TRANSPOSE4_PS(Y,F,G,H);
601 Heps = _mm_mul_ps(vfeps,H);
602 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
603 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
604 velec = _mm_mul_ps(qq20,VV);
605 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
606 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
608 /* Update potential sum for this i atom from the interaction with this j atom. */
609 velecsum = _mm_add_ps(velecsum,velec);
611 fscal = felec;
613 /* Calculate temporary vectorial force */
614 tx = _mm_mul_ps(fscal,dx20);
615 ty = _mm_mul_ps(fscal,dy20);
616 tz = _mm_mul_ps(fscal,dz20);
618 /* Update vectorial force */
619 fix2 = _mm_add_ps(fix2,tx);
620 fiy2 = _mm_add_ps(fiy2,ty);
621 fiz2 = _mm_add_ps(fiz2,tz);
623 fjx0 = _mm_add_ps(fjx0,tx);
624 fjy0 = _mm_add_ps(fjy0,ty);
625 fjz0 = _mm_add_ps(fjz0,tz);
627 /**************************
628 * CALCULATE INTERACTIONS *
629 **************************/
631 r21 = _mm_mul_ps(rsq21,rinv21);
633 /* Calculate table index by multiplying r with table scale and truncate to integer */
634 rt = _mm_mul_ps(r21,vftabscale);
635 vfitab = _mm_cvttps_epi32(rt);
636 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
637 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
639 /* CUBIC SPLINE TABLE ELECTROSTATICS */
640 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
641 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
642 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
643 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
644 _MM_TRANSPOSE4_PS(Y,F,G,H);
645 Heps = _mm_mul_ps(vfeps,H);
646 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
647 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
648 velec = _mm_mul_ps(qq21,VV);
649 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
650 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
652 /* Update potential sum for this i atom from the interaction with this j atom. */
653 velecsum = _mm_add_ps(velecsum,velec);
655 fscal = felec;
657 /* Calculate temporary vectorial force */
658 tx = _mm_mul_ps(fscal,dx21);
659 ty = _mm_mul_ps(fscal,dy21);
660 tz = _mm_mul_ps(fscal,dz21);
662 /* Update vectorial force */
663 fix2 = _mm_add_ps(fix2,tx);
664 fiy2 = _mm_add_ps(fiy2,ty);
665 fiz2 = _mm_add_ps(fiz2,tz);
667 fjx1 = _mm_add_ps(fjx1,tx);
668 fjy1 = _mm_add_ps(fjy1,ty);
669 fjz1 = _mm_add_ps(fjz1,tz);
671 /**************************
672 * CALCULATE INTERACTIONS *
673 **************************/
675 r22 = _mm_mul_ps(rsq22,rinv22);
677 /* Calculate table index by multiplying r with table scale and truncate to integer */
678 rt = _mm_mul_ps(r22,vftabscale);
679 vfitab = _mm_cvttps_epi32(rt);
680 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
681 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
683 /* CUBIC SPLINE TABLE ELECTROSTATICS */
684 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
685 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
686 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
687 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
688 _MM_TRANSPOSE4_PS(Y,F,G,H);
689 Heps = _mm_mul_ps(vfeps,H);
690 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
691 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
692 velec = _mm_mul_ps(qq22,VV);
693 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
694 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
696 /* Update potential sum for this i atom from the interaction with this j atom. */
697 velecsum = _mm_add_ps(velecsum,velec);
699 fscal = felec;
701 /* Calculate temporary vectorial force */
702 tx = _mm_mul_ps(fscal,dx22);
703 ty = _mm_mul_ps(fscal,dy22);
704 tz = _mm_mul_ps(fscal,dz22);
706 /* Update vectorial force */
707 fix2 = _mm_add_ps(fix2,tx);
708 fiy2 = _mm_add_ps(fiy2,ty);
709 fiz2 = _mm_add_ps(fiz2,tz);
711 fjx2 = _mm_add_ps(fjx2,tx);
712 fjy2 = _mm_add_ps(fjy2,ty);
713 fjz2 = _mm_add_ps(fjz2,tz);
715 fjptrA = f+j_coord_offsetA;
716 fjptrB = f+j_coord_offsetB;
717 fjptrC = f+j_coord_offsetC;
718 fjptrD = f+j_coord_offsetD;
720 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
721 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
723 /* Inner loop uses 417 flops */
726 if(jidx<j_index_end)
729 /* Get j neighbor index, and coordinate index */
730 jnrlistA = jjnr[jidx];
731 jnrlistB = jjnr[jidx+1];
732 jnrlistC = jjnr[jidx+2];
733 jnrlistD = jjnr[jidx+3];
734 /* Sign of each element will be negative for non-real atoms.
735 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
736 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
738 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
739 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
740 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
741 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
742 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
743 j_coord_offsetA = DIM*jnrA;
744 j_coord_offsetB = DIM*jnrB;
745 j_coord_offsetC = DIM*jnrC;
746 j_coord_offsetD = DIM*jnrD;
748 /* load j atom coordinates */
749 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
750 x+j_coord_offsetC,x+j_coord_offsetD,
751 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
753 /* Calculate displacement vector */
754 dx00 = _mm_sub_ps(ix0,jx0);
755 dy00 = _mm_sub_ps(iy0,jy0);
756 dz00 = _mm_sub_ps(iz0,jz0);
757 dx01 = _mm_sub_ps(ix0,jx1);
758 dy01 = _mm_sub_ps(iy0,jy1);
759 dz01 = _mm_sub_ps(iz0,jz1);
760 dx02 = _mm_sub_ps(ix0,jx2);
761 dy02 = _mm_sub_ps(iy0,jy2);
762 dz02 = _mm_sub_ps(iz0,jz2);
763 dx10 = _mm_sub_ps(ix1,jx0);
764 dy10 = _mm_sub_ps(iy1,jy0);
765 dz10 = _mm_sub_ps(iz1,jz0);
766 dx11 = _mm_sub_ps(ix1,jx1);
767 dy11 = _mm_sub_ps(iy1,jy1);
768 dz11 = _mm_sub_ps(iz1,jz1);
769 dx12 = _mm_sub_ps(ix1,jx2);
770 dy12 = _mm_sub_ps(iy1,jy2);
771 dz12 = _mm_sub_ps(iz1,jz2);
772 dx20 = _mm_sub_ps(ix2,jx0);
773 dy20 = _mm_sub_ps(iy2,jy0);
774 dz20 = _mm_sub_ps(iz2,jz0);
775 dx21 = _mm_sub_ps(ix2,jx1);
776 dy21 = _mm_sub_ps(iy2,jy1);
777 dz21 = _mm_sub_ps(iz2,jz1);
778 dx22 = _mm_sub_ps(ix2,jx2);
779 dy22 = _mm_sub_ps(iy2,jy2);
780 dz22 = _mm_sub_ps(iz2,jz2);
782 /* Calculate squared distance and things based on it */
783 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
784 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
785 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
786 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
787 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
788 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
789 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
790 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
791 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
793 rinv00 = sse2_invsqrt_f(rsq00);
794 rinv01 = sse2_invsqrt_f(rsq01);
795 rinv02 = sse2_invsqrt_f(rsq02);
796 rinv10 = sse2_invsqrt_f(rsq10);
797 rinv11 = sse2_invsqrt_f(rsq11);
798 rinv12 = sse2_invsqrt_f(rsq12);
799 rinv20 = sse2_invsqrt_f(rsq20);
800 rinv21 = sse2_invsqrt_f(rsq21);
801 rinv22 = sse2_invsqrt_f(rsq22);
803 fjx0 = _mm_setzero_ps();
804 fjy0 = _mm_setzero_ps();
805 fjz0 = _mm_setzero_ps();
806 fjx1 = _mm_setzero_ps();
807 fjy1 = _mm_setzero_ps();
808 fjz1 = _mm_setzero_ps();
809 fjx2 = _mm_setzero_ps();
810 fjy2 = _mm_setzero_ps();
811 fjz2 = _mm_setzero_ps();
813 /**************************
814 * CALCULATE INTERACTIONS *
815 **************************/
817 r00 = _mm_mul_ps(rsq00,rinv00);
818 r00 = _mm_andnot_ps(dummy_mask,r00);
820 /* Calculate table index by multiplying r with table scale and truncate to integer */
821 rt = _mm_mul_ps(r00,vftabscale);
822 vfitab = _mm_cvttps_epi32(rt);
823 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
824 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
826 /* CUBIC SPLINE TABLE ELECTROSTATICS */
827 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
828 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
829 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
830 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
831 _MM_TRANSPOSE4_PS(Y,F,G,H);
832 Heps = _mm_mul_ps(vfeps,H);
833 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
834 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
835 velec = _mm_mul_ps(qq00,VV);
836 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
837 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
839 /* CUBIC SPLINE TABLE DISPERSION */
840 vfitab = _mm_add_epi32(vfitab,ifour);
841 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
842 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
843 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
844 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
845 _MM_TRANSPOSE4_PS(Y,F,G,H);
846 Heps = _mm_mul_ps(vfeps,H);
847 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
848 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
849 vvdw6 = _mm_mul_ps(c6_00,VV);
850 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
851 fvdw6 = _mm_mul_ps(c6_00,FF);
853 /* CUBIC SPLINE TABLE REPULSION */
854 vfitab = _mm_add_epi32(vfitab,ifour);
855 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
856 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
857 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
858 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
859 _MM_TRANSPOSE4_PS(Y,F,G,H);
860 Heps = _mm_mul_ps(vfeps,H);
861 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
862 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
863 vvdw12 = _mm_mul_ps(c12_00,VV);
864 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
865 fvdw12 = _mm_mul_ps(c12_00,FF);
866 vvdw = _mm_add_ps(vvdw12,vvdw6);
867 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
869 /* Update potential sum for this i atom from the interaction with this j atom. */
870 velec = _mm_andnot_ps(dummy_mask,velec);
871 velecsum = _mm_add_ps(velecsum,velec);
872 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
873 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
875 fscal = _mm_add_ps(felec,fvdw);
877 fscal = _mm_andnot_ps(dummy_mask,fscal);
879 /* Calculate temporary vectorial force */
880 tx = _mm_mul_ps(fscal,dx00);
881 ty = _mm_mul_ps(fscal,dy00);
882 tz = _mm_mul_ps(fscal,dz00);
884 /* Update vectorial force */
885 fix0 = _mm_add_ps(fix0,tx);
886 fiy0 = _mm_add_ps(fiy0,ty);
887 fiz0 = _mm_add_ps(fiz0,tz);
889 fjx0 = _mm_add_ps(fjx0,tx);
890 fjy0 = _mm_add_ps(fjy0,ty);
891 fjz0 = _mm_add_ps(fjz0,tz);
893 /**************************
894 * CALCULATE INTERACTIONS *
895 **************************/
897 r01 = _mm_mul_ps(rsq01,rinv01);
898 r01 = _mm_andnot_ps(dummy_mask,r01);
900 /* Calculate table index by multiplying r with table scale and truncate to integer */
901 rt = _mm_mul_ps(r01,vftabscale);
902 vfitab = _mm_cvttps_epi32(rt);
903 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
904 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
906 /* CUBIC SPLINE TABLE ELECTROSTATICS */
907 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
908 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
909 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
910 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
911 _MM_TRANSPOSE4_PS(Y,F,G,H);
912 Heps = _mm_mul_ps(vfeps,H);
913 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
914 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
915 velec = _mm_mul_ps(qq01,VV);
916 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
917 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
919 /* Update potential sum for this i atom from the interaction with this j atom. */
920 velec = _mm_andnot_ps(dummy_mask,velec);
921 velecsum = _mm_add_ps(velecsum,velec);
923 fscal = felec;
925 fscal = _mm_andnot_ps(dummy_mask,fscal);
927 /* Calculate temporary vectorial force */
928 tx = _mm_mul_ps(fscal,dx01);
929 ty = _mm_mul_ps(fscal,dy01);
930 tz = _mm_mul_ps(fscal,dz01);
932 /* Update vectorial force */
933 fix0 = _mm_add_ps(fix0,tx);
934 fiy0 = _mm_add_ps(fiy0,ty);
935 fiz0 = _mm_add_ps(fiz0,tz);
937 fjx1 = _mm_add_ps(fjx1,tx);
938 fjy1 = _mm_add_ps(fjy1,ty);
939 fjz1 = _mm_add_ps(fjz1,tz);
941 /**************************
942 * CALCULATE INTERACTIONS *
943 **************************/
945 r02 = _mm_mul_ps(rsq02,rinv02);
946 r02 = _mm_andnot_ps(dummy_mask,r02);
948 /* Calculate table index by multiplying r with table scale and truncate to integer */
949 rt = _mm_mul_ps(r02,vftabscale);
950 vfitab = _mm_cvttps_epi32(rt);
951 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
952 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
954 /* CUBIC SPLINE TABLE ELECTROSTATICS */
955 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
956 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
957 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
958 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
959 _MM_TRANSPOSE4_PS(Y,F,G,H);
960 Heps = _mm_mul_ps(vfeps,H);
961 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
962 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
963 velec = _mm_mul_ps(qq02,VV);
964 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
965 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
967 /* Update potential sum for this i atom from the interaction with this j atom. */
968 velec = _mm_andnot_ps(dummy_mask,velec);
969 velecsum = _mm_add_ps(velecsum,velec);
971 fscal = felec;
973 fscal = _mm_andnot_ps(dummy_mask,fscal);
975 /* Calculate temporary vectorial force */
976 tx = _mm_mul_ps(fscal,dx02);
977 ty = _mm_mul_ps(fscal,dy02);
978 tz = _mm_mul_ps(fscal,dz02);
980 /* Update vectorial force */
981 fix0 = _mm_add_ps(fix0,tx);
982 fiy0 = _mm_add_ps(fiy0,ty);
983 fiz0 = _mm_add_ps(fiz0,tz);
985 fjx2 = _mm_add_ps(fjx2,tx);
986 fjy2 = _mm_add_ps(fjy2,ty);
987 fjz2 = _mm_add_ps(fjz2,tz);
989 /**************************
990 * CALCULATE INTERACTIONS *
991 **************************/
993 r10 = _mm_mul_ps(rsq10,rinv10);
994 r10 = _mm_andnot_ps(dummy_mask,r10);
996 /* Calculate table index by multiplying r with table scale and truncate to integer */
997 rt = _mm_mul_ps(r10,vftabscale);
998 vfitab = _mm_cvttps_epi32(rt);
999 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1000 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1002 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1003 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1004 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1005 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1006 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1007 _MM_TRANSPOSE4_PS(Y,F,G,H);
1008 Heps = _mm_mul_ps(vfeps,H);
1009 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1010 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1011 velec = _mm_mul_ps(qq10,VV);
1012 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1013 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
1015 /* Update potential sum for this i atom from the interaction with this j atom. */
1016 velec = _mm_andnot_ps(dummy_mask,velec);
1017 velecsum = _mm_add_ps(velecsum,velec);
1019 fscal = felec;
1021 fscal = _mm_andnot_ps(dummy_mask,fscal);
1023 /* Calculate temporary vectorial force */
1024 tx = _mm_mul_ps(fscal,dx10);
1025 ty = _mm_mul_ps(fscal,dy10);
1026 tz = _mm_mul_ps(fscal,dz10);
1028 /* Update vectorial force */
1029 fix1 = _mm_add_ps(fix1,tx);
1030 fiy1 = _mm_add_ps(fiy1,ty);
1031 fiz1 = _mm_add_ps(fiz1,tz);
1033 fjx0 = _mm_add_ps(fjx0,tx);
1034 fjy0 = _mm_add_ps(fjy0,ty);
1035 fjz0 = _mm_add_ps(fjz0,tz);
1037 /**************************
1038 * CALCULATE INTERACTIONS *
1039 **************************/
1041 r11 = _mm_mul_ps(rsq11,rinv11);
1042 r11 = _mm_andnot_ps(dummy_mask,r11);
1044 /* Calculate table index by multiplying r with table scale and truncate to integer */
1045 rt = _mm_mul_ps(r11,vftabscale);
1046 vfitab = _mm_cvttps_epi32(rt);
1047 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1048 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1050 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1051 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1052 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1053 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1054 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1055 _MM_TRANSPOSE4_PS(Y,F,G,H);
1056 Heps = _mm_mul_ps(vfeps,H);
1057 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1058 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1059 velec = _mm_mul_ps(qq11,VV);
1060 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1061 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
1063 /* Update potential sum for this i atom from the interaction with this j atom. */
1064 velec = _mm_andnot_ps(dummy_mask,velec);
1065 velecsum = _mm_add_ps(velecsum,velec);
1067 fscal = felec;
1069 fscal = _mm_andnot_ps(dummy_mask,fscal);
1071 /* Calculate temporary vectorial force */
1072 tx = _mm_mul_ps(fscal,dx11);
1073 ty = _mm_mul_ps(fscal,dy11);
1074 tz = _mm_mul_ps(fscal,dz11);
1076 /* Update vectorial force */
1077 fix1 = _mm_add_ps(fix1,tx);
1078 fiy1 = _mm_add_ps(fiy1,ty);
1079 fiz1 = _mm_add_ps(fiz1,tz);
1081 fjx1 = _mm_add_ps(fjx1,tx);
1082 fjy1 = _mm_add_ps(fjy1,ty);
1083 fjz1 = _mm_add_ps(fjz1,tz);
1085 /**************************
1086 * CALCULATE INTERACTIONS *
1087 **************************/
1089 r12 = _mm_mul_ps(rsq12,rinv12);
1090 r12 = _mm_andnot_ps(dummy_mask,r12);
1092 /* Calculate table index by multiplying r with table scale and truncate to integer */
1093 rt = _mm_mul_ps(r12,vftabscale);
1094 vfitab = _mm_cvttps_epi32(rt);
1095 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1096 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1098 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1099 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1100 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1101 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1102 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1103 _MM_TRANSPOSE4_PS(Y,F,G,H);
1104 Heps = _mm_mul_ps(vfeps,H);
1105 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1106 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1107 velec = _mm_mul_ps(qq12,VV);
1108 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1109 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
1111 /* Update potential sum for this i atom from the interaction with this j atom. */
1112 velec = _mm_andnot_ps(dummy_mask,velec);
1113 velecsum = _mm_add_ps(velecsum,velec);
1115 fscal = felec;
1117 fscal = _mm_andnot_ps(dummy_mask,fscal);
1119 /* Calculate temporary vectorial force */
1120 tx = _mm_mul_ps(fscal,dx12);
1121 ty = _mm_mul_ps(fscal,dy12);
1122 tz = _mm_mul_ps(fscal,dz12);
1124 /* Update vectorial force */
1125 fix1 = _mm_add_ps(fix1,tx);
1126 fiy1 = _mm_add_ps(fiy1,ty);
1127 fiz1 = _mm_add_ps(fiz1,tz);
1129 fjx2 = _mm_add_ps(fjx2,tx);
1130 fjy2 = _mm_add_ps(fjy2,ty);
1131 fjz2 = _mm_add_ps(fjz2,tz);
1133 /**************************
1134 * CALCULATE INTERACTIONS *
1135 **************************/
1137 r20 = _mm_mul_ps(rsq20,rinv20);
1138 r20 = _mm_andnot_ps(dummy_mask,r20);
1140 /* Calculate table index by multiplying r with table scale and truncate to integer */
1141 rt = _mm_mul_ps(r20,vftabscale);
1142 vfitab = _mm_cvttps_epi32(rt);
1143 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1144 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1146 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1147 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1148 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1149 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1150 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1151 _MM_TRANSPOSE4_PS(Y,F,G,H);
1152 Heps = _mm_mul_ps(vfeps,H);
1153 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1154 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1155 velec = _mm_mul_ps(qq20,VV);
1156 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1157 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
1159 /* Update potential sum for this i atom from the interaction with this j atom. */
1160 velec = _mm_andnot_ps(dummy_mask,velec);
1161 velecsum = _mm_add_ps(velecsum,velec);
1163 fscal = felec;
1165 fscal = _mm_andnot_ps(dummy_mask,fscal);
1167 /* Calculate temporary vectorial force */
1168 tx = _mm_mul_ps(fscal,dx20);
1169 ty = _mm_mul_ps(fscal,dy20);
1170 tz = _mm_mul_ps(fscal,dz20);
1172 /* Update vectorial force */
1173 fix2 = _mm_add_ps(fix2,tx);
1174 fiy2 = _mm_add_ps(fiy2,ty);
1175 fiz2 = _mm_add_ps(fiz2,tz);
1177 fjx0 = _mm_add_ps(fjx0,tx);
1178 fjy0 = _mm_add_ps(fjy0,ty);
1179 fjz0 = _mm_add_ps(fjz0,tz);
1181 /**************************
1182 * CALCULATE INTERACTIONS *
1183 **************************/
1185 r21 = _mm_mul_ps(rsq21,rinv21);
1186 r21 = _mm_andnot_ps(dummy_mask,r21);
1188 /* Calculate table index by multiplying r with table scale and truncate to integer */
1189 rt = _mm_mul_ps(r21,vftabscale);
1190 vfitab = _mm_cvttps_epi32(rt);
1191 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1192 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1194 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1195 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1196 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1197 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1198 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1199 _MM_TRANSPOSE4_PS(Y,F,G,H);
1200 Heps = _mm_mul_ps(vfeps,H);
1201 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1202 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1203 velec = _mm_mul_ps(qq21,VV);
1204 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1205 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1207 /* Update potential sum for this i atom from the interaction with this j atom. */
1208 velec = _mm_andnot_ps(dummy_mask,velec);
1209 velecsum = _mm_add_ps(velecsum,velec);
1211 fscal = felec;
1213 fscal = _mm_andnot_ps(dummy_mask,fscal);
1215 /* Calculate temporary vectorial force */
1216 tx = _mm_mul_ps(fscal,dx21);
1217 ty = _mm_mul_ps(fscal,dy21);
1218 tz = _mm_mul_ps(fscal,dz21);
1220 /* Update vectorial force */
1221 fix2 = _mm_add_ps(fix2,tx);
1222 fiy2 = _mm_add_ps(fiy2,ty);
1223 fiz2 = _mm_add_ps(fiz2,tz);
1225 fjx1 = _mm_add_ps(fjx1,tx);
1226 fjy1 = _mm_add_ps(fjy1,ty);
1227 fjz1 = _mm_add_ps(fjz1,tz);
1229 /**************************
1230 * CALCULATE INTERACTIONS *
1231 **************************/
1233 r22 = _mm_mul_ps(rsq22,rinv22);
1234 r22 = _mm_andnot_ps(dummy_mask,r22);
1236 /* Calculate table index by multiplying r with table scale and truncate to integer */
1237 rt = _mm_mul_ps(r22,vftabscale);
1238 vfitab = _mm_cvttps_epi32(rt);
1239 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1240 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1242 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1243 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1244 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1245 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1246 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1247 _MM_TRANSPOSE4_PS(Y,F,G,H);
1248 Heps = _mm_mul_ps(vfeps,H);
1249 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1250 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1251 velec = _mm_mul_ps(qq22,VV);
1252 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1253 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1255 /* Update potential sum for this i atom from the interaction with this j atom. */
1256 velec = _mm_andnot_ps(dummy_mask,velec);
1257 velecsum = _mm_add_ps(velecsum,velec);
1259 fscal = felec;
1261 fscal = _mm_andnot_ps(dummy_mask,fscal);
1263 /* Calculate temporary vectorial force */
1264 tx = _mm_mul_ps(fscal,dx22);
1265 ty = _mm_mul_ps(fscal,dy22);
1266 tz = _mm_mul_ps(fscal,dz22);
1268 /* Update vectorial force */
1269 fix2 = _mm_add_ps(fix2,tx);
1270 fiy2 = _mm_add_ps(fiy2,ty);
1271 fiz2 = _mm_add_ps(fiz2,tz);
1273 fjx2 = _mm_add_ps(fjx2,tx);
1274 fjy2 = _mm_add_ps(fjy2,ty);
1275 fjz2 = _mm_add_ps(fjz2,tz);
1277 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1278 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1279 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1280 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1282 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1283 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1285 /* Inner loop uses 426 flops */
1288 /* End of innermost loop */
1290 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1291 f+i_coord_offset,fshift+i_shift_offset);
1293 ggid = gid[iidx];
1294 /* Update potential energies */
1295 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1296 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1298 /* Increment number of inner iterations */
1299 inneriter += j_index_end - j_index_start;
1301 /* Outer loop uses 20 flops */
1304 /* Increment number of outer iterations */
1305 outeriter += nri;
1307 /* Update outer/inner flops */
1309 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*426);
1312 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sse2_single
1313 * Electrostatics interaction: CubicSplineTable
1314 * VdW interaction: CubicSplineTable
1315 * Geometry: Water3-Water3
1316 * Calculate force/pot: Force
1318 void
1319 nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sse2_single
1320 (t_nblist * gmx_restrict nlist,
1321 rvec * gmx_restrict xx,
1322 rvec * gmx_restrict ff,
1323 struct t_forcerec * gmx_restrict fr,
1324 t_mdatoms * gmx_restrict mdatoms,
1325 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1326 t_nrnb * gmx_restrict nrnb)
1328 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1329 * just 0 for non-waters.
1330 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1331 * jnr indices corresponding to data put in the four positions in the SIMD register.
1333 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1334 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1335 int jnrA,jnrB,jnrC,jnrD;
1336 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1337 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1338 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1339 real rcutoff_scalar;
1340 real *shiftvec,*fshift,*x,*f;
1341 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1342 real scratch[4*DIM];
1343 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1344 int vdwioffset0;
1345 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1346 int vdwioffset1;
1347 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1348 int vdwioffset2;
1349 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1350 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1351 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1352 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1353 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1354 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1355 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1356 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1357 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1358 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1359 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1360 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1361 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1362 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1363 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1364 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1365 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1366 real *charge;
1367 int nvdwtype;
1368 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1369 int *vdwtype;
1370 real *vdwparam;
1371 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1372 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1373 __m128i vfitab;
1374 __m128i ifour = _mm_set1_epi32(4);
1375 __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1376 real *vftab;
1377 __m128 dummy_mask,cutoff_mask;
1378 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1379 __m128 one = _mm_set1_ps(1.0);
1380 __m128 two = _mm_set1_ps(2.0);
1381 x = xx[0];
1382 f = ff[0];
1384 nri = nlist->nri;
1385 iinr = nlist->iinr;
1386 jindex = nlist->jindex;
1387 jjnr = nlist->jjnr;
1388 shiftidx = nlist->shift;
1389 gid = nlist->gid;
1390 shiftvec = fr->shift_vec[0];
1391 fshift = fr->fshift[0];
1392 facel = _mm_set1_ps(fr->ic->epsfac);
1393 charge = mdatoms->chargeA;
1394 nvdwtype = fr->ntype;
1395 vdwparam = fr->nbfp;
1396 vdwtype = mdatoms->typeA;
1398 vftab = kernel_data->table_elec_vdw->data;
1399 vftabscale = _mm_set1_ps(kernel_data->table_elec_vdw->scale);
1401 /* Setup water-specific parameters */
1402 inr = nlist->iinr[0];
1403 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1404 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1405 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1406 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1408 jq0 = _mm_set1_ps(charge[inr+0]);
1409 jq1 = _mm_set1_ps(charge[inr+1]);
1410 jq2 = _mm_set1_ps(charge[inr+2]);
1411 vdwjidx0A = 2*vdwtype[inr+0];
1412 qq00 = _mm_mul_ps(iq0,jq0);
1413 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1414 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1415 qq01 = _mm_mul_ps(iq0,jq1);
1416 qq02 = _mm_mul_ps(iq0,jq2);
1417 qq10 = _mm_mul_ps(iq1,jq0);
1418 qq11 = _mm_mul_ps(iq1,jq1);
1419 qq12 = _mm_mul_ps(iq1,jq2);
1420 qq20 = _mm_mul_ps(iq2,jq0);
1421 qq21 = _mm_mul_ps(iq2,jq1);
1422 qq22 = _mm_mul_ps(iq2,jq2);
1424 /* Avoid stupid compiler warnings */
1425 jnrA = jnrB = jnrC = jnrD = 0;
1426 j_coord_offsetA = 0;
1427 j_coord_offsetB = 0;
1428 j_coord_offsetC = 0;
1429 j_coord_offsetD = 0;
1431 outeriter = 0;
1432 inneriter = 0;
1434 for(iidx=0;iidx<4*DIM;iidx++)
1436 scratch[iidx] = 0.0;
1439 /* Start outer loop over neighborlists */
1440 for(iidx=0; iidx<nri; iidx++)
1442 /* Load shift vector for this list */
1443 i_shift_offset = DIM*shiftidx[iidx];
1445 /* Load limits for loop over neighbors */
1446 j_index_start = jindex[iidx];
1447 j_index_end = jindex[iidx+1];
1449 /* Get outer coordinate index */
1450 inr = iinr[iidx];
1451 i_coord_offset = DIM*inr;
1453 /* Load i particle coords and add shift vector */
1454 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1455 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1457 fix0 = _mm_setzero_ps();
1458 fiy0 = _mm_setzero_ps();
1459 fiz0 = _mm_setzero_ps();
1460 fix1 = _mm_setzero_ps();
1461 fiy1 = _mm_setzero_ps();
1462 fiz1 = _mm_setzero_ps();
1463 fix2 = _mm_setzero_ps();
1464 fiy2 = _mm_setzero_ps();
1465 fiz2 = _mm_setzero_ps();
1467 /* Start inner kernel loop */
1468 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1471 /* Get j neighbor index, and coordinate index */
1472 jnrA = jjnr[jidx];
1473 jnrB = jjnr[jidx+1];
1474 jnrC = jjnr[jidx+2];
1475 jnrD = jjnr[jidx+3];
1476 j_coord_offsetA = DIM*jnrA;
1477 j_coord_offsetB = DIM*jnrB;
1478 j_coord_offsetC = DIM*jnrC;
1479 j_coord_offsetD = DIM*jnrD;
1481 /* load j atom coordinates */
1482 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1483 x+j_coord_offsetC,x+j_coord_offsetD,
1484 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1486 /* Calculate displacement vector */
1487 dx00 = _mm_sub_ps(ix0,jx0);
1488 dy00 = _mm_sub_ps(iy0,jy0);
1489 dz00 = _mm_sub_ps(iz0,jz0);
1490 dx01 = _mm_sub_ps(ix0,jx1);
1491 dy01 = _mm_sub_ps(iy0,jy1);
1492 dz01 = _mm_sub_ps(iz0,jz1);
1493 dx02 = _mm_sub_ps(ix0,jx2);
1494 dy02 = _mm_sub_ps(iy0,jy2);
1495 dz02 = _mm_sub_ps(iz0,jz2);
1496 dx10 = _mm_sub_ps(ix1,jx0);
1497 dy10 = _mm_sub_ps(iy1,jy0);
1498 dz10 = _mm_sub_ps(iz1,jz0);
1499 dx11 = _mm_sub_ps(ix1,jx1);
1500 dy11 = _mm_sub_ps(iy1,jy1);
1501 dz11 = _mm_sub_ps(iz1,jz1);
1502 dx12 = _mm_sub_ps(ix1,jx2);
1503 dy12 = _mm_sub_ps(iy1,jy2);
1504 dz12 = _mm_sub_ps(iz1,jz2);
1505 dx20 = _mm_sub_ps(ix2,jx0);
1506 dy20 = _mm_sub_ps(iy2,jy0);
1507 dz20 = _mm_sub_ps(iz2,jz0);
1508 dx21 = _mm_sub_ps(ix2,jx1);
1509 dy21 = _mm_sub_ps(iy2,jy1);
1510 dz21 = _mm_sub_ps(iz2,jz1);
1511 dx22 = _mm_sub_ps(ix2,jx2);
1512 dy22 = _mm_sub_ps(iy2,jy2);
1513 dz22 = _mm_sub_ps(iz2,jz2);
1515 /* Calculate squared distance and things based on it */
1516 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1517 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1518 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1519 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1520 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1521 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1522 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1523 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1524 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1526 rinv00 = sse2_invsqrt_f(rsq00);
1527 rinv01 = sse2_invsqrt_f(rsq01);
1528 rinv02 = sse2_invsqrt_f(rsq02);
1529 rinv10 = sse2_invsqrt_f(rsq10);
1530 rinv11 = sse2_invsqrt_f(rsq11);
1531 rinv12 = sse2_invsqrt_f(rsq12);
1532 rinv20 = sse2_invsqrt_f(rsq20);
1533 rinv21 = sse2_invsqrt_f(rsq21);
1534 rinv22 = sse2_invsqrt_f(rsq22);
1536 fjx0 = _mm_setzero_ps();
1537 fjy0 = _mm_setzero_ps();
1538 fjz0 = _mm_setzero_ps();
1539 fjx1 = _mm_setzero_ps();
1540 fjy1 = _mm_setzero_ps();
1541 fjz1 = _mm_setzero_ps();
1542 fjx2 = _mm_setzero_ps();
1543 fjy2 = _mm_setzero_ps();
1544 fjz2 = _mm_setzero_ps();
1546 /**************************
1547 * CALCULATE INTERACTIONS *
1548 **************************/
1550 r00 = _mm_mul_ps(rsq00,rinv00);
1552 /* Calculate table index by multiplying r with table scale and truncate to integer */
1553 rt = _mm_mul_ps(r00,vftabscale);
1554 vfitab = _mm_cvttps_epi32(rt);
1555 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1556 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1558 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1559 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1560 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1561 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1562 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1563 _MM_TRANSPOSE4_PS(Y,F,G,H);
1564 Heps = _mm_mul_ps(vfeps,H);
1565 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1566 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1567 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
1569 /* CUBIC SPLINE TABLE DISPERSION */
1570 vfitab = _mm_add_epi32(vfitab,ifour);
1571 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1572 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1573 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1574 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1575 _MM_TRANSPOSE4_PS(Y,F,G,H);
1576 Heps = _mm_mul_ps(vfeps,H);
1577 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1578 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1579 fvdw6 = _mm_mul_ps(c6_00,FF);
1581 /* CUBIC SPLINE TABLE REPULSION */
1582 vfitab = _mm_add_epi32(vfitab,ifour);
1583 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1584 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1585 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1586 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1587 _MM_TRANSPOSE4_PS(Y,F,G,H);
1588 Heps = _mm_mul_ps(vfeps,H);
1589 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1590 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1591 fvdw12 = _mm_mul_ps(c12_00,FF);
1592 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
1594 fscal = _mm_add_ps(felec,fvdw);
1596 /* Calculate temporary vectorial force */
1597 tx = _mm_mul_ps(fscal,dx00);
1598 ty = _mm_mul_ps(fscal,dy00);
1599 tz = _mm_mul_ps(fscal,dz00);
1601 /* Update vectorial force */
1602 fix0 = _mm_add_ps(fix0,tx);
1603 fiy0 = _mm_add_ps(fiy0,ty);
1604 fiz0 = _mm_add_ps(fiz0,tz);
1606 fjx0 = _mm_add_ps(fjx0,tx);
1607 fjy0 = _mm_add_ps(fjy0,ty);
1608 fjz0 = _mm_add_ps(fjz0,tz);
1610 /**************************
1611 * CALCULATE INTERACTIONS *
1612 **************************/
1614 r01 = _mm_mul_ps(rsq01,rinv01);
1616 /* Calculate table index by multiplying r with table scale and truncate to integer */
1617 rt = _mm_mul_ps(r01,vftabscale);
1618 vfitab = _mm_cvttps_epi32(rt);
1619 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1620 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1622 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1623 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1624 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1625 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1626 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1627 _MM_TRANSPOSE4_PS(Y,F,G,H);
1628 Heps = _mm_mul_ps(vfeps,H);
1629 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1630 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1631 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
1633 fscal = felec;
1635 /* Calculate temporary vectorial force */
1636 tx = _mm_mul_ps(fscal,dx01);
1637 ty = _mm_mul_ps(fscal,dy01);
1638 tz = _mm_mul_ps(fscal,dz01);
1640 /* Update vectorial force */
1641 fix0 = _mm_add_ps(fix0,tx);
1642 fiy0 = _mm_add_ps(fiy0,ty);
1643 fiz0 = _mm_add_ps(fiz0,tz);
1645 fjx1 = _mm_add_ps(fjx1,tx);
1646 fjy1 = _mm_add_ps(fjy1,ty);
1647 fjz1 = _mm_add_ps(fjz1,tz);
1649 /**************************
1650 * CALCULATE INTERACTIONS *
1651 **************************/
1653 r02 = _mm_mul_ps(rsq02,rinv02);
1655 /* Calculate table index by multiplying r with table scale and truncate to integer */
1656 rt = _mm_mul_ps(r02,vftabscale);
1657 vfitab = _mm_cvttps_epi32(rt);
1658 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1659 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1661 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1662 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1663 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1664 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1665 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1666 _MM_TRANSPOSE4_PS(Y,F,G,H);
1667 Heps = _mm_mul_ps(vfeps,H);
1668 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1669 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1670 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
1672 fscal = felec;
1674 /* Calculate temporary vectorial force */
1675 tx = _mm_mul_ps(fscal,dx02);
1676 ty = _mm_mul_ps(fscal,dy02);
1677 tz = _mm_mul_ps(fscal,dz02);
1679 /* Update vectorial force */
1680 fix0 = _mm_add_ps(fix0,tx);
1681 fiy0 = _mm_add_ps(fiy0,ty);
1682 fiz0 = _mm_add_ps(fiz0,tz);
1684 fjx2 = _mm_add_ps(fjx2,tx);
1685 fjy2 = _mm_add_ps(fjy2,ty);
1686 fjz2 = _mm_add_ps(fjz2,tz);
1688 /**************************
1689 * CALCULATE INTERACTIONS *
1690 **************************/
1692 r10 = _mm_mul_ps(rsq10,rinv10);
1694 /* Calculate table index by multiplying r with table scale and truncate to integer */
1695 rt = _mm_mul_ps(r10,vftabscale);
1696 vfitab = _mm_cvttps_epi32(rt);
1697 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1698 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1700 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1701 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1702 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1703 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1704 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1705 _MM_TRANSPOSE4_PS(Y,F,G,H);
1706 Heps = _mm_mul_ps(vfeps,H);
1707 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1708 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1709 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
1711 fscal = felec;
1713 /* Calculate temporary vectorial force */
1714 tx = _mm_mul_ps(fscal,dx10);
1715 ty = _mm_mul_ps(fscal,dy10);
1716 tz = _mm_mul_ps(fscal,dz10);
1718 /* Update vectorial force */
1719 fix1 = _mm_add_ps(fix1,tx);
1720 fiy1 = _mm_add_ps(fiy1,ty);
1721 fiz1 = _mm_add_ps(fiz1,tz);
1723 fjx0 = _mm_add_ps(fjx0,tx);
1724 fjy0 = _mm_add_ps(fjy0,ty);
1725 fjz0 = _mm_add_ps(fjz0,tz);
1727 /**************************
1728 * CALCULATE INTERACTIONS *
1729 **************************/
1731 r11 = _mm_mul_ps(rsq11,rinv11);
1733 /* Calculate table index by multiplying r with table scale and truncate to integer */
1734 rt = _mm_mul_ps(r11,vftabscale);
1735 vfitab = _mm_cvttps_epi32(rt);
1736 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1737 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1739 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1740 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1741 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1742 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1743 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1744 _MM_TRANSPOSE4_PS(Y,F,G,H);
1745 Heps = _mm_mul_ps(vfeps,H);
1746 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1747 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1748 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
1750 fscal = felec;
1752 /* Calculate temporary vectorial force */
1753 tx = _mm_mul_ps(fscal,dx11);
1754 ty = _mm_mul_ps(fscal,dy11);
1755 tz = _mm_mul_ps(fscal,dz11);
1757 /* Update vectorial force */
1758 fix1 = _mm_add_ps(fix1,tx);
1759 fiy1 = _mm_add_ps(fiy1,ty);
1760 fiz1 = _mm_add_ps(fiz1,tz);
1762 fjx1 = _mm_add_ps(fjx1,tx);
1763 fjy1 = _mm_add_ps(fjy1,ty);
1764 fjz1 = _mm_add_ps(fjz1,tz);
1766 /**************************
1767 * CALCULATE INTERACTIONS *
1768 **************************/
1770 r12 = _mm_mul_ps(rsq12,rinv12);
1772 /* Calculate table index by multiplying r with table scale and truncate to integer */
1773 rt = _mm_mul_ps(r12,vftabscale);
1774 vfitab = _mm_cvttps_epi32(rt);
1775 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1776 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1778 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1779 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1780 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1781 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1782 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1783 _MM_TRANSPOSE4_PS(Y,F,G,H);
1784 Heps = _mm_mul_ps(vfeps,H);
1785 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1786 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1787 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
1789 fscal = felec;
1791 /* Calculate temporary vectorial force */
1792 tx = _mm_mul_ps(fscal,dx12);
1793 ty = _mm_mul_ps(fscal,dy12);
1794 tz = _mm_mul_ps(fscal,dz12);
1796 /* Update vectorial force */
1797 fix1 = _mm_add_ps(fix1,tx);
1798 fiy1 = _mm_add_ps(fiy1,ty);
1799 fiz1 = _mm_add_ps(fiz1,tz);
1801 fjx2 = _mm_add_ps(fjx2,tx);
1802 fjy2 = _mm_add_ps(fjy2,ty);
1803 fjz2 = _mm_add_ps(fjz2,tz);
1805 /**************************
1806 * CALCULATE INTERACTIONS *
1807 **************************/
1809 r20 = _mm_mul_ps(rsq20,rinv20);
1811 /* Calculate table index by multiplying r with table scale and truncate to integer */
1812 rt = _mm_mul_ps(r20,vftabscale);
1813 vfitab = _mm_cvttps_epi32(rt);
1814 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1815 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1817 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1818 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1819 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1820 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1821 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1822 _MM_TRANSPOSE4_PS(Y,F,G,H);
1823 Heps = _mm_mul_ps(vfeps,H);
1824 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1825 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1826 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
1828 fscal = felec;
1830 /* Calculate temporary vectorial force */
1831 tx = _mm_mul_ps(fscal,dx20);
1832 ty = _mm_mul_ps(fscal,dy20);
1833 tz = _mm_mul_ps(fscal,dz20);
1835 /* Update vectorial force */
1836 fix2 = _mm_add_ps(fix2,tx);
1837 fiy2 = _mm_add_ps(fiy2,ty);
1838 fiz2 = _mm_add_ps(fiz2,tz);
1840 fjx0 = _mm_add_ps(fjx0,tx);
1841 fjy0 = _mm_add_ps(fjy0,ty);
1842 fjz0 = _mm_add_ps(fjz0,tz);
1844 /**************************
1845 * CALCULATE INTERACTIONS *
1846 **************************/
1848 r21 = _mm_mul_ps(rsq21,rinv21);
1850 /* Calculate table index by multiplying r with table scale and truncate to integer */
1851 rt = _mm_mul_ps(r21,vftabscale);
1852 vfitab = _mm_cvttps_epi32(rt);
1853 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1854 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1856 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1857 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1858 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1859 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1860 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1861 _MM_TRANSPOSE4_PS(Y,F,G,H);
1862 Heps = _mm_mul_ps(vfeps,H);
1863 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1864 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1865 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1867 fscal = felec;
1869 /* Calculate temporary vectorial force */
1870 tx = _mm_mul_ps(fscal,dx21);
1871 ty = _mm_mul_ps(fscal,dy21);
1872 tz = _mm_mul_ps(fscal,dz21);
1874 /* Update vectorial force */
1875 fix2 = _mm_add_ps(fix2,tx);
1876 fiy2 = _mm_add_ps(fiy2,ty);
1877 fiz2 = _mm_add_ps(fiz2,tz);
1879 fjx1 = _mm_add_ps(fjx1,tx);
1880 fjy1 = _mm_add_ps(fjy1,ty);
1881 fjz1 = _mm_add_ps(fjz1,tz);
1883 /**************************
1884 * CALCULATE INTERACTIONS *
1885 **************************/
1887 r22 = _mm_mul_ps(rsq22,rinv22);
1889 /* Calculate table index by multiplying r with table scale and truncate to integer */
1890 rt = _mm_mul_ps(r22,vftabscale);
1891 vfitab = _mm_cvttps_epi32(rt);
1892 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1893 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1895 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1896 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1897 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1898 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1899 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1900 _MM_TRANSPOSE4_PS(Y,F,G,H);
1901 Heps = _mm_mul_ps(vfeps,H);
1902 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1903 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1904 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1906 fscal = felec;
1908 /* Calculate temporary vectorial force */
1909 tx = _mm_mul_ps(fscal,dx22);
1910 ty = _mm_mul_ps(fscal,dy22);
1911 tz = _mm_mul_ps(fscal,dz22);
1913 /* Update vectorial force */
1914 fix2 = _mm_add_ps(fix2,tx);
1915 fiy2 = _mm_add_ps(fiy2,ty);
1916 fiz2 = _mm_add_ps(fiz2,tz);
1918 fjx2 = _mm_add_ps(fjx2,tx);
1919 fjy2 = _mm_add_ps(fjy2,ty);
1920 fjz2 = _mm_add_ps(fjz2,tz);
1922 fjptrA = f+j_coord_offsetA;
1923 fjptrB = f+j_coord_offsetB;
1924 fjptrC = f+j_coord_offsetC;
1925 fjptrD = f+j_coord_offsetD;
1927 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1928 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1930 /* Inner loop uses 373 flops */
1933 if(jidx<j_index_end)
1936 /* Get j neighbor index, and coordinate index */
1937 jnrlistA = jjnr[jidx];
1938 jnrlistB = jjnr[jidx+1];
1939 jnrlistC = jjnr[jidx+2];
1940 jnrlistD = jjnr[jidx+3];
1941 /* Sign of each element will be negative for non-real atoms.
1942 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1943 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1945 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1946 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1947 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1948 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1949 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1950 j_coord_offsetA = DIM*jnrA;
1951 j_coord_offsetB = DIM*jnrB;
1952 j_coord_offsetC = DIM*jnrC;
1953 j_coord_offsetD = DIM*jnrD;
1955 /* load j atom coordinates */
1956 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1957 x+j_coord_offsetC,x+j_coord_offsetD,
1958 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1960 /* Calculate displacement vector */
1961 dx00 = _mm_sub_ps(ix0,jx0);
1962 dy00 = _mm_sub_ps(iy0,jy0);
1963 dz00 = _mm_sub_ps(iz0,jz0);
1964 dx01 = _mm_sub_ps(ix0,jx1);
1965 dy01 = _mm_sub_ps(iy0,jy1);
1966 dz01 = _mm_sub_ps(iz0,jz1);
1967 dx02 = _mm_sub_ps(ix0,jx2);
1968 dy02 = _mm_sub_ps(iy0,jy2);
1969 dz02 = _mm_sub_ps(iz0,jz2);
1970 dx10 = _mm_sub_ps(ix1,jx0);
1971 dy10 = _mm_sub_ps(iy1,jy0);
1972 dz10 = _mm_sub_ps(iz1,jz0);
1973 dx11 = _mm_sub_ps(ix1,jx1);
1974 dy11 = _mm_sub_ps(iy1,jy1);
1975 dz11 = _mm_sub_ps(iz1,jz1);
1976 dx12 = _mm_sub_ps(ix1,jx2);
1977 dy12 = _mm_sub_ps(iy1,jy2);
1978 dz12 = _mm_sub_ps(iz1,jz2);
1979 dx20 = _mm_sub_ps(ix2,jx0);
1980 dy20 = _mm_sub_ps(iy2,jy0);
1981 dz20 = _mm_sub_ps(iz2,jz0);
1982 dx21 = _mm_sub_ps(ix2,jx1);
1983 dy21 = _mm_sub_ps(iy2,jy1);
1984 dz21 = _mm_sub_ps(iz2,jz1);
1985 dx22 = _mm_sub_ps(ix2,jx2);
1986 dy22 = _mm_sub_ps(iy2,jy2);
1987 dz22 = _mm_sub_ps(iz2,jz2);
1989 /* Calculate squared distance and things based on it */
1990 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1991 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1992 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1993 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1994 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1995 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1996 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1997 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1998 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2000 rinv00 = sse2_invsqrt_f(rsq00);
2001 rinv01 = sse2_invsqrt_f(rsq01);
2002 rinv02 = sse2_invsqrt_f(rsq02);
2003 rinv10 = sse2_invsqrt_f(rsq10);
2004 rinv11 = sse2_invsqrt_f(rsq11);
2005 rinv12 = sse2_invsqrt_f(rsq12);
2006 rinv20 = sse2_invsqrt_f(rsq20);
2007 rinv21 = sse2_invsqrt_f(rsq21);
2008 rinv22 = sse2_invsqrt_f(rsq22);
2010 fjx0 = _mm_setzero_ps();
2011 fjy0 = _mm_setzero_ps();
2012 fjz0 = _mm_setzero_ps();
2013 fjx1 = _mm_setzero_ps();
2014 fjy1 = _mm_setzero_ps();
2015 fjz1 = _mm_setzero_ps();
2016 fjx2 = _mm_setzero_ps();
2017 fjy2 = _mm_setzero_ps();
2018 fjz2 = _mm_setzero_ps();
2020 /**************************
2021 * CALCULATE INTERACTIONS *
2022 **************************/
2024 r00 = _mm_mul_ps(rsq00,rinv00);
2025 r00 = _mm_andnot_ps(dummy_mask,r00);
2027 /* Calculate table index by multiplying r with table scale and truncate to integer */
2028 rt = _mm_mul_ps(r00,vftabscale);
2029 vfitab = _mm_cvttps_epi32(rt);
2030 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2031 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2033 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2034 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2035 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2036 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2037 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2038 _MM_TRANSPOSE4_PS(Y,F,G,H);
2039 Heps = _mm_mul_ps(vfeps,H);
2040 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2041 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2042 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
2044 /* CUBIC SPLINE TABLE DISPERSION */
2045 vfitab = _mm_add_epi32(vfitab,ifour);
2046 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2047 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2048 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2049 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2050 _MM_TRANSPOSE4_PS(Y,F,G,H);
2051 Heps = _mm_mul_ps(vfeps,H);
2052 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2053 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2054 fvdw6 = _mm_mul_ps(c6_00,FF);
2056 /* CUBIC SPLINE TABLE REPULSION */
2057 vfitab = _mm_add_epi32(vfitab,ifour);
2058 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2059 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2060 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2061 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2062 _MM_TRANSPOSE4_PS(Y,F,G,H);
2063 Heps = _mm_mul_ps(vfeps,H);
2064 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2065 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2066 fvdw12 = _mm_mul_ps(c12_00,FF);
2067 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
2069 fscal = _mm_add_ps(felec,fvdw);
2071 fscal = _mm_andnot_ps(dummy_mask,fscal);
2073 /* Calculate temporary vectorial force */
2074 tx = _mm_mul_ps(fscal,dx00);
2075 ty = _mm_mul_ps(fscal,dy00);
2076 tz = _mm_mul_ps(fscal,dz00);
2078 /* Update vectorial force */
2079 fix0 = _mm_add_ps(fix0,tx);
2080 fiy0 = _mm_add_ps(fiy0,ty);
2081 fiz0 = _mm_add_ps(fiz0,tz);
2083 fjx0 = _mm_add_ps(fjx0,tx);
2084 fjy0 = _mm_add_ps(fjy0,ty);
2085 fjz0 = _mm_add_ps(fjz0,tz);
2087 /**************************
2088 * CALCULATE INTERACTIONS *
2089 **************************/
2091 r01 = _mm_mul_ps(rsq01,rinv01);
2092 r01 = _mm_andnot_ps(dummy_mask,r01);
2094 /* Calculate table index by multiplying r with table scale and truncate to integer */
2095 rt = _mm_mul_ps(r01,vftabscale);
2096 vfitab = _mm_cvttps_epi32(rt);
2097 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2098 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2100 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2101 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2102 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2103 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2104 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2105 _MM_TRANSPOSE4_PS(Y,F,G,H);
2106 Heps = _mm_mul_ps(vfeps,H);
2107 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2108 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2109 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
2111 fscal = felec;
2113 fscal = _mm_andnot_ps(dummy_mask,fscal);
2115 /* Calculate temporary vectorial force */
2116 tx = _mm_mul_ps(fscal,dx01);
2117 ty = _mm_mul_ps(fscal,dy01);
2118 tz = _mm_mul_ps(fscal,dz01);
2120 /* Update vectorial force */
2121 fix0 = _mm_add_ps(fix0,tx);
2122 fiy0 = _mm_add_ps(fiy0,ty);
2123 fiz0 = _mm_add_ps(fiz0,tz);
2125 fjx1 = _mm_add_ps(fjx1,tx);
2126 fjy1 = _mm_add_ps(fjy1,ty);
2127 fjz1 = _mm_add_ps(fjz1,tz);
2129 /**************************
2130 * CALCULATE INTERACTIONS *
2131 **************************/
2133 r02 = _mm_mul_ps(rsq02,rinv02);
2134 r02 = _mm_andnot_ps(dummy_mask,r02);
2136 /* Calculate table index by multiplying r with table scale and truncate to integer */
2137 rt = _mm_mul_ps(r02,vftabscale);
2138 vfitab = _mm_cvttps_epi32(rt);
2139 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2140 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2142 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2143 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2144 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2145 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2146 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2147 _MM_TRANSPOSE4_PS(Y,F,G,H);
2148 Heps = _mm_mul_ps(vfeps,H);
2149 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2150 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2151 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
2153 fscal = felec;
2155 fscal = _mm_andnot_ps(dummy_mask,fscal);
2157 /* Calculate temporary vectorial force */
2158 tx = _mm_mul_ps(fscal,dx02);
2159 ty = _mm_mul_ps(fscal,dy02);
2160 tz = _mm_mul_ps(fscal,dz02);
2162 /* Update vectorial force */
2163 fix0 = _mm_add_ps(fix0,tx);
2164 fiy0 = _mm_add_ps(fiy0,ty);
2165 fiz0 = _mm_add_ps(fiz0,tz);
2167 fjx2 = _mm_add_ps(fjx2,tx);
2168 fjy2 = _mm_add_ps(fjy2,ty);
2169 fjz2 = _mm_add_ps(fjz2,tz);
2171 /**************************
2172 * CALCULATE INTERACTIONS *
2173 **************************/
2175 r10 = _mm_mul_ps(rsq10,rinv10);
2176 r10 = _mm_andnot_ps(dummy_mask,r10);
2178 /* Calculate table index by multiplying r with table scale and truncate to integer */
2179 rt = _mm_mul_ps(r10,vftabscale);
2180 vfitab = _mm_cvttps_epi32(rt);
2181 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2182 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2184 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2185 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2186 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2187 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2188 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2189 _MM_TRANSPOSE4_PS(Y,F,G,H);
2190 Heps = _mm_mul_ps(vfeps,H);
2191 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2192 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2193 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
2195 fscal = felec;
2197 fscal = _mm_andnot_ps(dummy_mask,fscal);
2199 /* Calculate temporary vectorial force */
2200 tx = _mm_mul_ps(fscal,dx10);
2201 ty = _mm_mul_ps(fscal,dy10);
2202 tz = _mm_mul_ps(fscal,dz10);
2204 /* Update vectorial force */
2205 fix1 = _mm_add_ps(fix1,tx);
2206 fiy1 = _mm_add_ps(fiy1,ty);
2207 fiz1 = _mm_add_ps(fiz1,tz);
2209 fjx0 = _mm_add_ps(fjx0,tx);
2210 fjy0 = _mm_add_ps(fjy0,ty);
2211 fjz0 = _mm_add_ps(fjz0,tz);
2213 /**************************
2214 * CALCULATE INTERACTIONS *
2215 **************************/
2217 r11 = _mm_mul_ps(rsq11,rinv11);
2218 r11 = _mm_andnot_ps(dummy_mask,r11);
2220 /* Calculate table index by multiplying r with table scale and truncate to integer */
2221 rt = _mm_mul_ps(r11,vftabscale);
2222 vfitab = _mm_cvttps_epi32(rt);
2223 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2224 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2226 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2227 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2228 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2229 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2230 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2231 _MM_TRANSPOSE4_PS(Y,F,G,H);
2232 Heps = _mm_mul_ps(vfeps,H);
2233 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2234 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2235 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
2237 fscal = felec;
2239 fscal = _mm_andnot_ps(dummy_mask,fscal);
2241 /* Calculate temporary vectorial force */
2242 tx = _mm_mul_ps(fscal,dx11);
2243 ty = _mm_mul_ps(fscal,dy11);
2244 tz = _mm_mul_ps(fscal,dz11);
2246 /* Update vectorial force */
2247 fix1 = _mm_add_ps(fix1,tx);
2248 fiy1 = _mm_add_ps(fiy1,ty);
2249 fiz1 = _mm_add_ps(fiz1,tz);
2251 fjx1 = _mm_add_ps(fjx1,tx);
2252 fjy1 = _mm_add_ps(fjy1,ty);
2253 fjz1 = _mm_add_ps(fjz1,tz);
2255 /**************************
2256 * CALCULATE INTERACTIONS *
2257 **************************/
2259 r12 = _mm_mul_ps(rsq12,rinv12);
2260 r12 = _mm_andnot_ps(dummy_mask,r12);
2262 /* Calculate table index by multiplying r with table scale and truncate to integer */
2263 rt = _mm_mul_ps(r12,vftabscale);
2264 vfitab = _mm_cvttps_epi32(rt);
2265 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2266 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2268 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2269 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2270 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2271 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2272 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2273 _MM_TRANSPOSE4_PS(Y,F,G,H);
2274 Heps = _mm_mul_ps(vfeps,H);
2275 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2276 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2277 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
2279 fscal = felec;
2281 fscal = _mm_andnot_ps(dummy_mask,fscal);
2283 /* Calculate temporary vectorial force */
2284 tx = _mm_mul_ps(fscal,dx12);
2285 ty = _mm_mul_ps(fscal,dy12);
2286 tz = _mm_mul_ps(fscal,dz12);
2288 /* Update vectorial force */
2289 fix1 = _mm_add_ps(fix1,tx);
2290 fiy1 = _mm_add_ps(fiy1,ty);
2291 fiz1 = _mm_add_ps(fiz1,tz);
2293 fjx2 = _mm_add_ps(fjx2,tx);
2294 fjy2 = _mm_add_ps(fjy2,ty);
2295 fjz2 = _mm_add_ps(fjz2,tz);
2297 /**************************
2298 * CALCULATE INTERACTIONS *
2299 **************************/
2301 r20 = _mm_mul_ps(rsq20,rinv20);
2302 r20 = _mm_andnot_ps(dummy_mask,r20);
2304 /* Calculate table index by multiplying r with table scale and truncate to integer */
2305 rt = _mm_mul_ps(r20,vftabscale);
2306 vfitab = _mm_cvttps_epi32(rt);
2307 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2308 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2310 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2311 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2312 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2313 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2314 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2315 _MM_TRANSPOSE4_PS(Y,F,G,H);
2316 Heps = _mm_mul_ps(vfeps,H);
2317 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2318 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2319 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
2321 fscal = felec;
2323 fscal = _mm_andnot_ps(dummy_mask,fscal);
2325 /* Calculate temporary vectorial force */
2326 tx = _mm_mul_ps(fscal,dx20);
2327 ty = _mm_mul_ps(fscal,dy20);
2328 tz = _mm_mul_ps(fscal,dz20);
2330 /* Update vectorial force */
2331 fix2 = _mm_add_ps(fix2,tx);
2332 fiy2 = _mm_add_ps(fiy2,ty);
2333 fiz2 = _mm_add_ps(fiz2,tz);
2335 fjx0 = _mm_add_ps(fjx0,tx);
2336 fjy0 = _mm_add_ps(fjy0,ty);
2337 fjz0 = _mm_add_ps(fjz0,tz);
2339 /**************************
2340 * CALCULATE INTERACTIONS *
2341 **************************/
2343 r21 = _mm_mul_ps(rsq21,rinv21);
2344 r21 = _mm_andnot_ps(dummy_mask,r21);
2346 /* Calculate table index by multiplying r with table scale and truncate to integer */
2347 rt = _mm_mul_ps(r21,vftabscale);
2348 vfitab = _mm_cvttps_epi32(rt);
2349 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2350 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2352 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2353 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2354 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2355 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2356 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2357 _MM_TRANSPOSE4_PS(Y,F,G,H);
2358 Heps = _mm_mul_ps(vfeps,H);
2359 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2360 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2361 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
2363 fscal = felec;
2365 fscal = _mm_andnot_ps(dummy_mask,fscal);
2367 /* Calculate temporary vectorial force */
2368 tx = _mm_mul_ps(fscal,dx21);
2369 ty = _mm_mul_ps(fscal,dy21);
2370 tz = _mm_mul_ps(fscal,dz21);
2372 /* Update vectorial force */
2373 fix2 = _mm_add_ps(fix2,tx);
2374 fiy2 = _mm_add_ps(fiy2,ty);
2375 fiz2 = _mm_add_ps(fiz2,tz);
2377 fjx1 = _mm_add_ps(fjx1,tx);
2378 fjy1 = _mm_add_ps(fjy1,ty);
2379 fjz1 = _mm_add_ps(fjz1,tz);
2381 /**************************
2382 * CALCULATE INTERACTIONS *
2383 **************************/
2385 r22 = _mm_mul_ps(rsq22,rinv22);
2386 r22 = _mm_andnot_ps(dummy_mask,r22);
2388 /* Calculate table index by multiplying r with table scale and truncate to integer */
2389 rt = _mm_mul_ps(r22,vftabscale);
2390 vfitab = _mm_cvttps_epi32(rt);
2391 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2392 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2394 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2395 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2396 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2397 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2398 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2399 _MM_TRANSPOSE4_PS(Y,F,G,H);
2400 Heps = _mm_mul_ps(vfeps,H);
2401 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2402 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2403 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
2405 fscal = felec;
2407 fscal = _mm_andnot_ps(dummy_mask,fscal);
2409 /* Calculate temporary vectorial force */
2410 tx = _mm_mul_ps(fscal,dx22);
2411 ty = _mm_mul_ps(fscal,dy22);
2412 tz = _mm_mul_ps(fscal,dz22);
2414 /* Update vectorial force */
2415 fix2 = _mm_add_ps(fix2,tx);
2416 fiy2 = _mm_add_ps(fiy2,ty);
2417 fiz2 = _mm_add_ps(fiz2,tz);
2419 fjx2 = _mm_add_ps(fjx2,tx);
2420 fjy2 = _mm_add_ps(fjy2,ty);
2421 fjz2 = _mm_add_ps(fjz2,tz);
2423 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2424 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2425 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2426 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2428 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2429 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2431 /* Inner loop uses 382 flops */
2434 /* End of innermost loop */
2436 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2437 f+i_coord_offset,fshift+i_shift_offset);
2439 /* Increment number of inner iterations */
2440 inneriter += j_index_end - j_index_start;
2442 /* Outer loop uses 18 flops */
2445 /* Increment number of outer iterations */
2446 outeriter += nri;
2448 /* Update outer/inner flops */
2450 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*382);