Removed simple.h from nb_kernel_sse2_XX
[gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse2_single / nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_sse2_single.c
blobef68da9b6aad17bbcc90ea232432acc25109689e
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_single kernel generator.
38 #include "gmxpre.h"
40 #include "config.h"
42 #include <math.h>
44 #include "../nb_kernel.h"
45 #include "gromacs/math/vec.h"
46 #include "gromacs/legacyheaders/nrnb.h"
48 #include "gromacs/simd/math_x86_sse2_single.h"
49 #include "kernelutil_x86_sse2_single.h"
52 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sse2_single
53 * Electrostatics interaction: CubicSplineTable
54 * VdW interaction: CubicSplineTable
55 * Geometry: Water3-Water3
56 * Calculate force/pot: PotentialAndForce
58 void
59 nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sse2_single
60 (t_nblist * gmx_restrict nlist,
61 rvec * gmx_restrict xx,
62 rvec * gmx_restrict ff,
63 t_forcerec * gmx_restrict fr,
64 t_mdatoms * gmx_restrict mdatoms,
65 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
66 t_nrnb * gmx_restrict nrnb)
68 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
69 * just 0 for non-waters.
70 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
71 * jnr indices corresponding to data put in the four positions in the SIMD register.
73 int i_shift_offset,i_coord_offset,outeriter,inneriter;
74 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
75 int jnrA,jnrB,jnrC,jnrD;
76 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
77 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
78 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
79 real rcutoff_scalar;
80 real *shiftvec,*fshift,*x,*f;
81 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
82 real scratch[4*DIM];
83 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
84 int vdwioffset0;
85 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
86 int vdwioffset1;
87 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
88 int vdwioffset2;
89 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
90 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
91 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
92 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
93 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
94 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
95 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
96 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
97 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
98 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
99 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
100 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
101 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
102 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
103 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
104 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
105 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
106 real *charge;
107 int nvdwtype;
108 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
109 int *vdwtype;
110 real *vdwparam;
111 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
112 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
113 __m128i vfitab;
114 __m128i ifour = _mm_set1_epi32(4);
115 __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
116 real *vftab;
117 __m128 dummy_mask,cutoff_mask;
118 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
119 __m128 one = _mm_set1_ps(1.0);
120 __m128 two = _mm_set1_ps(2.0);
121 x = xx[0];
122 f = ff[0];
124 nri = nlist->nri;
125 iinr = nlist->iinr;
126 jindex = nlist->jindex;
127 jjnr = nlist->jjnr;
128 shiftidx = nlist->shift;
129 gid = nlist->gid;
130 shiftvec = fr->shift_vec[0];
131 fshift = fr->fshift[0];
132 facel = _mm_set1_ps(fr->epsfac);
133 charge = mdatoms->chargeA;
134 nvdwtype = fr->ntype;
135 vdwparam = fr->nbfp;
136 vdwtype = mdatoms->typeA;
138 vftab = kernel_data->table_elec_vdw->data;
139 vftabscale = _mm_set1_ps(kernel_data->table_elec_vdw->scale);
141 /* Setup water-specific parameters */
142 inr = nlist->iinr[0];
143 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
144 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
145 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
146 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
148 jq0 = _mm_set1_ps(charge[inr+0]);
149 jq1 = _mm_set1_ps(charge[inr+1]);
150 jq2 = _mm_set1_ps(charge[inr+2]);
151 vdwjidx0A = 2*vdwtype[inr+0];
152 qq00 = _mm_mul_ps(iq0,jq0);
153 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
154 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
155 qq01 = _mm_mul_ps(iq0,jq1);
156 qq02 = _mm_mul_ps(iq0,jq2);
157 qq10 = _mm_mul_ps(iq1,jq0);
158 qq11 = _mm_mul_ps(iq1,jq1);
159 qq12 = _mm_mul_ps(iq1,jq2);
160 qq20 = _mm_mul_ps(iq2,jq0);
161 qq21 = _mm_mul_ps(iq2,jq1);
162 qq22 = _mm_mul_ps(iq2,jq2);
164 /* Avoid stupid compiler warnings */
165 jnrA = jnrB = jnrC = jnrD = 0;
166 j_coord_offsetA = 0;
167 j_coord_offsetB = 0;
168 j_coord_offsetC = 0;
169 j_coord_offsetD = 0;
171 outeriter = 0;
172 inneriter = 0;
174 for(iidx=0;iidx<4*DIM;iidx++)
176 scratch[iidx] = 0.0;
179 /* Start outer loop over neighborlists */
180 for(iidx=0; iidx<nri; iidx++)
182 /* Load shift vector for this list */
183 i_shift_offset = DIM*shiftidx[iidx];
185 /* Load limits for loop over neighbors */
186 j_index_start = jindex[iidx];
187 j_index_end = jindex[iidx+1];
189 /* Get outer coordinate index */
190 inr = iinr[iidx];
191 i_coord_offset = DIM*inr;
193 /* Load i particle coords and add shift vector */
194 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
195 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
197 fix0 = _mm_setzero_ps();
198 fiy0 = _mm_setzero_ps();
199 fiz0 = _mm_setzero_ps();
200 fix1 = _mm_setzero_ps();
201 fiy1 = _mm_setzero_ps();
202 fiz1 = _mm_setzero_ps();
203 fix2 = _mm_setzero_ps();
204 fiy2 = _mm_setzero_ps();
205 fiz2 = _mm_setzero_ps();
207 /* Reset potential sums */
208 velecsum = _mm_setzero_ps();
209 vvdwsum = _mm_setzero_ps();
211 /* Start inner kernel loop */
212 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
215 /* Get j neighbor index, and coordinate index */
216 jnrA = jjnr[jidx];
217 jnrB = jjnr[jidx+1];
218 jnrC = jjnr[jidx+2];
219 jnrD = jjnr[jidx+3];
220 j_coord_offsetA = DIM*jnrA;
221 j_coord_offsetB = DIM*jnrB;
222 j_coord_offsetC = DIM*jnrC;
223 j_coord_offsetD = DIM*jnrD;
225 /* load j atom coordinates */
226 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
227 x+j_coord_offsetC,x+j_coord_offsetD,
228 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
230 /* Calculate displacement vector */
231 dx00 = _mm_sub_ps(ix0,jx0);
232 dy00 = _mm_sub_ps(iy0,jy0);
233 dz00 = _mm_sub_ps(iz0,jz0);
234 dx01 = _mm_sub_ps(ix0,jx1);
235 dy01 = _mm_sub_ps(iy0,jy1);
236 dz01 = _mm_sub_ps(iz0,jz1);
237 dx02 = _mm_sub_ps(ix0,jx2);
238 dy02 = _mm_sub_ps(iy0,jy2);
239 dz02 = _mm_sub_ps(iz0,jz2);
240 dx10 = _mm_sub_ps(ix1,jx0);
241 dy10 = _mm_sub_ps(iy1,jy0);
242 dz10 = _mm_sub_ps(iz1,jz0);
243 dx11 = _mm_sub_ps(ix1,jx1);
244 dy11 = _mm_sub_ps(iy1,jy1);
245 dz11 = _mm_sub_ps(iz1,jz1);
246 dx12 = _mm_sub_ps(ix1,jx2);
247 dy12 = _mm_sub_ps(iy1,jy2);
248 dz12 = _mm_sub_ps(iz1,jz2);
249 dx20 = _mm_sub_ps(ix2,jx0);
250 dy20 = _mm_sub_ps(iy2,jy0);
251 dz20 = _mm_sub_ps(iz2,jz0);
252 dx21 = _mm_sub_ps(ix2,jx1);
253 dy21 = _mm_sub_ps(iy2,jy1);
254 dz21 = _mm_sub_ps(iz2,jz1);
255 dx22 = _mm_sub_ps(ix2,jx2);
256 dy22 = _mm_sub_ps(iy2,jy2);
257 dz22 = _mm_sub_ps(iz2,jz2);
259 /* Calculate squared distance and things based on it */
260 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
261 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
262 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
263 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
264 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
265 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
266 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
267 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
268 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
270 rinv00 = gmx_mm_invsqrt_ps(rsq00);
271 rinv01 = gmx_mm_invsqrt_ps(rsq01);
272 rinv02 = gmx_mm_invsqrt_ps(rsq02);
273 rinv10 = gmx_mm_invsqrt_ps(rsq10);
274 rinv11 = gmx_mm_invsqrt_ps(rsq11);
275 rinv12 = gmx_mm_invsqrt_ps(rsq12);
276 rinv20 = gmx_mm_invsqrt_ps(rsq20);
277 rinv21 = gmx_mm_invsqrt_ps(rsq21);
278 rinv22 = gmx_mm_invsqrt_ps(rsq22);
280 fjx0 = _mm_setzero_ps();
281 fjy0 = _mm_setzero_ps();
282 fjz0 = _mm_setzero_ps();
283 fjx1 = _mm_setzero_ps();
284 fjy1 = _mm_setzero_ps();
285 fjz1 = _mm_setzero_ps();
286 fjx2 = _mm_setzero_ps();
287 fjy2 = _mm_setzero_ps();
288 fjz2 = _mm_setzero_ps();
290 /**************************
291 * CALCULATE INTERACTIONS *
292 **************************/
294 r00 = _mm_mul_ps(rsq00,rinv00);
296 /* Calculate table index by multiplying r with table scale and truncate to integer */
297 rt = _mm_mul_ps(r00,vftabscale);
298 vfitab = _mm_cvttps_epi32(rt);
299 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
300 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
302 /* CUBIC SPLINE TABLE ELECTROSTATICS */
303 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
304 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
305 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
306 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
307 _MM_TRANSPOSE4_PS(Y,F,G,H);
308 Heps = _mm_mul_ps(vfeps,H);
309 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
310 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
311 velec = _mm_mul_ps(qq00,VV);
312 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
313 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
315 /* CUBIC SPLINE TABLE DISPERSION */
316 vfitab = _mm_add_epi32(vfitab,ifour);
317 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
318 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
319 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
320 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
321 _MM_TRANSPOSE4_PS(Y,F,G,H);
322 Heps = _mm_mul_ps(vfeps,H);
323 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
324 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
325 vvdw6 = _mm_mul_ps(c6_00,VV);
326 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
327 fvdw6 = _mm_mul_ps(c6_00,FF);
329 /* CUBIC SPLINE TABLE REPULSION */
330 vfitab = _mm_add_epi32(vfitab,ifour);
331 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
332 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
333 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
334 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
335 _MM_TRANSPOSE4_PS(Y,F,G,H);
336 Heps = _mm_mul_ps(vfeps,H);
337 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
338 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
339 vvdw12 = _mm_mul_ps(c12_00,VV);
340 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
341 fvdw12 = _mm_mul_ps(c12_00,FF);
342 vvdw = _mm_add_ps(vvdw12,vvdw6);
343 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
345 /* Update potential sum for this i atom from the interaction with this j atom. */
346 velecsum = _mm_add_ps(velecsum,velec);
347 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
349 fscal = _mm_add_ps(felec,fvdw);
351 /* Calculate temporary vectorial force */
352 tx = _mm_mul_ps(fscal,dx00);
353 ty = _mm_mul_ps(fscal,dy00);
354 tz = _mm_mul_ps(fscal,dz00);
356 /* Update vectorial force */
357 fix0 = _mm_add_ps(fix0,tx);
358 fiy0 = _mm_add_ps(fiy0,ty);
359 fiz0 = _mm_add_ps(fiz0,tz);
361 fjx0 = _mm_add_ps(fjx0,tx);
362 fjy0 = _mm_add_ps(fjy0,ty);
363 fjz0 = _mm_add_ps(fjz0,tz);
365 /**************************
366 * CALCULATE INTERACTIONS *
367 **************************/
369 r01 = _mm_mul_ps(rsq01,rinv01);
371 /* Calculate table index by multiplying r with table scale and truncate to integer */
372 rt = _mm_mul_ps(r01,vftabscale);
373 vfitab = _mm_cvttps_epi32(rt);
374 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
375 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
377 /* CUBIC SPLINE TABLE ELECTROSTATICS */
378 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
379 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
380 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
381 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
382 _MM_TRANSPOSE4_PS(Y,F,G,H);
383 Heps = _mm_mul_ps(vfeps,H);
384 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
385 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
386 velec = _mm_mul_ps(qq01,VV);
387 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
388 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
390 /* Update potential sum for this i atom from the interaction with this j atom. */
391 velecsum = _mm_add_ps(velecsum,velec);
393 fscal = felec;
395 /* Calculate temporary vectorial force */
396 tx = _mm_mul_ps(fscal,dx01);
397 ty = _mm_mul_ps(fscal,dy01);
398 tz = _mm_mul_ps(fscal,dz01);
400 /* Update vectorial force */
401 fix0 = _mm_add_ps(fix0,tx);
402 fiy0 = _mm_add_ps(fiy0,ty);
403 fiz0 = _mm_add_ps(fiz0,tz);
405 fjx1 = _mm_add_ps(fjx1,tx);
406 fjy1 = _mm_add_ps(fjy1,ty);
407 fjz1 = _mm_add_ps(fjz1,tz);
409 /**************************
410 * CALCULATE INTERACTIONS *
411 **************************/
413 r02 = _mm_mul_ps(rsq02,rinv02);
415 /* Calculate table index by multiplying r with table scale and truncate to integer */
416 rt = _mm_mul_ps(r02,vftabscale);
417 vfitab = _mm_cvttps_epi32(rt);
418 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
419 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
421 /* CUBIC SPLINE TABLE ELECTROSTATICS */
422 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
423 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
424 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
425 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
426 _MM_TRANSPOSE4_PS(Y,F,G,H);
427 Heps = _mm_mul_ps(vfeps,H);
428 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
429 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
430 velec = _mm_mul_ps(qq02,VV);
431 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
432 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
434 /* Update potential sum for this i atom from the interaction with this j atom. */
435 velecsum = _mm_add_ps(velecsum,velec);
437 fscal = felec;
439 /* Calculate temporary vectorial force */
440 tx = _mm_mul_ps(fscal,dx02);
441 ty = _mm_mul_ps(fscal,dy02);
442 tz = _mm_mul_ps(fscal,dz02);
444 /* Update vectorial force */
445 fix0 = _mm_add_ps(fix0,tx);
446 fiy0 = _mm_add_ps(fiy0,ty);
447 fiz0 = _mm_add_ps(fiz0,tz);
449 fjx2 = _mm_add_ps(fjx2,tx);
450 fjy2 = _mm_add_ps(fjy2,ty);
451 fjz2 = _mm_add_ps(fjz2,tz);
453 /**************************
454 * CALCULATE INTERACTIONS *
455 **************************/
457 r10 = _mm_mul_ps(rsq10,rinv10);
459 /* Calculate table index by multiplying r with table scale and truncate to integer */
460 rt = _mm_mul_ps(r10,vftabscale);
461 vfitab = _mm_cvttps_epi32(rt);
462 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
463 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
465 /* CUBIC SPLINE TABLE ELECTROSTATICS */
466 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
467 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
468 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
469 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
470 _MM_TRANSPOSE4_PS(Y,F,G,H);
471 Heps = _mm_mul_ps(vfeps,H);
472 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
473 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
474 velec = _mm_mul_ps(qq10,VV);
475 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
476 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
478 /* Update potential sum for this i atom from the interaction with this j atom. */
479 velecsum = _mm_add_ps(velecsum,velec);
481 fscal = felec;
483 /* Calculate temporary vectorial force */
484 tx = _mm_mul_ps(fscal,dx10);
485 ty = _mm_mul_ps(fscal,dy10);
486 tz = _mm_mul_ps(fscal,dz10);
488 /* Update vectorial force */
489 fix1 = _mm_add_ps(fix1,tx);
490 fiy1 = _mm_add_ps(fiy1,ty);
491 fiz1 = _mm_add_ps(fiz1,tz);
493 fjx0 = _mm_add_ps(fjx0,tx);
494 fjy0 = _mm_add_ps(fjy0,ty);
495 fjz0 = _mm_add_ps(fjz0,tz);
497 /**************************
498 * CALCULATE INTERACTIONS *
499 **************************/
501 r11 = _mm_mul_ps(rsq11,rinv11);
503 /* Calculate table index by multiplying r with table scale and truncate to integer */
504 rt = _mm_mul_ps(r11,vftabscale);
505 vfitab = _mm_cvttps_epi32(rt);
506 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
507 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
509 /* CUBIC SPLINE TABLE ELECTROSTATICS */
510 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
511 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
512 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
513 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
514 _MM_TRANSPOSE4_PS(Y,F,G,H);
515 Heps = _mm_mul_ps(vfeps,H);
516 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
517 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
518 velec = _mm_mul_ps(qq11,VV);
519 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
520 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
522 /* Update potential sum for this i atom from the interaction with this j atom. */
523 velecsum = _mm_add_ps(velecsum,velec);
525 fscal = felec;
527 /* Calculate temporary vectorial force */
528 tx = _mm_mul_ps(fscal,dx11);
529 ty = _mm_mul_ps(fscal,dy11);
530 tz = _mm_mul_ps(fscal,dz11);
532 /* Update vectorial force */
533 fix1 = _mm_add_ps(fix1,tx);
534 fiy1 = _mm_add_ps(fiy1,ty);
535 fiz1 = _mm_add_ps(fiz1,tz);
537 fjx1 = _mm_add_ps(fjx1,tx);
538 fjy1 = _mm_add_ps(fjy1,ty);
539 fjz1 = _mm_add_ps(fjz1,tz);
541 /**************************
542 * CALCULATE INTERACTIONS *
543 **************************/
545 r12 = _mm_mul_ps(rsq12,rinv12);
547 /* Calculate table index by multiplying r with table scale and truncate to integer */
548 rt = _mm_mul_ps(r12,vftabscale);
549 vfitab = _mm_cvttps_epi32(rt);
550 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
551 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
553 /* CUBIC SPLINE TABLE ELECTROSTATICS */
554 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
555 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
556 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
557 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
558 _MM_TRANSPOSE4_PS(Y,F,G,H);
559 Heps = _mm_mul_ps(vfeps,H);
560 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
561 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
562 velec = _mm_mul_ps(qq12,VV);
563 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
564 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
566 /* Update potential sum for this i atom from the interaction with this j atom. */
567 velecsum = _mm_add_ps(velecsum,velec);
569 fscal = felec;
571 /* Calculate temporary vectorial force */
572 tx = _mm_mul_ps(fscal,dx12);
573 ty = _mm_mul_ps(fscal,dy12);
574 tz = _mm_mul_ps(fscal,dz12);
576 /* Update vectorial force */
577 fix1 = _mm_add_ps(fix1,tx);
578 fiy1 = _mm_add_ps(fiy1,ty);
579 fiz1 = _mm_add_ps(fiz1,tz);
581 fjx2 = _mm_add_ps(fjx2,tx);
582 fjy2 = _mm_add_ps(fjy2,ty);
583 fjz2 = _mm_add_ps(fjz2,tz);
585 /**************************
586 * CALCULATE INTERACTIONS *
587 **************************/
589 r20 = _mm_mul_ps(rsq20,rinv20);
591 /* Calculate table index by multiplying r with table scale and truncate to integer */
592 rt = _mm_mul_ps(r20,vftabscale);
593 vfitab = _mm_cvttps_epi32(rt);
594 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
595 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
597 /* CUBIC SPLINE TABLE ELECTROSTATICS */
598 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
599 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
600 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
601 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
602 _MM_TRANSPOSE4_PS(Y,F,G,H);
603 Heps = _mm_mul_ps(vfeps,H);
604 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
605 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
606 velec = _mm_mul_ps(qq20,VV);
607 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
608 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
610 /* Update potential sum for this i atom from the interaction with this j atom. */
611 velecsum = _mm_add_ps(velecsum,velec);
613 fscal = felec;
615 /* Calculate temporary vectorial force */
616 tx = _mm_mul_ps(fscal,dx20);
617 ty = _mm_mul_ps(fscal,dy20);
618 tz = _mm_mul_ps(fscal,dz20);
620 /* Update vectorial force */
621 fix2 = _mm_add_ps(fix2,tx);
622 fiy2 = _mm_add_ps(fiy2,ty);
623 fiz2 = _mm_add_ps(fiz2,tz);
625 fjx0 = _mm_add_ps(fjx0,tx);
626 fjy0 = _mm_add_ps(fjy0,ty);
627 fjz0 = _mm_add_ps(fjz0,tz);
629 /**************************
630 * CALCULATE INTERACTIONS *
631 **************************/
633 r21 = _mm_mul_ps(rsq21,rinv21);
635 /* Calculate table index by multiplying r with table scale and truncate to integer */
636 rt = _mm_mul_ps(r21,vftabscale);
637 vfitab = _mm_cvttps_epi32(rt);
638 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
639 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
641 /* CUBIC SPLINE TABLE ELECTROSTATICS */
642 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
643 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
644 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
645 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
646 _MM_TRANSPOSE4_PS(Y,F,G,H);
647 Heps = _mm_mul_ps(vfeps,H);
648 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
649 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
650 velec = _mm_mul_ps(qq21,VV);
651 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
652 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
654 /* Update potential sum for this i atom from the interaction with this j atom. */
655 velecsum = _mm_add_ps(velecsum,velec);
657 fscal = felec;
659 /* Calculate temporary vectorial force */
660 tx = _mm_mul_ps(fscal,dx21);
661 ty = _mm_mul_ps(fscal,dy21);
662 tz = _mm_mul_ps(fscal,dz21);
664 /* Update vectorial force */
665 fix2 = _mm_add_ps(fix2,tx);
666 fiy2 = _mm_add_ps(fiy2,ty);
667 fiz2 = _mm_add_ps(fiz2,tz);
669 fjx1 = _mm_add_ps(fjx1,tx);
670 fjy1 = _mm_add_ps(fjy1,ty);
671 fjz1 = _mm_add_ps(fjz1,tz);
673 /**************************
674 * CALCULATE INTERACTIONS *
675 **************************/
677 r22 = _mm_mul_ps(rsq22,rinv22);
679 /* Calculate table index by multiplying r with table scale and truncate to integer */
680 rt = _mm_mul_ps(r22,vftabscale);
681 vfitab = _mm_cvttps_epi32(rt);
682 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
683 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
685 /* CUBIC SPLINE TABLE ELECTROSTATICS */
686 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
687 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
688 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
689 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
690 _MM_TRANSPOSE4_PS(Y,F,G,H);
691 Heps = _mm_mul_ps(vfeps,H);
692 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
693 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
694 velec = _mm_mul_ps(qq22,VV);
695 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
696 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
698 /* Update potential sum for this i atom from the interaction with this j atom. */
699 velecsum = _mm_add_ps(velecsum,velec);
701 fscal = felec;
703 /* Calculate temporary vectorial force */
704 tx = _mm_mul_ps(fscal,dx22);
705 ty = _mm_mul_ps(fscal,dy22);
706 tz = _mm_mul_ps(fscal,dz22);
708 /* Update vectorial force */
709 fix2 = _mm_add_ps(fix2,tx);
710 fiy2 = _mm_add_ps(fiy2,ty);
711 fiz2 = _mm_add_ps(fiz2,tz);
713 fjx2 = _mm_add_ps(fjx2,tx);
714 fjy2 = _mm_add_ps(fjy2,ty);
715 fjz2 = _mm_add_ps(fjz2,tz);
717 fjptrA = f+j_coord_offsetA;
718 fjptrB = f+j_coord_offsetB;
719 fjptrC = f+j_coord_offsetC;
720 fjptrD = f+j_coord_offsetD;
722 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
723 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
725 /* Inner loop uses 417 flops */
728 if(jidx<j_index_end)
731 /* Get j neighbor index, and coordinate index */
732 jnrlistA = jjnr[jidx];
733 jnrlistB = jjnr[jidx+1];
734 jnrlistC = jjnr[jidx+2];
735 jnrlistD = jjnr[jidx+3];
736 /* Sign of each element will be negative for non-real atoms.
737 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
738 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
740 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
741 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
742 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
743 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
744 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
745 j_coord_offsetA = DIM*jnrA;
746 j_coord_offsetB = DIM*jnrB;
747 j_coord_offsetC = DIM*jnrC;
748 j_coord_offsetD = DIM*jnrD;
750 /* load j atom coordinates */
751 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
752 x+j_coord_offsetC,x+j_coord_offsetD,
753 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
755 /* Calculate displacement vector */
756 dx00 = _mm_sub_ps(ix0,jx0);
757 dy00 = _mm_sub_ps(iy0,jy0);
758 dz00 = _mm_sub_ps(iz0,jz0);
759 dx01 = _mm_sub_ps(ix0,jx1);
760 dy01 = _mm_sub_ps(iy0,jy1);
761 dz01 = _mm_sub_ps(iz0,jz1);
762 dx02 = _mm_sub_ps(ix0,jx2);
763 dy02 = _mm_sub_ps(iy0,jy2);
764 dz02 = _mm_sub_ps(iz0,jz2);
765 dx10 = _mm_sub_ps(ix1,jx0);
766 dy10 = _mm_sub_ps(iy1,jy0);
767 dz10 = _mm_sub_ps(iz1,jz0);
768 dx11 = _mm_sub_ps(ix1,jx1);
769 dy11 = _mm_sub_ps(iy1,jy1);
770 dz11 = _mm_sub_ps(iz1,jz1);
771 dx12 = _mm_sub_ps(ix1,jx2);
772 dy12 = _mm_sub_ps(iy1,jy2);
773 dz12 = _mm_sub_ps(iz1,jz2);
774 dx20 = _mm_sub_ps(ix2,jx0);
775 dy20 = _mm_sub_ps(iy2,jy0);
776 dz20 = _mm_sub_ps(iz2,jz0);
777 dx21 = _mm_sub_ps(ix2,jx1);
778 dy21 = _mm_sub_ps(iy2,jy1);
779 dz21 = _mm_sub_ps(iz2,jz1);
780 dx22 = _mm_sub_ps(ix2,jx2);
781 dy22 = _mm_sub_ps(iy2,jy2);
782 dz22 = _mm_sub_ps(iz2,jz2);
784 /* Calculate squared distance and things based on it */
785 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
786 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
787 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
788 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
789 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
790 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
791 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
792 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
793 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
795 rinv00 = gmx_mm_invsqrt_ps(rsq00);
796 rinv01 = gmx_mm_invsqrt_ps(rsq01);
797 rinv02 = gmx_mm_invsqrt_ps(rsq02);
798 rinv10 = gmx_mm_invsqrt_ps(rsq10);
799 rinv11 = gmx_mm_invsqrt_ps(rsq11);
800 rinv12 = gmx_mm_invsqrt_ps(rsq12);
801 rinv20 = gmx_mm_invsqrt_ps(rsq20);
802 rinv21 = gmx_mm_invsqrt_ps(rsq21);
803 rinv22 = gmx_mm_invsqrt_ps(rsq22);
805 fjx0 = _mm_setzero_ps();
806 fjy0 = _mm_setzero_ps();
807 fjz0 = _mm_setzero_ps();
808 fjx1 = _mm_setzero_ps();
809 fjy1 = _mm_setzero_ps();
810 fjz1 = _mm_setzero_ps();
811 fjx2 = _mm_setzero_ps();
812 fjy2 = _mm_setzero_ps();
813 fjz2 = _mm_setzero_ps();
815 /**************************
816 * CALCULATE INTERACTIONS *
817 **************************/
819 r00 = _mm_mul_ps(rsq00,rinv00);
820 r00 = _mm_andnot_ps(dummy_mask,r00);
822 /* Calculate table index by multiplying r with table scale and truncate to integer */
823 rt = _mm_mul_ps(r00,vftabscale);
824 vfitab = _mm_cvttps_epi32(rt);
825 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
826 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
828 /* CUBIC SPLINE TABLE ELECTROSTATICS */
829 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
830 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
831 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
832 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
833 _MM_TRANSPOSE4_PS(Y,F,G,H);
834 Heps = _mm_mul_ps(vfeps,H);
835 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
836 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
837 velec = _mm_mul_ps(qq00,VV);
838 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
839 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
841 /* CUBIC SPLINE TABLE DISPERSION */
842 vfitab = _mm_add_epi32(vfitab,ifour);
843 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
844 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
845 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
846 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
847 _MM_TRANSPOSE4_PS(Y,F,G,H);
848 Heps = _mm_mul_ps(vfeps,H);
849 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
850 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
851 vvdw6 = _mm_mul_ps(c6_00,VV);
852 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
853 fvdw6 = _mm_mul_ps(c6_00,FF);
855 /* CUBIC SPLINE TABLE REPULSION */
856 vfitab = _mm_add_epi32(vfitab,ifour);
857 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
858 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
859 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
860 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
861 _MM_TRANSPOSE4_PS(Y,F,G,H);
862 Heps = _mm_mul_ps(vfeps,H);
863 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
864 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
865 vvdw12 = _mm_mul_ps(c12_00,VV);
866 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
867 fvdw12 = _mm_mul_ps(c12_00,FF);
868 vvdw = _mm_add_ps(vvdw12,vvdw6);
869 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
871 /* Update potential sum for this i atom from the interaction with this j atom. */
872 velec = _mm_andnot_ps(dummy_mask,velec);
873 velecsum = _mm_add_ps(velecsum,velec);
874 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
875 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
877 fscal = _mm_add_ps(felec,fvdw);
879 fscal = _mm_andnot_ps(dummy_mask,fscal);
881 /* Calculate temporary vectorial force */
882 tx = _mm_mul_ps(fscal,dx00);
883 ty = _mm_mul_ps(fscal,dy00);
884 tz = _mm_mul_ps(fscal,dz00);
886 /* Update vectorial force */
887 fix0 = _mm_add_ps(fix0,tx);
888 fiy0 = _mm_add_ps(fiy0,ty);
889 fiz0 = _mm_add_ps(fiz0,tz);
891 fjx0 = _mm_add_ps(fjx0,tx);
892 fjy0 = _mm_add_ps(fjy0,ty);
893 fjz0 = _mm_add_ps(fjz0,tz);
895 /**************************
896 * CALCULATE INTERACTIONS *
897 **************************/
899 r01 = _mm_mul_ps(rsq01,rinv01);
900 r01 = _mm_andnot_ps(dummy_mask,r01);
902 /* Calculate table index by multiplying r with table scale and truncate to integer */
903 rt = _mm_mul_ps(r01,vftabscale);
904 vfitab = _mm_cvttps_epi32(rt);
905 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
906 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
908 /* CUBIC SPLINE TABLE ELECTROSTATICS */
909 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
910 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
911 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
912 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
913 _MM_TRANSPOSE4_PS(Y,F,G,H);
914 Heps = _mm_mul_ps(vfeps,H);
915 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
916 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
917 velec = _mm_mul_ps(qq01,VV);
918 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
919 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
921 /* Update potential sum for this i atom from the interaction with this j atom. */
922 velec = _mm_andnot_ps(dummy_mask,velec);
923 velecsum = _mm_add_ps(velecsum,velec);
925 fscal = felec;
927 fscal = _mm_andnot_ps(dummy_mask,fscal);
929 /* Calculate temporary vectorial force */
930 tx = _mm_mul_ps(fscal,dx01);
931 ty = _mm_mul_ps(fscal,dy01);
932 tz = _mm_mul_ps(fscal,dz01);
934 /* Update vectorial force */
935 fix0 = _mm_add_ps(fix0,tx);
936 fiy0 = _mm_add_ps(fiy0,ty);
937 fiz0 = _mm_add_ps(fiz0,tz);
939 fjx1 = _mm_add_ps(fjx1,tx);
940 fjy1 = _mm_add_ps(fjy1,ty);
941 fjz1 = _mm_add_ps(fjz1,tz);
943 /**************************
944 * CALCULATE INTERACTIONS *
945 **************************/
947 r02 = _mm_mul_ps(rsq02,rinv02);
948 r02 = _mm_andnot_ps(dummy_mask,r02);
950 /* Calculate table index by multiplying r with table scale and truncate to integer */
951 rt = _mm_mul_ps(r02,vftabscale);
952 vfitab = _mm_cvttps_epi32(rt);
953 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
954 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
956 /* CUBIC SPLINE TABLE ELECTROSTATICS */
957 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
958 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
959 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
960 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
961 _MM_TRANSPOSE4_PS(Y,F,G,H);
962 Heps = _mm_mul_ps(vfeps,H);
963 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
964 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
965 velec = _mm_mul_ps(qq02,VV);
966 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
967 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
969 /* Update potential sum for this i atom from the interaction with this j atom. */
970 velec = _mm_andnot_ps(dummy_mask,velec);
971 velecsum = _mm_add_ps(velecsum,velec);
973 fscal = felec;
975 fscal = _mm_andnot_ps(dummy_mask,fscal);
977 /* Calculate temporary vectorial force */
978 tx = _mm_mul_ps(fscal,dx02);
979 ty = _mm_mul_ps(fscal,dy02);
980 tz = _mm_mul_ps(fscal,dz02);
982 /* Update vectorial force */
983 fix0 = _mm_add_ps(fix0,tx);
984 fiy0 = _mm_add_ps(fiy0,ty);
985 fiz0 = _mm_add_ps(fiz0,tz);
987 fjx2 = _mm_add_ps(fjx2,tx);
988 fjy2 = _mm_add_ps(fjy2,ty);
989 fjz2 = _mm_add_ps(fjz2,tz);
991 /**************************
992 * CALCULATE INTERACTIONS *
993 **************************/
995 r10 = _mm_mul_ps(rsq10,rinv10);
996 r10 = _mm_andnot_ps(dummy_mask,r10);
998 /* Calculate table index by multiplying r with table scale and truncate to integer */
999 rt = _mm_mul_ps(r10,vftabscale);
1000 vfitab = _mm_cvttps_epi32(rt);
1001 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1002 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1004 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1005 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1006 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1007 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1008 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1009 _MM_TRANSPOSE4_PS(Y,F,G,H);
1010 Heps = _mm_mul_ps(vfeps,H);
1011 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1012 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1013 velec = _mm_mul_ps(qq10,VV);
1014 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1015 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
1017 /* Update potential sum for this i atom from the interaction with this j atom. */
1018 velec = _mm_andnot_ps(dummy_mask,velec);
1019 velecsum = _mm_add_ps(velecsum,velec);
1021 fscal = felec;
1023 fscal = _mm_andnot_ps(dummy_mask,fscal);
1025 /* Calculate temporary vectorial force */
1026 tx = _mm_mul_ps(fscal,dx10);
1027 ty = _mm_mul_ps(fscal,dy10);
1028 tz = _mm_mul_ps(fscal,dz10);
1030 /* Update vectorial force */
1031 fix1 = _mm_add_ps(fix1,tx);
1032 fiy1 = _mm_add_ps(fiy1,ty);
1033 fiz1 = _mm_add_ps(fiz1,tz);
1035 fjx0 = _mm_add_ps(fjx0,tx);
1036 fjy0 = _mm_add_ps(fjy0,ty);
1037 fjz0 = _mm_add_ps(fjz0,tz);
1039 /**************************
1040 * CALCULATE INTERACTIONS *
1041 **************************/
1043 r11 = _mm_mul_ps(rsq11,rinv11);
1044 r11 = _mm_andnot_ps(dummy_mask,r11);
1046 /* Calculate table index by multiplying r with table scale and truncate to integer */
1047 rt = _mm_mul_ps(r11,vftabscale);
1048 vfitab = _mm_cvttps_epi32(rt);
1049 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1050 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1052 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1053 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1054 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1055 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1056 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1057 _MM_TRANSPOSE4_PS(Y,F,G,H);
1058 Heps = _mm_mul_ps(vfeps,H);
1059 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1060 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1061 velec = _mm_mul_ps(qq11,VV);
1062 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1063 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
1065 /* Update potential sum for this i atom from the interaction with this j atom. */
1066 velec = _mm_andnot_ps(dummy_mask,velec);
1067 velecsum = _mm_add_ps(velecsum,velec);
1069 fscal = felec;
1071 fscal = _mm_andnot_ps(dummy_mask,fscal);
1073 /* Calculate temporary vectorial force */
1074 tx = _mm_mul_ps(fscal,dx11);
1075 ty = _mm_mul_ps(fscal,dy11);
1076 tz = _mm_mul_ps(fscal,dz11);
1078 /* Update vectorial force */
1079 fix1 = _mm_add_ps(fix1,tx);
1080 fiy1 = _mm_add_ps(fiy1,ty);
1081 fiz1 = _mm_add_ps(fiz1,tz);
1083 fjx1 = _mm_add_ps(fjx1,tx);
1084 fjy1 = _mm_add_ps(fjy1,ty);
1085 fjz1 = _mm_add_ps(fjz1,tz);
1087 /**************************
1088 * CALCULATE INTERACTIONS *
1089 **************************/
1091 r12 = _mm_mul_ps(rsq12,rinv12);
1092 r12 = _mm_andnot_ps(dummy_mask,r12);
1094 /* Calculate table index by multiplying r with table scale and truncate to integer */
1095 rt = _mm_mul_ps(r12,vftabscale);
1096 vfitab = _mm_cvttps_epi32(rt);
1097 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1098 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1100 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1101 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1102 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1103 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1104 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1105 _MM_TRANSPOSE4_PS(Y,F,G,H);
1106 Heps = _mm_mul_ps(vfeps,H);
1107 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1108 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1109 velec = _mm_mul_ps(qq12,VV);
1110 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1111 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
1113 /* Update potential sum for this i atom from the interaction with this j atom. */
1114 velec = _mm_andnot_ps(dummy_mask,velec);
1115 velecsum = _mm_add_ps(velecsum,velec);
1117 fscal = felec;
1119 fscal = _mm_andnot_ps(dummy_mask,fscal);
1121 /* Calculate temporary vectorial force */
1122 tx = _mm_mul_ps(fscal,dx12);
1123 ty = _mm_mul_ps(fscal,dy12);
1124 tz = _mm_mul_ps(fscal,dz12);
1126 /* Update vectorial force */
1127 fix1 = _mm_add_ps(fix1,tx);
1128 fiy1 = _mm_add_ps(fiy1,ty);
1129 fiz1 = _mm_add_ps(fiz1,tz);
1131 fjx2 = _mm_add_ps(fjx2,tx);
1132 fjy2 = _mm_add_ps(fjy2,ty);
1133 fjz2 = _mm_add_ps(fjz2,tz);
1135 /**************************
1136 * CALCULATE INTERACTIONS *
1137 **************************/
1139 r20 = _mm_mul_ps(rsq20,rinv20);
1140 r20 = _mm_andnot_ps(dummy_mask,r20);
1142 /* Calculate table index by multiplying r with table scale and truncate to integer */
1143 rt = _mm_mul_ps(r20,vftabscale);
1144 vfitab = _mm_cvttps_epi32(rt);
1145 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1146 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1148 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1149 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1150 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1151 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1152 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1153 _MM_TRANSPOSE4_PS(Y,F,G,H);
1154 Heps = _mm_mul_ps(vfeps,H);
1155 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1156 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1157 velec = _mm_mul_ps(qq20,VV);
1158 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1159 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
1161 /* Update potential sum for this i atom from the interaction with this j atom. */
1162 velec = _mm_andnot_ps(dummy_mask,velec);
1163 velecsum = _mm_add_ps(velecsum,velec);
1165 fscal = felec;
1167 fscal = _mm_andnot_ps(dummy_mask,fscal);
1169 /* Calculate temporary vectorial force */
1170 tx = _mm_mul_ps(fscal,dx20);
1171 ty = _mm_mul_ps(fscal,dy20);
1172 tz = _mm_mul_ps(fscal,dz20);
1174 /* Update vectorial force */
1175 fix2 = _mm_add_ps(fix2,tx);
1176 fiy2 = _mm_add_ps(fiy2,ty);
1177 fiz2 = _mm_add_ps(fiz2,tz);
1179 fjx0 = _mm_add_ps(fjx0,tx);
1180 fjy0 = _mm_add_ps(fjy0,ty);
1181 fjz0 = _mm_add_ps(fjz0,tz);
1183 /**************************
1184 * CALCULATE INTERACTIONS *
1185 **************************/
1187 r21 = _mm_mul_ps(rsq21,rinv21);
1188 r21 = _mm_andnot_ps(dummy_mask,r21);
1190 /* Calculate table index by multiplying r with table scale and truncate to integer */
1191 rt = _mm_mul_ps(r21,vftabscale);
1192 vfitab = _mm_cvttps_epi32(rt);
1193 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1194 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1196 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1197 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1198 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1199 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1200 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1201 _MM_TRANSPOSE4_PS(Y,F,G,H);
1202 Heps = _mm_mul_ps(vfeps,H);
1203 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1204 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1205 velec = _mm_mul_ps(qq21,VV);
1206 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1207 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1209 /* Update potential sum for this i atom from the interaction with this j atom. */
1210 velec = _mm_andnot_ps(dummy_mask,velec);
1211 velecsum = _mm_add_ps(velecsum,velec);
1213 fscal = felec;
1215 fscal = _mm_andnot_ps(dummy_mask,fscal);
1217 /* Calculate temporary vectorial force */
1218 tx = _mm_mul_ps(fscal,dx21);
1219 ty = _mm_mul_ps(fscal,dy21);
1220 tz = _mm_mul_ps(fscal,dz21);
1222 /* Update vectorial force */
1223 fix2 = _mm_add_ps(fix2,tx);
1224 fiy2 = _mm_add_ps(fiy2,ty);
1225 fiz2 = _mm_add_ps(fiz2,tz);
1227 fjx1 = _mm_add_ps(fjx1,tx);
1228 fjy1 = _mm_add_ps(fjy1,ty);
1229 fjz1 = _mm_add_ps(fjz1,tz);
1231 /**************************
1232 * CALCULATE INTERACTIONS *
1233 **************************/
1235 r22 = _mm_mul_ps(rsq22,rinv22);
1236 r22 = _mm_andnot_ps(dummy_mask,r22);
1238 /* Calculate table index by multiplying r with table scale and truncate to integer */
1239 rt = _mm_mul_ps(r22,vftabscale);
1240 vfitab = _mm_cvttps_epi32(rt);
1241 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1242 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1244 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1245 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1246 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1247 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1248 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1249 _MM_TRANSPOSE4_PS(Y,F,G,H);
1250 Heps = _mm_mul_ps(vfeps,H);
1251 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1252 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1253 velec = _mm_mul_ps(qq22,VV);
1254 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1255 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1257 /* Update potential sum for this i atom from the interaction with this j atom. */
1258 velec = _mm_andnot_ps(dummy_mask,velec);
1259 velecsum = _mm_add_ps(velecsum,velec);
1261 fscal = felec;
1263 fscal = _mm_andnot_ps(dummy_mask,fscal);
1265 /* Calculate temporary vectorial force */
1266 tx = _mm_mul_ps(fscal,dx22);
1267 ty = _mm_mul_ps(fscal,dy22);
1268 tz = _mm_mul_ps(fscal,dz22);
1270 /* Update vectorial force */
1271 fix2 = _mm_add_ps(fix2,tx);
1272 fiy2 = _mm_add_ps(fiy2,ty);
1273 fiz2 = _mm_add_ps(fiz2,tz);
1275 fjx2 = _mm_add_ps(fjx2,tx);
1276 fjy2 = _mm_add_ps(fjy2,ty);
1277 fjz2 = _mm_add_ps(fjz2,tz);
1279 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1280 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1281 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1282 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1284 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1285 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1287 /* Inner loop uses 426 flops */
1290 /* End of innermost loop */
1292 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1293 f+i_coord_offset,fshift+i_shift_offset);
1295 ggid = gid[iidx];
1296 /* Update potential energies */
1297 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1298 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1300 /* Increment number of inner iterations */
1301 inneriter += j_index_end - j_index_start;
1303 /* Outer loop uses 20 flops */
1306 /* Increment number of outer iterations */
1307 outeriter += nri;
1309 /* Update outer/inner flops */
1311 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*426);
1314 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sse2_single
1315 * Electrostatics interaction: CubicSplineTable
1316 * VdW interaction: CubicSplineTable
1317 * Geometry: Water3-Water3
1318 * Calculate force/pot: Force
1320 void
1321 nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sse2_single
1322 (t_nblist * gmx_restrict nlist,
1323 rvec * gmx_restrict xx,
1324 rvec * gmx_restrict ff,
1325 t_forcerec * gmx_restrict fr,
1326 t_mdatoms * gmx_restrict mdatoms,
1327 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1328 t_nrnb * gmx_restrict nrnb)
1330 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1331 * just 0 for non-waters.
1332 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1333 * jnr indices corresponding to data put in the four positions in the SIMD register.
1335 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1336 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1337 int jnrA,jnrB,jnrC,jnrD;
1338 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1339 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1340 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1341 real rcutoff_scalar;
1342 real *shiftvec,*fshift,*x,*f;
1343 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1344 real scratch[4*DIM];
1345 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1346 int vdwioffset0;
1347 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1348 int vdwioffset1;
1349 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1350 int vdwioffset2;
1351 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1352 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1353 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1354 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1355 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1356 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1357 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1358 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1359 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1360 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1361 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1362 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1363 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1364 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1365 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1366 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1367 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1368 real *charge;
1369 int nvdwtype;
1370 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1371 int *vdwtype;
1372 real *vdwparam;
1373 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1374 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1375 __m128i vfitab;
1376 __m128i ifour = _mm_set1_epi32(4);
1377 __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1378 real *vftab;
1379 __m128 dummy_mask,cutoff_mask;
1380 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1381 __m128 one = _mm_set1_ps(1.0);
1382 __m128 two = _mm_set1_ps(2.0);
1383 x = xx[0];
1384 f = ff[0];
1386 nri = nlist->nri;
1387 iinr = nlist->iinr;
1388 jindex = nlist->jindex;
1389 jjnr = nlist->jjnr;
1390 shiftidx = nlist->shift;
1391 gid = nlist->gid;
1392 shiftvec = fr->shift_vec[0];
1393 fshift = fr->fshift[0];
1394 facel = _mm_set1_ps(fr->epsfac);
1395 charge = mdatoms->chargeA;
1396 nvdwtype = fr->ntype;
1397 vdwparam = fr->nbfp;
1398 vdwtype = mdatoms->typeA;
1400 vftab = kernel_data->table_elec_vdw->data;
1401 vftabscale = _mm_set1_ps(kernel_data->table_elec_vdw->scale);
1403 /* Setup water-specific parameters */
1404 inr = nlist->iinr[0];
1405 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1406 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1407 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1408 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1410 jq0 = _mm_set1_ps(charge[inr+0]);
1411 jq1 = _mm_set1_ps(charge[inr+1]);
1412 jq2 = _mm_set1_ps(charge[inr+2]);
1413 vdwjidx0A = 2*vdwtype[inr+0];
1414 qq00 = _mm_mul_ps(iq0,jq0);
1415 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1416 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1417 qq01 = _mm_mul_ps(iq0,jq1);
1418 qq02 = _mm_mul_ps(iq0,jq2);
1419 qq10 = _mm_mul_ps(iq1,jq0);
1420 qq11 = _mm_mul_ps(iq1,jq1);
1421 qq12 = _mm_mul_ps(iq1,jq2);
1422 qq20 = _mm_mul_ps(iq2,jq0);
1423 qq21 = _mm_mul_ps(iq2,jq1);
1424 qq22 = _mm_mul_ps(iq2,jq2);
1426 /* Avoid stupid compiler warnings */
1427 jnrA = jnrB = jnrC = jnrD = 0;
1428 j_coord_offsetA = 0;
1429 j_coord_offsetB = 0;
1430 j_coord_offsetC = 0;
1431 j_coord_offsetD = 0;
1433 outeriter = 0;
1434 inneriter = 0;
1436 for(iidx=0;iidx<4*DIM;iidx++)
1438 scratch[iidx] = 0.0;
1441 /* Start outer loop over neighborlists */
1442 for(iidx=0; iidx<nri; iidx++)
1444 /* Load shift vector for this list */
1445 i_shift_offset = DIM*shiftidx[iidx];
1447 /* Load limits for loop over neighbors */
1448 j_index_start = jindex[iidx];
1449 j_index_end = jindex[iidx+1];
1451 /* Get outer coordinate index */
1452 inr = iinr[iidx];
1453 i_coord_offset = DIM*inr;
1455 /* Load i particle coords and add shift vector */
1456 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1457 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1459 fix0 = _mm_setzero_ps();
1460 fiy0 = _mm_setzero_ps();
1461 fiz0 = _mm_setzero_ps();
1462 fix1 = _mm_setzero_ps();
1463 fiy1 = _mm_setzero_ps();
1464 fiz1 = _mm_setzero_ps();
1465 fix2 = _mm_setzero_ps();
1466 fiy2 = _mm_setzero_ps();
1467 fiz2 = _mm_setzero_ps();
1469 /* Start inner kernel loop */
1470 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1473 /* Get j neighbor index, and coordinate index */
1474 jnrA = jjnr[jidx];
1475 jnrB = jjnr[jidx+1];
1476 jnrC = jjnr[jidx+2];
1477 jnrD = jjnr[jidx+3];
1478 j_coord_offsetA = DIM*jnrA;
1479 j_coord_offsetB = DIM*jnrB;
1480 j_coord_offsetC = DIM*jnrC;
1481 j_coord_offsetD = DIM*jnrD;
1483 /* load j atom coordinates */
1484 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1485 x+j_coord_offsetC,x+j_coord_offsetD,
1486 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1488 /* Calculate displacement vector */
1489 dx00 = _mm_sub_ps(ix0,jx0);
1490 dy00 = _mm_sub_ps(iy0,jy0);
1491 dz00 = _mm_sub_ps(iz0,jz0);
1492 dx01 = _mm_sub_ps(ix0,jx1);
1493 dy01 = _mm_sub_ps(iy0,jy1);
1494 dz01 = _mm_sub_ps(iz0,jz1);
1495 dx02 = _mm_sub_ps(ix0,jx2);
1496 dy02 = _mm_sub_ps(iy0,jy2);
1497 dz02 = _mm_sub_ps(iz0,jz2);
1498 dx10 = _mm_sub_ps(ix1,jx0);
1499 dy10 = _mm_sub_ps(iy1,jy0);
1500 dz10 = _mm_sub_ps(iz1,jz0);
1501 dx11 = _mm_sub_ps(ix1,jx1);
1502 dy11 = _mm_sub_ps(iy1,jy1);
1503 dz11 = _mm_sub_ps(iz1,jz1);
1504 dx12 = _mm_sub_ps(ix1,jx2);
1505 dy12 = _mm_sub_ps(iy1,jy2);
1506 dz12 = _mm_sub_ps(iz1,jz2);
1507 dx20 = _mm_sub_ps(ix2,jx0);
1508 dy20 = _mm_sub_ps(iy2,jy0);
1509 dz20 = _mm_sub_ps(iz2,jz0);
1510 dx21 = _mm_sub_ps(ix2,jx1);
1511 dy21 = _mm_sub_ps(iy2,jy1);
1512 dz21 = _mm_sub_ps(iz2,jz1);
1513 dx22 = _mm_sub_ps(ix2,jx2);
1514 dy22 = _mm_sub_ps(iy2,jy2);
1515 dz22 = _mm_sub_ps(iz2,jz2);
1517 /* Calculate squared distance and things based on it */
1518 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1519 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1520 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1521 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1522 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1523 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1524 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1525 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1526 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1528 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1529 rinv01 = gmx_mm_invsqrt_ps(rsq01);
1530 rinv02 = gmx_mm_invsqrt_ps(rsq02);
1531 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1532 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1533 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1534 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1535 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1536 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1538 fjx0 = _mm_setzero_ps();
1539 fjy0 = _mm_setzero_ps();
1540 fjz0 = _mm_setzero_ps();
1541 fjx1 = _mm_setzero_ps();
1542 fjy1 = _mm_setzero_ps();
1543 fjz1 = _mm_setzero_ps();
1544 fjx2 = _mm_setzero_ps();
1545 fjy2 = _mm_setzero_ps();
1546 fjz2 = _mm_setzero_ps();
1548 /**************************
1549 * CALCULATE INTERACTIONS *
1550 **************************/
1552 r00 = _mm_mul_ps(rsq00,rinv00);
1554 /* Calculate table index by multiplying r with table scale and truncate to integer */
1555 rt = _mm_mul_ps(r00,vftabscale);
1556 vfitab = _mm_cvttps_epi32(rt);
1557 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1558 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1560 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1561 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1562 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1563 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1564 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1565 _MM_TRANSPOSE4_PS(Y,F,G,H);
1566 Heps = _mm_mul_ps(vfeps,H);
1567 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1568 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1569 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
1571 /* CUBIC SPLINE TABLE DISPERSION */
1572 vfitab = _mm_add_epi32(vfitab,ifour);
1573 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1574 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1575 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1576 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1577 _MM_TRANSPOSE4_PS(Y,F,G,H);
1578 Heps = _mm_mul_ps(vfeps,H);
1579 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1580 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1581 fvdw6 = _mm_mul_ps(c6_00,FF);
1583 /* CUBIC SPLINE TABLE REPULSION */
1584 vfitab = _mm_add_epi32(vfitab,ifour);
1585 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1586 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1587 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1588 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1589 _MM_TRANSPOSE4_PS(Y,F,G,H);
1590 Heps = _mm_mul_ps(vfeps,H);
1591 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1592 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1593 fvdw12 = _mm_mul_ps(c12_00,FF);
1594 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
1596 fscal = _mm_add_ps(felec,fvdw);
1598 /* Calculate temporary vectorial force */
1599 tx = _mm_mul_ps(fscal,dx00);
1600 ty = _mm_mul_ps(fscal,dy00);
1601 tz = _mm_mul_ps(fscal,dz00);
1603 /* Update vectorial force */
1604 fix0 = _mm_add_ps(fix0,tx);
1605 fiy0 = _mm_add_ps(fiy0,ty);
1606 fiz0 = _mm_add_ps(fiz0,tz);
1608 fjx0 = _mm_add_ps(fjx0,tx);
1609 fjy0 = _mm_add_ps(fjy0,ty);
1610 fjz0 = _mm_add_ps(fjz0,tz);
1612 /**************************
1613 * CALCULATE INTERACTIONS *
1614 **************************/
1616 r01 = _mm_mul_ps(rsq01,rinv01);
1618 /* Calculate table index by multiplying r with table scale and truncate to integer */
1619 rt = _mm_mul_ps(r01,vftabscale);
1620 vfitab = _mm_cvttps_epi32(rt);
1621 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1622 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1624 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1625 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1626 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1627 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1628 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1629 _MM_TRANSPOSE4_PS(Y,F,G,H);
1630 Heps = _mm_mul_ps(vfeps,H);
1631 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1632 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1633 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
1635 fscal = felec;
1637 /* Calculate temporary vectorial force */
1638 tx = _mm_mul_ps(fscal,dx01);
1639 ty = _mm_mul_ps(fscal,dy01);
1640 tz = _mm_mul_ps(fscal,dz01);
1642 /* Update vectorial force */
1643 fix0 = _mm_add_ps(fix0,tx);
1644 fiy0 = _mm_add_ps(fiy0,ty);
1645 fiz0 = _mm_add_ps(fiz0,tz);
1647 fjx1 = _mm_add_ps(fjx1,tx);
1648 fjy1 = _mm_add_ps(fjy1,ty);
1649 fjz1 = _mm_add_ps(fjz1,tz);
1651 /**************************
1652 * CALCULATE INTERACTIONS *
1653 **************************/
1655 r02 = _mm_mul_ps(rsq02,rinv02);
1657 /* Calculate table index by multiplying r with table scale and truncate to integer */
1658 rt = _mm_mul_ps(r02,vftabscale);
1659 vfitab = _mm_cvttps_epi32(rt);
1660 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1661 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1663 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1664 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1665 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1666 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1667 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1668 _MM_TRANSPOSE4_PS(Y,F,G,H);
1669 Heps = _mm_mul_ps(vfeps,H);
1670 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1671 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1672 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
1674 fscal = felec;
1676 /* Calculate temporary vectorial force */
1677 tx = _mm_mul_ps(fscal,dx02);
1678 ty = _mm_mul_ps(fscal,dy02);
1679 tz = _mm_mul_ps(fscal,dz02);
1681 /* Update vectorial force */
1682 fix0 = _mm_add_ps(fix0,tx);
1683 fiy0 = _mm_add_ps(fiy0,ty);
1684 fiz0 = _mm_add_ps(fiz0,tz);
1686 fjx2 = _mm_add_ps(fjx2,tx);
1687 fjy2 = _mm_add_ps(fjy2,ty);
1688 fjz2 = _mm_add_ps(fjz2,tz);
1690 /**************************
1691 * CALCULATE INTERACTIONS *
1692 **************************/
1694 r10 = _mm_mul_ps(rsq10,rinv10);
1696 /* Calculate table index by multiplying r with table scale and truncate to integer */
1697 rt = _mm_mul_ps(r10,vftabscale);
1698 vfitab = _mm_cvttps_epi32(rt);
1699 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1700 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1702 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1703 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1704 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1705 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1706 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1707 _MM_TRANSPOSE4_PS(Y,F,G,H);
1708 Heps = _mm_mul_ps(vfeps,H);
1709 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1710 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1711 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
1713 fscal = felec;
1715 /* Calculate temporary vectorial force */
1716 tx = _mm_mul_ps(fscal,dx10);
1717 ty = _mm_mul_ps(fscal,dy10);
1718 tz = _mm_mul_ps(fscal,dz10);
1720 /* Update vectorial force */
1721 fix1 = _mm_add_ps(fix1,tx);
1722 fiy1 = _mm_add_ps(fiy1,ty);
1723 fiz1 = _mm_add_ps(fiz1,tz);
1725 fjx0 = _mm_add_ps(fjx0,tx);
1726 fjy0 = _mm_add_ps(fjy0,ty);
1727 fjz0 = _mm_add_ps(fjz0,tz);
1729 /**************************
1730 * CALCULATE INTERACTIONS *
1731 **************************/
1733 r11 = _mm_mul_ps(rsq11,rinv11);
1735 /* Calculate table index by multiplying r with table scale and truncate to integer */
1736 rt = _mm_mul_ps(r11,vftabscale);
1737 vfitab = _mm_cvttps_epi32(rt);
1738 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1739 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1741 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1742 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1743 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1744 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1745 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1746 _MM_TRANSPOSE4_PS(Y,F,G,H);
1747 Heps = _mm_mul_ps(vfeps,H);
1748 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1749 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1750 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
1752 fscal = felec;
1754 /* Calculate temporary vectorial force */
1755 tx = _mm_mul_ps(fscal,dx11);
1756 ty = _mm_mul_ps(fscal,dy11);
1757 tz = _mm_mul_ps(fscal,dz11);
1759 /* Update vectorial force */
1760 fix1 = _mm_add_ps(fix1,tx);
1761 fiy1 = _mm_add_ps(fiy1,ty);
1762 fiz1 = _mm_add_ps(fiz1,tz);
1764 fjx1 = _mm_add_ps(fjx1,tx);
1765 fjy1 = _mm_add_ps(fjy1,ty);
1766 fjz1 = _mm_add_ps(fjz1,tz);
1768 /**************************
1769 * CALCULATE INTERACTIONS *
1770 **************************/
1772 r12 = _mm_mul_ps(rsq12,rinv12);
1774 /* Calculate table index by multiplying r with table scale and truncate to integer */
1775 rt = _mm_mul_ps(r12,vftabscale);
1776 vfitab = _mm_cvttps_epi32(rt);
1777 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1778 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1780 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1781 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1782 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1783 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1784 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1785 _MM_TRANSPOSE4_PS(Y,F,G,H);
1786 Heps = _mm_mul_ps(vfeps,H);
1787 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1788 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1789 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
1791 fscal = felec;
1793 /* Calculate temporary vectorial force */
1794 tx = _mm_mul_ps(fscal,dx12);
1795 ty = _mm_mul_ps(fscal,dy12);
1796 tz = _mm_mul_ps(fscal,dz12);
1798 /* Update vectorial force */
1799 fix1 = _mm_add_ps(fix1,tx);
1800 fiy1 = _mm_add_ps(fiy1,ty);
1801 fiz1 = _mm_add_ps(fiz1,tz);
1803 fjx2 = _mm_add_ps(fjx2,tx);
1804 fjy2 = _mm_add_ps(fjy2,ty);
1805 fjz2 = _mm_add_ps(fjz2,tz);
1807 /**************************
1808 * CALCULATE INTERACTIONS *
1809 **************************/
1811 r20 = _mm_mul_ps(rsq20,rinv20);
1813 /* Calculate table index by multiplying r with table scale and truncate to integer */
1814 rt = _mm_mul_ps(r20,vftabscale);
1815 vfitab = _mm_cvttps_epi32(rt);
1816 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1817 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1819 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1820 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1821 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1822 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1823 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1824 _MM_TRANSPOSE4_PS(Y,F,G,H);
1825 Heps = _mm_mul_ps(vfeps,H);
1826 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1827 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1828 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
1830 fscal = felec;
1832 /* Calculate temporary vectorial force */
1833 tx = _mm_mul_ps(fscal,dx20);
1834 ty = _mm_mul_ps(fscal,dy20);
1835 tz = _mm_mul_ps(fscal,dz20);
1837 /* Update vectorial force */
1838 fix2 = _mm_add_ps(fix2,tx);
1839 fiy2 = _mm_add_ps(fiy2,ty);
1840 fiz2 = _mm_add_ps(fiz2,tz);
1842 fjx0 = _mm_add_ps(fjx0,tx);
1843 fjy0 = _mm_add_ps(fjy0,ty);
1844 fjz0 = _mm_add_ps(fjz0,tz);
1846 /**************************
1847 * CALCULATE INTERACTIONS *
1848 **************************/
1850 r21 = _mm_mul_ps(rsq21,rinv21);
1852 /* Calculate table index by multiplying r with table scale and truncate to integer */
1853 rt = _mm_mul_ps(r21,vftabscale);
1854 vfitab = _mm_cvttps_epi32(rt);
1855 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1856 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1858 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1859 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1860 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1861 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1862 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1863 _MM_TRANSPOSE4_PS(Y,F,G,H);
1864 Heps = _mm_mul_ps(vfeps,H);
1865 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1866 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1867 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1869 fscal = felec;
1871 /* Calculate temporary vectorial force */
1872 tx = _mm_mul_ps(fscal,dx21);
1873 ty = _mm_mul_ps(fscal,dy21);
1874 tz = _mm_mul_ps(fscal,dz21);
1876 /* Update vectorial force */
1877 fix2 = _mm_add_ps(fix2,tx);
1878 fiy2 = _mm_add_ps(fiy2,ty);
1879 fiz2 = _mm_add_ps(fiz2,tz);
1881 fjx1 = _mm_add_ps(fjx1,tx);
1882 fjy1 = _mm_add_ps(fjy1,ty);
1883 fjz1 = _mm_add_ps(fjz1,tz);
1885 /**************************
1886 * CALCULATE INTERACTIONS *
1887 **************************/
1889 r22 = _mm_mul_ps(rsq22,rinv22);
1891 /* Calculate table index by multiplying r with table scale and truncate to integer */
1892 rt = _mm_mul_ps(r22,vftabscale);
1893 vfitab = _mm_cvttps_epi32(rt);
1894 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1895 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1897 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1898 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1899 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1900 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1901 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1902 _MM_TRANSPOSE4_PS(Y,F,G,H);
1903 Heps = _mm_mul_ps(vfeps,H);
1904 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1905 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1906 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1908 fscal = felec;
1910 /* Calculate temporary vectorial force */
1911 tx = _mm_mul_ps(fscal,dx22);
1912 ty = _mm_mul_ps(fscal,dy22);
1913 tz = _mm_mul_ps(fscal,dz22);
1915 /* Update vectorial force */
1916 fix2 = _mm_add_ps(fix2,tx);
1917 fiy2 = _mm_add_ps(fiy2,ty);
1918 fiz2 = _mm_add_ps(fiz2,tz);
1920 fjx2 = _mm_add_ps(fjx2,tx);
1921 fjy2 = _mm_add_ps(fjy2,ty);
1922 fjz2 = _mm_add_ps(fjz2,tz);
1924 fjptrA = f+j_coord_offsetA;
1925 fjptrB = f+j_coord_offsetB;
1926 fjptrC = f+j_coord_offsetC;
1927 fjptrD = f+j_coord_offsetD;
1929 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1930 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1932 /* Inner loop uses 373 flops */
1935 if(jidx<j_index_end)
1938 /* Get j neighbor index, and coordinate index */
1939 jnrlistA = jjnr[jidx];
1940 jnrlistB = jjnr[jidx+1];
1941 jnrlistC = jjnr[jidx+2];
1942 jnrlistD = jjnr[jidx+3];
1943 /* Sign of each element will be negative for non-real atoms.
1944 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1945 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1947 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1948 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1949 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1950 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1951 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1952 j_coord_offsetA = DIM*jnrA;
1953 j_coord_offsetB = DIM*jnrB;
1954 j_coord_offsetC = DIM*jnrC;
1955 j_coord_offsetD = DIM*jnrD;
1957 /* load j atom coordinates */
1958 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1959 x+j_coord_offsetC,x+j_coord_offsetD,
1960 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1962 /* Calculate displacement vector */
1963 dx00 = _mm_sub_ps(ix0,jx0);
1964 dy00 = _mm_sub_ps(iy0,jy0);
1965 dz00 = _mm_sub_ps(iz0,jz0);
1966 dx01 = _mm_sub_ps(ix0,jx1);
1967 dy01 = _mm_sub_ps(iy0,jy1);
1968 dz01 = _mm_sub_ps(iz0,jz1);
1969 dx02 = _mm_sub_ps(ix0,jx2);
1970 dy02 = _mm_sub_ps(iy0,jy2);
1971 dz02 = _mm_sub_ps(iz0,jz2);
1972 dx10 = _mm_sub_ps(ix1,jx0);
1973 dy10 = _mm_sub_ps(iy1,jy0);
1974 dz10 = _mm_sub_ps(iz1,jz0);
1975 dx11 = _mm_sub_ps(ix1,jx1);
1976 dy11 = _mm_sub_ps(iy1,jy1);
1977 dz11 = _mm_sub_ps(iz1,jz1);
1978 dx12 = _mm_sub_ps(ix1,jx2);
1979 dy12 = _mm_sub_ps(iy1,jy2);
1980 dz12 = _mm_sub_ps(iz1,jz2);
1981 dx20 = _mm_sub_ps(ix2,jx0);
1982 dy20 = _mm_sub_ps(iy2,jy0);
1983 dz20 = _mm_sub_ps(iz2,jz0);
1984 dx21 = _mm_sub_ps(ix2,jx1);
1985 dy21 = _mm_sub_ps(iy2,jy1);
1986 dz21 = _mm_sub_ps(iz2,jz1);
1987 dx22 = _mm_sub_ps(ix2,jx2);
1988 dy22 = _mm_sub_ps(iy2,jy2);
1989 dz22 = _mm_sub_ps(iz2,jz2);
1991 /* Calculate squared distance and things based on it */
1992 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1993 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1994 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1995 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1996 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1997 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1998 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1999 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2000 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2002 rinv00 = gmx_mm_invsqrt_ps(rsq00);
2003 rinv01 = gmx_mm_invsqrt_ps(rsq01);
2004 rinv02 = gmx_mm_invsqrt_ps(rsq02);
2005 rinv10 = gmx_mm_invsqrt_ps(rsq10);
2006 rinv11 = gmx_mm_invsqrt_ps(rsq11);
2007 rinv12 = gmx_mm_invsqrt_ps(rsq12);
2008 rinv20 = gmx_mm_invsqrt_ps(rsq20);
2009 rinv21 = gmx_mm_invsqrt_ps(rsq21);
2010 rinv22 = gmx_mm_invsqrt_ps(rsq22);
2012 fjx0 = _mm_setzero_ps();
2013 fjy0 = _mm_setzero_ps();
2014 fjz0 = _mm_setzero_ps();
2015 fjx1 = _mm_setzero_ps();
2016 fjy1 = _mm_setzero_ps();
2017 fjz1 = _mm_setzero_ps();
2018 fjx2 = _mm_setzero_ps();
2019 fjy2 = _mm_setzero_ps();
2020 fjz2 = _mm_setzero_ps();
2022 /**************************
2023 * CALCULATE INTERACTIONS *
2024 **************************/
2026 r00 = _mm_mul_ps(rsq00,rinv00);
2027 r00 = _mm_andnot_ps(dummy_mask,r00);
2029 /* Calculate table index by multiplying r with table scale and truncate to integer */
2030 rt = _mm_mul_ps(r00,vftabscale);
2031 vfitab = _mm_cvttps_epi32(rt);
2032 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2033 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2035 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2036 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2037 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2038 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2039 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2040 _MM_TRANSPOSE4_PS(Y,F,G,H);
2041 Heps = _mm_mul_ps(vfeps,H);
2042 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2043 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2044 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
2046 /* CUBIC SPLINE TABLE DISPERSION */
2047 vfitab = _mm_add_epi32(vfitab,ifour);
2048 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2049 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2050 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2051 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2052 _MM_TRANSPOSE4_PS(Y,F,G,H);
2053 Heps = _mm_mul_ps(vfeps,H);
2054 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2055 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2056 fvdw6 = _mm_mul_ps(c6_00,FF);
2058 /* CUBIC SPLINE TABLE REPULSION */
2059 vfitab = _mm_add_epi32(vfitab,ifour);
2060 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2061 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2062 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2063 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2064 _MM_TRANSPOSE4_PS(Y,F,G,H);
2065 Heps = _mm_mul_ps(vfeps,H);
2066 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2067 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2068 fvdw12 = _mm_mul_ps(c12_00,FF);
2069 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
2071 fscal = _mm_add_ps(felec,fvdw);
2073 fscal = _mm_andnot_ps(dummy_mask,fscal);
2075 /* Calculate temporary vectorial force */
2076 tx = _mm_mul_ps(fscal,dx00);
2077 ty = _mm_mul_ps(fscal,dy00);
2078 tz = _mm_mul_ps(fscal,dz00);
2080 /* Update vectorial force */
2081 fix0 = _mm_add_ps(fix0,tx);
2082 fiy0 = _mm_add_ps(fiy0,ty);
2083 fiz0 = _mm_add_ps(fiz0,tz);
2085 fjx0 = _mm_add_ps(fjx0,tx);
2086 fjy0 = _mm_add_ps(fjy0,ty);
2087 fjz0 = _mm_add_ps(fjz0,tz);
2089 /**************************
2090 * CALCULATE INTERACTIONS *
2091 **************************/
2093 r01 = _mm_mul_ps(rsq01,rinv01);
2094 r01 = _mm_andnot_ps(dummy_mask,r01);
2096 /* Calculate table index by multiplying r with table scale and truncate to integer */
2097 rt = _mm_mul_ps(r01,vftabscale);
2098 vfitab = _mm_cvttps_epi32(rt);
2099 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2100 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2102 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2103 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2104 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2105 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2106 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2107 _MM_TRANSPOSE4_PS(Y,F,G,H);
2108 Heps = _mm_mul_ps(vfeps,H);
2109 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2110 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2111 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
2113 fscal = felec;
2115 fscal = _mm_andnot_ps(dummy_mask,fscal);
2117 /* Calculate temporary vectorial force */
2118 tx = _mm_mul_ps(fscal,dx01);
2119 ty = _mm_mul_ps(fscal,dy01);
2120 tz = _mm_mul_ps(fscal,dz01);
2122 /* Update vectorial force */
2123 fix0 = _mm_add_ps(fix0,tx);
2124 fiy0 = _mm_add_ps(fiy0,ty);
2125 fiz0 = _mm_add_ps(fiz0,tz);
2127 fjx1 = _mm_add_ps(fjx1,tx);
2128 fjy1 = _mm_add_ps(fjy1,ty);
2129 fjz1 = _mm_add_ps(fjz1,tz);
2131 /**************************
2132 * CALCULATE INTERACTIONS *
2133 **************************/
2135 r02 = _mm_mul_ps(rsq02,rinv02);
2136 r02 = _mm_andnot_ps(dummy_mask,r02);
2138 /* Calculate table index by multiplying r with table scale and truncate to integer */
2139 rt = _mm_mul_ps(r02,vftabscale);
2140 vfitab = _mm_cvttps_epi32(rt);
2141 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2142 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2144 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2145 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2146 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2147 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2148 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2149 _MM_TRANSPOSE4_PS(Y,F,G,H);
2150 Heps = _mm_mul_ps(vfeps,H);
2151 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2152 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2153 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
2155 fscal = felec;
2157 fscal = _mm_andnot_ps(dummy_mask,fscal);
2159 /* Calculate temporary vectorial force */
2160 tx = _mm_mul_ps(fscal,dx02);
2161 ty = _mm_mul_ps(fscal,dy02);
2162 tz = _mm_mul_ps(fscal,dz02);
2164 /* Update vectorial force */
2165 fix0 = _mm_add_ps(fix0,tx);
2166 fiy0 = _mm_add_ps(fiy0,ty);
2167 fiz0 = _mm_add_ps(fiz0,tz);
2169 fjx2 = _mm_add_ps(fjx2,tx);
2170 fjy2 = _mm_add_ps(fjy2,ty);
2171 fjz2 = _mm_add_ps(fjz2,tz);
2173 /**************************
2174 * CALCULATE INTERACTIONS *
2175 **************************/
2177 r10 = _mm_mul_ps(rsq10,rinv10);
2178 r10 = _mm_andnot_ps(dummy_mask,r10);
2180 /* Calculate table index by multiplying r with table scale and truncate to integer */
2181 rt = _mm_mul_ps(r10,vftabscale);
2182 vfitab = _mm_cvttps_epi32(rt);
2183 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2184 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2186 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2187 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2188 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2189 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2190 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2191 _MM_TRANSPOSE4_PS(Y,F,G,H);
2192 Heps = _mm_mul_ps(vfeps,H);
2193 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2194 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2195 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
2197 fscal = felec;
2199 fscal = _mm_andnot_ps(dummy_mask,fscal);
2201 /* Calculate temporary vectorial force */
2202 tx = _mm_mul_ps(fscal,dx10);
2203 ty = _mm_mul_ps(fscal,dy10);
2204 tz = _mm_mul_ps(fscal,dz10);
2206 /* Update vectorial force */
2207 fix1 = _mm_add_ps(fix1,tx);
2208 fiy1 = _mm_add_ps(fiy1,ty);
2209 fiz1 = _mm_add_ps(fiz1,tz);
2211 fjx0 = _mm_add_ps(fjx0,tx);
2212 fjy0 = _mm_add_ps(fjy0,ty);
2213 fjz0 = _mm_add_ps(fjz0,tz);
2215 /**************************
2216 * CALCULATE INTERACTIONS *
2217 **************************/
2219 r11 = _mm_mul_ps(rsq11,rinv11);
2220 r11 = _mm_andnot_ps(dummy_mask,r11);
2222 /* Calculate table index by multiplying r with table scale and truncate to integer */
2223 rt = _mm_mul_ps(r11,vftabscale);
2224 vfitab = _mm_cvttps_epi32(rt);
2225 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2226 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2228 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2229 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2230 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2231 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2232 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2233 _MM_TRANSPOSE4_PS(Y,F,G,H);
2234 Heps = _mm_mul_ps(vfeps,H);
2235 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2236 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2237 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
2239 fscal = felec;
2241 fscal = _mm_andnot_ps(dummy_mask,fscal);
2243 /* Calculate temporary vectorial force */
2244 tx = _mm_mul_ps(fscal,dx11);
2245 ty = _mm_mul_ps(fscal,dy11);
2246 tz = _mm_mul_ps(fscal,dz11);
2248 /* Update vectorial force */
2249 fix1 = _mm_add_ps(fix1,tx);
2250 fiy1 = _mm_add_ps(fiy1,ty);
2251 fiz1 = _mm_add_ps(fiz1,tz);
2253 fjx1 = _mm_add_ps(fjx1,tx);
2254 fjy1 = _mm_add_ps(fjy1,ty);
2255 fjz1 = _mm_add_ps(fjz1,tz);
2257 /**************************
2258 * CALCULATE INTERACTIONS *
2259 **************************/
2261 r12 = _mm_mul_ps(rsq12,rinv12);
2262 r12 = _mm_andnot_ps(dummy_mask,r12);
2264 /* Calculate table index by multiplying r with table scale and truncate to integer */
2265 rt = _mm_mul_ps(r12,vftabscale);
2266 vfitab = _mm_cvttps_epi32(rt);
2267 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2268 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2270 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2271 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2272 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2273 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2274 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2275 _MM_TRANSPOSE4_PS(Y,F,G,H);
2276 Heps = _mm_mul_ps(vfeps,H);
2277 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2278 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2279 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
2281 fscal = felec;
2283 fscal = _mm_andnot_ps(dummy_mask,fscal);
2285 /* Calculate temporary vectorial force */
2286 tx = _mm_mul_ps(fscal,dx12);
2287 ty = _mm_mul_ps(fscal,dy12);
2288 tz = _mm_mul_ps(fscal,dz12);
2290 /* Update vectorial force */
2291 fix1 = _mm_add_ps(fix1,tx);
2292 fiy1 = _mm_add_ps(fiy1,ty);
2293 fiz1 = _mm_add_ps(fiz1,tz);
2295 fjx2 = _mm_add_ps(fjx2,tx);
2296 fjy2 = _mm_add_ps(fjy2,ty);
2297 fjz2 = _mm_add_ps(fjz2,tz);
2299 /**************************
2300 * CALCULATE INTERACTIONS *
2301 **************************/
2303 r20 = _mm_mul_ps(rsq20,rinv20);
2304 r20 = _mm_andnot_ps(dummy_mask,r20);
2306 /* Calculate table index by multiplying r with table scale and truncate to integer */
2307 rt = _mm_mul_ps(r20,vftabscale);
2308 vfitab = _mm_cvttps_epi32(rt);
2309 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2310 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2312 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2313 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2314 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2315 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2316 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2317 _MM_TRANSPOSE4_PS(Y,F,G,H);
2318 Heps = _mm_mul_ps(vfeps,H);
2319 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2320 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2321 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
2323 fscal = felec;
2325 fscal = _mm_andnot_ps(dummy_mask,fscal);
2327 /* Calculate temporary vectorial force */
2328 tx = _mm_mul_ps(fscal,dx20);
2329 ty = _mm_mul_ps(fscal,dy20);
2330 tz = _mm_mul_ps(fscal,dz20);
2332 /* Update vectorial force */
2333 fix2 = _mm_add_ps(fix2,tx);
2334 fiy2 = _mm_add_ps(fiy2,ty);
2335 fiz2 = _mm_add_ps(fiz2,tz);
2337 fjx0 = _mm_add_ps(fjx0,tx);
2338 fjy0 = _mm_add_ps(fjy0,ty);
2339 fjz0 = _mm_add_ps(fjz0,tz);
2341 /**************************
2342 * CALCULATE INTERACTIONS *
2343 **************************/
2345 r21 = _mm_mul_ps(rsq21,rinv21);
2346 r21 = _mm_andnot_ps(dummy_mask,r21);
2348 /* Calculate table index by multiplying r with table scale and truncate to integer */
2349 rt = _mm_mul_ps(r21,vftabscale);
2350 vfitab = _mm_cvttps_epi32(rt);
2351 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2352 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2354 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2355 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2356 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2357 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2358 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2359 _MM_TRANSPOSE4_PS(Y,F,G,H);
2360 Heps = _mm_mul_ps(vfeps,H);
2361 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2362 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2363 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
2365 fscal = felec;
2367 fscal = _mm_andnot_ps(dummy_mask,fscal);
2369 /* Calculate temporary vectorial force */
2370 tx = _mm_mul_ps(fscal,dx21);
2371 ty = _mm_mul_ps(fscal,dy21);
2372 tz = _mm_mul_ps(fscal,dz21);
2374 /* Update vectorial force */
2375 fix2 = _mm_add_ps(fix2,tx);
2376 fiy2 = _mm_add_ps(fiy2,ty);
2377 fiz2 = _mm_add_ps(fiz2,tz);
2379 fjx1 = _mm_add_ps(fjx1,tx);
2380 fjy1 = _mm_add_ps(fjy1,ty);
2381 fjz1 = _mm_add_ps(fjz1,tz);
2383 /**************************
2384 * CALCULATE INTERACTIONS *
2385 **************************/
2387 r22 = _mm_mul_ps(rsq22,rinv22);
2388 r22 = _mm_andnot_ps(dummy_mask,r22);
2390 /* Calculate table index by multiplying r with table scale and truncate to integer */
2391 rt = _mm_mul_ps(r22,vftabscale);
2392 vfitab = _mm_cvttps_epi32(rt);
2393 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2394 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2396 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2397 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2398 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2399 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2400 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2401 _MM_TRANSPOSE4_PS(Y,F,G,H);
2402 Heps = _mm_mul_ps(vfeps,H);
2403 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2404 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2405 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
2407 fscal = felec;
2409 fscal = _mm_andnot_ps(dummy_mask,fscal);
2411 /* Calculate temporary vectorial force */
2412 tx = _mm_mul_ps(fscal,dx22);
2413 ty = _mm_mul_ps(fscal,dy22);
2414 tz = _mm_mul_ps(fscal,dz22);
2416 /* Update vectorial force */
2417 fix2 = _mm_add_ps(fix2,tx);
2418 fiy2 = _mm_add_ps(fiy2,ty);
2419 fiz2 = _mm_add_ps(fiz2,tz);
2421 fjx2 = _mm_add_ps(fjx2,tx);
2422 fjy2 = _mm_add_ps(fjy2,ty);
2423 fjz2 = _mm_add_ps(fjz2,tz);
2425 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2426 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2427 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2428 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2430 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2431 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2433 /* Inner loop uses 382 flops */
2436 /* End of innermost loop */
2438 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2439 f+i_coord_offset,fshift+i_shift_offset);
2441 /* Increment number of inner iterations */
2442 inneriter += j_index_end - j_index_start;
2444 /* Outer loop uses 18 flops */
2447 /* Increment number of outer iterations */
2448 outeriter += nri;
2450 /* Update outer/inner flops */
2452 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*382);