Removed simple.h from nb_kernel_sse2_XX
[gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse2_single / nb_kernel_ElecEw_VdwLJEw_GeomW3W3_sse2_single.c
blob31e780f9179b424cf501aea077b50fe0cdba1daa
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_single kernel generator.
38 #include "gmxpre.h"
40 #include "config.h"
42 #include <math.h>
44 #include "../nb_kernel.h"
45 #include "gromacs/math/vec.h"
46 #include "gromacs/legacyheaders/nrnb.h"
48 #include "gromacs/simd/math_x86_sse2_single.h"
49 #include "kernelutil_x86_sse2_single.h"
52 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_sse2_single
53 * Electrostatics interaction: Ewald
54 * VdW interaction: LJEwald
55 * Geometry: Water3-Water3
56 * Calculate force/pot: PotentialAndForce
58 void
59 nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_sse2_single
60 (t_nblist * gmx_restrict nlist,
61 rvec * gmx_restrict xx,
62 rvec * gmx_restrict ff,
63 t_forcerec * gmx_restrict fr,
64 t_mdatoms * gmx_restrict mdatoms,
65 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
66 t_nrnb * gmx_restrict nrnb)
68 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
69 * just 0 for non-waters.
70 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
71 * jnr indices corresponding to data put in the four positions in the SIMD register.
73 int i_shift_offset,i_coord_offset,outeriter,inneriter;
74 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
75 int jnrA,jnrB,jnrC,jnrD;
76 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
77 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
78 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
79 real rcutoff_scalar;
80 real *shiftvec,*fshift,*x,*f;
81 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
82 real scratch[4*DIM];
83 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
84 int vdwioffset0;
85 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
86 int vdwioffset1;
87 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
88 int vdwioffset2;
89 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
90 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
91 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
92 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
93 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
94 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
95 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
96 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
97 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
98 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
99 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
100 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
101 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
102 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
103 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
104 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
105 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
106 real *charge;
107 int nvdwtype;
108 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
109 int *vdwtype;
110 real *vdwparam;
111 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
112 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
113 __m128 c6grid_00;
114 __m128 c6grid_01;
115 __m128 c6grid_02;
116 __m128 c6grid_10;
117 __m128 c6grid_11;
118 __m128 c6grid_12;
119 __m128 c6grid_20;
120 __m128 c6grid_21;
121 __m128 c6grid_22;
122 __m128 ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
123 real *vdwgridparam;
124 __m128 one_half = _mm_set1_ps(0.5);
125 __m128 minus_one = _mm_set1_ps(-1.0);
126 __m128i ewitab;
127 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
128 real *ewtab;
129 __m128 dummy_mask,cutoff_mask;
130 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
131 __m128 one = _mm_set1_ps(1.0);
132 __m128 two = _mm_set1_ps(2.0);
133 x = xx[0];
134 f = ff[0];
136 nri = nlist->nri;
137 iinr = nlist->iinr;
138 jindex = nlist->jindex;
139 jjnr = nlist->jjnr;
140 shiftidx = nlist->shift;
141 gid = nlist->gid;
142 shiftvec = fr->shift_vec[0];
143 fshift = fr->fshift[0];
144 facel = _mm_set1_ps(fr->epsfac);
145 charge = mdatoms->chargeA;
146 nvdwtype = fr->ntype;
147 vdwparam = fr->nbfp;
148 vdwtype = mdatoms->typeA;
149 vdwgridparam = fr->ljpme_c6grid;
150 sh_lj_ewald = _mm_set1_ps(fr->ic->sh_lj_ewald);
151 ewclj = _mm_set1_ps(fr->ewaldcoeff_lj);
152 ewclj2 = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
154 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
155 ewtab = fr->ic->tabq_coul_FDV0;
156 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
157 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
159 /* Setup water-specific parameters */
160 inr = nlist->iinr[0];
161 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
162 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
163 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
164 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
166 jq0 = _mm_set1_ps(charge[inr+0]);
167 jq1 = _mm_set1_ps(charge[inr+1]);
168 jq2 = _mm_set1_ps(charge[inr+2]);
169 vdwjidx0A = 2*vdwtype[inr+0];
170 qq00 = _mm_mul_ps(iq0,jq0);
171 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
172 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
173 c6grid_00 = _mm_set1_ps(vdwgridparam[vdwioffset0+vdwjidx0A]);
174 qq01 = _mm_mul_ps(iq0,jq1);
175 qq02 = _mm_mul_ps(iq0,jq2);
176 qq10 = _mm_mul_ps(iq1,jq0);
177 qq11 = _mm_mul_ps(iq1,jq1);
178 qq12 = _mm_mul_ps(iq1,jq2);
179 qq20 = _mm_mul_ps(iq2,jq0);
180 qq21 = _mm_mul_ps(iq2,jq1);
181 qq22 = _mm_mul_ps(iq2,jq2);
183 /* Avoid stupid compiler warnings */
184 jnrA = jnrB = jnrC = jnrD = 0;
185 j_coord_offsetA = 0;
186 j_coord_offsetB = 0;
187 j_coord_offsetC = 0;
188 j_coord_offsetD = 0;
190 outeriter = 0;
191 inneriter = 0;
193 for(iidx=0;iidx<4*DIM;iidx++)
195 scratch[iidx] = 0.0;
198 /* Start outer loop over neighborlists */
199 for(iidx=0; iidx<nri; iidx++)
201 /* Load shift vector for this list */
202 i_shift_offset = DIM*shiftidx[iidx];
204 /* Load limits for loop over neighbors */
205 j_index_start = jindex[iidx];
206 j_index_end = jindex[iidx+1];
208 /* Get outer coordinate index */
209 inr = iinr[iidx];
210 i_coord_offset = DIM*inr;
212 /* Load i particle coords and add shift vector */
213 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
214 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
216 fix0 = _mm_setzero_ps();
217 fiy0 = _mm_setzero_ps();
218 fiz0 = _mm_setzero_ps();
219 fix1 = _mm_setzero_ps();
220 fiy1 = _mm_setzero_ps();
221 fiz1 = _mm_setzero_ps();
222 fix2 = _mm_setzero_ps();
223 fiy2 = _mm_setzero_ps();
224 fiz2 = _mm_setzero_ps();
226 /* Reset potential sums */
227 velecsum = _mm_setzero_ps();
228 vvdwsum = _mm_setzero_ps();
230 /* Start inner kernel loop */
231 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
234 /* Get j neighbor index, and coordinate index */
235 jnrA = jjnr[jidx];
236 jnrB = jjnr[jidx+1];
237 jnrC = jjnr[jidx+2];
238 jnrD = jjnr[jidx+3];
239 j_coord_offsetA = DIM*jnrA;
240 j_coord_offsetB = DIM*jnrB;
241 j_coord_offsetC = DIM*jnrC;
242 j_coord_offsetD = DIM*jnrD;
244 /* load j atom coordinates */
245 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
246 x+j_coord_offsetC,x+j_coord_offsetD,
247 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
249 /* Calculate displacement vector */
250 dx00 = _mm_sub_ps(ix0,jx0);
251 dy00 = _mm_sub_ps(iy0,jy0);
252 dz00 = _mm_sub_ps(iz0,jz0);
253 dx01 = _mm_sub_ps(ix0,jx1);
254 dy01 = _mm_sub_ps(iy0,jy1);
255 dz01 = _mm_sub_ps(iz0,jz1);
256 dx02 = _mm_sub_ps(ix0,jx2);
257 dy02 = _mm_sub_ps(iy0,jy2);
258 dz02 = _mm_sub_ps(iz0,jz2);
259 dx10 = _mm_sub_ps(ix1,jx0);
260 dy10 = _mm_sub_ps(iy1,jy0);
261 dz10 = _mm_sub_ps(iz1,jz0);
262 dx11 = _mm_sub_ps(ix1,jx1);
263 dy11 = _mm_sub_ps(iy1,jy1);
264 dz11 = _mm_sub_ps(iz1,jz1);
265 dx12 = _mm_sub_ps(ix1,jx2);
266 dy12 = _mm_sub_ps(iy1,jy2);
267 dz12 = _mm_sub_ps(iz1,jz2);
268 dx20 = _mm_sub_ps(ix2,jx0);
269 dy20 = _mm_sub_ps(iy2,jy0);
270 dz20 = _mm_sub_ps(iz2,jz0);
271 dx21 = _mm_sub_ps(ix2,jx1);
272 dy21 = _mm_sub_ps(iy2,jy1);
273 dz21 = _mm_sub_ps(iz2,jz1);
274 dx22 = _mm_sub_ps(ix2,jx2);
275 dy22 = _mm_sub_ps(iy2,jy2);
276 dz22 = _mm_sub_ps(iz2,jz2);
278 /* Calculate squared distance and things based on it */
279 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
280 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
281 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
282 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
283 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
284 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
285 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
286 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
287 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
289 rinv00 = gmx_mm_invsqrt_ps(rsq00);
290 rinv01 = gmx_mm_invsqrt_ps(rsq01);
291 rinv02 = gmx_mm_invsqrt_ps(rsq02);
292 rinv10 = gmx_mm_invsqrt_ps(rsq10);
293 rinv11 = gmx_mm_invsqrt_ps(rsq11);
294 rinv12 = gmx_mm_invsqrt_ps(rsq12);
295 rinv20 = gmx_mm_invsqrt_ps(rsq20);
296 rinv21 = gmx_mm_invsqrt_ps(rsq21);
297 rinv22 = gmx_mm_invsqrt_ps(rsq22);
299 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
300 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
301 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
302 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
303 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
304 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
305 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
306 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
307 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
309 fjx0 = _mm_setzero_ps();
310 fjy0 = _mm_setzero_ps();
311 fjz0 = _mm_setzero_ps();
312 fjx1 = _mm_setzero_ps();
313 fjy1 = _mm_setzero_ps();
314 fjz1 = _mm_setzero_ps();
315 fjx2 = _mm_setzero_ps();
316 fjy2 = _mm_setzero_ps();
317 fjz2 = _mm_setzero_ps();
319 /**************************
320 * CALCULATE INTERACTIONS *
321 **************************/
323 r00 = _mm_mul_ps(rsq00,rinv00);
325 /* EWALD ELECTROSTATICS */
327 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
328 ewrt = _mm_mul_ps(r00,ewtabscale);
329 ewitab = _mm_cvttps_epi32(ewrt);
330 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
331 ewitab = _mm_slli_epi32(ewitab,2);
332 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
333 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
334 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
335 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
336 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
337 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
338 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
339 velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
340 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
342 /* Analytical LJ-PME */
343 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
344 ewcljrsq = _mm_mul_ps(ewclj2,rsq00);
345 ewclj6 = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
346 exponent = gmx_simd_exp_r(ewcljrsq);
347 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
348 poly = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
349 /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
350 vvdw6 = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
351 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
352 vvdw = _mm_sub_ps(_mm_mul_ps(vvdw12,one_twelfth),_mm_mul_ps(vvdw6,one_sixth));
353 /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
354 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
356 /* Update potential sum for this i atom from the interaction with this j atom. */
357 velecsum = _mm_add_ps(velecsum,velec);
358 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
360 fscal = _mm_add_ps(felec,fvdw);
362 /* Calculate temporary vectorial force */
363 tx = _mm_mul_ps(fscal,dx00);
364 ty = _mm_mul_ps(fscal,dy00);
365 tz = _mm_mul_ps(fscal,dz00);
367 /* Update vectorial force */
368 fix0 = _mm_add_ps(fix0,tx);
369 fiy0 = _mm_add_ps(fiy0,ty);
370 fiz0 = _mm_add_ps(fiz0,tz);
372 fjx0 = _mm_add_ps(fjx0,tx);
373 fjy0 = _mm_add_ps(fjy0,ty);
374 fjz0 = _mm_add_ps(fjz0,tz);
376 /**************************
377 * CALCULATE INTERACTIONS *
378 **************************/
380 r01 = _mm_mul_ps(rsq01,rinv01);
382 /* EWALD ELECTROSTATICS */
384 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
385 ewrt = _mm_mul_ps(r01,ewtabscale);
386 ewitab = _mm_cvttps_epi32(ewrt);
387 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
388 ewitab = _mm_slli_epi32(ewitab,2);
389 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
390 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
391 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
392 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
393 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
394 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
395 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
396 velec = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
397 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
399 /* Update potential sum for this i atom from the interaction with this j atom. */
400 velecsum = _mm_add_ps(velecsum,velec);
402 fscal = felec;
404 /* Calculate temporary vectorial force */
405 tx = _mm_mul_ps(fscal,dx01);
406 ty = _mm_mul_ps(fscal,dy01);
407 tz = _mm_mul_ps(fscal,dz01);
409 /* Update vectorial force */
410 fix0 = _mm_add_ps(fix0,tx);
411 fiy0 = _mm_add_ps(fiy0,ty);
412 fiz0 = _mm_add_ps(fiz0,tz);
414 fjx1 = _mm_add_ps(fjx1,tx);
415 fjy1 = _mm_add_ps(fjy1,ty);
416 fjz1 = _mm_add_ps(fjz1,tz);
418 /**************************
419 * CALCULATE INTERACTIONS *
420 **************************/
422 r02 = _mm_mul_ps(rsq02,rinv02);
424 /* EWALD ELECTROSTATICS */
426 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
427 ewrt = _mm_mul_ps(r02,ewtabscale);
428 ewitab = _mm_cvttps_epi32(ewrt);
429 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
430 ewitab = _mm_slli_epi32(ewitab,2);
431 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
432 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
433 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
434 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
435 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
436 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
437 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
438 velec = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
439 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
441 /* Update potential sum for this i atom from the interaction with this j atom. */
442 velecsum = _mm_add_ps(velecsum,velec);
444 fscal = felec;
446 /* Calculate temporary vectorial force */
447 tx = _mm_mul_ps(fscal,dx02);
448 ty = _mm_mul_ps(fscal,dy02);
449 tz = _mm_mul_ps(fscal,dz02);
451 /* Update vectorial force */
452 fix0 = _mm_add_ps(fix0,tx);
453 fiy0 = _mm_add_ps(fiy0,ty);
454 fiz0 = _mm_add_ps(fiz0,tz);
456 fjx2 = _mm_add_ps(fjx2,tx);
457 fjy2 = _mm_add_ps(fjy2,ty);
458 fjz2 = _mm_add_ps(fjz2,tz);
460 /**************************
461 * CALCULATE INTERACTIONS *
462 **************************/
464 r10 = _mm_mul_ps(rsq10,rinv10);
466 /* EWALD ELECTROSTATICS */
468 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
469 ewrt = _mm_mul_ps(r10,ewtabscale);
470 ewitab = _mm_cvttps_epi32(ewrt);
471 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
472 ewitab = _mm_slli_epi32(ewitab,2);
473 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
474 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
475 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
476 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
477 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
478 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
479 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
480 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
481 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
483 /* Update potential sum for this i atom from the interaction with this j atom. */
484 velecsum = _mm_add_ps(velecsum,velec);
486 fscal = felec;
488 /* Calculate temporary vectorial force */
489 tx = _mm_mul_ps(fscal,dx10);
490 ty = _mm_mul_ps(fscal,dy10);
491 tz = _mm_mul_ps(fscal,dz10);
493 /* Update vectorial force */
494 fix1 = _mm_add_ps(fix1,tx);
495 fiy1 = _mm_add_ps(fiy1,ty);
496 fiz1 = _mm_add_ps(fiz1,tz);
498 fjx0 = _mm_add_ps(fjx0,tx);
499 fjy0 = _mm_add_ps(fjy0,ty);
500 fjz0 = _mm_add_ps(fjz0,tz);
502 /**************************
503 * CALCULATE INTERACTIONS *
504 **************************/
506 r11 = _mm_mul_ps(rsq11,rinv11);
508 /* EWALD ELECTROSTATICS */
510 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
511 ewrt = _mm_mul_ps(r11,ewtabscale);
512 ewitab = _mm_cvttps_epi32(ewrt);
513 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
514 ewitab = _mm_slli_epi32(ewitab,2);
515 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
516 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
517 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
518 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
519 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
520 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
521 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
522 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
523 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
525 /* Update potential sum for this i atom from the interaction with this j atom. */
526 velecsum = _mm_add_ps(velecsum,velec);
528 fscal = felec;
530 /* Calculate temporary vectorial force */
531 tx = _mm_mul_ps(fscal,dx11);
532 ty = _mm_mul_ps(fscal,dy11);
533 tz = _mm_mul_ps(fscal,dz11);
535 /* Update vectorial force */
536 fix1 = _mm_add_ps(fix1,tx);
537 fiy1 = _mm_add_ps(fiy1,ty);
538 fiz1 = _mm_add_ps(fiz1,tz);
540 fjx1 = _mm_add_ps(fjx1,tx);
541 fjy1 = _mm_add_ps(fjy1,ty);
542 fjz1 = _mm_add_ps(fjz1,tz);
544 /**************************
545 * CALCULATE INTERACTIONS *
546 **************************/
548 r12 = _mm_mul_ps(rsq12,rinv12);
550 /* EWALD ELECTROSTATICS */
552 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
553 ewrt = _mm_mul_ps(r12,ewtabscale);
554 ewitab = _mm_cvttps_epi32(ewrt);
555 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
556 ewitab = _mm_slli_epi32(ewitab,2);
557 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
558 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
559 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
560 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
561 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
562 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
563 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
564 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
565 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
567 /* Update potential sum for this i atom from the interaction with this j atom. */
568 velecsum = _mm_add_ps(velecsum,velec);
570 fscal = felec;
572 /* Calculate temporary vectorial force */
573 tx = _mm_mul_ps(fscal,dx12);
574 ty = _mm_mul_ps(fscal,dy12);
575 tz = _mm_mul_ps(fscal,dz12);
577 /* Update vectorial force */
578 fix1 = _mm_add_ps(fix1,tx);
579 fiy1 = _mm_add_ps(fiy1,ty);
580 fiz1 = _mm_add_ps(fiz1,tz);
582 fjx2 = _mm_add_ps(fjx2,tx);
583 fjy2 = _mm_add_ps(fjy2,ty);
584 fjz2 = _mm_add_ps(fjz2,tz);
586 /**************************
587 * CALCULATE INTERACTIONS *
588 **************************/
590 r20 = _mm_mul_ps(rsq20,rinv20);
592 /* EWALD ELECTROSTATICS */
594 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
595 ewrt = _mm_mul_ps(r20,ewtabscale);
596 ewitab = _mm_cvttps_epi32(ewrt);
597 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
598 ewitab = _mm_slli_epi32(ewitab,2);
599 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
600 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
601 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
602 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
603 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
604 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
605 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
606 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
607 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
609 /* Update potential sum for this i atom from the interaction with this j atom. */
610 velecsum = _mm_add_ps(velecsum,velec);
612 fscal = felec;
614 /* Calculate temporary vectorial force */
615 tx = _mm_mul_ps(fscal,dx20);
616 ty = _mm_mul_ps(fscal,dy20);
617 tz = _mm_mul_ps(fscal,dz20);
619 /* Update vectorial force */
620 fix2 = _mm_add_ps(fix2,tx);
621 fiy2 = _mm_add_ps(fiy2,ty);
622 fiz2 = _mm_add_ps(fiz2,tz);
624 fjx0 = _mm_add_ps(fjx0,tx);
625 fjy0 = _mm_add_ps(fjy0,ty);
626 fjz0 = _mm_add_ps(fjz0,tz);
628 /**************************
629 * CALCULATE INTERACTIONS *
630 **************************/
632 r21 = _mm_mul_ps(rsq21,rinv21);
634 /* EWALD ELECTROSTATICS */
636 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
637 ewrt = _mm_mul_ps(r21,ewtabscale);
638 ewitab = _mm_cvttps_epi32(ewrt);
639 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
640 ewitab = _mm_slli_epi32(ewitab,2);
641 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
642 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
643 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
644 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
645 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
646 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
647 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
648 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
649 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
651 /* Update potential sum for this i atom from the interaction with this j atom. */
652 velecsum = _mm_add_ps(velecsum,velec);
654 fscal = felec;
656 /* Calculate temporary vectorial force */
657 tx = _mm_mul_ps(fscal,dx21);
658 ty = _mm_mul_ps(fscal,dy21);
659 tz = _mm_mul_ps(fscal,dz21);
661 /* Update vectorial force */
662 fix2 = _mm_add_ps(fix2,tx);
663 fiy2 = _mm_add_ps(fiy2,ty);
664 fiz2 = _mm_add_ps(fiz2,tz);
666 fjx1 = _mm_add_ps(fjx1,tx);
667 fjy1 = _mm_add_ps(fjy1,ty);
668 fjz1 = _mm_add_ps(fjz1,tz);
670 /**************************
671 * CALCULATE INTERACTIONS *
672 **************************/
674 r22 = _mm_mul_ps(rsq22,rinv22);
676 /* EWALD ELECTROSTATICS */
678 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
679 ewrt = _mm_mul_ps(r22,ewtabscale);
680 ewitab = _mm_cvttps_epi32(ewrt);
681 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
682 ewitab = _mm_slli_epi32(ewitab,2);
683 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
684 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
685 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
686 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
687 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
688 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
689 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
690 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
691 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
693 /* Update potential sum for this i atom from the interaction with this j atom. */
694 velecsum = _mm_add_ps(velecsum,velec);
696 fscal = felec;
698 /* Calculate temporary vectorial force */
699 tx = _mm_mul_ps(fscal,dx22);
700 ty = _mm_mul_ps(fscal,dy22);
701 tz = _mm_mul_ps(fscal,dz22);
703 /* Update vectorial force */
704 fix2 = _mm_add_ps(fix2,tx);
705 fiy2 = _mm_add_ps(fiy2,ty);
706 fiz2 = _mm_add_ps(fiz2,tz);
708 fjx2 = _mm_add_ps(fjx2,tx);
709 fjy2 = _mm_add_ps(fjy2,ty);
710 fjz2 = _mm_add_ps(fjz2,tz);
712 fjptrA = f+j_coord_offsetA;
713 fjptrB = f+j_coord_offsetB;
714 fjptrC = f+j_coord_offsetC;
715 fjptrD = f+j_coord_offsetD;
717 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
718 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
720 /* Inner loop uses 397 flops */
723 if(jidx<j_index_end)
726 /* Get j neighbor index, and coordinate index */
727 jnrlistA = jjnr[jidx];
728 jnrlistB = jjnr[jidx+1];
729 jnrlistC = jjnr[jidx+2];
730 jnrlistD = jjnr[jidx+3];
731 /* Sign of each element will be negative for non-real atoms.
732 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
733 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
735 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
736 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
737 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
738 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
739 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
740 j_coord_offsetA = DIM*jnrA;
741 j_coord_offsetB = DIM*jnrB;
742 j_coord_offsetC = DIM*jnrC;
743 j_coord_offsetD = DIM*jnrD;
745 /* load j atom coordinates */
746 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
747 x+j_coord_offsetC,x+j_coord_offsetD,
748 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
750 /* Calculate displacement vector */
751 dx00 = _mm_sub_ps(ix0,jx0);
752 dy00 = _mm_sub_ps(iy0,jy0);
753 dz00 = _mm_sub_ps(iz0,jz0);
754 dx01 = _mm_sub_ps(ix0,jx1);
755 dy01 = _mm_sub_ps(iy0,jy1);
756 dz01 = _mm_sub_ps(iz0,jz1);
757 dx02 = _mm_sub_ps(ix0,jx2);
758 dy02 = _mm_sub_ps(iy0,jy2);
759 dz02 = _mm_sub_ps(iz0,jz2);
760 dx10 = _mm_sub_ps(ix1,jx0);
761 dy10 = _mm_sub_ps(iy1,jy0);
762 dz10 = _mm_sub_ps(iz1,jz0);
763 dx11 = _mm_sub_ps(ix1,jx1);
764 dy11 = _mm_sub_ps(iy1,jy1);
765 dz11 = _mm_sub_ps(iz1,jz1);
766 dx12 = _mm_sub_ps(ix1,jx2);
767 dy12 = _mm_sub_ps(iy1,jy2);
768 dz12 = _mm_sub_ps(iz1,jz2);
769 dx20 = _mm_sub_ps(ix2,jx0);
770 dy20 = _mm_sub_ps(iy2,jy0);
771 dz20 = _mm_sub_ps(iz2,jz0);
772 dx21 = _mm_sub_ps(ix2,jx1);
773 dy21 = _mm_sub_ps(iy2,jy1);
774 dz21 = _mm_sub_ps(iz2,jz1);
775 dx22 = _mm_sub_ps(ix2,jx2);
776 dy22 = _mm_sub_ps(iy2,jy2);
777 dz22 = _mm_sub_ps(iz2,jz2);
779 /* Calculate squared distance and things based on it */
780 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
781 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
782 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
783 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
784 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
785 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
786 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
787 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
788 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
790 rinv00 = gmx_mm_invsqrt_ps(rsq00);
791 rinv01 = gmx_mm_invsqrt_ps(rsq01);
792 rinv02 = gmx_mm_invsqrt_ps(rsq02);
793 rinv10 = gmx_mm_invsqrt_ps(rsq10);
794 rinv11 = gmx_mm_invsqrt_ps(rsq11);
795 rinv12 = gmx_mm_invsqrt_ps(rsq12);
796 rinv20 = gmx_mm_invsqrt_ps(rsq20);
797 rinv21 = gmx_mm_invsqrt_ps(rsq21);
798 rinv22 = gmx_mm_invsqrt_ps(rsq22);
800 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
801 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
802 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
803 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
804 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
805 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
806 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
807 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
808 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
810 fjx0 = _mm_setzero_ps();
811 fjy0 = _mm_setzero_ps();
812 fjz0 = _mm_setzero_ps();
813 fjx1 = _mm_setzero_ps();
814 fjy1 = _mm_setzero_ps();
815 fjz1 = _mm_setzero_ps();
816 fjx2 = _mm_setzero_ps();
817 fjy2 = _mm_setzero_ps();
818 fjz2 = _mm_setzero_ps();
820 /**************************
821 * CALCULATE INTERACTIONS *
822 **************************/
824 r00 = _mm_mul_ps(rsq00,rinv00);
825 r00 = _mm_andnot_ps(dummy_mask,r00);
827 /* EWALD ELECTROSTATICS */
829 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
830 ewrt = _mm_mul_ps(r00,ewtabscale);
831 ewitab = _mm_cvttps_epi32(ewrt);
832 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
833 ewitab = _mm_slli_epi32(ewitab,2);
834 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
835 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
836 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
837 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
838 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
839 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
840 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
841 velec = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
842 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
844 /* Analytical LJ-PME */
845 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
846 ewcljrsq = _mm_mul_ps(ewclj2,rsq00);
847 ewclj6 = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
848 exponent = gmx_simd_exp_r(ewcljrsq);
849 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
850 poly = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
851 /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
852 vvdw6 = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
853 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
854 vvdw = _mm_sub_ps(_mm_mul_ps(vvdw12,one_twelfth),_mm_mul_ps(vvdw6,one_sixth));
855 /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
856 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
858 /* Update potential sum for this i atom from the interaction with this j atom. */
859 velec = _mm_andnot_ps(dummy_mask,velec);
860 velecsum = _mm_add_ps(velecsum,velec);
861 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
862 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
864 fscal = _mm_add_ps(felec,fvdw);
866 fscal = _mm_andnot_ps(dummy_mask,fscal);
868 /* Calculate temporary vectorial force */
869 tx = _mm_mul_ps(fscal,dx00);
870 ty = _mm_mul_ps(fscal,dy00);
871 tz = _mm_mul_ps(fscal,dz00);
873 /* Update vectorial force */
874 fix0 = _mm_add_ps(fix0,tx);
875 fiy0 = _mm_add_ps(fiy0,ty);
876 fiz0 = _mm_add_ps(fiz0,tz);
878 fjx0 = _mm_add_ps(fjx0,tx);
879 fjy0 = _mm_add_ps(fjy0,ty);
880 fjz0 = _mm_add_ps(fjz0,tz);
882 /**************************
883 * CALCULATE INTERACTIONS *
884 **************************/
886 r01 = _mm_mul_ps(rsq01,rinv01);
887 r01 = _mm_andnot_ps(dummy_mask,r01);
889 /* EWALD ELECTROSTATICS */
891 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
892 ewrt = _mm_mul_ps(r01,ewtabscale);
893 ewitab = _mm_cvttps_epi32(ewrt);
894 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
895 ewitab = _mm_slli_epi32(ewitab,2);
896 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
897 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
898 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
899 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
900 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
901 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
902 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
903 velec = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
904 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
906 /* Update potential sum for this i atom from the interaction with this j atom. */
907 velec = _mm_andnot_ps(dummy_mask,velec);
908 velecsum = _mm_add_ps(velecsum,velec);
910 fscal = felec;
912 fscal = _mm_andnot_ps(dummy_mask,fscal);
914 /* Calculate temporary vectorial force */
915 tx = _mm_mul_ps(fscal,dx01);
916 ty = _mm_mul_ps(fscal,dy01);
917 tz = _mm_mul_ps(fscal,dz01);
919 /* Update vectorial force */
920 fix0 = _mm_add_ps(fix0,tx);
921 fiy0 = _mm_add_ps(fiy0,ty);
922 fiz0 = _mm_add_ps(fiz0,tz);
924 fjx1 = _mm_add_ps(fjx1,tx);
925 fjy1 = _mm_add_ps(fjy1,ty);
926 fjz1 = _mm_add_ps(fjz1,tz);
928 /**************************
929 * CALCULATE INTERACTIONS *
930 **************************/
932 r02 = _mm_mul_ps(rsq02,rinv02);
933 r02 = _mm_andnot_ps(dummy_mask,r02);
935 /* EWALD ELECTROSTATICS */
937 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
938 ewrt = _mm_mul_ps(r02,ewtabscale);
939 ewitab = _mm_cvttps_epi32(ewrt);
940 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
941 ewitab = _mm_slli_epi32(ewitab,2);
942 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
943 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
944 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
945 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
946 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
947 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
948 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
949 velec = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
950 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
952 /* Update potential sum for this i atom from the interaction with this j atom. */
953 velec = _mm_andnot_ps(dummy_mask,velec);
954 velecsum = _mm_add_ps(velecsum,velec);
956 fscal = felec;
958 fscal = _mm_andnot_ps(dummy_mask,fscal);
960 /* Calculate temporary vectorial force */
961 tx = _mm_mul_ps(fscal,dx02);
962 ty = _mm_mul_ps(fscal,dy02);
963 tz = _mm_mul_ps(fscal,dz02);
965 /* Update vectorial force */
966 fix0 = _mm_add_ps(fix0,tx);
967 fiy0 = _mm_add_ps(fiy0,ty);
968 fiz0 = _mm_add_ps(fiz0,tz);
970 fjx2 = _mm_add_ps(fjx2,tx);
971 fjy2 = _mm_add_ps(fjy2,ty);
972 fjz2 = _mm_add_ps(fjz2,tz);
974 /**************************
975 * CALCULATE INTERACTIONS *
976 **************************/
978 r10 = _mm_mul_ps(rsq10,rinv10);
979 r10 = _mm_andnot_ps(dummy_mask,r10);
981 /* EWALD ELECTROSTATICS */
983 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
984 ewrt = _mm_mul_ps(r10,ewtabscale);
985 ewitab = _mm_cvttps_epi32(ewrt);
986 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
987 ewitab = _mm_slli_epi32(ewitab,2);
988 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
989 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
990 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
991 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
992 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
993 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
994 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
995 velec = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
996 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
998 /* Update potential sum for this i atom from the interaction with this j atom. */
999 velec = _mm_andnot_ps(dummy_mask,velec);
1000 velecsum = _mm_add_ps(velecsum,velec);
1002 fscal = felec;
1004 fscal = _mm_andnot_ps(dummy_mask,fscal);
1006 /* Calculate temporary vectorial force */
1007 tx = _mm_mul_ps(fscal,dx10);
1008 ty = _mm_mul_ps(fscal,dy10);
1009 tz = _mm_mul_ps(fscal,dz10);
1011 /* Update vectorial force */
1012 fix1 = _mm_add_ps(fix1,tx);
1013 fiy1 = _mm_add_ps(fiy1,ty);
1014 fiz1 = _mm_add_ps(fiz1,tz);
1016 fjx0 = _mm_add_ps(fjx0,tx);
1017 fjy0 = _mm_add_ps(fjy0,ty);
1018 fjz0 = _mm_add_ps(fjz0,tz);
1020 /**************************
1021 * CALCULATE INTERACTIONS *
1022 **************************/
1024 r11 = _mm_mul_ps(rsq11,rinv11);
1025 r11 = _mm_andnot_ps(dummy_mask,r11);
1027 /* EWALD ELECTROSTATICS */
1029 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1030 ewrt = _mm_mul_ps(r11,ewtabscale);
1031 ewitab = _mm_cvttps_epi32(ewrt);
1032 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1033 ewitab = _mm_slli_epi32(ewitab,2);
1034 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1035 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1036 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1037 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1038 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1039 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1040 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1041 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
1042 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1044 /* Update potential sum for this i atom from the interaction with this j atom. */
1045 velec = _mm_andnot_ps(dummy_mask,velec);
1046 velecsum = _mm_add_ps(velecsum,velec);
1048 fscal = felec;
1050 fscal = _mm_andnot_ps(dummy_mask,fscal);
1052 /* Calculate temporary vectorial force */
1053 tx = _mm_mul_ps(fscal,dx11);
1054 ty = _mm_mul_ps(fscal,dy11);
1055 tz = _mm_mul_ps(fscal,dz11);
1057 /* Update vectorial force */
1058 fix1 = _mm_add_ps(fix1,tx);
1059 fiy1 = _mm_add_ps(fiy1,ty);
1060 fiz1 = _mm_add_ps(fiz1,tz);
1062 fjx1 = _mm_add_ps(fjx1,tx);
1063 fjy1 = _mm_add_ps(fjy1,ty);
1064 fjz1 = _mm_add_ps(fjz1,tz);
1066 /**************************
1067 * CALCULATE INTERACTIONS *
1068 **************************/
1070 r12 = _mm_mul_ps(rsq12,rinv12);
1071 r12 = _mm_andnot_ps(dummy_mask,r12);
1073 /* EWALD ELECTROSTATICS */
1075 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1076 ewrt = _mm_mul_ps(r12,ewtabscale);
1077 ewitab = _mm_cvttps_epi32(ewrt);
1078 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1079 ewitab = _mm_slli_epi32(ewitab,2);
1080 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1081 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1082 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1083 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1084 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1085 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1086 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1087 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
1088 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1090 /* Update potential sum for this i atom from the interaction with this j atom. */
1091 velec = _mm_andnot_ps(dummy_mask,velec);
1092 velecsum = _mm_add_ps(velecsum,velec);
1094 fscal = felec;
1096 fscal = _mm_andnot_ps(dummy_mask,fscal);
1098 /* Calculate temporary vectorial force */
1099 tx = _mm_mul_ps(fscal,dx12);
1100 ty = _mm_mul_ps(fscal,dy12);
1101 tz = _mm_mul_ps(fscal,dz12);
1103 /* Update vectorial force */
1104 fix1 = _mm_add_ps(fix1,tx);
1105 fiy1 = _mm_add_ps(fiy1,ty);
1106 fiz1 = _mm_add_ps(fiz1,tz);
1108 fjx2 = _mm_add_ps(fjx2,tx);
1109 fjy2 = _mm_add_ps(fjy2,ty);
1110 fjz2 = _mm_add_ps(fjz2,tz);
1112 /**************************
1113 * CALCULATE INTERACTIONS *
1114 **************************/
1116 r20 = _mm_mul_ps(rsq20,rinv20);
1117 r20 = _mm_andnot_ps(dummy_mask,r20);
1119 /* EWALD ELECTROSTATICS */
1121 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1122 ewrt = _mm_mul_ps(r20,ewtabscale);
1123 ewitab = _mm_cvttps_epi32(ewrt);
1124 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1125 ewitab = _mm_slli_epi32(ewitab,2);
1126 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1127 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1128 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1129 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1130 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1131 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1132 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1133 velec = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
1134 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1136 /* Update potential sum for this i atom from the interaction with this j atom. */
1137 velec = _mm_andnot_ps(dummy_mask,velec);
1138 velecsum = _mm_add_ps(velecsum,velec);
1140 fscal = felec;
1142 fscal = _mm_andnot_ps(dummy_mask,fscal);
1144 /* Calculate temporary vectorial force */
1145 tx = _mm_mul_ps(fscal,dx20);
1146 ty = _mm_mul_ps(fscal,dy20);
1147 tz = _mm_mul_ps(fscal,dz20);
1149 /* Update vectorial force */
1150 fix2 = _mm_add_ps(fix2,tx);
1151 fiy2 = _mm_add_ps(fiy2,ty);
1152 fiz2 = _mm_add_ps(fiz2,tz);
1154 fjx0 = _mm_add_ps(fjx0,tx);
1155 fjy0 = _mm_add_ps(fjy0,ty);
1156 fjz0 = _mm_add_ps(fjz0,tz);
1158 /**************************
1159 * CALCULATE INTERACTIONS *
1160 **************************/
1162 r21 = _mm_mul_ps(rsq21,rinv21);
1163 r21 = _mm_andnot_ps(dummy_mask,r21);
1165 /* EWALD ELECTROSTATICS */
1167 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1168 ewrt = _mm_mul_ps(r21,ewtabscale);
1169 ewitab = _mm_cvttps_epi32(ewrt);
1170 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1171 ewitab = _mm_slli_epi32(ewitab,2);
1172 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1173 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1174 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1175 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1176 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1177 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1178 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1179 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
1180 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1182 /* Update potential sum for this i atom from the interaction with this j atom. */
1183 velec = _mm_andnot_ps(dummy_mask,velec);
1184 velecsum = _mm_add_ps(velecsum,velec);
1186 fscal = felec;
1188 fscal = _mm_andnot_ps(dummy_mask,fscal);
1190 /* Calculate temporary vectorial force */
1191 tx = _mm_mul_ps(fscal,dx21);
1192 ty = _mm_mul_ps(fscal,dy21);
1193 tz = _mm_mul_ps(fscal,dz21);
1195 /* Update vectorial force */
1196 fix2 = _mm_add_ps(fix2,tx);
1197 fiy2 = _mm_add_ps(fiy2,ty);
1198 fiz2 = _mm_add_ps(fiz2,tz);
1200 fjx1 = _mm_add_ps(fjx1,tx);
1201 fjy1 = _mm_add_ps(fjy1,ty);
1202 fjz1 = _mm_add_ps(fjz1,tz);
1204 /**************************
1205 * CALCULATE INTERACTIONS *
1206 **************************/
1208 r22 = _mm_mul_ps(rsq22,rinv22);
1209 r22 = _mm_andnot_ps(dummy_mask,r22);
1211 /* EWALD ELECTROSTATICS */
1213 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1214 ewrt = _mm_mul_ps(r22,ewtabscale);
1215 ewitab = _mm_cvttps_epi32(ewrt);
1216 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1217 ewitab = _mm_slli_epi32(ewitab,2);
1218 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1219 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1220 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1221 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1222 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1223 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1224 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1225 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
1226 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1228 /* Update potential sum for this i atom from the interaction with this j atom. */
1229 velec = _mm_andnot_ps(dummy_mask,velec);
1230 velecsum = _mm_add_ps(velecsum,velec);
1232 fscal = felec;
1234 fscal = _mm_andnot_ps(dummy_mask,fscal);
1236 /* Calculate temporary vectorial force */
1237 tx = _mm_mul_ps(fscal,dx22);
1238 ty = _mm_mul_ps(fscal,dy22);
1239 tz = _mm_mul_ps(fscal,dz22);
1241 /* Update vectorial force */
1242 fix2 = _mm_add_ps(fix2,tx);
1243 fiy2 = _mm_add_ps(fiy2,ty);
1244 fiz2 = _mm_add_ps(fiz2,tz);
1246 fjx2 = _mm_add_ps(fjx2,tx);
1247 fjy2 = _mm_add_ps(fjy2,ty);
1248 fjz2 = _mm_add_ps(fjz2,tz);
1250 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1251 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1252 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1253 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1255 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1256 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1258 /* Inner loop uses 406 flops */
1261 /* End of innermost loop */
1263 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1264 f+i_coord_offset,fshift+i_shift_offset);
1266 ggid = gid[iidx];
1267 /* Update potential energies */
1268 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1269 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1271 /* Increment number of inner iterations */
1272 inneriter += j_index_end - j_index_start;
1274 /* Outer loop uses 20 flops */
1277 /* Increment number of outer iterations */
1278 outeriter += nri;
1280 /* Update outer/inner flops */
1282 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*406);
1285 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_sse2_single
1286 * Electrostatics interaction: Ewald
1287 * VdW interaction: LJEwald
1288 * Geometry: Water3-Water3
1289 * Calculate force/pot: Force
1291 void
1292 nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_sse2_single
1293 (t_nblist * gmx_restrict nlist,
1294 rvec * gmx_restrict xx,
1295 rvec * gmx_restrict ff,
1296 t_forcerec * gmx_restrict fr,
1297 t_mdatoms * gmx_restrict mdatoms,
1298 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1299 t_nrnb * gmx_restrict nrnb)
1301 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1302 * just 0 for non-waters.
1303 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1304 * jnr indices corresponding to data put in the four positions in the SIMD register.
1306 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1307 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1308 int jnrA,jnrB,jnrC,jnrD;
1309 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1310 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1311 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1312 real rcutoff_scalar;
1313 real *shiftvec,*fshift,*x,*f;
1314 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1315 real scratch[4*DIM];
1316 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1317 int vdwioffset0;
1318 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1319 int vdwioffset1;
1320 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1321 int vdwioffset2;
1322 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1323 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1324 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1325 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1326 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1327 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1328 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1329 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1330 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1331 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1332 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1333 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1334 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1335 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1336 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1337 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1338 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1339 real *charge;
1340 int nvdwtype;
1341 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1342 int *vdwtype;
1343 real *vdwparam;
1344 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1345 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1346 __m128 c6grid_00;
1347 __m128 c6grid_01;
1348 __m128 c6grid_02;
1349 __m128 c6grid_10;
1350 __m128 c6grid_11;
1351 __m128 c6grid_12;
1352 __m128 c6grid_20;
1353 __m128 c6grid_21;
1354 __m128 c6grid_22;
1355 __m128 ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
1356 real *vdwgridparam;
1357 __m128 one_half = _mm_set1_ps(0.5);
1358 __m128 minus_one = _mm_set1_ps(-1.0);
1359 __m128i ewitab;
1360 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1361 real *ewtab;
1362 __m128 dummy_mask,cutoff_mask;
1363 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1364 __m128 one = _mm_set1_ps(1.0);
1365 __m128 two = _mm_set1_ps(2.0);
1366 x = xx[0];
1367 f = ff[0];
1369 nri = nlist->nri;
1370 iinr = nlist->iinr;
1371 jindex = nlist->jindex;
1372 jjnr = nlist->jjnr;
1373 shiftidx = nlist->shift;
1374 gid = nlist->gid;
1375 shiftvec = fr->shift_vec[0];
1376 fshift = fr->fshift[0];
1377 facel = _mm_set1_ps(fr->epsfac);
1378 charge = mdatoms->chargeA;
1379 nvdwtype = fr->ntype;
1380 vdwparam = fr->nbfp;
1381 vdwtype = mdatoms->typeA;
1382 vdwgridparam = fr->ljpme_c6grid;
1383 sh_lj_ewald = _mm_set1_ps(fr->ic->sh_lj_ewald);
1384 ewclj = _mm_set1_ps(fr->ewaldcoeff_lj);
1385 ewclj2 = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
1387 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
1388 ewtab = fr->ic->tabq_coul_F;
1389 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
1390 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1392 /* Setup water-specific parameters */
1393 inr = nlist->iinr[0];
1394 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1395 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1396 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1397 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1399 jq0 = _mm_set1_ps(charge[inr+0]);
1400 jq1 = _mm_set1_ps(charge[inr+1]);
1401 jq2 = _mm_set1_ps(charge[inr+2]);
1402 vdwjidx0A = 2*vdwtype[inr+0];
1403 qq00 = _mm_mul_ps(iq0,jq0);
1404 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1405 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1406 c6grid_00 = _mm_set1_ps(vdwgridparam[vdwioffset0+vdwjidx0A]);
1407 qq01 = _mm_mul_ps(iq0,jq1);
1408 qq02 = _mm_mul_ps(iq0,jq2);
1409 qq10 = _mm_mul_ps(iq1,jq0);
1410 qq11 = _mm_mul_ps(iq1,jq1);
1411 qq12 = _mm_mul_ps(iq1,jq2);
1412 qq20 = _mm_mul_ps(iq2,jq0);
1413 qq21 = _mm_mul_ps(iq2,jq1);
1414 qq22 = _mm_mul_ps(iq2,jq2);
1416 /* Avoid stupid compiler warnings */
1417 jnrA = jnrB = jnrC = jnrD = 0;
1418 j_coord_offsetA = 0;
1419 j_coord_offsetB = 0;
1420 j_coord_offsetC = 0;
1421 j_coord_offsetD = 0;
1423 outeriter = 0;
1424 inneriter = 0;
1426 for(iidx=0;iidx<4*DIM;iidx++)
1428 scratch[iidx] = 0.0;
1431 /* Start outer loop over neighborlists */
1432 for(iidx=0; iidx<nri; iidx++)
1434 /* Load shift vector for this list */
1435 i_shift_offset = DIM*shiftidx[iidx];
1437 /* Load limits for loop over neighbors */
1438 j_index_start = jindex[iidx];
1439 j_index_end = jindex[iidx+1];
1441 /* Get outer coordinate index */
1442 inr = iinr[iidx];
1443 i_coord_offset = DIM*inr;
1445 /* Load i particle coords and add shift vector */
1446 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1447 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1449 fix0 = _mm_setzero_ps();
1450 fiy0 = _mm_setzero_ps();
1451 fiz0 = _mm_setzero_ps();
1452 fix1 = _mm_setzero_ps();
1453 fiy1 = _mm_setzero_ps();
1454 fiz1 = _mm_setzero_ps();
1455 fix2 = _mm_setzero_ps();
1456 fiy2 = _mm_setzero_ps();
1457 fiz2 = _mm_setzero_ps();
1459 /* Start inner kernel loop */
1460 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1463 /* Get j neighbor index, and coordinate index */
1464 jnrA = jjnr[jidx];
1465 jnrB = jjnr[jidx+1];
1466 jnrC = jjnr[jidx+2];
1467 jnrD = jjnr[jidx+3];
1468 j_coord_offsetA = DIM*jnrA;
1469 j_coord_offsetB = DIM*jnrB;
1470 j_coord_offsetC = DIM*jnrC;
1471 j_coord_offsetD = DIM*jnrD;
1473 /* load j atom coordinates */
1474 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1475 x+j_coord_offsetC,x+j_coord_offsetD,
1476 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1478 /* Calculate displacement vector */
1479 dx00 = _mm_sub_ps(ix0,jx0);
1480 dy00 = _mm_sub_ps(iy0,jy0);
1481 dz00 = _mm_sub_ps(iz0,jz0);
1482 dx01 = _mm_sub_ps(ix0,jx1);
1483 dy01 = _mm_sub_ps(iy0,jy1);
1484 dz01 = _mm_sub_ps(iz0,jz1);
1485 dx02 = _mm_sub_ps(ix0,jx2);
1486 dy02 = _mm_sub_ps(iy0,jy2);
1487 dz02 = _mm_sub_ps(iz0,jz2);
1488 dx10 = _mm_sub_ps(ix1,jx0);
1489 dy10 = _mm_sub_ps(iy1,jy0);
1490 dz10 = _mm_sub_ps(iz1,jz0);
1491 dx11 = _mm_sub_ps(ix1,jx1);
1492 dy11 = _mm_sub_ps(iy1,jy1);
1493 dz11 = _mm_sub_ps(iz1,jz1);
1494 dx12 = _mm_sub_ps(ix1,jx2);
1495 dy12 = _mm_sub_ps(iy1,jy2);
1496 dz12 = _mm_sub_ps(iz1,jz2);
1497 dx20 = _mm_sub_ps(ix2,jx0);
1498 dy20 = _mm_sub_ps(iy2,jy0);
1499 dz20 = _mm_sub_ps(iz2,jz0);
1500 dx21 = _mm_sub_ps(ix2,jx1);
1501 dy21 = _mm_sub_ps(iy2,jy1);
1502 dz21 = _mm_sub_ps(iz2,jz1);
1503 dx22 = _mm_sub_ps(ix2,jx2);
1504 dy22 = _mm_sub_ps(iy2,jy2);
1505 dz22 = _mm_sub_ps(iz2,jz2);
1507 /* Calculate squared distance and things based on it */
1508 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1509 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1510 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1511 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1512 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1513 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1514 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1515 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1516 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1518 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1519 rinv01 = gmx_mm_invsqrt_ps(rsq01);
1520 rinv02 = gmx_mm_invsqrt_ps(rsq02);
1521 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1522 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1523 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1524 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1525 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1526 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1528 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1529 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1530 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1531 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1532 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1533 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1534 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1535 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1536 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1538 fjx0 = _mm_setzero_ps();
1539 fjy0 = _mm_setzero_ps();
1540 fjz0 = _mm_setzero_ps();
1541 fjx1 = _mm_setzero_ps();
1542 fjy1 = _mm_setzero_ps();
1543 fjz1 = _mm_setzero_ps();
1544 fjx2 = _mm_setzero_ps();
1545 fjy2 = _mm_setzero_ps();
1546 fjz2 = _mm_setzero_ps();
1548 /**************************
1549 * CALCULATE INTERACTIONS *
1550 **************************/
1552 r00 = _mm_mul_ps(rsq00,rinv00);
1554 /* EWALD ELECTROSTATICS */
1556 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1557 ewrt = _mm_mul_ps(r00,ewtabscale);
1558 ewitab = _mm_cvttps_epi32(ewrt);
1559 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1560 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1561 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1562 &ewtabF,&ewtabFn);
1563 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1564 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
1566 /* Analytical LJ-PME */
1567 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1568 ewcljrsq = _mm_mul_ps(ewclj2,rsq00);
1569 ewclj6 = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
1570 exponent = gmx_simd_exp_r(ewcljrsq);
1571 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
1572 poly = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
1573 /* f6A = 6 * C6grid * (1 - poly) */
1574 f6A = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
1575 /* f6B = C6grid * exponent * beta^6 */
1576 f6B = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
1577 /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
1578 fvdw = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
1580 fscal = _mm_add_ps(felec,fvdw);
1582 /* Calculate temporary vectorial force */
1583 tx = _mm_mul_ps(fscal,dx00);
1584 ty = _mm_mul_ps(fscal,dy00);
1585 tz = _mm_mul_ps(fscal,dz00);
1587 /* Update vectorial force */
1588 fix0 = _mm_add_ps(fix0,tx);
1589 fiy0 = _mm_add_ps(fiy0,ty);
1590 fiz0 = _mm_add_ps(fiz0,tz);
1592 fjx0 = _mm_add_ps(fjx0,tx);
1593 fjy0 = _mm_add_ps(fjy0,ty);
1594 fjz0 = _mm_add_ps(fjz0,tz);
1596 /**************************
1597 * CALCULATE INTERACTIONS *
1598 **************************/
1600 r01 = _mm_mul_ps(rsq01,rinv01);
1602 /* EWALD ELECTROSTATICS */
1604 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1605 ewrt = _mm_mul_ps(r01,ewtabscale);
1606 ewitab = _mm_cvttps_epi32(ewrt);
1607 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1608 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1609 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1610 &ewtabF,&ewtabFn);
1611 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1612 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
1614 fscal = felec;
1616 /* Calculate temporary vectorial force */
1617 tx = _mm_mul_ps(fscal,dx01);
1618 ty = _mm_mul_ps(fscal,dy01);
1619 tz = _mm_mul_ps(fscal,dz01);
1621 /* Update vectorial force */
1622 fix0 = _mm_add_ps(fix0,tx);
1623 fiy0 = _mm_add_ps(fiy0,ty);
1624 fiz0 = _mm_add_ps(fiz0,tz);
1626 fjx1 = _mm_add_ps(fjx1,tx);
1627 fjy1 = _mm_add_ps(fjy1,ty);
1628 fjz1 = _mm_add_ps(fjz1,tz);
1630 /**************************
1631 * CALCULATE INTERACTIONS *
1632 **************************/
1634 r02 = _mm_mul_ps(rsq02,rinv02);
1636 /* EWALD ELECTROSTATICS */
1638 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1639 ewrt = _mm_mul_ps(r02,ewtabscale);
1640 ewitab = _mm_cvttps_epi32(ewrt);
1641 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1642 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1643 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1644 &ewtabF,&ewtabFn);
1645 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1646 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
1648 fscal = felec;
1650 /* Calculate temporary vectorial force */
1651 tx = _mm_mul_ps(fscal,dx02);
1652 ty = _mm_mul_ps(fscal,dy02);
1653 tz = _mm_mul_ps(fscal,dz02);
1655 /* Update vectorial force */
1656 fix0 = _mm_add_ps(fix0,tx);
1657 fiy0 = _mm_add_ps(fiy0,ty);
1658 fiz0 = _mm_add_ps(fiz0,tz);
1660 fjx2 = _mm_add_ps(fjx2,tx);
1661 fjy2 = _mm_add_ps(fjy2,ty);
1662 fjz2 = _mm_add_ps(fjz2,tz);
1664 /**************************
1665 * CALCULATE INTERACTIONS *
1666 **************************/
1668 r10 = _mm_mul_ps(rsq10,rinv10);
1670 /* EWALD ELECTROSTATICS */
1672 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1673 ewrt = _mm_mul_ps(r10,ewtabscale);
1674 ewitab = _mm_cvttps_epi32(ewrt);
1675 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1676 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1677 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1678 &ewtabF,&ewtabFn);
1679 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1680 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1682 fscal = felec;
1684 /* Calculate temporary vectorial force */
1685 tx = _mm_mul_ps(fscal,dx10);
1686 ty = _mm_mul_ps(fscal,dy10);
1687 tz = _mm_mul_ps(fscal,dz10);
1689 /* Update vectorial force */
1690 fix1 = _mm_add_ps(fix1,tx);
1691 fiy1 = _mm_add_ps(fiy1,ty);
1692 fiz1 = _mm_add_ps(fiz1,tz);
1694 fjx0 = _mm_add_ps(fjx0,tx);
1695 fjy0 = _mm_add_ps(fjy0,ty);
1696 fjz0 = _mm_add_ps(fjz0,tz);
1698 /**************************
1699 * CALCULATE INTERACTIONS *
1700 **************************/
1702 r11 = _mm_mul_ps(rsq11,rinv11);
1704 /* EWALD ELECTROSTATICS */
1706 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1707 ewrt = _mm_mul_ps(r11,ewtabscale);
1708 ewitab = _mm_cvttps_epi32(ewrt);
1709 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1710 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1711 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1712 &ewtabF,&ewtabFn);
1713 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1714 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1716 fscal = felec;
1718 /* Calculate temporary vectorial force */
1719 tx = _mm_mul_ps(fscal,dx11);
1720 ty = _mm_mul_ps(fscal,dy11);
1721 tz = _mm_mul_ps(fscal,dz11);
1723 /* Update vectorial force */
1724 fix1 = _mm_add_ps(fix1,tx);
1725 fiy1 = _mm_add_ps(fiy1,ty);
1726 fiz1 = _mm_add_ps(fiz1,tz);
1728 fjx1 = _mm_add_ps(fjx1,tx);
1729 fjy1 = _mm_add_ps(fjy1,ty);
1730 fjz1 = _mm_add_ps(fjz1,tz);
1732 /**************************
1733 * CALCULATE INTERACTIONS *
1734 **************************/
1736 r12 = _mm_mul_ps(rsq12,rinv12);
1738 /* EWALD ELECTROSTATICS */
1740 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1741 ewrt = _mm_mul_ps(r12,ewtabscale);
1742 ewitab = _mm_cvttps_epi32(ewrt);
1743 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1744 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1745 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1746 &ewtabF,&ewtabFn);
1747 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1748 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1750 fscal = felec;
1752 /* Calculate temporary vectorial force */
1753 tx = _mm_mul_ps(fscal,dx12);
1754 ty = _mm_mul_ps(fscal,dy12);
1755 tz = _mm_mul_ps(fscal,dz12);
1757 /* Update vectorial force */
1758 fix1 = _mm_add_ps(fix1,tx);
1759 fiy1 = _mm_add_ps(fiy1,ty);
1760 fiz1 = _mm_add_ps(fiz1,tz);
1762 fjx2 = _mm_add_ps(fjx2,tx);
1763 fjy2 = _mm_add_ps(fjy2,ty);
1764 fjz2 = _mm_add_ps(fjz2,tz);
1766 /**************************
1767 * CALCULATE INTERACTIONS *
1768 **************************/
1770 r20 = _mm_mul_ps(rsq20,rinv20);
1772 /* EWALD ELECTROSTATICS */
1774 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1775 ewrt = _mm_mul_ps(r20,ewtabscale);
1776 ewitab = _mm_cvttps_epi32(ewrt);
1777 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1778 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1779 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1780 &ewtabF,&ewtabFn);
1781 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1782 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1784 fscal = felec;
1786 /* Calculate temporary vectorial force */
1787 tx = _mm_mul_ps(fscal,dx20);
1788 ty = _mm_mul_ps(fscal,dy20);
1789 tz = _mm_mul_ps(fscal,dz20);
1791 /* Update vectorial force */
1792 fix2 = _mm_add_ps(fix2,tx);
1793 fiy2 = _mm_add_ps(fiy2,ty);
1794 fiz2 = _mm_add_ps(fiz2,tz);
1796 fjx0 = _mm_add_ps(fjx0,tx);
1797 fjy0 = _mm_add_ps(fjy0,ty);
1798 fjz0 = _mm_add_ps(fjz0,tz);
1800 /**************************
1801 * CALCULATE INTERACTIONS *
1802 **************************/
1804 r21 = _mm_mul_ps(rsq21,rinv21);
1806 /* EWALD ELECTROSTATICS */
1808 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1809 ewrt = _mm_mul_ps(r21,ewtabscale);
1810 ewitab = _mm_cvttps_epi32(ewrt);
1811 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1812 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1813 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1814 &ewtabF,&ewtabFn);
1815 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1816 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1818 fscal = felec;
1820 /* Calculate temporary vectorial force */
1821 tx = _mm_mul_ps(fscal,dx21);
1822 ty = _mm_mul_ps(fscal,dy21);
1823 tz = _mm_mul_ps(fscal,dz21);
1825 /* Update vectorial force */
1826 fix2 = _mm_add_ps(fix2,tx);
1827 fiy2 = _mm_add_ps(fiy2,ty);
1828 fiz2 = _mm_add_ps(fiz2,tz);
1830 fjx1 = _mm_add_ps(fjx1,tx);
1831 fjy1 = _mm_add_ps(fjy1,ty);
1832 fjz1 = _mm_add_ps(fjz1,tz);
1834 /**************************
1835 * CALCULATE INTERACTIONS *
1836 **************************/
1838 r22 = _mm_mul_ps(rsq22,rinv22);
1840 /* EWALD ELECTROSTATICS */
1842 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1843 ewrt = _mm_mul_ps(r22,ewtabscale);
1844 ewitab = _mm_cvttps_epi32(ewrt);
1845 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1846 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1847 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1848 &ewtabF,&ewtabFn);
1849 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1850 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1852 fscal = felec;
1854 /* Calculate temporary vectorial force */
1855 tx = _mm_mul_ps(fscal,dx22);
1856 ty = _mm_mul_ps(fscal,dy22);
1857 tz = _mm_mul_ps(fscal,dz22);
1859 /* Update vectorial force */
1860 fix2 = _mm_add_ps(fix2,tx);
1861 fiy2 = _mm_add_ps(fiy2,ty);
1862 fiz2 = _mm_add_ps(fiz2,tz);
1864 fjx2 = _mm_add_ps(fjx2,tx);
1865 fjy2 = _mm_add_ps(fjy2,ty);
1866 fjz2 = _mm_add_ps(fjz2,tz);
1868 fjptrA = f+j_coord_offsetA;
1869 fjptrB = f+j_coord_offsetB;
1870 fjptrC = f+j_coord_offsetC;
1871 fjptrD = f+j_coord_offsetD;
1873 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1874 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1876 /* Inner loop uses 347 flops */
1879 if(jidx<j_index_end)
1882 /* Get j neighbor index, and coordinate index */
1883 jnrlistA = jjnr[jidx];
1884 jnrlistB = jjnr[jidx+1];
1885 jnrlistC = jjnr[jidx+2];
1886 jnrlistD = jjnr[jidx+3];
1887 /* Sign of each element will be negative for non-real atoms.
1888 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1889 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1891 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1892 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1893 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1894 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1895 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1896 j_coord_offsetA = DIM*jnrA;
1897 j_coord_offsetB = DIM*jnrB;
1898 j_coord_offsetC = DIM*jnrC;
1899 j_coord_offsetD = DIM*jnrD;
1901 /* load j atom coordinates */
1902 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1903 x+j_coord_offsetC,x+j_coord_offsetD,
1904 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1906 /* Calculate displacement vector */
1907 dx00 = _mm_sub_ps(ix0,jx0);
1908 dy00 = _mm_sub_ps(iy0,jy0);
1909 dz00 = _mm_sub_ps(iz0,jz0);
1910 dx01 = _mm_sub_ps(ix0,jx1);
1911 dy01 = _mm_sub_ps(iy0,jy1);
1912 dz01 = _mm_sub_ps(iz0,jz1);
1913 dx02 = _mm_sub_ps(ix0,jx2);
1914 dy02 = _mm_sub_ps(iy0,jy2);
1915 dz02 = _mm_sub_ps(iz0,jz2);
1916 dx10 = _mm_sub_ps(ix1,jx0);
1917 dy10 = _mm_sub_ps(iy1,jy0);
1918 dz10 = _mm_sub_ps(iz1,jz0);
1919 dx11 = _mm_sub_ps(ix1,jx1);
1920 dy11 = _mm_sub_ps(iy1,jy1);
1921 dz11 = _mm_sub_ps(iz1,jz1);
1922 dx12 = _mm_sub_ps(ix1,jx2);
1923 dy12 = _mm_sub_ps(iy1,jy2);
1924 dz12 = _mm_sub_ps(iz1,jz2);
1925 dx20 = _mm_sub_ps(ix2,jx0);
1926 dy20 = _mm_sub_ps(iy2,jy0);
1927 dz20 = _mm_sub_ps(iz2,jz0);
1928 dx21 = _mm_sub_ps(ix2,jx1);
1929 dy21 = _mm_sub_ps(iy2,jy1);
1930 dz21 = _mm_sub_ps(iz2,jz1);
1931 dx22 = _mm_sub_ps(ix2,jx2);
1932 dy22 = _mm_sub_ps(iy2,jy2);
1933 dz22 = _mm_sub_ps(iz2,jz2);
1935 /* Calculate squared distance and things based on it */
1936 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1937 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1938 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1939 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1940 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1941 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1942 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1943 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1944 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1946 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1947 rinv01 = gmx_mm_invsqrt_ps(rsq01);
1948 rinv02 = gmx_mm_invsqrt_ps(rsq02);
1949 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1950 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1951 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1952 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1953 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1954 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1956 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1957 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1958 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1959 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1960 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1961 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1962 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1963 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1964 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1966 fjx0 = _mm_setzero_ps();
1967 fjy0 = _mm_setzero_ps();
1968 fjz0 = _mm_setzero_ps();
1969 fjx1 = _mm_setzero_ps();
1970 fjy1 = _mm_setzero_ps();
1971 fjz1 = _mm_setzero_ps();
1972 fjx2 = _mm_setzero_ps();
1973 fjy2 = _mm_setzero_ps();
1974 fjz2 = _mm_setzero_ps();
1976 /**************************
1977 * CALCULATE INTERACTIONS *
1978 **************************/
1980 r00 = _mm_mul_ps(rsq00,rinv00);
1981 r00 = _mm_andnot_ps(dummy_mask,r00);
1983 /* EWALD ELECTROSTATICS */
1985 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1986 ewrt = _mm_mul_ps(r00,ewtabscale);
1987 ewitab = _mm_cvttps_epi32(ewrt);
1988 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1989 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1990 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1991 &ewtabF,&ewtabFn);
1992 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1993 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
1995 /* Analytical LJ-PME */
1996 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1997 ewcljrsq = _mm_mul_ps(ewclj2,rsq00);
1998 ewclj6 = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
1999 exponent = gmx_simd_exp_r(ewcljrsq);
2000 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
2001 poly = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
2002 /* f6A = 6 * C6grid * (1 - poly) */
2003 f6A = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
2004 /* f6B = C6grid * exponent * beta^6 */
2005 f6B = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
2006 /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
2007 fvdw = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
2009 fscal = _mm_add_ps(felec,fvdw);
2011 fscal = _mm_andnot_ps(dummy_mask,fscal);
2013 /* Calculate temporary vectorial force */
2014 tx = _mm_mul_ps(fscal,dx00);
2015 ty = _mm_mul_ps(fscal,dy00);
2016 tz = _mm_mul_ps(fscal,dz00);
2018 /* Update vectorial force */
2019 fix0 = _mm_add_ps(fix0,tx);
2020 fiy0 = _mm_add_ps(fiy0,ty);
2021 fiz0 = _mm_add_ps(fiz0,tz);
2023 fjx0 = _mm_add_ps(fjx0,tx);
2024 fjy0 = _mm_add_ps(fjy0,ty);
2025 fjz0 = _mm_add_ps(fjz0,tz);
2027 /**************************
2028 * CALCULATE INTERACTIONS *
2029 **************************/
2031 r01 = _mm_mul_ps(rsq01,rinv01);
2032 r01 = _mm_andnot_ps(dummy_mask,r01);
2034 /* EWALD ELECTROSTATICS */
2036 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2037 ewrt = _mm_mul_ps(r01,ewtabscale);
2038 ewitab = _mm_cvttps_epi32(ewrt);
2039 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2040 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2041 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2042 &ewtabF,&ewtabFn);
2043 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2044 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
2046 fscal = felec;
2048 fscal = _mm_andnot_ps(dummy_mask,fscal);
2050 /* Calculate temporary vectorial force */
2051 tx = _mm_mul_ps(fscal,dx01);
2052 ty = _mm_mul_ps(fscal,dy01);
2053 tz = _mm_mul_ps(fscal,dz01);
2055 /* Update vectorial force */
2056 fix0 = _mm_add_ps(fix0,tx);
2057 fiy0 = _mm_add_ps(fiy0,ty);
2058 fiz0 = _mm_add_ps(fiz0,tz);
2060 fjx1 = _mm_add_ps(fjx1,tx);
2061 fjy1 = _mm_add_ps(fjy1,ty);
2062 fjz1 = _mm_add_ps(fjz1,tz);
2064 /**************************
2065 * CALCULATE INTERACTIONS *
2066 **************************/
2068 r02 = _mm_mul_ps(rsq02,rinv02);
2069 r02 = _mm_andnot_ps(dummy_mask,r02);
2071 /* EWALD ELECTROSTATICS */
2073 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2074 ewrt = _mm_mul_ps(r02,ewtabscale);
2075 ewitab = _mm_cvttps_epi32(ewrt);
2076 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2077 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2078 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2079 &ewtabF,&ewtabFn);
2080 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2081 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
2083 fscal = felec;
2085 fscal = _mm_andnot_ps(dummy_mask,fscal);
2087 /* Calculate temporary vectorial force */
2088 tx = _mm_mul_ps(fscal,dx02);
2089 ty = _mm_mul_ps(fscal,dy02);
2090 tz = _mm_mul_ps(fscal,dz02);
2092 /* Update vectorial force */
2093 fix0 = _mm_add_ps(fix0,tx);
2094 fiy0 = _mm_add_ps(fiy0,ty);
2095 fiz0 = _mm_add_ps(fiz0,tz);
2097 fjx2 = _mm_add_ps(fjx2,tx);
2098 fjy2 = _mm_add_ps(fjy2,ty);
2099 fjz2 = _mm_add_ps(fjz2,tz);
2101 /**************************
2102 * CALCULATE INTERACTIONS *
2103 **************************/
2105 r10 = _mm_mul_ps(rsq10,rinv10);
2106 r10 = _mm_andnot_ps(dummy_mask,r10);
2108 /* EWALD ELECTROSTATICS */
2110 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2111 ewrt = _mm_mul_ps(r10,ewtabscale);
2112 ewitab = _mm_cvttps_epi32(ewrt);
2113 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2114 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2115 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2116 &ewtabF,&ewtabFn);
2117 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2118 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
2120 fscal = felec;
2122 fscal = _mm_andnot_ps(dummy_mask,fscal);
2124 /* Calculate temporary vectorial force */
2125 tx = _mm_mul_ps(fscal,dx10);
2126 ty = _mm_mul_ps(fscal,dy10);
2127 tz = _mm_mul_ps(fscal,dz10);
2129 /* Update vectorial force */
2130 fix1 = _mm_add_ps(fix1,tx);
2131 fiy1 = _mm_add_ps(fiy1,ty);
2132 fiz1 = _mm_add_ps(fiz1,tz);
2134 fjx0 = _mm_add_ps(fjx0,tx);
2135 fjy0 = _mm_add_ps(fjy0,ty);
2136 fjz0 = _mm_add_ps(fjz0,tz);
2138 /**************************
2139 * CALCULATE INTERACTIONS *
2140 **************************/
2142 r11 = _mm_mul_ps(rsq11,rinv11);
2143 r11 = _mm_andnot_ps(dummy_mask,r11);
2145 /* EWALD ELECTROSTATICS */
2147 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2148 ewrt = _mm_mul_ps(r11,ewtabscale);
2149 ewitab = _mm_cvttps_epi32(ewrt);
2150 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2151 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2152 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2153 &ewtabF,&ewtabFn);
2154 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2155 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2157 fscal = felec;
2159 fscal = _mm_andnot_ps(dummy_mask,fscal);
2161 /* Calculate temporary vectorial force */
2162 tx = _mm_mul_ps(fscal,dx11);
2163 ty = _mm_mul_ps(fscal,dy11);
2164 tz = _mm_mul_ps(fscal,dz11);
2166 /* Update vectorial force */
2167 fix1 = _mm_add_ps(fix1,tx);
2168 fiy1 = _mm_add_ps(fiy1,ty);
2169 fiz1 = _mm_add_ps(fiz1,tz);
2171 fjx1 = _mm_add_ps(fjx1,tx);
2172 fjy1 = _mm_add_ps(fjy1,ty);
2173 fjz1 = _mm_add_ps(fjz1,tz);
2175 /**************************
2176 * CALCULATE INTERACTIONS *
2177 **************************/
2179 r12 = _mm_mul_ps(rsq12,rinv12);
2180 r12 = _mm_andnot_ps(dummy_mask,r12);
2182 /* EWALD ELECTROSTATICS */
2184 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2185 ewrt = _mm_mul_ps(r12,ewtabscale);
2186 ewitab = _mm_cvttps_epi32(ewrt);
2187 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2188 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2189 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2190 &ewtabF,&ewtabFn);
2191 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2192 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2194 fscal = felec;
2196 fscal = _mm_andnot_ps(dummy_mask,fscal);
2198 /* Calculate temporary vectorial force */
2199 tx = _mm_mul_ps(fscal,dx12);
2200 ty = _mm_mul_ps(fscal,dy12);
2201 tz = _mm_mul_ps(fscal,dz12);
2203 /* Update vectorial force */
2204 fix1 = _mm_add_ps(fix1,tx);
2205 fiy1 = _mm_add_ps(fiy1,ty);
2206 fiz1 = _mm_add_ps(fiz1,tz);
2208 fjx2 = _mm_add_ps(fjx2,tx);
2209 fjy2 = _mm_add_ps(fjy2,ty);
2210 fjz2 = _mm_add_ps(fjz2,tz);
2212 /**************************
2213 * CALCULATE INTERACTIONS *
2214 **************************/
2216 r20 = _mm_mul_ps(rsq20,rinv20);
2217 r20 = _mm_andnot_ps(dummy_mask,r20);
2219 /* EWALD ELECTROSTATICS */
2221 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2222 ewrt = _mm_mul_ps(r20,ewtabscale);
2223 ewitab = _mm_cvttps_epi32(ewrt);
2224 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2225 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2226 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2227 &ewtabF,&ewtabFn);
2228 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2229 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
2231 fscal = felec;
2233 fscal = _mm_andnot_ps(dummy_mask,fscal);
2235 /* Calculate temporary vectorial force */
2236 tx = _mm_mul_ps(fscal,dx20);
2237 ty = _mm_mul_ps(fscal,dy20);
2238 tz = _mm_mul_ps(fscal,dz20);
2240 /* Update vectorial force */
2241 fix2 = _mm_add_ps(fix2,tx);
2242 fiy2 = _mm_add_ps(fiy2,ty);
2243 fiz2 = _mm_add_ps(fiz2,tz);
2245 fjx0 = _mm_add_ps(fjx0,tx);
2246 fjy0 = _mm_add_ps(fjy0,ty);
2247 fjz0 = _mm_add_ps(fjz0,tz);
2249 /**************************
2250 * CALCULATE INTERACTIONS *
2251 **************************/
2253 r21 = _mm_mul_ps(rsq21,rinv21);
2254 r21 = _mm_andnot_ps(dummy_mask,r21);
2256 /* EWALD ELECTROSTATICS */
2258 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2259 ewrt = _mm_mul_ps(r21,ewtabscale);
2260 ewitab = _mm_cvttps_epi32(ewrt);
2261 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2262 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2263 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2264 &ewtabF,&ewtabFn);
2265 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2266 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2268 fscal = felec;
2270 fscal = _mm_andnot_ps(dummy_mask,fscal);
2272 /* Calculate temporary vectorial force */
2273 tx = _mm_mul_ps(fscal,dx21);
2274 ty = _mm_mul_ps(fscal,dy21);
2275 tz = _mm_mul_ps(fscal,dz21);
2277 /* Update vectorial force */
2278 fix2 = _mm_add_ps(fix2,tx);
2279 fiy2 = _mm_add_ps(fiy2,ty);
2280 fiz2 = _mm_add_ps(fiz2,tz);
2282 fjx1 = _mm_add_ps(fjx1,tx);
2283 fjy1 = _mm_add_ps(fjy1,ty);
2284 fjz1 = _mm_add_ps(fjz1,tz);
2286 /**************************
2287 * CALCULATE INTERACTIONS *
2288 **************************/
2290 r22 = _mm_mul_ps(rsq22,rinv22);
2291 r22 = _mm_andnot_ps(dummy_mask,r22);
2293 /* EWALD ELECTROSTATICS */
2295 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2296 ewrt = _mm_mul_ps(r22,ewtabscale);
2297 ewitab = _mm_cvttps_epi32(ewrt);
2298 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2299 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2300 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2301 &ewtabF,&ewtabFn);
2302 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2303 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2305 fscal = felec;
2307 fscal = _mm_andnot_ps(dummy_mask,fscal);
2309 /* Calculate temporary vectorial force */
2310 tx = _mm_mul_ps(fscal,dx22);
2311 ty = _mm_mul_ps(fscal,dy22);
2312 tz = _mm_mul_ps(fscal,dz22);
2314 /* Update vectorial force */
2315 fix2 = _mm_add_ps(fix2,tx);
2316 fiy2 = _mm_add_ps(fiy2,ty);
2317 fiz2 = _mm_add_ps(fiz2,tz);
2319 fjx2 = _mm_add_ps(fjx2,tx);
2320 fjy2 = _mm_add_ps(fjy2,ty);
2321 fjz2 = _mm_add_ps(fjz2,tz);
2323 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2324 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2325 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2326 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2328 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2329 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2331 /* Inner loop uses 356 flops */
2334 /* End of innermost loop */
2336 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2337 f+i_coord_offset,fshift+i_shift_offset);
2339 /* Increment number of inner iterations */
2340 inneriter += j_index_end - j_index_start;
2342 /* Outer loop uses 18 flops */
2345 /* Increment number of outer iterations */
2346 outeriter += nri;
2348 /* Update outer/inner flops */
2350 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*356);