Remove all unnecessary HAVE_CONFIG_H
[gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_128_fma_double / nb_kernel_ElecEw_VdwCSTab_GeomW3W3_avx_128_fma_double.c
blobf51ebf534baa6e28d24e4414c1051a5f340d3093
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_128_fma_double kernel generator.
38 #include "config.h"
40 #include <math.h>
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "nrnb.h"
47 #include "gromacs/simd/math_x86_avx_128_fma_double.h"
48 #include "kernelutil_x86_avx_128_fma_double.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_avx_128_fma_double
52 * Electrostatics interaction: Ewald
53 * VdW interaction: CubicSplineTable
54 * Geometry: Water3-Water3
55 * Calculate force/pot: PotentialAndForce
57 void
58 nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_avx_128_fma_double
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int jnrA,jnrB;
75 int j_coord_offsetA,j_coord_offsetB;
76 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
77 real rcutoff_scalar;
78 real *shiftvec,*fshift,*x,*f;
79 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
80 int vdwioffset0;
81 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
82 int vdwioffset1;
83 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
84 int vdwioffset2;
85 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
86 int vdwjidx0A,vdwjidx0B;
87 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
88 int vdwjidx1A,vdwjidx1B;
89 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
90 int vdwjidx2A,vdwjidx2B;
91 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
92 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
93 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
94 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
95 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
96 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
97 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
98 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
99 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
100 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
101 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
102 real *charge;
103 int nvdwtype;
104 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
105 int *vdwtype;
106 real *vdwparam;
107 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
108 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
109 __m128i vfitab;
110 __m128i ifour = _mm_set1_epi32(4);
111 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
112 real *vftab;
113 __m128i ewitab;
114 __m128d ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
115 real *ewtab;
116 __m128d dummy_mask,cutoff_mask;
117 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
118 __m128d one = _mm_set1_pd(1.0);
119 __m128d two = _mm_set1_pd(2.0);
120 x = xx[0];
121 f = ff[0];
123 nri = nlist->nri;
124 iinr = nlist->iinr;
125 jindex = nlist->jindex;
126 jjnr = nlist->jjnr;
127 shiftidx = nlist->shift;
128 gid = nlist->gid;
129 shiftvec = fr->shift_vec[0];
130 fshift = fr->fshift[0];
131 facel = _mm_set1_pd(fr->epsfac);
132 charge = mdatoms->chargeA;
133 nvdwtype = fr->ntype;
134 vdwparam = fr->nbfp;
135 vdwtype = mdatoms->typeA;
137 vftab = kernel_data->table_vdw->data;
138 vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale);
140 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
141 ewtab = fr->ic->tabq_coul_FDV0;
142 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
143 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
145 /* Setup water-specific parameters */
146 inr = nlist->iinr[0];
147 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
148 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
149 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
150 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
152 jq0 = _mm_set1_pd(charge[inr+0]);
153 jq1 = _mm_set1_pd(charge[inr+1]);
154 jq2 = _mm_set1_pd(charge[inr+2]);
155 vdwjidx0A = 2*vdwtype[inr+0];
156 qq00 = _mm_mul_pd(iq0,jq0);
157 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
158 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
159 qq01 = _mm_mul_pd(iq0,jq1);
160 qq02 = _mm_mul_pd(iq0,jq2);
161 qq10 = _mm_mul_pd(iq1,jq0);
162 qq11 = _mm_mul_pd(iq1,jq1);
163 qq12 = _mm_mul_pd(iq1,jq2);
164 qq20 = _mm_mul_pd(iq2,jq0);
165 qq21 = _mm_mul_pd(iq2,jq1);
166 qq22 = _mm_mul_pd(iq2,jq2);
168 /* Avoid stupid compiler warnings */
169 jnrA = jnrB = 0;
170 j_coord_offsetA = 0;
171 j_coord_offsetB = 0;
173 outeriter = 0;
174 inneriter = 0;
176 /* Start outer loop over neighborlists */
177 for(iidx=0; iidx<nri; iidx++)
179 /* Load shift vector for this list */
180 i_shift_offset = DIM*shiftidx[iidx];
182 /* Load limits for loop over neighbors */
183 j_index_start = jindex[iidx];
184 j_index_end = jindex[iidx+1];
186 /* Get outer coordinate index */
187 inr = iinr[iidx];
188 i_coord_offset = DIM*inr;
190 /* Load i particle coords and add shift vector */
191 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
192 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
194 fix0 = _mm_setzero_pd();
195 fiy0 = _mm_setzero_pd();
196 fiz0 = _mm_setzero_pd();
197 fix1 = _mm_setzero_pd();
198 fiy1 = _mm_setzero_pd();
199 fiz1 = _mm_setzero_pd();
200 fix2 = _mm_setzero_pd();
201 fiy2 = _mm_setzero_pd();
202 fiz2 = _mm_setzero_pd();
204 /* Reset potential sums */
205 velecsum = _mm_setzero_pd();
206 vvdwsum = _mm_setzero_pd();
208 /* Start inner kernel loop */
209 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
212 /* Get j neighbor index, and coordinate index */
213 jnrA = jjnr[jidx];
214 jnrB = jjnr[jidx+1];
215 j_coord_offsetA = DIM*jnrA;
216 j_coord_offsetB = DIM*jnrB;
218 /* load j atom coordinates */
219 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
220 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
222 /* Calculate displacement vector */
223 dx00 = _mm_sub_pd(ix0,jx0);
224 dy00 = _mm_sub_pd(iy0,jy0);
225 dz00 = _mm_sub_pd(iz0,jz0);
226 dx01 = _mm_sub_pd(ix0,jx1);
227 dy01 = _mm_sub_pd(iy0,jy1);
228 dz01 = _mm_sub_pd(iz0,jz1);
229 dx02 = _mm_sub_pd(ix0,jx2);
230 dy02 = _mm_sub_pd(iy0,jy2);
231 dz02 = _mm_sub_pd(iz0,jz2);
232 dx10 = _mm_sub_pd(ix1,jx0);
233 dy10 = _mm_sub_pd(iy1,jy0);
234 dz10 = _mm_sub_pd(iz1,jz0);
235 dx11 = _mm_sub_pd(ix1,jx1);
236 dy11 = _mm_sub_pd(iy1,jy1);
237 dz11 = _mm_sub_pd(iz1,jz1);
238 dx12 = _mm_sub_pd(ix1,jx2);
239 dy12 = _mm_sub_pd(iy1,jy2);
240 dz12 = _mm_sub_pd(iz1,jz2);
241 dx20 = _mm_sub_pd(ix2,jx0);
242 dy20 = _mm_sub_pd(iy2,jy0);
243 dz20 = _mm_sub_pd(iz2,jz0);
244 dx21 = _mm_sub_pd(ix2,jx1);
245 dy21 = _mm_sub_pd(iy2,jy1);
246 dz21 = _mm_sub_pd(iz2,jz1);
247 dx22 = _mm_sub_pd(ix2,jx2);
248 dy22 = _mm_sub_pd(iy2,jy2);
249 dz22 = _mm_sub_pd(iz2,jz2);
251 /* Calculate squared distance and things based on it */
252 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
253 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
254 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
255 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
256 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
257 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
258 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
259 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
260 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
262 rinv00 = gmx_mm_invsqrt_pd(rsq00);
263 rinv01 = gmx_mm_invsqrt_pd(rsq01);
264 rinv02 = gmx_mm_invsqrt_pd(rsq02);
265 rinv10 = gmx_mm_invsqrt_pd(rsq10);
266 rinv11 = gmx_mm_invsqrt_pd(rsq11);
267 rinv12 = gmx_mm_invsqrt_pd(rsq12);
268 rinv20 = gmx_mm_invsqrt_pd(rsq20);
269 rinv21 = gmx_mm_invsqrt_pd(rsq21);
270 rinv22 = gmx_mm_invsqrt_pd(rsq22);
272 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
273 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
274 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
275 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
276 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
277 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
278 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
279 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
280 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
282 fjx0 = _mm_setzero_pd();
283 fjy0 = _mm_setzero_pd();
284 fjz0 = _mm_setzero_pd();
285 fjx1 = _mm_setzero_pd();
286 fjy1 = _mm_setzero_pd();
287 fjz1 = _mm_setzero_pd();
288 fjx2 = _mm_setzero_pd();
289 fjy2 = _mm_setzero_pd();
290 fjz2 = _mm_setzero_pd();
292 /**************************
293 * CALCULATE INTERACTIONS *
294 **************************/
296 r00 = _mm_mul_pd(rsq00,rinv00);
298 /* Calculate table index by multiplying r with table scale and truncate to integer */
299 rt = _mm_mul_pd(r00,vftabscale);
300 vfitab = _mm_cvttpd_epi32(rt);
301 #ifdef __XOP__
302 vfeps = _mm_frcz_pd(rt);
303 #else
304 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
305 #endif
306 twovfeps = _mm_add_pd(vfeps,vfeps);
307 vfitab = _mm_slli_epi32(vfitab,3);
309 /* EWALD ELECTROSTATICS */
311 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
312 ewrt = _mm_mul_pd(r00,ewtabscale);
313 ewitab = _mm_cvttpd_epi32(ewrt);
314 #ifdef __XOP__
315 eweps = _mm_frcz_pd(ewrt);
316 #else
317 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
318 #endif
319 twoeweps = _mm_add_pd(eweps,eweps);
320 ewitab = _mm_slli_epi32(ewitab,2);
321 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
322 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
323 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
324 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
325 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
326 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
327 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
328 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
329 velec = _mm_mul_pd(qq00,_mm_sub_pd(rinv00,velec));
330 felec = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
332 /* CUBIC SPLINE TABLE DISPERSION */
333 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
334 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
335 GMX_MM_TRANSPOSE2_PD(Y,F);
336 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
337 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
338 GMX_MM_TRANSPOSE2_PD(G,H);
339 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
340 VV = _mm_macc_pd(vfeps,Fp,Y);
341 vvdw6 = _mm_mul_pd(c6_00,VV);
342 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
343 fvdw6 = _mm_mul_pd(c6_00,FF);
345 /* CUBIC SPLINE TABLE REPULSION */
346 vfitab = _mm_add_epi32(vfitab,ifour);
347 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
348 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
349 GMX_MM_TRANSPOSE2_PD(Y,F);
350 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
351 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
352 GMX_MM_TRANSPOSE2_PD(G,H);
353 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
354 VV = _mm_macc_pd(vfeps,Fp,Y);
355 vvdw12 = _mm_mul_pd(c12_00,VV);
356 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
357 fvdw12 = _mm_mul_pd(c12_00,FF);
358 vvdw = _mm_add_pd(vvdw12,vvdw6);
359 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
361 /* Update potential sum for this i atom from the interaction with this j atom. */
362 velecsum = _mm_add_pd(velecsum,velec);
363 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
365 fscal = _mm_add_pd(felec,fvdw);
367 /* Update vectorial force */
368 fix0 = _mm_macc_pd(dx00,fscal,fix0);
369 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
370 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
372 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
373 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
374 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
376 /**************************
377 * CALCULATE INTERACTIONS *
378 **************************/
380 r01 = _mm_mul_pd(rsq01,rinv01);
382 /* EWALD ELECTROSTATICS */
384 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
385 ewrt = _mm_mul_pd(r01,ewtabscale);
386 ewitab = _mm_cvttpd_epi32(ewrt);
387 #ifdef __XOP__
388 eweps = _mm_frcz_pd(ewrt);
389 #else
390 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
391 #endif
392 twoeweps = _mm_add_pd(eweps,eweps);
393 ewitab = _mm_slli_epi32(ewitab,2);
394 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
395 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
396 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
397 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
398 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
399 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
400 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
401 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
402 velec = _mm_mul_pd(qq01,_mm_sub_pd(rinv01,velec));
403 felec = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
405 /* Update potential sum for this i atom from the interaction with this j atom. */
406 velecsum = _mm_add_pd(velecsum,velec);
408 fscal = felec;
410 /* Update vectorial force */
411 fix0 = _mm_macc_pd(dx01,fscal,fix0);
412 fiy0 = _mm_macc_pd(dy01,fscal,fiy0);
413 fiz0 = _mm_macc_pd(dz01,fscal,fiz0);
415 fjx1 = _mm_macc_pd(dx01,fscal,fjx1);
416 fjy1 = _mm_macc_pd(dy01,fscal,fjy1);
417 fjz1 = _mm_macc_pd(dz01,fscal,fjz1);
419 /**************************
420 * CALCULATE INTERACTIONS *
421 **************************/
423 r02 = _mm_mul_pd(rsq02,rinv02);
425 /* EWALD ELECTROSTATICS */
427 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
428 ewrt = _mm_mul_pd(r02,ewtabscale);
429 ewitab = _mm_cvttpd_epi32(ewrt);
430 #ifdef __XOP__
431 eweps = _mm_frcz_pd(ewrt);
432 #else
433 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
434 #endif
435 twoeweps = _mm_add_pd(eweps,eweps);
436 ewitab = _mm_slli_epi32(ewitab,2);
437 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
438 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
439 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
440 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
441 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
442 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
443 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
444 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
445 velec = _mm_mul_pd(qq02,_mm_sub_pd(rinv02,velec));
446 felec = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
448 /* Update potential sum for this i atom from the interaction with this j atom. */
449 velecsum = _mm_add_pd(velecsum,velec);
451 fscal = felec;
453 /* Update vectorial force */
454 fix0 = _mm_macc_pd(dx02,fscal,fix0);
455 fiy0 = _mm_macc_pd(dy02,fscal,fiy0);
456 fiz0 = _mm_macc_pd(dz02,fscal,fiz0);
458 fjx2 = _mm_macc_pd(dx02,fscal,fjx2);
459 fjy2 = _mm_macc_pd(dy02,fscal,fjy2);
460 fjz2 = _mm_macc_pd(dz02,fscal,fjz2);
462 /**************************
463 * CALCULATE INTERACTIONS *
464 **************************/
466 r10 = _mm_mul_pd(rsq10,rinv10);
468 /* EWALD ELECTROSTATICS */
470 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
471 ewrt = _mm_mul_pd(r10,ewtabscale);
472 ewitab = _mm_cvttpd_epi32(ewrt);
473 #ifdef __XOP__
474 eweps = _mm_frcz_pd(ewrt);
475 #else
476 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
477 #endif
478 twoeweps = _mm_add_pd(eweps,eweps);
479 ewitab = _mm_slli_epi32(ewitab,2);
480 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
481 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
482 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
483 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
484 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
485 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
486 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
487 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
488 velec = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
489 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
491 /* Update potential sum for this i atom from the interaction with this j atom. */
492 velecsum = _mm_add_pd(velecsum,velec);
494 fscal = felec;
496 /* Update vectorial force */
497 fix1 = _mm_macc_pd(dx10,fscal,fix1);
498 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
499 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
501 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
502 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
503 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
505 /**************************
506 * CALCULATE INTERACTIONS *
507 **************************/
509 r11 = _mm_mul_pd(rsq11,rinv11);
511 /* EWALD ELECTROSTATICS */
513 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
514 ewrt = _mm_mul_pd(r11,ewtabscale);
515 ewitab = _mm_cvttpd_epi32(ewrt);
516 #ifdef __XOP__
517 eweps = _mm_frcz_pd(ewrt);
518 #else
519 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
520 #endif
521 twoeweps = _mm_add_pd(eweps,eweps);
522 ewitab = _mm_slli_epi32(ewitab,2);
523 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
524 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
525 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
526 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
527 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
528 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
529 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
530 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
531 velec = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
532 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
534 /* Update potential sum for this i atom from the interaction with this j atom. */
535 velecsum = _mm_add_pd(velecsum,velec);
537 fscal = felec;
539 /* Update vectorial force */
540 fix1 = _mm_macc_pd(dx11,fscal,fix1);
541 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
542 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
544 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
545 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
546 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
548 /**************************
549 * CALCULATE INTERACTIONS *
550 **************************/
552 r12 = _mm_mul_pd(rsq12,rinv12);
554 /* EWALD ELECTROSTATICS */
556 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
557 ewrt = _mm_mul_pd(r12,ewtabscale);
558 ewitab = _mm_cvttpd_epi32(ewrt);
559 #ifdef __XOP__
560 eweps = _mm_frcz_pd(ewrt);
561 #else
562 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
563 #endif
564 twoeweps = _mm_add_pd(eweps,eweps);
565 ewitab = _mm_slli_epi32(ewitab,2);
566 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
567 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
568 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
569 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
570 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
571 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
572 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
573 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
574 velec = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
575 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
577 /* Update potential sum for this i atom from the interaction with this j atom. */
578 velecsum = _mm_add_pd(velecsum,velec);
580 fscal = felec;
582 /* Update vectorial force */
583 fix1 = _mm_macc_pd(dx12,fscal,fix1);
584 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
585 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
587 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
588 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
589 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
591 /**************************
592 * CALCULATE INTERACTIONS *
593 **************************/
595 r20 = _mm_mul_pd(rsq20,rinv20);
597 /* EWALD ELECTROSTATICS */
599 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
600 ewrt = _mm_mul_pd(r20,ewtabscale);
601 ewitab = _mm_cvttpd_epi32(ewrt);
602 #ifdef __XOP__
603 eweps = _mm_frcz_pd(ewrt);
604 #else
605 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
606 #endif
607 twoeweps = _mm_add_pd(eweps,eweps);
608 ewitab = _mm_slli_epi32(ewitab,2);
609 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
610 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
611 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
612 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
613 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
614 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
615 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
616 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
617 velec = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
618 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
620 /* Update potential sum for this i atom from the interaction with this j atom. */
621 velecsum = _mm_add_pd(velecsum,velec);
623 fscal = felec;
625 /* Update vectorial force */
626 fix2 = _mm_macc_pd(dx20,fscal,fix2);
627 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
628 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
630 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
631 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
632 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
634 /**************************
635 * CALCULATE INTERACTIONS *
636 **************************/
638 r21 = _mm_mul_pd(rsq21,rinv21);
640 /* EWALD ELECTROSTATICS */
642 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
643 ewrt = _mm_mul_pd(r21,ewtabscale);
644 ewitab = _mm_cvttpd_epi32(ewrt);
645 #ifdef __XOP__
646 eweps = _mm_frcz_pd(ewrt);
647 #else
648 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
649 #endif
650 twoeweps = _mm_add_pd(eweps,eweps);
651 ewitab = _mm_slli_epi32(ewitab,2);
652 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
653 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
654 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
655 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
656 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
657 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
658 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
659 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
660 velec = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
661 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
663 /* Update potential sum for this i atom from the interaction with this j atom. */
664 velecsum = _mm_add_pd(velecsum,velec);
666 fscal = felec;
668 /* Update vectorial force */
669 fix2 = _mm_macc_pd(dx21,fscal,fix2);
670 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
671 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
673 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
674 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
675 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
677 /**************************
678 * CALCULATE INTERACTIONS *
679 **************************/
681 r22 = _mm_mul_pd(rsq22,rinv22);
683 /* EWALD ELECTROSTATICS */
685 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
686 ewrt = _mm_mul_pd(r22,ewtabscale);
687 ewitab = _mm_cvttpd_epi32(ewrt);
688 #ifdef __XOP__
689 eweps = _mm_frcz_pd(ewrt);
690 #else
691 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
692 #endif
693 twoeweps = _mm_add_pd(eweps,eweps);
694 ewitab = _mm_slli_epi32(ewitab,2);
695 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
696 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
697 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
698 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
699 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
700 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
701 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
702 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
703 velec = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
704 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
706 /* Update potential sum for this i atom from the interaction with this j atom. */
707 velecsum = _mm_add_pd(velecsum,velec);
709 fscal = felec;
711 /* Update vectorial force */
712 fix2 = _mm_macc_pd(dx22,fscal,fix2);
713 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
714 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
716 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
717 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
718 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
720 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
722 /* Inner loop uses 430 flops */
725 if(jidx<j_index_end)
728 jnrA = jjnr[jidx];
729 j_coord_offsetA = DIM*jnrA;
731 /* load j atom coordinates */
732 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
733 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
735 /* Calculate displacement vector */
736 dx00 = _mm_sub_pd(ix0,jx0);
737 dy00 = _mm_sub_pd(iy0,jy0);
738 dz00 = _mm_sub_pd(iz0,jz0);
739 dx01 = _mm_sub_pd(ix0,jx1);
740 dy01 = _mm_sub_pd(iy0,jy1);
741 dz01 = _mm_sub_pd(iz0,jz1);
742 dx02 = _mm_sub_pd(ix0,jx2);
743 dy02 = _mm_sub_pd(iy0,jy2);
744 dz02 = _mm_sub_pd(iz0,jz2);
745 dx10 = _mm_sub_pd(ix1,jx0);
746 dy10 = _mm_sub_pd(iy1,jy0);
747 dz10 = _mm_sub_pd(iz1,jz0);
748 dx11 = _mm_sub_pd(ix1,jx1);
749 dy11 = _mm_sub_pd(iy1,jy1);
750 dz11 = _mm_sub_pd(iz1,jz1);
751 dx12 = _mm_sub_pd(ix1,jx2);
752 dy12 = _mm_sub_pd(iy1,jy2);
753 dz12 = _mm_sub_pd(iz1,jz2);
754 dx20 = _mm_sub_pd(ix2,jx0);
755 dy20 = _mm_sub_pd(iy2,jy0);
756 dz20 = _mm_sub_pd(iz2,jz0);
757 dx21 = _mm_sub_pd(ix2,jx1);
758 dy21 = _mm_sub_pd(iy2,jy1);
759 dz21 = _mm_sub_pd(iz2,jz1);
760 dx22 = _mm_sub_pd(ix2,jx2);
761 dy22 = _mm_sub_pd(iy2,jy2);
762 dz22 = _mm_sub_pd(iz2,jz2);
764 /* Calculate squared distance and things based on it */
765 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
766 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
767 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
768 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
769 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
770 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
771 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
772 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
773 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
775 rinv00 = gmx_mm_invsqrt_pd(rsq00);
776 rinv01 = gmx_mm_invsqrt_pd(rsq01);
777 rinv02 = gmx_mm_invsqrt_pd(rsq02);
778 rinv10 = gmx_mm_invsqrt_pd(rsq10);
779 rinv11 = gmx_mm_invsqrt_pd(rsq11);
780 rinv12 = gmx_mm_invsqrt_pd(rsq12);
781 rinv20 = gmx_mm_invsqrt_pd(rsq20);
782 rinv21 = gmx_mm_invsqrt_pd(rsq21);
783 rinv22 = gmx_mm_invsqrt_pd(rsq22);
785 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
786 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
787 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
788 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
789 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
790 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
791 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
792 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
793 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
795 fjx0 = _mm_setzero_pd();
796 fjy0 = _mm_setzero_pd();
797 fjz0 = _mm_setzero_pd();
798 fjx1 = _mm_setzero_pd();
799 fjy1 = _mm_setzero_pd();
800 fjz1 = _mm_setzero_pd();
801 fjx2 = _mm_setzero_pd();
802 fjy2 = _mm_setzero_pd();
803 fjz2 = _mm_setzero_pd();
805 /**************************
806 * CALCULATE INTERACTIONS *
807 **************************/
809 r00 = _mm_mul_pd(rsq00,rinv00);
811 /* Calculate table index by multiplying r with table scale and truncate to integer */
812 rt = _mm_mul_pd(r00,vftabscale);
813 vfitab = _mm_cvttpd_epi32(rt);
814 #ifdef __XOP__
815 vfeps = _mm_frcz_pd(rt);
816 #else
817 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
818 #endif
819 twovfeps = _mm_add_pd(vfeps,vfeps);
820 vfitab = _mm_slli_epi32(vfitab,3);
822 /* EWALD ELECTROSTATICS */
824 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
825 ewrt = _mm_mul_pd(r00,ewtabscale);
826 ewitab = _mm_cvttpd_epi32(ewrt);
827 #ifdef __XOP__
828 eweps = _mm_frcz_pd(ewrt);
829 #else
830 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
831 #endif
832 twoeweps = _mm_add_pd(eweps,eweps);
833 ewitab = _mm_slli_epi32(ewitab,2);
834 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
835 ewtabD = _mm_setzero_pd();
836 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
837 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
838 ewtabFn = _mm_setzero_pd();
839 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
840 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
841 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
842 velec = _mm_mul_pd(qq00,_mm_sub_pd(rinv00,velec));
843 felec = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
845 /* CUBIC SPLINE TABLE DISPERSION */
846 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
847 F = _mm_setzero_pd();
848 GMX_MM_TRANSPOSE2_PD(Y,F);
849 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
850 H = _mm_setzero_pd();
851 GMX_MM_TRANSPOSE2_PD(G,H);
852 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
853 VV = _mm_macc_pd(vfeps,Fp,Y);
854 vvdw6 = _mm_mul_pd(c6_00,VV);
855 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
856 fvdw6 = _mm_mul_pd(c6_00,FF);
858 /* CUBIC SPLINE TABLE REPULSION */
859 vfitab = _mm_add_epi32(vfitab,ifour);
860 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
861 F = _mm_setzero_pd();
862 GMX_MM_TRANSPOSE2_PD(Y,F);
863 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
864 H = _mm_setzero_pd();
865 GMX_MM_TRANSPOSE2_PD(G,H);
866 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
867 VV = _mm_macc_pd(vfeps,Fp,Y);
868 vvdw12 = _mm_mul_pd(c12_00,VV);
869 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
870 fvdw12 = _mm_mul_pd(c12_00,FF);
871 vvdw = _mm_add_pd(vvdw12,vvdw6);
872 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
874 /* Update potential sum for this i atom from the interaction with this j atom. */
875 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
876 velecsum = _mm_add_pd(velecsum,velec);
877 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
878 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
880 fscal = _mm_add_pd(felec,fvdw);
882 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
884 /* Update vectorial force */
885 fix0 = _mm_macc_pd(dx00,fscal,fix0);
886 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
887 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
889 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
890 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
891 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
893 /**************************
894 * CALCULATE INTERACTIONS *
895 **************************/
897 r01 = _mm_mul_pd(rsq01,rinv01);
899 /* EWALD ELECTROSTATICS */
901 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
902 ewrt = _mm_mul_pd(r01,ewtabscale);
903 ewitab = _mm_cvttpd_epi32(ewrt);
904 #ifdef __XOP__
905 eweps = _mm_frcz_pd(ewrt);
906 #else
907 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
908 #endif
909 twoeweps = _mm_add_pd(eweps,eweps);
910 ewitab = _mm_slli_epi32(ewitab,2);
911 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
912 ewtabD = _mm_setzero_pd();
913 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
914 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
915 ewtabFn = _mm_setzero_pd();
916 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
917 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
918 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
919 velec = _mm_mul_pd(qq01,_mm_sub_pd(rinv01,velec));
920 felec = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
922 /* Update potential sum for this i atom from the interaction with this j atom. */
923 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
924 velecsum = _mm_add_pd(velecsum,velec);
926 fscal = felec;
928 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
930 /* Update vectorial force */
931 fix0 = _mm_macc_pd(dx01,fscal,fix0);
932 fiy0 = _mm_macc_pd(dy01,fscal,fiy0);
933 fiz0 = _mm_macc_pd(dz01,fscal,fiz0);
935 fjx1 = _mm_macc_pd(dx01,fscal,fjx1);
936 fjy1 = _mm_macc_pd(dy01,fscal,fjy1);
937 fjz1 = _mm_macc_pd(dz01,fscal,fjz1);
939 /**************************
940 * CALCULATE INTERACTIONS *
941 **************************/
943 r02 = _mm_mul_pd(rsq02,rinv02);
945 /* EWALD ELECTROSTATICS */
947 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
948 ewrt = _mm_mul_pd(r02,ewtabscale);
949 ewitab = _mm_cvttpd_epi32(ewrt);
950 #ifdef __XOP__
951 eweps = _mm_frcz_pd(ewrt);
952 #else
953 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
954 #endif
955 twoeweps = _mm_add_pd(eweps,eweps);
956 ewitab = _mm_slli_epi32(ewitab,2);
957 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
958 ewtabD = _mm_setzero_pd();
959 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
960 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
961 ewtabFn = _mm_setzero_pd();
962 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
963 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
964 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
965 velec = _mm_mul_pd(qq02,_mm_sub_pd(rinv02,velec));
966 felec = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
968 /* Update potential sum for this i atom from the interaction with this j atom. */
969 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
970 velecsum = _mm_add_pd(velecsum,velec);
972 fscal = felec;
974 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
976 /* Update vectorial force */
977 fix0 = _mm_macc_pd(dx02,fscal,fix0);
978 fiy0 = _mm_macc_pd(dy02,fscal,fiy0);
979 fiz0 = _mm_macc_pd(dz02,fscal,fiz0);
981 fjx2 = _mm_macc_pd(dx02,fscal,fjx2);
982 fjy2 = _mm_macc_pd(dy02,fscal,fjy2);
983 fjz2 = _mm_macc_pd(dz02,fscal,fjz2);
985 /**************************
986 * CALCULATE INTERACTIONS *
987 **************************/
989 r10 = _mm_mul_pd(rsq10,rinv10);
991 /* EWALD ELECTROSTATICS */
993 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
994 ewrt = _mm_mul_pd(r10,ewtabscale);
995 ewitab = _mm_cvttpd_epi32(ewrt);
996 #ifdef __XOP__
997 eweps = _mm_frcz_pd(ewrt);
998 #else
999 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1000 #endif
1001 twoeweps = _mm_add_pd(eweps,eweps);
1002 ewitab = _mm_slli_epi32(ewitab,2);
1003 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1004 ewtabD = _mm_setzero_pd();
1005 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1006 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1007 ewtabFn = _mm_setzero_pd();
1008 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1009 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1010 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1011 velec = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
1012 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
1014 /* Update potential sum for this i atom from the interaction with this j atom. */
1015 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1016 velecsum = _mm_add_pd(velecsum,velec);
1018 fscal = felec;
1020 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1022 /* Update vectorial force */
1023 fix1 = _mm_macc_pd(dx10,fscal,fix1);
1024 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
1025 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
1027 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
1028 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
1029 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
1031 /**************************
1032 * CALCULATE INTERACTIONS *
1033 **************************/
1035 r11 = _mm_mul_pd(rsq11,rinv11);
1037 /* EWALD ELECTROSTATICS */
1039 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1040 ewrt = _mm_mul_pd(r11,ewtabscale);
1041 ewitab = _mm_cvttpd_epi32(ewrt);
1042 #ifdef __XOP__
1043 eweps = _mm_frcz_pd(ewrt);
1044 #else
1045 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1046 #endif
1047 twoeweps = _mm_add_pd(eweps,eweps);
1048 ewitab = _mm_slli_epi32(ewitab,2);
1049 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1050 ewtabD = _mm_setzero_pd();
1051 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1052 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1053 ewtabFn = _mm_setzero_pd();
1054 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1055 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1056 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1057 velec = _mm_mul_pd(qq11,_mm_sub_pd(rinv11,velec));
1058 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
1060 /* Update potential sum for this i atom from the interaction with this j atom. */
1061 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1062 velecsum = _mm_add_pd(velecsum,velec);
1064 fscal = felec;
1066 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1068 /* Update vectorial force */
1069 fix1 = _mm_macc_pd(dx11,fscal,fix1);
1070 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
1071 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
1073 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
1074 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
1075 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
1077 /**************************
1078 * CALCULATE INTERACTIONS *
1079 **************************/
1081 r12 = _mm_mul_pd(rsq12,rinv12);
1083 /* EWALD ELECTROSTATICS */
1085 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1086 ewrt = _mm_mul_pd(r12,ewtabscale);
1087 ewitab = _mm_cvttpd_epi32(ewrt);
1088 #ifdef __XOP__
1089 eweps = _mm_frcz_pd(ewrt);
1090 #else
1091 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1092 #endif
1093 twoeweps = _mm_add_pd(eweps,eweps);
1094 ewitab = _mm_slli_epi32(ewitab,2);
1095 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1096 ewtabD = _mm_setzero_pd();
1097 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1098 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1099 ewtabFn = _mm_setzero_pd();
1100 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1101 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1102 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1103 velec = _mm_mul_pd(qq12,_mm_sub_pd(rinv12,velec));
1104 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
1106 /* Update potential sum for this i atom from the interaction with this j atom. */
1107 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1108 velecsum = _mm_add_pd(velecsum,velec);
1110 fscal = felec;
1112 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1114 /* Update vectorial force */
1115 fix1 = _mm_macc_pd(dx12,fscal,fix1);
1116 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
1117 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
1119 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
1120 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
1121 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
1123 /**************************
1124 * CALCULATE INTERACTIONS *
1125 **************************/
1127 r20 = _mm_mul_pd(rsq20,rinv20);
1129 /* EWALD ELECTROSTATICS */
1131 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1132 ewrt = _mm_mul_pd(r20,ewtabscale);
1133 ewitab = _mm_cvttpd_epi32(ewrt);
1134 #ifdef __XOP__
1135 eweps = _mm_frcz_pd(ewrt);
1136 #else
1137 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1138 #endif
1139 twoeweps = _mm_add_pd(eweps,eweps);
1140 ewitab = _mm_slli_epi32(ewitab,2);
1141 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1142 ewtabD = _mm_setzero_pd();
1143 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1144 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1145 ewtabFn = _mm_setzero_pd();
1146 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1147 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1148 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1149 velec = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
1150 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
1152 /* Update potential sum for this i atom from the interaction with this j atom. */
1153 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1154 velecsum = _mm_add_pd(velecsum,velec);
1156 fscal = felec;
1158 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1160 /* Update vectorial force */
1161 fix2 = _mm_macc_pd(dx20,fscal,fix2);
1162 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
1163 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
1165 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
1166 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
1167 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
1169 /**************************
1170 * CALCULATE INTERACTIONS *
1171 **************************/
1173 r21 = _mm_mul_pd(rsq21,rinv21);
1175 /* EWALD ELECTROSTATICS */
1177 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1178 ewrt = _mm_mul_pd(r21,ewtabscale);
1179 ewitab = _mm_cvttpd_epi32(ewrt);
1180 #ifdef __XOP__
1181 eweps = _mm_frcz_pd(ewrt);
1182 #else
1183 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1184 #endif
1185 twoeweps = _mm_add_pd(eweps,eweps);
1186 ewitab = _mm_slli_epi32(ewitab,2);
1187 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1188 ewtabD = _mm_setzero_pd();
1189 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1190 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1191 ewtabFn = _mm_setzero_pd();
1192 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1193 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1194 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1195 velec = _mm_mul_pd(qq21,_mm_sub_pd(rinv21,velec));
1196 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
1198 /* Update potential sum for this i atom from the interaction with this j atom. */
1199 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1200 velecsum = _mm_add_pd(velecsum,velec);
1202 fscal = felec;
1204 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1206 /* Update vectorial force */
1207 fix2 = _mm_macc_pd(dx21,fscal,fix2);
1208 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
1209 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
1211 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
1212 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
1213 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
1215 /**************************
1216 * CALCULATE INTERACTIONS *
1217 **************************/
1219 r22 = _mm_mul_pd(rsq22,rinv22);
1221 /* EWALD ELECTROSTATICS */
1223 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1224 ewrt = _mm_mul_pd(r22,ewtabscale);
1225 ewitab = _mm_cvttpd_epi32(ewrt);
1226 #ifdef __XOP__
1227 eweps = _mm_frcz_pd(ewrt);
1228 #else
1229 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1230 #endif
1231 twoeweps = _mm_add_pd(eweps,eweps);
1232 ewitab = _mm_slli_epi32(ewitab,2);
1233 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1234 ewtabD = _mm_setzero_pd();
1235 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1236 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1237 ewtabFn = _mm_setzero_pd();
1238 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1239 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1240 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1241 velec = _mm_mul_pd(qq22,_mm_sub_pd(rinv22,velec));
1242 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
1244 /* Update potential sum for this i atom from the interaction with this j atom. */
1245 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1246 velecsum = _mm_add_pd(velecsum,velec);
1248 fscal = felec;
1250 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1252 /* Update vectorial force */
1253 fix2 = _mm_macc_pd(dx22,fscal,fix2);
1254 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
1255 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
1257 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
1258 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
1259 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
1261 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1263 /* Inner loop uses 430 flops */
1266 /* End of innermost loop */
1268 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1269 f+i_coord_offset,fshift+i_shift_offset);
1271 ggid = gid[iidx];
1272 /* Update potential energies */
1273 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1274 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1276 /* Increment number of inner iterations */
1277 inneriter += j_index_end - j_index_start;
1279 /* Outer loop uses 20 flops */
1282 /* Increment number of outer iterations */
1283 outeriter += nri;
1285 /* Update outer/inner flops */
1287 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*430);
1290 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_avx_128_fma_double
1291 * Electrostatics interaction: Ewald
1292 * VdW interaction: CubicSplineTable
1293 * Geometry: Water3-Water3
1294 * Calculate force/pot: Force
1296 void
1297 nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_avx_128_fma_double
1298 (t_nblist * gmx_restrict nlist,
1299 rvec * gmx_restrict xx,
1300 rvec * gmx_restrict ff,
1301 t_forcerec * gmx_restrict fr,
1302 t_mdatoms * gmx_restrict mdatoms,
1303 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1304 t_nrnb * gmx_restrict nrnb)
1306 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1307 * just 0 for non-waters.
1308 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1309 * jnr indices corresponding to data put in the four positions in the SIMD register.
1311 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1312 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1313 int jnrA,jnrB;
1314 int j_coord_offsetA,j_coord_offsetB;
1315 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1316 real rcutoff_scalar;
1317 real *shiftvec,*fshift,*x,*f;
1318 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1319 int vdwioffset0;
1320 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1321 int vdwioffset1;
1322 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1323 int vdwioffset2;
1324 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1325 int vdwjidx0A,vdwjidx0B;
1326 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1327 int vdwjidx1A,vdwjidx1B;
1328 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1329 int vdwjidx2A,vdwjidx2B;
1330 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1331 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1332 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1333 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1334 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1335 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1336 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1337 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1338 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1339 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1340 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
1341 real *charge;
1342 int nvdwtype;
1343 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1344 int *vdwtype;
1345 real *vdwparam;
1346 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
1347 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
1348 __m128i vfitab;
1349 __m128i ifour = _mm_set1_epi32(4);
1350 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
1351 real *vftab;
1352 __m128i ewitab;
1353 __m128d ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1354 real *ewtab;
1355 __m128d dummy_mask,cutoff_mask;
1356 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1357 __m128d one = _mm_set1_pd(1.0);
1358 __m128d two = _mm_set1_pd(2.0);
1359 x = xx[0];
1360 f = ff[0];
1362 nri = nlist->nri;
1363 iinr = nlist->iinr;
1364 jindex = nlist->jindex;
1365 jjnr = nlist->jjnr;
1366 shiftidx = nlist->shift;
1367 gid = nlist->gid;
1368 shiftvec = fr->shift_vec[0];
1369 fshift = fr->fshift[0];
1370 facel = _mm_set1_pd(fr->epsfac);
1371 charge = mdatoms->chargeA;
1372 nvdwtype = fr->ntype;
1373 vdwparam = fr->nbfp;
1374 vdwtype = mdatoms->typeA;
1376 vftab = kernel_data->table_vdw->data;
1377 vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale);
1379 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
1380 ewtab = fr->ic->tabq_coul_F;
1381 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
1382 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
1384 /* Setup water-specific parameters */
1385 inr = nlist->iinr[0];
1386 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
1387 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1388 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1389 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1391 jq0 = _mm_set1_pd(charge[inr+0]);
1392 jq1 = _mm_set1_pd(charge[inr+1]);
1393 jq2 = _mm_set1_pd(charge[inr+2]);
1394 vdwjidx0A = 2*vdwtype[inr+0];
1395 qq00 = _mm_mul_pd(iq0,jq0);
1396 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1397 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1398 qq01 = _mm_mul_pd(iq0,jq1);
1399 qq02 = _mm_mul_pd(iq0,jq2);
1400 qq10 = _mm_mul_pd(iq1,jq0);
1401 qq11 = _mm_mul_pd(iq1,jq1);
1402 qq12 = _mm_mul_pd(iq1,jq2);
1403 qq20 = _mm_mul_pd(iq2,jq0);
1404 qq21 = _mm_mul_pd(iq2,jq1);
1405 qq22 = _mm_mul_pd(iq2,jq2);
1407 /* Avoid stupid compiler warnings */
1408 jnrA = jnrB = 0;
1409 j_coord_offsetA = 0;
1410 j_coord_offsetB = 0;
1412 outeriter = 0;
1413 inneriter = 0;
1415 /* Start outer loop over neighborlists */
1416 for(iidx=0; iidx<nri; iidx++)
1418 /* Load shift vector for this list */
1419 i_shift_offset = DIM*shiftidx[iidx];
1421 /* Load limits for loop over neighbors */
1422 j_index_start = jindex[iidx];
1423 j_index_end = jindex[iidx+1];
1425 /* Get outer coordinate index */
1426 inr = iinr[iidx];
1427 i_coord_offset = DIM*inr;
1429 /* Load i particle coords and add shift vector */
1430 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1431 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1433 fix0 = _mm_setzero_pd();
1434 fiy0 = _mm_setzero_pd();
1435 fiz0 = _mm_setzero_pd();
1436 fix1 = _mm_setzero_pd();
1437 fiy1 = _mm_setzero_pd();
1438 fiz1 = _mm_setzero_pd();
1439 fix2 = _mm_setzero_pd();
1440 fiy2 = _mm_setzero_pd();
1441 fiz2 = _mm_setzero_pd();
1443 /* Start inner kernel loop */
1444 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1447 /* Get j neighbor index, and coordinate index */
1448 jnrA = jjnr[jidx];
1449 jnrB = jjnr[jidx+1];
1450 j_coord_offsetA = DIM*jnrA;
1451 j_coord_offsetB = DIM*jnrB;
1453 /* load j atom coordinates */
1454 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1455 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1457 /* Calculate displacement vector */
1458 dx00 = _mm_sub_pd(ix0,jx0);
1459 dy00 = _mm_sub_pd(iy0,jy0);
1460 dz00 = _mm_sub_pd(iz0,jz0);
1461 dx01 = _mm_sub_pd(ix0,jx1);
1462 dy01 = _mm_sub_pd(iy0,jy1);
1463 dz01 = _mm_sub_pd(iz0,jz1);
1464 dx02 = _mm_sub_pd(ix0,jx2);
1465 dy02 = _mm_sub_pd(iy0,jy2);
1466 dz02 = _mm_sub_pd(iz0,jz2);
1467 dx10 = _mm_sub_pd(ix1,jx0);
1468 dy10 = _mm_sub_pd(iy1,jy0);
1469 dz10 = _mm_sub_pd(iz1,jz0);
1470 dx11 = _mm_sub_pd(ix1,jx1);
1471 dy11 = _mm_sub_pd(iy1,jy1);
1472 dz11 = _mm_sub_pd(iz1,jz1);
1473 dx12 = _mm_sub_pd(ix1,jx2);
1474 dy12 = _mm_sub_pd(iy1,jy2);
1475 dz12 = _mm_sub_pd(iz1,jz2);
1476 dx20 = _mm_sub_pd(ix2,jx0);
1477 dy20 = _mm_sub_pd(iy2,jy0);
1478 dz20 = _mm_sub_pd(iz2,jz0);
1479 dx21 = _mm_sub_pd(ix2,jx1);
1480 dy21 = _mm_sub_pd(iy2,jy1);
1481 dz21 = _mm_sub_pd(iz2,jz1);
1482 dx22 = _mm_sub_pd(ix2,jx2);
1483 dy22 = _mm_sub_pd(iy2,jy2);
1484 dz22 = _mm_sub_pd(iz2,jz2);
1486 /* Calculate squared distance and things based on it */
1487 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1488 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1489 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1490 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1491 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1492 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1493 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1494 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1495 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1497 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1498 rinv01 = gmx_mm_invsqrt_pd(rsq01);
1499 rinv02 = gmx_mm_invsqrt_pd(rsq02);
1500 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1501 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1502 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1503 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1504 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1505 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1507 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1508 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
1509 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
1510 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
1511 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1512 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1513 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
1514 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1515 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1517 fjx0 = _mm_setzero_pd();
1518 fjy0 = _mm_setzero_pd();
1519 fjz0 = _mm_setzero_pd();
1520 fjx1 = _mm_setzero_pd();
1521 fjy1 = _mm_setzero_pd();
1522 fjz1 = _mm_setzero_pd();
1523 fjx2 = _mm_setzero_pd();
1524 fjy2 = _mm_setzero_pd();
1525 fjz2 = _mm_setzero_pd();
1527 /**************************
1528 * CALCULATE INTERACTIONS *
1529 **************************/
1531 r00 = _mm_mul_pd(rsq00,rinv00);
1533 /* Calculate table index by multiplying r with table scale and truncate to integer */
1534 rt = _mm_mul_pd(r00,vftabscale);
1535 vfitab = _mm_cvttpd_epi32(rt);
1536 #ifdef __XOP__
1537 vfeps = _mm_frcz_pd(rt);
1538 #else
1539 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1540 #endif
1541 twovfeps = _mm_add_pd(vfeps,vfeps);
1542 vfitab = _mm_slli_epi32(vfitab,3);
1544 /* EWALD ELECTROSTATICS */
1546 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1547 ewrt = _mm_mul_pd(r00,ewtabscale);
1548 ewitab = _mm_cvttpd_epi32(ewrt);
1549 #ifdef __XOP__
1550 eweps = _mm_frcz_pd(ewrt);
1551 #else
1552 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1553 #endif
1554 twoeweps = _mm_add_pd(eweps,eweps);
1555 gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1556 &ewtabF,&ewtabFn);
1557 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1558 felec = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
1560 /* CUBIC SPLINE TABLE DISPERSION */
1561 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1562 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1563 GMX_MM_TRANSPOSE2_PD(Y,F);
1564 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1565 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1566 GMX_MM_TRANSPOSE2_PD(G,H);
1567 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
1568 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
1569 fvdw6 = _mm_mul_pd(c6_00,FF);
1571 /* CUBIC SPLINE TABLE REPULSION */
1572 vfitab = _mm_add_epi32(vfitab,ifour);
1573 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1574 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1575 GMX_MM_TRANSPOSE2_PD(Y,F);
1576 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1577 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1578 GMX_MM_TRANSPOSE2_PD(G,H);
1579 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
1580 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
1581 fvdw12 = _mm_mul_pd(c12_00,FF);
1582 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
1584 fscal = _mm_add_pd(felec,fvdw);
1586 /* Update vectorial force */
1587 fix0 = _mm_macc_pd(dx00,fscal,fix0);
1588 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
1589 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
1591 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
1592 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
1593 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
1595 /**************************
1596 * CALCULATE INTERACTIONS *
1597 **************************/
1599 r01 = _mm_mul_pd(rsq01,rinv01);
1601 /* EWALD ELECTROSTATICS */
1603 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1604 ewrt = _mm_mul_pd(r01,ewtabscale);
1605 ewitab = _mm_cvttpd_epi32(ewrt);
1606 #ifdef __XOP__
1607 eweps = _mm_frcz_pd(ewrt);
1608 #else
1609 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1610 #endif
1611 twoeweps = _mm_add_pd(eweps,eweps);
1612 gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1613 &ewtabF,&ewtabFn);
1614 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1615 felec = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
1617 fscal = felec;
1619 /* Update vectorial force */
1620 fix0 = _mm_macc_pd(dx01,fscal,fix0);
1621 fiy0 = _mm_macc_pd(dy01,fscal,fiy0);
1622 fiz0 = _mm_macc_pd(dz01,fscal,fiz0);
1624 fjx1 = _mm_macc_pd(dx01,fscal,fjx1);
1625 fjy1 = _mm_macc_pd(dy01,fscal,fjy1);
1626 fjz1 = _mm_macc_pd(dz01,fscal,fjz1);
1628 /**************************
1629 * CALCULATE INTERACTIONS *
1630 **************************/
1632 r02 = _mm_mul_pd(rsq02,rinv02);
1634 /* EWALD ELECTROSTATICS */
1636 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1637 ewrt = _mm_mul_pd(r02,ewtabscale);
1638 ewitab = _mm_cvttpd_epi32(ewrt);
1639 #ifdef __XOP__
1640 eweps = _mm_frcz_pd(ewrt);
1641 #else
1642 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1643 #endif
1644 twoeweps = _mm_add_pd(eweps,eweps);
1645 gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1646 &ewtabF,&ewtabFn);
1647 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1648 felec = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
1650 fscal = felec;
1652 /* Update vectorial force */
1653 fix0 = _mm_macc_pd(dx02,fscal,fix0);
1654 fiy0 = _mm_macc_pd(dy02,fscal,fiy0);
1655 fiz0 = _mm_macc_pd(dz02,fscal,fiz0);
1657 fjx2 = _mm_macc_pd(dx02,fscal,fjx2);
1658 fjy2 = _mm_macc_pd(dy02,fscal,fjy2);
1659 fjz2 = _mm_macc_pd(dz02,fscal,fjz2);
1661 /**************************
1662 * CALCULATE INTERACTIONS *
1663 **************************/
1665 r10 = _mm_mul_pd(rsq10,rinv10);
1667 /* EWALD ELECTROSTATICS */
1669 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1670 ewrt = _mm_mul_pd(r10,ewtabscale);
1671 ewitab = _mm_cvttpd_epi32(ewrt);
1672 #ifdef __XOP__
1673 eweps = _mm_frcz_pd(ewrt);
1674 #else
1675 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1676 #endif
1677 twoeweps = _mm_add_pd(eweps,eweps);
1678 gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1679 &ewtabF,&ewtabFn);
1680 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1681 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
1683 fscal = felec;
1685 /* Update vectorial force */
1686 fix1 = _mm_macc_pd(dx10,fscal,fix1);
1687 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
1688 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
1690 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
1691 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
1692 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
1694 /**************************
1695 * CALCULATE INTERACTIONS *
1696 **************************/
1698 r11 = _mm_mul_pd(rsq11,rinv11);
1700 /* EWALD ELECTROSTATICS */
1702 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1703 ewrt = _mm_mul_pd(r11,ewtabscale);
1704 ewitab = _mm_cvttpd_epi32(ewrt);
1705 #ifdef __XOP__
1706 eweps = _mm_frcz_pd(ewrt);
1707 #else
1708 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1709 #endif
1710 twoeweps = _mm_add_pd(eweps,eweps);
1711 gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1712 &ewtabF,&ewtabFn);
1713 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1714 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
1716 fscal = felec;
1718 /* Update vectorial force */
1719 fix1 = _mm_macc_pd(dx11,fscal,fix1);
1720 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
1721 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
1723 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
1724 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
1725 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
1727 /**************************
1728 * CALCULATE INTERACTIONS *
1729 **************************/
1731 r12 = _mm_mul_pd(rsq12,rinv12);
1733 /* EWALD ELECTROSTATICS */
1735 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1736 ewrt = _mm_mul_pd(r12,ewtabscale);
1737 ewitab = _mm_cvttpd_epi32(ewrt);
1738 #ifdef __XOP__
1739 eweps = _mm_frcz_pd(ewrt);
1740 #else
1741 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1742 #endif
1743 twoeweps = _mm_add_pd(eweps,eweps);
1744 gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1745 &ewtabF,&ewtabFn);
1746 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1747 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
1749 fscal = felec;
1751 /* Update vectorial force */
1752 fix1 = _mm_macc_pd(dx12,fscal,fix1);
1753 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
1754 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
1756 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
1757 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
1758 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
1760 /**************************
1761 * CALCULATE INTERACTIONS *
1762 **************************/
1764 r20 = _mm_mul_pd(rsq20,rinv20);
1766 /* EWALD ELECTROSTATICS */
1768 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1769 ewrt = _mm_mul_pd(r20,ewtabscale);
1770 ewitab = _mm_cvttpd_epi32(ewrt);
1771 #ifdef __XOP__
1772 eweps = _mm_frcz_pd(ewrt);
1773 #else
1774 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1775 #endif
1776 twoeweps = _mm_add_pd(eweps,eweps);
1777 gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1778 &ewtabF,&ewtabFn);
1779 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1780 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
1782 fscal = felec;
1784 /* Update vectorial force */
1785 fix2 = _mm_macc_pd(dx20,fscal,fix2);
1786 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
1787 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
1789 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
1790 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
1791 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
1793 /**************************
1794 * CALCULATE INTERACTIONS *
1795 **************************/
1797 r21 = _mm_mul_pd(rsq21,rinv21);
1799 /* EWALD ELECTROSTATICS */
1801 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1802 ewrt = _mm_mul_pd(r21,ewtabscale);
1803 ewitab = _mm_cvttpd_epi32(ewrt);
1804 #ifdef __XOP__
1805 eweps = _mm_frcz_pd(ewrt);
1806 #else
1807 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1808 #endif
1809 twoeweps = _mm_add_pd(eweps,eweps);
1810 gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1811 &ewtabF,&ewtabFn);
1812 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1813 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
1815 fscal = felec;
1817 /* Update vectorial force */
1818 fix2 = _mm_macc_pd(dx21,fscal,fix2);
1819 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
1820 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
1822 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
1823 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
1824 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
1826 /**************************
1827 * CALCULATE INTERACTIONS *
1828 **************************/
1830 r22 = _mm_mul_pd(rsq22,rinv22);
1832 /* EWALD ELECTROSTATICS */
1834 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1835 ewrt = _mm_mul_pd(r22,ewtabscale);
1836 ewitab = _mm_cvttpd_epi32(ewrt);
1837 #ifdef __XOP__
1838 eweps = _mm_frcz_pd(ewrt);
1839 #else
1840 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1841 #endif
1842 twoeweps = _mm_add_pd(eweps,eweps);
1843 gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1844 &ewtabF,&ewtabFn);
1845 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1846 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
1848 fscal = felec;
1850 /* Update vectorial force */
1851 fix2 = _mm_macc_pd(dx22,fscal,fix2);
1852 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
1853 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
1855 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
1856 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
1857 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
1859 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1861 /* Inner loop uses 377 flops */
1864 if(jidx<j_index_end)
1867 jnrA = jjnr[jidx];
1868 j_coord_offsetA = DIM*jnrA;
1870 /* load j atom coordinates */
1871 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1872 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1874 /* Calculate displacement vector */
1875 dx00 = _mm_sub_pd(ix0,jx0);
1876 dy00 = _mm_sub_pd(iy0,jy0);
1877 dz00 = _mm_sub_pd(iz0,jz0);
1878 dx01 = _mm_sub_pd(ix0,jx1);
1879 dy01 = _mm_sub_pd(iy0,jy1);
1880 dz01 = _mm_sub_pd(iz0,jz1);
1881 dx02 = _mm_sub_pd(ix0,jx2);
1882 dy02 = _mm_sub_pd(iy0,jy2);
1883 dz02 = _mm_sub_pd(iz0,jz2);
1884 dx10 = _mm_sub_pd(ix1,jx0);
1885 dy10 = _mm_sub_pd(iy1,jy0);
1886 dz10 = _mm_sub_pd(iz1,jz0);
1887 dx11 = _mm_sub_pd(ix1,jx1);
1888 dy11 = _mm_sub_pd(iy1,jy1);
1889 dz11 = _mm_sub_pd(iz1,jz1);
1890 dx12 = _mm_sub_pd(ix1,jx2);
1891 dy12 = _mm_sub_pd(iy1,jy2);
1892 dz12 = _mm_sub_pd(iz1,jz2);
1893 dx20 = _mm_sub_pd(ix2,jx0);
1894 dy20 = _mm_sub_pd(iy2,jy0);
1895 dz20 = _mm_sub_pd(iz2,jz0);
1896 dx21 = _mm_sub_pd(ix2,jx1);
1897 dy21 = _mm_sub_pd(iy2,jy1);
1898 dz21 = _mm_sub_pd(iz2,jz1);
1899 dx22 = _mm_sub_pd(ix2,jx2);
1900 dy22 = _mm_sub_pd(iy2,jy2);
1901 dz22 = _mm_sub_pd(iz2,jz2);
1903 /* Calculate squared distance and things based on it */
1904 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1905 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1906 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1907 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1908 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1909 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1910 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1911 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1912 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1914 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1915 rinv01 = gmx_mm_invsqrt_pd(rsq01);
1916 rinv02 = gmx_mm_invsqrt_pd(rsq02);
1917 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1918 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1919 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1920 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1921 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1922 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1924 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1925 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
1926 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
1927 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
1928 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1929 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1930 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
1931 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1932 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1934 fjx0 = _mm_setzero_pd();
1935 fjy0 = _mm_setzero_pd();
1936 fjz0 = _mm_setzero_pd();
1937 fjx1 = _mm_setzero_pd();
1938 fjy1 = _mm_setzero_pd();
1939 fjz1 = _mm_setzero_pd();
1940 fjx2 = _mm_setzero_pd();
1941 fjy2 = _mm_setzero_pd();
1942 fjz2 = _mm_setzero_pd();
1944 /**************************
1945 * CALCULATE INTERACTIONS *
1946 **************************/
1948 r00 = _mm_mul_pd(rsq00,rinv00);
1950 /* Calculate table index by multiplying r with table scale and truncate to integer */
1951 rt = _mm_mul_pd(r00,vftabscale);
1952 vfitab = _mm_cvttpd_epi32(rt);
1953 #ifdef __XOP__
1954 vfeps = _mm_frcz_pd(rt);
1955 #else
1956 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1957 #endif
1958 twovfeps = _mm_add_pd(vfeps,vfeps);
1959 vfitab = _mm_slli_epi32(vfitab,3);
1961 /* EWALD ELECTROSTATICS */
1963 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1964 ewrt = _mm_mul_pd(r00,ewtabscale);
1965 ewitab = _mm_cvttpd_epi32(ewrt);
1966 #ifdef __XOP__
1967 eweps = _mm_frcz_pd(ewrt);
1968 #else
1969 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1970 #endif
1971 twoeweps = _mm_add_pd(eweps,eweps);
1972 gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
1973 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1974 felec = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
1976 /* CUBIC SPLINE TABLE DISPERSION */
1977 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1978 F = _mm_setzero_pd();
1979 GMX_MM_TRANSPOSE2_PD(Y,F);
1980 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1981 H = _mm_setzero_pd();
1982 GMX_MM_TRANSPOSE2_PD(G,H);
1983 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
1984 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
1985 fvdw6 = _mm_mul_pd(c6_00,FF);
1987 /* CUBIC SPLINE TABLE REPULSION */
1988 vfitab = _mm_add_epi32(vfitab,ifour);
1989 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1990 F = _mm_setzero_pd();
1991 GMX_MM_TRANSPOSE2_PD(Y,F);
1992 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1993 H = _mm_setzero_pd();
1994 GMX_MM_TRANSPOSE2_PD(G,H);
1995 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
1996 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
1997 fvdw12 = _mm_mul_pd(c12_00,FF);
1998 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
2000 fscal = _mm_add_pd(felec,fvdw);
2002 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2004 /* Update vectorial force */
2005 fix0 = _mm_macc_pd(dx00,fscal,fix0);
2006 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
2007 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
2009 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
2010 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
2011 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
2013 /**************************
2014 * CALCULATE INTERACTIONS *
2015 **************************/
2017 r01 = _mm_mul_pd(rsq01,rinv01);
2019 /* EWALD ELECTROSTATICS */
2021 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2022 ewrt = _mm_mul_pd(r01,ewtabscale);
2023 ewitab = _mm_cvttpd_epi32(ewrt);
2024 #ifdef __XOP__
2025 eweps = _mm_frcz_pd(ewrt);
2026 #else
2027 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2028 #endif
2029 twoeweps = _mm_add_pd(eweps,eweps);
2030 gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2031 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2032 felec = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
2034 fscal = felec;
2036 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2038 /* Update vectorial force */
2039 fix0 = _mm_macc_pd(dx01,fscal,fix0);
2040 fiy0 = _mm_macc_pd(dy01,fscal,fiy0);
2041 fiz0 = _mm_macc_pd(dz01,fscal,fiz0);
2043 fjx1 = _mm_macc_pd(dx01,fscal,fjx1);
2044 fjy1 = _mm_macc_pd(dy01,fscal,fjy1);
2045 fjz1 = _mm_macc_pd(dz01,fscal,fjz1);
2047 /**************************
2048 * CALCULATE INTERACTIONS *
2049 **************************/
2051 r02 = _mm_mul_pd(rsq02,rinv02);
2053 /* EWALD ELECTROSTATICS */
2055 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2056 ewrt = _mm_mul_pd(r02,ewtabscale);
2057 ewitab = _mm_cvttpd_epi32(ewrt);
2058 #ifdef __XOP__
2059 eweps = _mm_frcz_pd(ewrt);
2060 #else
2061 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2062 #endif
2063 twoeweps = _mm_add_pd(eweps,eweps);
2064 gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2065 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2066 felec = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
2068 fscal = felec;
2070 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2072 /* Update vectorial force */
2073 fix0 = _mm_macc_pd(dx02,fscal,fix0);
2074 fiy0 = _mm_macc_pd(dy02,fscal,fiy0);
2075 fiz0 = _mm_macc_pd(dz02,fscal,fiz0);
2077 fjx2 = _mm_macc_pd(dx02,fscal,fjx2);
2078 fjy2 = _mm_macc_pd(dy02,fscal,fjy2);
2079 fjz2 = _mm_macc_pd(dz02,fscal,fjz2);
2081 /**************************
2082 * CALCULATE INTERACTIONS *
2083 **************************/
2085 r10 = _mm_mul_pd(rsq10,rinv10);
2087 /* EWALD ELECTROSTATICS */
2089 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2090 ewrt = _mm_mul_pd(r10,ewtabscale);
2091 ewitab = _mm_cvttpd_epi32(ewrt);
2092 #ifdef __XOP__
2093 eweps = _mm_frcz_pd(ewrt);
2094 #else
2095 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2096 #endif
2097 twoeweps = _mm_add_pd(eweps,eweps);
2098 gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2099 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2100 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
2102 fscal = felec;
2104 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2106 /* Update vectorial force */
2107 fix1 = _mm_macc_pd(dx10,fscal,fix1);
2108 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
2109 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
2111 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
2112 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
2113 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
2115 /**************************
2116 * CALCULATE INTERACTIONS *
2117 **************************/
2119 r11 = _mm_mul_pd(rsq11,rinv11);
2121 /* EWALD ELECTROSTATICS */
2123 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2124 ewrt = _mm_mul_pd(r11,ewtabscale);
2125 ewitab = _mm_cvttpd_epi32(ewrt);
2126 #ifdef __XOP__
2127 eweps = _mm_frcz_pd(ewrt);
2128 #else
2129 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2130 #endif
2131 twoeweps = _mm_add_pd(eweps,eweps);
2132 gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2133 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2134 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
2136 fscal = felec;
2138 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2140 /* Update vectorial force */
2141 fix1 = _mm_macc_pd(dx11,fscal,fix1);
2142 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
2143 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
2145 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
2146 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
2147 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
2149 /**************************
2150 * CALCULATE INTERACTIONS *
2151 **************************/
2153 r12 = _mm_mul_pd(rsq12,rinv12);
2155 /* EWALD ELECTROSTATICS */
2157 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2158 ewrt = _mm_mul_pd(r12,ewtabscale);
2159 ewitab = _mm_cvttpd_epi32(ewrt);
2160 #ifdef __XOP__
2161 eweps = _mm_frcz_pd(ewrt);
2162 #else
2163 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2164 #endif
2165 twoeweps = _mm_add_pd(eweps,eweps);
2166 gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2167 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2168 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
2170 fscal = felec;
2172 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2174 /* Update vectorial force */
2175 fix1 = _mm_macc_pd(dx12,fscal,fix1);
2176 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
2177 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
2179 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
2180 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
2181 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
2183 /**************************
2184 * CALCULATE INTERACTIONS *
2185 **************************/
2187 r20 = _mm_mul_pd(rsq20,rinv20);
2189 /* EWALD ELECTROSTATICS */
2191 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2192 ewrt = _mm_mul_pd(r20,ewtabscale);
2193 ewitab = _mm_cvttpd_epi32(ewrt);
2194 #ifdef __XOP__
2195 eweps = _mm_frcz_pd(ewrt);
2196 #else
2197 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2198 #endif
2199 twoeweps = _mm_add_pd(eweps,eweps);
2200 gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2201 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2202 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
2204 fscal = felec;
2206 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2208 /* Update vectorial force */
2209 fix2 = _mm_macc_pd(dx20,fscal,fix2);
2210 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
2211 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
2213 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
2214 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
2215 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
2217 /**************************
2218 * CALCULATE INTERACTIONS *
2219 **************************/
2221 r21 = _mm_mul_pd(rsq21,rinv21);
2223 /* EWALD ELECTROSTATICS */
2225 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2226 ewrt = _mm_mul_pd(r21,ewtabscale);
2227 ewitab = _mm_cvttpd_epi32(ewrt);
2228 #ifdef __XOP__
2229 eweps = _mm_frcz_pd(ewrt);
2230 #else
2231 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2232 #endif
2233 twoeweps = _mm_add_pd(eweps,eweps);
2234 gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2235 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2236 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
2238 fscal = felec;
2240 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2242 /* Update vectorial force */
2243 fix2 = _mm_macc_pd(dx21,fscal,fix2);
2244 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
2245 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
2247 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
2248 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
2249 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
2251 /**************************
2252 * CALCULATE INTERACTIONS *
2253 **************************/
2255 r22 = _mm_mul_pd(rsq22,rinv22);
2257 /* EWALD ELECTROSTATICS */
2259 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2260 ewrt = _mm_mul_pd(r22,ewtabscale);
2261 ewitab = _mm_cvttpd_epi32(ewrt);
2262 #ifdef __XOP__
2263 eweps = _mm_frcz_pd(ewrt);
2264 #else
2265 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2266 #endif
2267 twoeweps = _mm_add_pd(eweps,eweps);
2268 gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2269 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2270 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
2272 fscal = felec;
2274 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2276 /* Update vectorial force */
2277 fix2 = _mm_macc_pd(dx22,fscal,fix2);
2278 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
2279 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
2281 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
2282 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
2283 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
2285 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2287 /* Inner loop uses 377 flops */
2290 /* End of innermost loop */
2292 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2293 f+i_coord_offset,fshift+i_shift_offset);
2295 /* Increment number of inner iterations */
2296 inneriter += j_index_end - j_index_start;
2298 /* Outer loop uses 18 flops */
2301 /* Increment number of outer iterations */
2302 outeriter += nri;
2304 /* Update outer/inner flops */
2306 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*377);