Removed simple.h from nb_kernel_sse2_XX
[gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse2_single / nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_sse2_single.c
blobf9e4be7c98d91965f03f0caba97ee01866b48cb4
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_single kernel generator.
38 #include "gmxpre.h"
40 #include "config.h"
42 #include <math.h>
44 #include "../nb_kernel.h"
45 #include "gromacs/math/vec.h"
46 #include "gromacs/legacyheaders/nrnb.h"
48 #include "gromacs/simd/math_x86_sse2_single.h"
49 #include "kernelutil_x86_sse2_single.h"
52 * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_sse2_single
53 * Electrostatics interaction: Coulomb
54 * VdW interaction: CubicSplineTable
55 * Geometry: Water4-Particle
56 * Calculate force/pot: PotentialAndForce
58 void
59 nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_sse2_single
60 (t_nblist * gmx_restrict nlist,
61 rvec * gmx_restrict xx,
62 rvec * gmx_restrict ff,
63 t_forcerec * gmx_restrict fr,
64 t_mdatoms * gmx_restrict mdatoms,
65 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
66 t_nrnb * gmx_restrict nrnb)
68 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
69 * just 0 for non-waters.
70 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
71 * jnr indices corresponding to data put in the four positions in the SIMD register.
73 int i_shift_offset,i_coord_offset,outeriter,inneriter;
74 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
75 int jnrA,jnrB,jnrC,jnrD;
76 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
77 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
78 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
79 real rcutoff_scalar;
80 real *shiftvec,*fshift,*x,*f;
81 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
82 real scratch[4*DIM];
83 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
84 int vdwioffset0;
85 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
86 int vdwioffset1;
87 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
88 int vdwioffset2;
89 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
90 int vdwioffset3;
91 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
92 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
93 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
94 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
95 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
96 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
97 __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
98 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
99 real *charge;
100 int nvdwtype;
101 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
102 int *vdwtype;
103 real *vdwparam;
104 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
105 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
106 __m128i vfitab;
107 __m128i ifour = _mm_set1_epi32(4);
108 __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
109 real *vftab;
110 __m128 dummy_mask,cutoff_mask;
111 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
112 __m128 one = _mm_set1_ps(1.0);
113 __m128 two = _mm_set1_ps(2.0);
114 x = xx[0];
115 f = ff[0];
117 nri = nlist->nri;
118 iinr = nlist->iinr;
119 jindex = nlist->jindex;
120 jjnr = nlist->jjnr;
121 shiftidx = nlist->shift;
122 gid = nlist->gid;
123 shiftvec = fr->shift_vec[0];
124 fshift = fr->fshift[0];
125 facel = _mm_set1_ps(fr->epsfac);
126 charge = mdatoms->chargeA;
127 nvdwtype = fr->ntype;
128 vdwparam = fr->nbfp;
129 vdwtype = mdatoms->typeA;
131 vftab = kernel_data->table_vdw->data;
132 vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
134 /* Setup water-specific parameters */
135 inr = nlist->iinr[0];
136 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
137 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
138 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
139 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
141 /* Avoid stupid compiler warnings */
142 jnrA = jnrB = jnrC = jnrD = 0;
143 j_coord_offsetA = 0;
144 j_coord_offsetB = 0;
145 j_coord_offsetC = 0;
146 j_coord_offsetD = 0;
148 outeriter = 0;
149 inneriter = 0;
151 for(iidx=0;iidx<4*DIM;iidx++)
153 scratch[iidx] = 0.0;
156 /* Start outer loop over neighborlists */
157 for(iidx=0; iidx<nri; iidx++)
159 /* Load shift vector for this list */
160 i_shift_offset = DIM*shiftidx[iidx];
162 /* Load limits for loop over neighbors */
163 j_index_start = jindex[iidx];
164 j_index_end = jindex[iidx+1];
166 /* Get outer coordinate index */
167 inr = iinr[iidx];
168 i_coord_offset = DIM*inr;
170 /* Load i particle coords and add shift vector */
171 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
172 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
174 fix0 = _mm_setzero_ps();
175 fiy0 = _mm_setzero_ps();
176 fiz0 = _mm_setzero_ps();
177 fix1 = _mm_setzero_ps();
178 fiy1 = _mm_setzero_ps();
179 fiz1 = _mm_setzero_ps();
180 fix2 = _mm_setzero_ps();
181 fiy2 = _mm_setzero_ps();
182 fiz2 = _mm_setzero_ps();
183 fix3 = _mm_setzero_ps();
184 fiy3 = _mm_setzero_ps();
185 fiz3 = _mm_setzero_ps();
187 /* Reset potential sums */
188 velecsum = _mm_setzero_ps();
189 vvdwsum = _mm_setzero_ps();
191 /* Start inner kernel loop */
192 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
195 /* Get j neighbor index, and coordinate index */
196 jnrA = jjnr[jidx];
197 jnrB = jjnr[jidx+1];
198 jnrC = jjnr[jidx+2];
199 jnrD = jjnr[jidx+3];
200 j_coord_offsetA = DIM*jnrA;
201 j_coord_offsetB = DIM*jnrB;
202 j_coord_offsetC = DIM*jnrC;
203 j_coord_offsetD = DIM*jnrD;
205 /* load j atom coordinates */
206 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
207 x+j_coord_offsetC,x+j_coord_offsetD,
208 &jx0,&jy0,&jz0);
210 /* Calculate displacement vector */
211 dx00 = _mm_sub_ps(ix0,jx0);
212 dy00 = _mm_sub_ps(iy0,jy0);
213 dz00 = _mm_sub_ps(iz0,jz0);
214 dx10 = _mm_sub_ps(ix1,jx0);
215 dy10 = _mm_sub_ps(iy1,jy0);
216 dz10 = _mm_sub_ps(iz1,jz0);
217 dx20 = _mm_sub_ps(ix2,jx0);
218 dy20 = _mm_sub_ps(iy2,jy0);
219 dz20 = _mm_sub_ps(iz2,jz0);
220 dx30 = _mm_sub_ps(ix3,jx0);
221 dy30 = _mm_sub_ps(iy3,jy0);
222 dz30 = _mm_sub_ps(iz3,jz0);
224 /* Calculate squared distance and things based on it */
225 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
226 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
227 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
228 rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
230 rinv00 = gmx_mm_invsqrt_ps(rsq00);
231 rinv10 = gmx_mm_invsqrt_ps(rsq10);
232 rinv20 = gmx_mm_invsqrt_ps(rsq20);
233 rinv30 = gmx_mm_invsqrt_ps(rsq30);
235 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
236 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
237 rinvsq30 = _mm_mul_ps(rinv30,rinv30);
239 /* Load parameters for j particles */
240 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
241 charge+jnrC+0,charge+jnrD+0);
242 vdwjidx0A = 2*vdwtype[jnrA+0];
243 vdwjidx0B = 2*vdwtype[jnrB+0];
244 vdwjidx0C = 2*vdwtype[jnrC+0];
245 vdwjidx0D = 2*vdwtype[jnrD+0];
247 fjx0 = _mm_setzero_ps();
248 fjy0 = _mm_setzero_ps();
249 fjz0 = _mm_setzero_ps();
251 /**************************
252 * CALCULATE INTERACTIONS *
253 **************************/
255 r00 = _mm_mul_ps(rsq00,rinv00);
257 /* Compute parameters for interactions between i and j atoms */
258 gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
259 vdwparam+vdwioffset0+vdwjidx0B,
260 vdwparam+vdwioffset0+vdwjidx0C,
261 vdwparam+vdwioffset0+vdwjidx0D,
262 &c6_00,&c12_00);
264 /* Calculate table index by multiplying r with table scale and truncate to integer */
265 rt = _mm_mul_ps(r00,vftabscale);
266 vfitab = _mm_cvttps_epi32(rt);
267 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
268 vfitab = _mm_slli_epi32(vfitab,3);
270 /* CUBIC SPLINE TABLE DISPERSION */
271 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
272 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
273 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
274 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
275 _MM_TRANSPOSE4_PS(Y,F,G,H);
276 Heps = _mm_mul_ps(vfeps,H);
277 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
278 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
279 vvdw6 = _mm_mul_ps(c6_00,VV);
280 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
281 fvdw6 = _mm_mul_ps(c6_00,FF);
283 /* CUBIC SPLINE TABLE REPULSION */
284 vfitab = _mm_add_epi32(vfitab,ifour);
285 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
286 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
287 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
288 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
289 _MM_TRANSPOSE4_PS(Y,F,G,H);
290 Heps = _mm_mul_ps(vfeps,H);
291 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
292 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
293 vvdw12 = _mm_mul_ps(c12_00,VV);
294 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
295 fvdw12 = _mm_mul_ps(c12_00,FF);
296 vvdw = _mm_add_ps(vvdw12,vvdw6);
297 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
299 /* Update potential sum for this i atom from the interaction with this j atom. */
300 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
302 fscal = fvdw;
304 /* Calculate temporary vectorial force */
305 tx = _mm_mul_ps(fscal,dx00);
306 ty = _mm_mul_ps(fscal,dy00);
307 tz = _mm_mul_ps(fscal,dz00);
309 /* Update vectorial force */
310 fix0 = _mm_add_ps(fix0,tx);
311 fiy0 = _mm_add_ps(fiy0,ty);
312 fiz0 = _mm_add_ps(fiz0,tz);
314 fjx0 = _mm_add_ps(fjx0,tx);
315 fjy0 = _mm_add_ps(fjy0,ty);
316 fjz0 = _mm_add_ps(fjz0,tz);
318 /**************************
319 * CALCULATE INTERACTIONS *
320 **************************/
322 /* Compute parameters for interactions between i and j atoms */
323 qq10 = _mm_mul_ps(iq1,jq0);
325 /* COULOMB ELECTROSTATICS */
326 velec = _mm_mul_ps(qq10,rinv10);
327 felec = _mm_mul_ps(velec,rinvsq10);
329 /* Update potential sum for this i atom from the interaction with this j atom. */
330 velecsum = _mm_add_ps(velecsum,velec);
332 fscal = felec;
334 /* Calculate temporary vectorial force */
335 tx = _mm_mul_ps(fscal,dx10);
336 ty = _mm_mul_ps(fscal,dy10);
337 tz = _mm_mul_ps(fscal,dz10);
339 /* Update vectorial force */
340 fix1 = _mm_add_ps(fix1,tx);
341 fiy1 = _mm_add_ps(fiy1,ty);
342 fiz1 = _mm_add_ps(fiz1,tz);
344 fjx0 = _mm_add_ps(fjx0,tx);
345 fjy0 = _mm_add_ps(fjy0,ty);
346 fjz0 = _mm_add_ps(fjz0,tz);
348 /**************************
349 * CALCULATE INTERACTIONS *
350 **************************/
352 /* Compute parameters for interactions between i and j atoms */
353 qq20 = _mm_mul_ps(iq2,jq0);
355 /* COULOMB ELECTROSTATICS */
356 velec = _mm_mul_ps(qq20,rinv20);
357 felec = _mm_mul_ps(velec,rinvsq20);
359 /* Update potential sum for this i atom from the interaction with this j atom. */
360 velecsum = _mm_add_ps(velecsum,velec);
362 fscal = felec;
364 /* Calculate temporary vectorial force */
365 tx = _mm_mul_ps(fscal,dx20);
366 ty = _mm_mul_ps(fscal,dy20);
367 tz = _mm_mul_ps(fscal,dz20);
369 /* Update vectorial force */
370 fix2 = _mm_add_ps(fix2,tx);
371 fiy2 = _mm_add_ps(fiy2,ty);
372 fiz2 = _mm_add_ps(fiz2,tz);
374 fjx0 = _mm_add_ps(fjx0,tx);
375 fjy0 = _mm_add_ps(fjy0,ty);
376 fjz0 = _mm_add_ps(fjz0,tz);
378 /**************************
379 * CALCULATE INTERACTIONS *
380 **************************/
382 /* Compute parameters for interactions between i and j atoms */
383 qq30 = _mm_mul_ps(iq3,jq0);
385 /* COULOMB ELECTROSTATICS */
386 velec = _mm_mul_ps(qq30,rinv30);
387 felec = _mm_mul_ps(velec,rinvsq30);
389 /* Update potential sum for this i atom from the interaction with this j atom. */
390 velecsum = _mm_add_ps(velecsum,velec);
392 fscal = felec;
394 /* Calculate temporary vectorial force */
395 tx = _mm_mul_ps(fscal,dx30);
396 ty = _mm_mul_ps(fscal,dy30);
397 tz = _mm_mul_ps(fscal,dz30);
399 /* Update vectorial force */
400 fix3 = _mm_add_ps(fix3,tx);
401 fiy3 = _mm_add_ps(fiy3,ty);
402 fiz3 = _mm_add_ps(fiz3,tz);
404 fjx0 = _mm_add_ps(fjx0,tx);
405 fjy0 = _mm_add_ps(fjy0,ty);
406 fjz0 = _mm_add_ps(fjz0,tz);
408 fjptrA = f+j_coord_offsetA;
409 fjptrB = f+j_coord_offsetB;
410 fjptrC = f+j_coord_offsetC;
411 fjptrD = f+j_coord_offsetD;
413 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
415 /* Inner loop uses 140 flops */
418 if(jidx<j_index_end)
421 /* Get j neighbor index, and coordinate index */
422 jnrlistA = jjnr[jidx];
423 jnrlistB = jjnr[jidx+1];
424 jnrlistC = jjnr[jidx+2];
425 jnrlistD = jjnr[jidx+3];
426 /* Sign of each element will be negative for non-real atoms.
427 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
428 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
430 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
431 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
432 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
433 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
434 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
435 j_coord_offsetA = DIM*jnrA;
436 j_coord_offsetB = DIM*jnrB;
437 j_coord_offsetC = DIM*jnrC;
438 j_coord_offsetD = DIM*jnrD;
440 /* load j atom coordinates */
441 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
442 x+j_coord_offsetC,x+j_coord_offsetD,
443 &jx0,&jy0,&jz0);
445 /* Calculate displacement vector */
446 dx00 = _mm_sub_ps(ix0,jx0);
447 dy00 = _mm_sub_ps(iy0,jy0);
448 dz00 = _mm_sub_ps(iz0,jz0);
449 dx10 = _mm_sub_ps(ix1,jx0);
450 dy10 = _mm_sub_ps(iy1,jy0);
451 dz10 = _mm_sub_ps(iz1,jz0);
452 dx20 = _mm_sub_ps(ix2,jx0);
453 dy20 = _mm_sub_ps(iy2,jy0);
454 dz20 = _mm_sub_ps(iz2,jz0);
455 dx30 = _mm_sub_ps(ix3,jx0);
456 dy30 = _mm_sub_ps(iy3,jy0);
457 dz30 = _mm_sub_ps(iz3,jz0);
459 /* Calculate squared distance and things based on it */
460 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
461 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
462 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
463 rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
465 rinv00 = gmx_mm_invsqrt_ps(rsq00);
466 rinv10 = gmx_mm_invsqrt_ps(rsq10);
467 rinv20 = gmx_mm_invsqrt_ps(rsq20);
468 rinv30 = gmx_mm_invsqrt_ps(rsq30);
470 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
471 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
472 rinvsq30 = _mm_mul_ps(rinv30,rinv30);
474 /* Load parameters for j particles */
475 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
476 charge+jnrC+0,charge+jnrD+0);
477 vdwjidx0A = 2*vdwtype[jnrA+0];
478 vdwjidx0B = 2*vdwtype[jnrB+0];
479 vdwjidx0C = 2*vdwtype[jnrC+0];
480 vdwjidx0D = 2*vdwtype[jnrD+0];
482 fjx0 = _mm_setzero_ps();
483 fjy0 = _mm_setzero_ps();
484 fjz0 = _mm_setzero_ps();
486 /**************************
487 * CALCULATE INTERACTIONS *
488 **************************/
490 r00 = _mm_mul_ps(rsq00,rinv00);
491 r00 = _mm_andnot_ps(dummy_mask,r00);
493 /* Compute parameters for interactions between i and j atoms */
494 gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
495 vdwparam+vdwioffset0+vdwjidx0B,
496 vdwparam+vdwioffset0+vdwjidx0C,
497 vdwparam+vdwioffset0+vdwjidx0D,
498 &c6_00,&c12_00);
500 /* Calculate table index by multiplying r with table scale and truncate to integer */
501 rt = _mm_mul_ps(r00,vftabscale);
502 vfitab = _mm_cvttps_epi32(rt);
503 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
504 vfitab = _mm_slli_epi32(vfitab,3);
506 /* CUBIC SPLINE TABLE DISPERSION */
507 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
508 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
509 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
510 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
511 _MM_TRANSPOSE4_PS(Y,F,G,H);
512 Heps = _mm_mul_ps(vfeps,H);
513 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
514 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
515 vvdw6 = _mm_mul_ps(c6_00,VV);
516 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
517 fvdw6 = _mm_mul_ps(c6_00,FF);
519 /* CUBIC SPLINE TABLE REPULSION */
520 vfitab = _mm_add_epi32(vfitab,ifour);
521 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
522 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
523 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
524 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
525 _MM_TRANSPOSE4_PS(Y,F,G,H);
526 Heps = _mm_mul_ps(vfeps,H);
527 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
528 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
529 vvdw12 = _mm_mul_ps(c12_00,VV);
530 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
531 fvdw12 = _mm_mul_ps(c12_00,FF);
532 vvdw = _mm_add_ps(vvdw12,vvdw6);
533 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
535 /* Update potential sum for this i atom from the interaction with this j atom. */
536 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
537 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
539 fscal = fvdw;
541 fscal = _mm_andnot_ps(dummy_mask,fscal);
543 /* Calculate temporary vectorial force */
544 tx = _mm_mul_ps(fscal,dx00);
545 ty = _mm_mul_ps(fscal,dy00);
546 tz = _mm_mul_ps(fscal,dz00);
548 /* Update vectorial force */
549 fix0 = _mm_add_ps(fix0,tx);
550 fiy0 = _mm_add_ps(fiy0,ty);
551 fiz0 = _mm_add_ps(fiz0,tz);
553 fjx0 = _mm_add_ps(fjx0,tx);
554 fjy0 = _mm_add_ps(fjy0,ty);
555 fjz0 = _mm_add_ps(fjz0,tz);
557 /**************************
558 * CALCULATE INTERACTIONS *
559 **************************/
561 /* Compute parameters for interactions between i and j atoms */
562 qq10 = _mm_mul_ps(iq1,jq0);
564 /* COULOMB ELECTROSTATICS */
565 velec = _mm_mul_ps(qq10,rinv10);
566 felec = _mm_mul_ps(velec,rinvsq10);
568 /* Update potential sum for this i atom from the interaction with this j atom. */
569 velec = _mm_andnot_ps(dummy_mask,velec);
570 velecsum = _mm_add_ps(velecsum,velec);
572 fscal = felec;
574 fscal = _mm_andnot_ps(dummy_mask,fscal);
576 /* Calculate temporary vectorial force */
577 tx = _mm_mul_ps(fscal,dx10);
578 ty = _mm_mul_ps(fscal,dy10);
579 tz = _mm_mul_ps(fscal,dz10);
581 /* Update vectorial force */
582 fix1 = _mm_add_ps(fix1,tx);
583 fiy1 = _mm_add_ps(fiy1,ty);
584 fiz1 = _mm_add_ps(fiz1,tz);
586 fjx0 = _mm_add_ps(fjx0,tx);
587 fjy0 = _mm_add_ps(fjy0,ty);
588 fjz0 = _mm_add_ps(fjz0,tz);
590 /**************************
591 * CALCULATE INTERACTIONS *
592 **************************/
594 /* Compute parameters for interactions between i and j atoms */
595 qq20 = _mm_mul_ps(iq2,jq0);
597 /* COULOMB ELECTROSTATICS */
598 velec = _mm_mul_ps(qq20,rinv20);
599 felec = _mm_mul_ps(velec,rinvsq20);
601 /* Update potential sum for this i atom from the interaction with this j atom. */
602 velec = _mm_andnot_ps(dummy_mask,velec);
603 velecsum = _mm_add_ps(velecsum,velec);
605 fscal = felec;
607 fscal = _mm_andnot_ps(dummy_mask,fscal);
609 /* Calculate temporary vectorial force */
610 tx = _mm_mul_ps(fscal,dx20);
611 ty = _mm_mul_ps(fscal,dy20);
612 tz = _mm_mul_ps(fscal,dz20);
614 /* Update vectorial force */
615 fix2 = _mm_add_ps(fix2,tx);
616 fiy2 = _mm_add_ps(fiy2,ty);
617 fiz2 = _mm_add_ps(fiz2,tz);
619 fjx0 = _mm_add_ps(fjx0,tx);
620 fjy0 = _mm_add_ps(fjy0,ty);
621 fjz0 = _mm_add_ps(fjz0,tz);
623 /**************************
624 * CALCULATE INTERACTIONS *
625 **************************/
627 /* Compute parameters for interactions between i and j atoms */
628 qq30 = _mm_mul_ps(iq3,jq0);
630 /* COULOMB ELECTROSTATICS */
631 velec = _mm_mul_ps(qq30,rinv30);
632 felec = _mm_mul_ps(velec,rinvsq30);
634 /* Update potential sum for this i atom from the interaction with this j atom. */
635 velec = _mm_andnot_ps(dummy_mask,velec);
636 velecsum = _mm_add_ps(velecsum,velec);
638 fscal = felec;
640 fscal = _mm_andnot_ps(dummy_mask,fscal);
642 /* Calculate temporary vectorial force */
643 tx = _mm_mul_ps(fscal,dx30);
644 ty = _mm_mul_ps(fscal,dy30);
645 tz = _mm_mul_ps(fscal,dz30);
647 /* Update vectorial force */
648 fix3 = _mm_add_ps(fix3,tx);
649 fiy3 = _mm_add_ps(fiy3,ty);
650 fiz3 = _mm_add_ps(fiz3,tz);
652 fjx0 = _mm_add_ps(fjx0,tx);
653 fjy0 = _mm_add_ps(fjy0,ty);
654 fjz0 = _mm_add_ps(fjz0,tz);
656 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
657 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
658 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
659 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
661 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
663 /* Inner loop uses 141 flops */
666 /* End of innermost loop */
668 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
669 f+i_coord_offset,fshift+i_shift_offset);
671 ggid = gid[iidx];
672 /* Update potential energies */
673 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
674 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
676 /* Increment number of inner iterations */
677 inneriter += j_index_end - j_index_start;
679 /* Outer loop uses 26 flops */
682 /* Increment number of outer iterations */
683 outeriter += nri;
685 /* Update outer/inner flops */
687 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*141);
690 * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_sse2_single
691 * Electrostatics interaction: Coulomb
692 * VdW interaction: CubicSplineTable
693 * Geometry: Water4-Particle
694 * Calculate force/pot: Force
696 void
697 nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_sse2_single
698 (t_nblist * gmx_restrict nlist,
699 rvec * gmx_restrict xx,
700 rvec * gmx_restrict ff,
701 t_forcerec * gmx_restrict fr,
702 t_mdatoms * gmx_restrict mdatoms,
703 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
704 t_nrnb * gmx_restrict nrnb)
706 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
707 * just 0 for non-waters.
708 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
709 * jnr indices corresponding to data put in the four positions in the SIMD register.
711 int i_shift_offset,i_coord_offset,outeriter,inneriter;
712 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
713 int jnrA,jnrB,jnrC,jnrD;
714 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
715 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
716 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
717 real rcutoff_scalar;
718 real *shiftvec,*fshift,*x,*f;
719 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
720 real scratch[4*DIM];
721 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
722 int vdwioffset0;
723 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
724 int vdwioffset1;
725 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
726 int vdwioffset2;
727 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
728 int vdwioffset3;
729 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
730 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
731 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
732 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
733 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
734 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
735 __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
736 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
737 real *charge;
738 int nvdwtype;
739 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
740 int *vdwtype;
741 real *vdwparam;
742 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
743 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
744 __m128i vfitab;
745 __m128i ifour = _mm_set1_epi32(4);
746 __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
747 real *vftab;
748 __m128 dummy_mask,cutoff_mask;
749 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
750 __m128 one = _mm_set1_ps(1.0);
751 __m128 two = _mm_set1_ps(2.0);
752 x = xx[0];
753 f = ff[0];
755 nri = nlist->nri;
756 iinr = nlist->iinr;
757 jindex = nlist->jindex;
758 jjnr = nlist->jjnr;
759 shiftidx = nlist->shift;
760 gid = nlist->gid;
761 shiftvec = fr->shift_vec[0];
762 fshift = fr->fshift[0];
763 facel = _mm_set1_ps(fr->epsfac);
764 charge = mdatoms->chargeA;
765 nvdwtype = fr->ntype;
766 vdwparam = fr->nbfp;
767 vdwtype = mdatoms->typeA;
769 vftab = kernel_data->table_vdw->data;
770 vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
772 /* Setup water-specific parameters */
773 inr = nlist->iinr[0];
774 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
775 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
776 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
777 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
779 /* Avoid stupid compiler warnings */
780 jnrA = jnrB = jnrC = jnrD = 0;
781 j_coord_offsetA = 0;
782 j_coord_offsetB = 0;
783 j_coord_offsetC = 0;
784 j_coord_offsetD = 0;
786 outeriter = 0;
787 inneriter = 0;
789 for(iidx=0;iidx<4*DIM;iidx++)
791 scratch[iidx] = 0.0;
794 /* Start outer loop over neighborlists */
795 for(iidx=0; iidx<nri; iidx++)
797 /* Load shift vector for this list */
798 i_shift_offset = DIM*shiftidx[iidx];
800 /* Load limits for loop over neighbors */
801 j_index_start = jindex[iidx];
802 j_index_end = jindex[iidx+1];
804 /* Get outer coordinate index */
805 inr = iinr[iidx];
806 i_coord_offset = DIM*inr;
808 /* Load i particle coords and add shift vector */
809 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
810 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
812 fix0 = _mm_setzero_ps();
813 fiy0 = _mm_setzero_ps();
814 fiz0 = _mm_setzero_ps();
815 fix1 = _mm_setzero_ps();
816 fiy1 = _mm_setzero_ps();
817 fiz1 = _mm_setzero_ps();
818 fix2 = _mm_setzero_ps();
819 fiy2 = _mm_setzero_ps();
820 fiz2 = _mm_setzero_ps();
821 fix3 = _mm_setzero_ps();
822 fiy3 = _mm_setzero_ps();
823 fiz3 = _mm_setzero_ps();
825 /* Start inner kernel loop */
826 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
829 /* Get j neighbor index, and coordinate index */
830 jnrA = jjnr[jidx];
831 jnrB = jjnr[jidx+1];
832 jnrC = jjnr[jidx+2];
833 jnrD = jjnr[jidx+3];
834 j_coord_offsetA = DIM*jnrA;
835 j_coord_offsetB = DIM*jnrB;
836 j_coord_offsetC = DIM*jnrC;
837 j_coord_offsetD = DIM*jnrD;
839 /* load j atom coordinates */
840 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
841 x+j_coord_offsetC,x+j_coord_offsetD,
842 &jx0,&jy0,&jz0);
844 /* Calculate displacement vector */
845 dx00 = _mm_sub_ps(ix0,jx0);
846 dy00 = _mm_sub_ps(iy0,jy0);
847 dz00 = _mm_sub_ps(iz0,jz0);
848 dx10 = _mm_sub_ps(ix1,jx0);
849 dy10 = _mm_sub_ps(iy1,jy0);
850 dz10 = _mm_sub_ps(iz1,jz0);
851 dx20 = _mm_sub_ps(ix2,jx0);
852 dy20 = _mm_sub_ps(iy2,jy0);
853 dz20 = _mm_sub_ps(iz2,jz0);
854 dx30 = _mm_sub_ps(ix3,jx0);
855 dy30 = _mm_sub_ps(iy3,jy0);
856 dz30 = _mm_sub_ps(iz3,jz0);
858 /* Calculate squared distance and things based on it */
859 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
860 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
861 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
862 rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
864 rinv00 = gmx_mm_invsqrt_ps(rsq00);
865 rinv10 = gmx_mm_invsqrt_ps(rsq10);
866 rinv20 = gmx_mm_invsqrt_ps(rsq20);
867 rinv30 = gmx_mm_invsqrt_ps(rsq30);
869 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
870 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
871 rinvsq30 = _mm_mul_ps(rinv30,rinv30);
873 /* Load parameters for j particles */
874 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
875 charge+jnrC+0,charge+jnrD+0);
876 vdwjidx0A = 2*vdwtype[jnrA+0];
877 vdwjidx0B = 2*vdwtype[jnrB+0];
878 vdwjidx0C = 2*vdwtype[jnrC+0];
879 vdwjidx0D = 2*vdwtype[jnrD+0];
881 fjx0 = _mm_setzero_ps();
882 fjy0 = _mm_setzero_ps();
883 fjz0 = _mm_setzero_ps();
885 /**************************
886 * CALCULATE INTERACTIONS *
887 **************************/
889 r00 = _mm_mul_ps(rsq00,rinv00);
891 /* Compute parameters for interactions between i and j atoms */
892 gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
893 vdwparam+vdwioffset0+vdwjidx0B,
894 vdwparam+vdwioffset0+vdwjidx0C,
895 vdwparam+vdwioffset0+vdwjidx0D,
896 &c6_00,&c12_00);
898 /* Calculate table index by multiplying r with table scale and truncate to integer */
899 rt = _mm_mul_ps(r00,vftabscale);
900 vfitab = _mm_cvttps_epi32(rt);
901 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
902 vfitab = _mm_slli_epi32(vfitab,3);
904 /* CUBIC SPLINE TABLE DISPERSION */
905 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
906 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
907 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
908 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
909 _MM_TRANSPOSE4_PS(Y,F,G,H);
910 Heps = _mm_mul_ps(vfeps,H);
911 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
912 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
913 fvdw6 = _mm_mul_ps(c6_00,FF);
915 /* CUBIC SPLINE TABLE REPULSION */
916 vfitab = _mm_add_epi32(vfitab,ifour);
917 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
918 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
919 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
920 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
921 _MM_TRANSPOSE4_PS(Y,F,G,H);
922 Heps = _mm_mul_ps(vfeps,H);
923 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
924 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
925 fvdw12 = _mm_mul_ps(c12_00,FF);
926 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
928 fscal = fvdw;
930 /* Calculate temporary vectorial force */
931 tx = _mm_mul_ps(fscal,dx00);
932 ty = _mm_mul_ps(fscal,dy00);
933 tz = _mm_mul_ps(fscal,dz00);
935 /* Update vectorial force */
936 fix0 = _mm_add_ps(fix0,tx);
937 fiy0 = _mm_add_ps(fiy0,ty);
938 fiz0 = _mm_add_ps(fiz0,tz);
940 fjx0 = _mm_add_ps(fjx0,tx);
941 fjy0 = _mm_add_ps(fjy0,ty);
942 fjz0 = _mm_add_ps(fjz0,tz);
944 /**************************
945 * CALCULATE INTERACTIONS *
946 **************************/
948 /* Compute parameters for interactions between i and j atoms */
949 qq10 = _mm_mul_ps(iq1,jq0);
951 /* COULOMB ELECTROSTATICS */
952 velec = _mm_mul_ps(qq10,rinv10);
953 felec = _mm_mul_ps(velec,rinvsq10);
955 fscal = felec;
957 /* Calculate temporary vectorial force */
958 tx = _mm_mul_ps(fscal,dx10);
959 ty = _mm_mul_ps(fscal,dy10);
960 tz = _mm_mul_ps(fscal,dz10);
962 /* Update vectorial force */
963 fix1 = _mm_add_ps(fix1,tx);
964 fiy1 = _mm_add_ps(fiy1,ty);
965 fiz1 = _mm_add_ps(fiz1,tz);
967 fjx0 = _mm_add_ps(fjx0,tx);
968 fjy0 = _mm_add_ps(fjy0,ty);
969 fjz0 = _mm_add_ps(fjz0,tz);
971 /**************************
972 * CALCULATE INTERACTIONS *
973 **************************/
975 /* Compute parameters for interactions between i and j atoms */
976 qq20 = _mm_mul_ps(iq2,jq0);
978 /* COULOMB ELECTROSTATICS */
979 velec = _mm_mul_ps(qq20,rinv20);
980 felec = _mm_mul_ps(velec,rinvsq20);
982 fscal = felec;
984 /* Calculate temporary vectorial force */
985 tx = _mm_mul_ps(fscal,dx20);
986 ty = _mm_mul_ps(fscal,dy20);
987 tz = _mm_mul_ps(fscal,dz20);
989 /* Update vectorial force */
990 fix2 = _mm_add_ps(fix2,tx);
991 fiy2 = _mm_add_ps(fiy2,ty);
992 fiz2 = _mm_add_ps(fiz2,tz);
994 fjx0 = _mm_add_ps(fjx0,tx);
995 fjy0 = _mm_add_ps(fjy0,ty);
996 fjz0 = _mm_add_ps(fjz0,tz);
998 /**************************
999 * CALCULATE INTERACTIONS *
1000 **************************/
1002 /* Compute parameters for interactions between i and j atoms */
1003 qq30 = _mm_mul_ps(iq3,jq0);
1005 /* COULOMB ELECTROSTATICS */
1006 velec = _mm_mul_ps(qq30,rinv30);
1007 felec = _mm_mul_ps(velec,rinvsq30);
1009 fscal = felec;
1011 /* Calculate temporary vectorial force */
1012 tx = _mm_mul_ps(fscal,dx30);
1013 ty = _mm_mul_ps(fscal,dy30);
1014 tz = _mm_mul_ps(fscal,dz30);
1016 /* Update vectorial force */
1017 fix3 = _mm_add_ps(fix3,tx);
1018 fiy3 = _mm_add_ps(fiy3,ty);
1019 fiz3 = _mm_add_ps(fiz3,tz);
1021 fjx0 = _mm_add_ps(fjx0,tx);
1022 fjy0 = _mm_add_ps(fjy0,ty);
1023 fjz0 = _mm_add_ps(fjz0,tz);
1025 fjptrA = f+j_coord_offsetA;
1026 fjptrB = f+j_coord_offsetB;
1027 fjptrC = f+j_coord_offsetC;
1028 fjptrD = f+j_coord_offsetD;
1030 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1032 /* Inner loop uses 129 flops */
1035 if(jidx<j_index_end)
1038 /* Get j neighbor index, and coordinate index */
1039 jnrlistA = jjnr[jidx];
1040 jnrlistB = jjnr[jidx+1];
1041 jnrlistC = jjnr[jidx+2];
1042 jnrlistD = jjnr[jidx+3];
1043 /* Sign of each element will be negative for non-real atoms.
1044 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1045 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1047 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1048 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1049 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1050 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1051 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1052 j_coord_offsetA = DIM*jnrA;
1053 j_coord_offsetB = DIM*jnrB;
1054 j_coord_offsetC = DIM*jnrC;
1055 j_coord_offsetD = DIM*jnrD;
1057 /* load j atom coordinates */
1058 gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1059 x+j_coord_offsetC,x+j_coord_offsetD,
1060 &jx0,&jy0,&jz0);
1062 /* Calculate displacement vector */
1063 dx00 = _mm_sub_ps(ix0,jx0);
1064 dy00 = _mm_sub_ps(iy0,jy0);
1065 dz00 = _mm_sub_ps(iz0,jz0);
1066 dx10 = _mm_sub_ps(ix1,jx0);
1067 dy10 = _mm_sub_ps(iy1,jy0);
1068 dz10 = _mm_sub_ps(iz1,jz0);
1069 dx20 = _mm_sub_ps(ix2,jx0);
1070 dy20 = _mm_sub_ps(iy2,jy0);
1071 dz20 = _mm_sub_ps(iz2,jz0);
1072 dx30 = _mm_sub_ps(ix3,jx0);
1073 dy30 = _mm_sub_ps(iy3,jy0);
1074 dz30 = _mm_sub_ps(iz3,jz0);
1076 /* Calculate squared distance and things based on it */
1077 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1078 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1079 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1080 rsq30 = gmx_mm_calc_rsq_ps(dx30,dy30,dz30);
1082 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1083 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1084 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1085 rinv30 = gmx_mm_invsqrt_ps(rsq30);
1087 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1088 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1089 rinvsq30 = _mm_mul_ps(rinv30,rinv30);
1091 /* Load parameters for j particles */
1092 jq0 = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
1093 charge+jnrC+0,charge+jnrD+0);
1094 vdwjidx0A = 2*vdwtype[jnrA+0];
1095 vdwjidx0B = 2*vdwtype[jnrB+0];
1096 vdwjidx0C = 2*vdwtype[jnrC+0];
1097 vdwjidx0D = 2*vdwtype[jnrD+0];
1099 fjx0 = _mm_setzero_ps();
1100 fjy0 = _mm_setzero_ps();
1101 fjz0 = _mm_setzero_ps();
1103 /**************************
1104 * CALCULATE INTERACTIONS *
1105 **************************/
1107 r00 = _mm_mul_ps(rsq00,rinv00);
1108 r00 = _mm_andnot_ps(dummy_mask,r00);
1110 /* Compute parameters for interactions between i and j atoms */
1111 gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
1112 vdwparam+vdwioffset0+vdwjidx0B,
1113 vdwparam+vdwioffset0+vdwjidx0C,
1114 vdwparam+vdwioffset0+vdwjidx0D,
1115 &c6_00,&c12_00);
1117 /* Calculate table index by multiplying r with table scale and truncate to integer */
1118 rt = _mm_mul_ps(r00,vftabscale);
1119 vfitab = _mm_cvttps_epi32(rt);
1120 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1121 vfitab = _mm_slli_epi32(vfitab,3);
1123 /* CUBIC SPLINE TABLE DISPERSION */
1124 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1125 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1126 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1127 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1128 _MM_TRANSPOSE4_PS(Y,F,G,H);
1129 Heps = _mm_mul_ps(vfeps,H);
1130 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1131 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1132 fvdw6 = _mm_mul_ps(c6_00,FF);
1134 /* CUBIC SPLINE TABLE REPULSION */
1135 vfitab = _mm_add_epi32(vfitab,ifour);
1136 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1137 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1138 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1139 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1140 _MM_TRANSPOSE4_PS(Y,F,G,H);
1141 Heps = _mm_mul_ps(vfeps,H);
1142 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1143 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1144 fvdw12 = _mm_mul_ps(c12_00,FF);
1145 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
1147 fscal = fvdw;
1149 fscal = _mm_andnot_ps(dummy_mask,fscal);
1151 /* Calculate temporary vectorial force */
1152 tx = _mm_mul_ps(fscal,dx00);
1153 ty = _mm_mul_ps(fscal,dy00);
1154 tz = _mm_mul_ps(fscal,dz00);
1156 /* Update vectorial force */
1157 fix0 = _mm_add_ps(fix0,tx);
1158 fiy0 = _mm_add_ps(fiy0,ty);
1159 fiz0 = _mm_add_ps(fiz0,tz);
1161 fjx0 = _mm_add_ps(fjx0,tx);
1162 fjy0 = _mm_add_ps(fjy0,ty);
1163 fjz0 = _mm_add_ps(fjz0,tz);
1165 /**************************
1166 * CALCULATE INTERACTIONS *
1167 **************************/
1169 /* Compute parameters for interactions between i and j atoms */
1170 qq10 = _mm_mul_ps(iq1,jq0);
1172 /* COULOMB ELECTROSTATICS */
1173 velec = _mm_mul_ps(qq10,rinv10);
1174 felec = _mm_mul_ps(velec,rinvsq10);
1176 fscal = felec;
1178 fscal = _mm_andnot_ps(dummy_mask,fscal);
1180 /* Calculate temporary vectorial force */
1181 tx = _mm_mul_ps(fscal,dx10);
1182 ty = _mm_mul_ps(fscal,dy10);
1183 tz = _mm_mul_ps(fscal,dz10);
1185 /* Update vectorial force */
1186 fix1 = _mm_add_ps(fix1,tx);
1187 fiy1 = _mm_add_ps(fiy1,ty);
1188 fiz1 = _mm_add_ps(fiz1,tz);
1190 fjx0 = _mm_add_ps(fjx0,tx);
1191 fjy0 = _mm_add_ps(fjy0,ty);
1192 fjz0 = _mm_add_ps(fjz0,tz);
1194 /**************************
1195 * CALCULATE INTERACTIONS *
1196 **************************/
1198 /* Compute parameters for interactions between i and j atoms */
1199 qq20 = _mm_mul_ps(iq2,jq0);
1201 /* COULOMB ELECTROSTATICS */
1202 velec = _mm_mul_ps(qq20,rinv20);
1203 felec = _mm_mul_ps(velec,rinvsq20);
1205 fscal = felec;
1207 fscal = _mm_andnot_ps(dummy_mask,fscal);
1209 /* Calculate temporary vectorial force */
1210 tx = _mm_mul_ps(fscal,dx20);
1211 ty = _mm_mul_ps(fscal,dy20);
1212 tz = _mm_mul_ps(fscal,dz20);
1214 /* Update vectorial force */
1215 fix2 = _mm_add_ps(fix2,tx);
1216 fiy2 = _mm_add_ps(fiy2,ty);
1217 fiz2 = _mm_add_ps(fiz2,tz);
1219 fjx0 = _mm_add_ps(fjx0,tx);
1220 fjy0 = _mm_add_ps(fjy0,ty);
1221 fjz0 = _mm_add_ps(fjz0,tz);
1223 /**************************
1224 * CALCULATE INTERACTIONS *
1225 **************************/
1227 /* Compute parameters for interactions between i and j atoms */
1228 qq30 = _mm_mul_ps(iq3,jq0);
1230 /* COULOMB ELECTROSTATICS */
1231 velec = _mm_mul_ps(qq30,rinv30);
1232 felec = _mm_mul_ps(velec,rinvsq30);
1234 fscal = felec;
1236 fscal = _mm_andnot_ps(dummy_mask,fscal);
1238 /* Calculate temporary vectorial force */
1239 tx = _mm_mul_ps(fscal,dx30);
1240 ty = _mm_mul_ps(fscal,dy30);
1241 tz = _mm_mul_ps(fscal,dz30);
1243 /* Update vectorial force */
1244 fix3 = _mm_add_ps(fix3,tx);
1245 fiy3 = _mm_add_ps(fiy3,ty);
1246 fiz3 = _mm_add_ps(fiz3,tz);
1248 fjx0 = _mm_add_ps(fjx0,tx);
1249 fjy0 = _mm_add_ps(fjy0,ty);
1250 fjz0 = _mm_add_ps(fjz0,tz);
1252 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1253 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1254 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1255 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1257 gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1259 /* Inner loop uses 130 flops */
1262 /* End of innermost loop */
1264 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1265 f+i_coord_offset,fshift+i_shift_offset);
1267 /* Increment number of inner iterations */
1268 inneriter += j_index_end - j_index_start;
1270 /* Outer loop uses 24 flops */
1273 /* Increment number of outer iterations */
1274 outeriter += nri;
1276 /* Update outer/inner flops */
1278 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*130);