2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_single kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/math/vec.h"
46 #include "gromacs/legacyheaders/nrnb.h"
48 #include "gromacs/simd/math_x86_sse2_single.h"
49 #include "kernelutil_x86_sse2_single.h"
52 * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_sse2_single
53 * Electrostatics interaction: Coulomb
54 * VdW interaction: LennardJones
55 * Geometry: Water4-Water4
56 * Calculate force/pot: PotentialAndForce
59 nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_sse2_single
60 (t_nblist
* gmx_restrict nlist
,
61 rvec
* gmx_restrict xx
,
62 rvec
* gmx_restrict ff
,
63 t_forcerec
* gmx_restrict fr
,
64 t_mdatoms
* gmx_restrict mdatoms
,
65 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
66 t_nrnb
* gmx_restrict nrnb
)
68 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
69 * just 0 for non-waters.
70 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
71 * jnr indices corresponding to data put in the four positions in the SIMD register.
73 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
74 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
75 int jnrA
,jnrB
,jnrC
,jnrD
;
76 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
77 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
78 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
80 real
*shiftvec
,*fshift
,*x
,*f
;
81 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
;
83 __m128 tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
85 __m128 ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
87 __m128 ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
89 __m128 ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
91 __m128 ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
92 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
;
93 __m128 jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
94 int vdwjidx1A
,vdwjidx1B
,vdwjidx1C
,vdwjidx1D
;
95 __m128 jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
96 int vdwjidx2A
,vdwjidx2B
,vdwjidx2C
,vdwjidx2D
;
97 __m128 jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
98 int vdwjidx3A
,vdwjidx3B
,vdwjidx3C
,vdwjidx3D
;
99 __m128 jx3
,jy3
,jz3
,fjx3
,fjy3
,fjz3
,jq3
,isaj3
;
100 __m128 dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
101 __m128 dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
102 __m128 dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
103 __m128 dx13
,dy13
,dz13
,rsq13
,rinv13
,rinvsq13
,r13
,qq13
,c6_13
,c12_13
;
104 __m128 dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
105 __m128 dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
106 __m128 dx23
,dy23
,dz23
,rsq23
,rinv23
,rinvsq23
,r23
,qq23
,c6_23
,c12_23
;
107 __m128 dx31
,dy31
,dz31
,rsq31
,rinv31
,rinvsq31
,r31
,qq31
,c6_31
,c12_31
;
108 __m128 dx32
,dy32
,dz32
,rsq32
,rinv32
,rinvsq32
,r32
,qq32
,c6_32
,c12_32
;
109 __m128 dx33
,dy33
,dz33
,rsq33
,rinv33
,rinvsq33
,r33
,qq33
,c6_33
,c12_33
;
110 __m128 velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
113 __m128 rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
116 __m128 one_sixth
= _mm_set1_ps(1.0/6.0);
117 __m128 one_twelfth
= _mm_set1_ps(1.0/12.0);
118 __m128 dummy_mask
,cutoff_mask
;
119 __m128 signbit
= _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
120 __m128 one
= _mm_set1_ps(1.0);
121 __m128 two
= _mm_set1_ps(2.0);
127 jindex
= nlist
->jindex
;
129 shiftidx
= nlist
->shift
;
131 shiftvec
= fr
->shift_vec
[0];
132 fshift
= fr
->fshift
[0];
133 facel
= _mm_set1_ps(fr
->epsfac
);
134 charge
= mdatoms
->chargeA
;
135 nvdwtype
= fr
->ntype
;
137 vdwtype
= mdatoms
->typeA
;
139 /* Setup water-specific parameters */
140 inr
= nlist
->iinr
[0];
141 iq1
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+1]));
142 iq2
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+2]));
143 iq3
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+3]));
144 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
146 jq1
= _mm_set1_ps(charge
[inr
+1]);
147 jq2
= _mm_set1_ps(charge
[inr
+2]);
148 jq3
= _mm_set1_ps(charge
[inr
+3]);
149 vdwjidx0A
= 2*vdwtype
[inr
+0];
150 c6_00
= _mm_set1_ps(vdwparam
[vdwioffset0
+vdwjidx0A
]);
151 c12_00
= _mm_set1_ps(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
152 qq11
= _mm_mul_ps(iq1
,jq1
);
153 qq12
= _mm_mul_ps(iq1
,jq2
);
154 qq13
= _mm_mul_ps(iq1
,jq3
);
155 qq21
= _mm_mul_ps(iq2
,jq1
);
156 qq22
= _mm_mul_ps(iq2
,jq2
);
157 qq23
= _mm_mul_ps(iq2
,jq3
);
158 qq31
= _mm_mul_ps(iq3
,jq1
);
159 qq32
= _mm_mul_ps(iq3
,jq2
);
160 qq33
= _mm_mul_ps(iq3
,jq3
);
162 /* Avoid stupid compiler warnings */
163 jnrA
= jnrB
= jnrC
= jnrD
= 0;
172 for(iidx
=0;iidx
<4*DIM
;iidx
++)
177 /* Start outer loop over neighborlists */
178 for(iidx
=0; iidx
<nri
; iidx
++)
180 /* Load shift vector for this list */
181 i_shift_offset
= DIM
*shiftidx
[iidx
];
183 /* Load limits for loop over neighbors */
184 j_index_start
= jindex
[iidx
];
185 j_index_end
= jindex
[iidx
+1];
187 /* Get outer coordinate index */
189 i_coord_offset
= DIM
*inr
;
191 /* Load i particle coords and add shift vector */
192 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
193 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
195 fix0
= _mm_setzero_ps();
196 fiy0
= _mm_setzero_ps();
197 fiz0
= _mm_setzero_ps();
198 fix1
= _mm_setzero_ps();
199 fiy1
= _mm_setzero_ps();
200 fiz1
= _mm_setzero_ps();
201 fix2
= _mm_setzero_ps();
202 fiy2
= _mm_setzero_ps();
203 fiz2
= _mm_setzero_ps();
204 fix3
= _mm_setzero_ps();
205 fiy3
= _mm_setzero_ps();
206 fiz3
= _mm_setzero_ps();
208 /* Reset potential sums */
209 velecsum
= _mm_setzero_ps();
210 vvdwsum
= _mm_setzero_ps();
212 /* Start inner kernel loop */
213 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+3]>=0; jidx
+=4)
216 /* Get j neighbor index, and coordinate index */
221 j_coord_offsetA
= DIM
*jnrA
;
222 j_coord_offsetB
= DIM
*jnrB
;
223 j_coord_offsetC
= DIM
*jnrC
;
224 j_coord_offsetD
= DIM
*jnrD
;
226 /* load j atom coordinates */
227 gmx_mm_load_4rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
228 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
229 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
230 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
232 /* Calculate displacement vector */
233 dx00
= _mm_sub_ps(ix0
,jx0
);
234 dy00
= _mm_sub_ps(iy0
,jy0
);
235 dz00
= _mm_sub_ps(iz0
,jz0
);
236 dx11
= _mm_sub_ps(ix1
,jx1
);
237 dy11
= _mm_sub_ps(iy1
,jy1
);
238 dz11
= _mm_sub_ps(iz1
,jz1
);
239 dx12
= _mm_sub_ps(ix1
,jx2
);
240 dy12
= _mm_sub_ps(iy1
,jy2
);
241 dz12
= _mm_sub_ps(iz1
,jz2
);
242 dx13
= _mm_sub_ps(ix1
,jx3
);
243 dy13
= _mm_sub_ps(iy1
,jy3
);
244 dz13
= _mm_sub_ps(iz1
,jz3
);
245 dx21
= _mm_sub_ps(ix2
,jx1
);
246 dy21
= _mm_sub_ps(iy2
,jy1
);
247 dz21
= _mm_sub_ps(iz2
,jz1
);
248 dx22
= _mm_sub_ps(ix2
,jx2
);
249 dy22
= _mm_sub_ps(iy2
,jy2
);
250 dz22
= _mm_sub_ps(iz2
,jz2
);
251 dx23
= _mm_sub_ps(ix2
,jx3
);
252 dy23
= _mm_sub_ps(iy2
,jy3
);
253 dz23
= _mm_sub_ps(iz2
,jz3
);
254 dx31
= _mm_sub_ps(ix3
,jx1
);
255 dy31
= _mm_sub_ps(iy3
,jy1
);
256 dz31
= _mm_sub_ps(iz3
,jz1
);
257 dx32
= _mm_sub_ps(ix3
,jx2
);
258 dy32
= _mm_sub_ps(iy3
,jy2
);
259 dz32
= _mm_sub_ps(iz3
,jz2
);
260 dx33
= _mm_sub_ps(ix3
,jx3
);
261 dy33
= _mm_sub_ps(iy3
,jy3
);
262 dz33
= _mm_sub_ps(iz3
,jz3
);
264 /* Calculate squared distance and things based on it */
265 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
266 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
267 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
268 rsq13
= gmx_mm_calc_rsq_ps(dx13
,dy13
,dz13
);
269 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
270 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
271 rsq23
= gmx_mm_calc_rsq_ps(dx23
,dy23
,dz23
);
272 rsq31
= gmx_mm_calc_rsq_ps(dx31
,dy31
,dz31
);
273 rsq32
= gmx_mm_calc_rsq_ps(dx32
,dy32
,dz32
);
274 rsq33
= gmx_mm_calc_rsq_ps(dx33
,dy33
,dz33
);
276 rinv11
= gmx_mm_invsqrt_ps(rsq11
);
277 rinv12
= gmx_mm_invsqrt_ps(rsq12
);
278 rinv13
= gmx_mm_invsqrt_ps(rsq13
);
279 rinv21
= gmx_mm_invsqrt_ps(rsq21
);
280 rinv22
= gmx_mm_invsqrt_ps(rsq22
);
281 rinv23
= gmx_mm_invsqrt_ps(rsq23
);
282 rinv31
= gmx_mm_invsqrt_ps(rsq31
);
283 rinv32
= gmx_mm_invsqrt_ps(rsq32
);
284 rinv33
= gmx_mm_invsqrt_ps(rsq33
);
286 rinvsq00
= gmx_mm_inv_ps(rsq00
);
287 rinvsq11
= _mm_mul_ps(rinv11
,rinv11
);
288 rinvsq12
= _mm_mul_ps(rinv12
,rinv12
);
289 rinvsq13
= _mm_mul_ps(rinv13
,rinv13
);
290 rinvsq21
= _mm_mul_ps(rinv21
,rinv21
);
291 rinvsq22
= _mm_mul_ps(rinv22
,rinv22
);
292 rinvsq23
= _mm_mul_ps(rinv23
,rinv23
);
293 rinvsq31
= _mm_mul_ps(rinv31
,rinv31
);
294 rinvsq32
= _mm_mul_ps(rinv32
,rinv32
);
295 rinvsq33
= _mm_mul_ps(rinv33
,rinv33
);
297 fjx0
= _mm_setzero_ps();
298 fjy0
= _mm_setzero_ps();
299 fjz0
= _mm_setzero_ps();
300 fjx1
= _mm_setzero_ps();
301 fjy1
= _mm_setzero_ps();
302 fjz1
= _mm_setzero_ps();
303 fjx2
= _mm_setzero_ps();
304 fjy2
= _mm_setzero_ps();
305 fjz2
= _mm_setzero_ps();
306 fjx3
= _mm_setzero_ps();
307 fjy3
= _mm_setzero_ps();
308 fjz3
= _mm_setzero_ps();
310 /**************************
311 * CALCULATE INTERACTIONS *
312 **************************/
314 /* LENNARD-JONES DISPERSION/REPULSION */
316 rinvsix
= _mm_mul_ps(_mm_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
317 vvdw6
= _mm_mul_ps(c6_00
,rinvsix
);
318 vvdw12
= _mm_mul_ps(c12_00
,_mm_mul_ps(rinvsix
,rinvsix
));
319 vvdw
= _mm_sub_ps( _mm_mul_ps(vvdw12
,one_twelfth
) , _mm_mul_ps(vvdw6
,one_sixth
) );
320 fvdw
= _mm_mul_ps(_mm_sub_ps(vvdw12
,vvdw6
),rinvsq00
);
322 /* Update potential sum for this i atom from the interaction with this j atom. */
323 vvdwsum
= _mm_add_ps(vvdwsum
,vvdw
);
327 /* Calculate temporary vectorial force */
328 tx
= _mm_mul_ps(fscal
,dx00
);
329 ty
= _mm_mul_ps(fscal
,dy00
);
330 tz
= _mm_mul_ps(fscal
,dz00
);
332 /* Update vectorial force */
333 fix0
= _mm_add_ps(fix0
,tx
);
334 fiy0
= _mm_add_ps(fiy0
,ty
);
335 fiz0
= _mm_add_ps(fiz0
,tz
);
337 fjx0
= _mm_add_ps(fjx0
,tx
);
338 fjy0
= _mm_add_ps(fjy0
,ty
);
339 fjz0
= _mm_add_ps(fjz0
,tz
);
341 /**************************
342 * CALCULATE INTERACTIONS *
343 **************************/
345 /* COULOMB ELECTROSTATICS */
346 velec
= _mm_mul_ps(qq11
,rinv11
);
347 felec
= _mm_mul_ps(velec
,rinvsq11
);
349 /* Update potential sum for this i atom from the interaction with this j atom. */
350 velecsum
= _mm_add_ps(velecsum
,velec
);
354 /* Calculate temporary vectorial force */
355 tx
= _mm_mul_ps(fscal
,dx11
);
356 ty
= _mm_mul_ps(fscal
,dy11
);
357 tz
= _mm_mul_ps(fscal
,dz11
);
359 /* Update vectorial force */
360 fix1
= _mm_add_ps(fix1
,tx
);
361 fiy1
= _mm_add_ps(fiy1
,ty
);
362 fiz1
= _mm_add_ps(fiz1
,tz
);
364 fjx1
= _mm_add_ps(fjx1
,tx
);
365 fjy1
= _mm_add_ps(fjy1
,ty
);
366 fjz1
= _mm_add_ps(fjz1
,tz
);
368 /**************************
369 * CALCULATE INTERACTIONS *
370 **************************/
372 /* COULOMB ELECTROSTATICS */
373 velec
= _mm_mul_ps(qq12
,rinv12
);
374 felec
= _mm_mul_ps(velec
,rinvsq12
);
376 /* Update potential sum for this i atom from the interaction with this j atom. */
377 velecsum
= _mm_add_ps(velecsum
,velec
);
381 /* Calculate temporary vectorial force */
382 tx
= _mm_mul_ps(fscal
,dx12
);
383 ty
= _mm_mul_ps(fscal
,dy12
);
384 tz
= _mm_mul_ps(fscal
,dz12
);
386 /* Update vectorial force */
387 fix1
= _mm_add_ps(fix1
,tx
);
388 fiy1
= _mm_add_ps(fiy1
,ty
);
389 fiz1
= _mm_add_ps(fiz1
,tz
);
391 fjx2
= _mm_add_ps(fjx2
,tx
);
392 fjy2
= _mm_add_ps(fjy2
,ty
);
393 fjz2
= _mm_add_ps(fjz2
,tz
);
395 /**************************
396 * CALCULATE INTERACTIONS *
397 **************************/
399 /* COULOMB ELECTROSTATICS */
400 velec
= _mm_mul_ps(qq13
,rinv13
);
401 felec
= _mm_mul_ps(velec
,rinvsq13
);
403 /* Update potential sum for this i atom from the interaction with this j atom. */
404 velecsum
= _mm_add_ps(velecsum
,velec
);
408 /* Calculate temporary vectorial force */
409 tx
= _mm_mul_ps(fscal
,dx13
);
410 ty
= _mm_mul_ps(fscal
,dy13
);
411 tz
= _mm_mul_ps(fscal
,dz13
);
413 /* Update vectorial force */
414 fix1
= _mm_add_ps(fix1
,tx
);
415 fiy1
= _mm_add_ps(fiy1
,ty
);
416 fiz1
= _mm_add_ps(fiz1
,tz
);
418 fjx3
= _mm_add_ps(fjx3
,tx
);
419 fjy3
= _mm_add_ps(fjy3
,ty
);
420 fjz3
= _mm_add_ps(fjz3
,tz
);
422 /**************************
423 * CALCULATE INTERACTIONS *
424 **************************/
426 /* COULOMB ELECTROSTATICS */
427 velec
= _mm_mul_ps(qq21
,rinv21
);
428 felec
= _mm_mul_ps(velec
,rinvsq21
);
430 /* Update potential sum for this i atom from the interaction with this j atom. */
431 velecsum
= _mm_add_ps(velecsum
,velec
);
435 /* Calculate temporary vectorial force */
436 tx
= _mm_mul_ps(fscal
,dx21
);
437 ty
= _mm_mul_ps(fscal
,dy21
);
438 tz
= _mm_mul_ps(fscal
,dz21
);
440 /* Update vectorial force */
441 fix2
= _mm_add_ps(fix2
,tx
);
442 fiy2
= _mm_add_ps(fiy2
,ty
);
443 fiz2
= _mm_add_ps(fiz2
,tz
);
445 fjx1
= _mm_add_ps(fjx1
,tx
);
446 fjy1
= _mm_add_ps(fjy1
,ty
);
447 fjz1
= _mm_add_ps(fjz1
,tz
);
449 /**************************
450 * CALCULATE INTERACTIONS *
451 **************************/
453 /* COULOMB ELECTROSTATICS */
454 velec
= _mm_mul_ps(qq22
,rinv22
);
455 felec
= _mm_mul_ps(velec
,rinvsq22
);
457 /* Update potential sum for this i atom from the interaction with this j atom. */
458 velecsum
= _mm_add_ps(velecsum
,velec
);
462 /* Calculate temporary vectorial force */
463 tx
= _mm_mul_ps(fscal
,dx22
);
464 ty
= _mm_mul_ps(fscal
,dy22
);
465 tz
= _mm_mul_ps(fscal
,dz22
);
467 /* Update vectorial force */
468 fix2
= _mm_add_ps(fix2
,tx
);
469 fiy2
= _mm_add_ps(fiy2
,ty
);
470 fiz2
= _mm_add_ps(fiz2
,tz
);
472 fjx2
= _mm_add_ps(fjx2
,tx
);
473 fjy2
= _mm_add_ps(fjy2
,ty
);
474 fjz2
= _mm_add_ps(fjz2
,tz
);
476 /**************************
477 * CALCULATE INTERACTIONS *
478 **************************/
480 /* COULOMB ELECTROSTATICS */
481 velec
= _mm_mul_ps(qq23
,rinv23
);
482 felec
= _mm_mul_ps(velec
,rinvsq23
);
484 /* Update potential sum for this i atom from the interaction with this j atom. */
485 velecsum
= _mm_add_ps(velecsum
,velec
);
489 /* Calculate temporary vectorial force */
490 tx
= _mm_mul_ps(fscal
,dx23
);
491 ty
= _mm_mul_ps(fscal
,dy23
);
492 tz
= _mm_mul_ps(fscal
,dz23
);
494 /* Update vectorial force */
495 fix2
= _mm_add_ps(fix2
,tx
);
496 fiy2
= _mm_add_ps(fiy2
,ty
);
497 fiz2
= _mm_add_ps(fiz2
,tz
);
499 fjx3
= _mm_add_ps(fjx3
,tx
);
500 fjy3
= _mm_add_ps(fjy3
,ty
);
501 fjz3
= _mm_add_ps(fjz3
,tz
);
503 /**************************
504 * CALCULATE INTERACTIONS *
505 **************************/
507 /* COULOMB ELECTROSTATICS */
508 velec
= _mm_mul_ps(qq31
,rinv31
);
509 felec
= _mm_mul_ps(velec
,rinvsq31
);
511 /* Update potential sum for this i atom from the interaction with this j atom. */
512 velecsum
= _mm_add_ps(velecsum
,velec
);
516 /* Calculate temporary vectorial force */
517 tx
= _mm_mul_ps(fscal
,dx31
);
518 ty
= _mm_mul_ps(fscal
,dy31
);
519 tz
= _mm_mul_ps(fscal
,dz31
);
521 /* Update vectorial force */
522 fix3
= _mm_add_ps(fix3
,tx
);
523 fiy3
= _mm_add_ps(fiy3
,ty
);
524 fiz3
= _mm_add_ps(fiz3
,tz
);
526 fjx1
= _mm_add_ps(fjx1
,tx
);
527 fjy1
= _mm_add_ps(fjy1
,ty
);
528 fjz1
= _mm_add_ps(fjz1
,tz
);
530 /**************************
531 * CALCULATE INTERACTIONS *
532 **************************/
534 /* COULOMB ELECTROSTATICS */
535 velec
= _mm_mul_ps(qq32
,rinv32
);
536 felec
= _mm_mul_ps(velec
,rinvsq32
);
538 /* Update potential sum for this i atom from the interaction with this j atom. */
539 velecsum
= _mm_add_ps(velecsum
,velec
);
543 /* Calculate temporary vectorial force */
544 tx
= _mm_mul_ps(fscal
,dx32
);
545 ty
= _mm_mul_ps(fscal
,dy32
);
546 tz
= _mm_mul_ps(fscal
,dz32
);
548 /* Update vectorial force */
549 fix3
= _mm_add_ps(fix3
,tx
);
550 fiy3
= _mm_add_ps(fiy3
,ty
);
551 fiz3
= _mm_add_ps(fiz3
,tz
);
553 fjx2
= _mm_add_ps(fjx2
,tx
);
554 fjy2
= _mm_add_ps(fjy2
,ty
);
555 fjz2
= _mm_add_ps(fjz2
,tz
);
557 /**************************
558 * CALCULATE INTERACTIONS *
559 **************************/
561 /* COULOMB ELECTROSTATICS */
562 velec
= _mm_mul_ps(qq33
,rinv33
);
563 felec
= _mm_mul_ps(velec
,rinvsq33
);
565 /* Update potential sum for this i atom from the interaction with this j atom. */
566 velecsum
= _mm_add_ps(velecsum
,velec
);
570 /* Calculate temporary vectorial force */
571 tx
= _mm_mul_ps(fscal
,dx33
);
572 ty
= _mm_mul_ps(fscal
,dy33
);
573 tz
= _mm_mul_ps(fscal
,dz33
);
575 /* Update vectorial force */
576 fix3
= _mm_add_ps(fix3
,tx
);
577 fiy3
= _mm_add_ps(fiy3
,ty
);
578 fiz3
= _mm_add_ps(fiz3
,tz
);
580 fjx3
= _mm_add_ps(fjx3
,tx
);
581 fjy3
= _mm_add_ps(fjy3
,ty
);
582 fjz3
= _mm_add_ps(fjz3
,tz
);
584 fjptrA
= f
+j_coord_offsetA
;
585 fjptrB
= f
+j_coord_offsetB
;
586 fjptrC
= f
+j_coord_offsetC
;
587 fjptrD
= f
+j_coord_offsetD
;
589 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
590 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,
591 fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
593 /* Inner loop uses 287 flops */
599 /* Get j neighbor index, and coordinate index */
600 jnrlistA
= jjnr
[jidx
];
601 jnrlistB
= jjnr
[jidx
+1];
602 jnrlistC
= jjnr
[jidx
+2];
603 jnrlistD
= jjnr
[jidx
+3];
604 /* Sign of each element will be negative for non-real atoms.
605 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
606 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
608 dummy_mask
= gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128()));
609 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
610 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
611 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
612 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
613 j_coord_offsetA
= DIM
*jnrA
;
614 j_coord_offsetB
= DIM
*jnrB
;
615 j_coord_offsetC
= DIM
*jnrC
;
616 j_coord_offsetD
= DIM
*jnrD
;
618 /* load j atom coordinates */
619 gmx_mm_load_4rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
620 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
621 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
622 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
624 /* Calculate displacement vector */
625 dx00
= _mm_sub_ps(ix0
,jx0
);
626 dy00
= _mm_sub_ps(iy0
,jy0
);
627 dz00
= _mm_sub_ps(iz0
,jz0
);
628 dx11
= _mm_sub_ps(ix1
,jx1
);
629 dy11
= _mm_sub_ps(iy1
,jy1
);
630 dz11
= _mm_sub_ps(iz1
,jz1
);
631 dx12
= _mm_sub_ps(ix1
,jx2
);
632 dy12
= _mm_sub_ps(iy1
,jy2
);
633 dz12
= _mm_sub_ps(iz1
,jz2
);
634 dx13
= _mm_sub_ps(ix1
,jx3
);
635 dy13
= _mm_sub_ps(iy1
,jy3
);
636 dz13
= _mm_sub_ps(iz1
,jz3
);
637 dx21
= _mm_sub_ps(ix2
,jx1
);
638 dy21
= _mm_sub_ps(iy2
,jy1
);
639 dz21
= _mm_sub_ps(iz2
,jz1
);
640 dx22
= _mm_sub_ps(ix2
,jx2
);
641 dy22
= _mm_sub_ps(iy2
,jy2
);
642 dz22
= _mm_sub_ps(iz2
,jz2
);
643 dx23
= _mm_sub_ps(ix2
,jx3
);
644 dy23
= _mm_sub_ps(iy2
,jy3
);
645 dz23
= _mm_sub_ps(iz2
,jz3
);
646 dx31
= _mm_sub_ps(ix3
,jx1
);
647 dy31
= _mm_sub_ps(iy3
,jy1
);
648 dz31
= _mm_sub_ps(iz3
,jz1
);
649 dx32
= _mm_sub_ps(ix3
,jx2
);
650 dy32
= _mm_sub_ps(iy3
,jy2
);
651 dz32
= _mm_sub_ps(iz3
,jz2
);
652 dx33
= _mm_sub_ps(ix3
,jx3
);
653 dy33
= _mm_sub_ps(iy3
,jy3
);
654 dz33
= _mm_sub_ps(iz3
,jz3
);
656 /* Calculate squared distance and things based on it */
657 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
658 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
659 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
660 rsq13
= gmx_mm_calc_rsq_ps(dx13
,dy13
,dz13
);
661 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
662 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
663 rsq23
= gmx_mm_calc_rsq_ps(dx23
,dy23
,dz23
);
664 rsq31
= gmx_mm_calc_rsq_ps(dx31
,dy31
,dz31
);
665 rsq32
= gmx_mm_calc_rsq_ps(dx32
,dy32
,dz32
);
666 rsq33
= gmx_mm_calc_rsq_ps(dx33
,dy33
,dz33
);
668 rinv11
= gmx_mm_invsqrt_ps(rsq11
);
669 rinv12
= gmx_mm_invsqrt_ps(rsq12
);
670 rinv13
= gmx_mm_invsqrt_ps(rsq13
);
671 rinv21
= gmx_mm_invsqrt_ps(rsq21
);
672 rinv22
= gmx_mm_invsqrt_ps(rsq22
);
673 rinv23
= gmx_mm_invsqrt_ps(rsq23
);
674 rinv31
= gmx_mm_invsqrt_ps(rsq31
);
675 rinv32
= gmx_mm_invsqrt_ps(rsq32
);
676 rinv33
= gmx_mm_invsqrt_ps(rsq33
);
678 rinvsq00
= gmx_mm_inv_ps(rsq00
);
679 rinvsq11
= _mm_mul_ps(rinv11
,rinv11
);
680 rinvsq12
= _mm_mul_ps(rinv12
,rinv12
);
681 rinvsq13
= _mm_mul_ps(rinv13
,rinv13
);
682 rinvsq21
= _mm_mul_ps(rinv21
,rinv21
);
683 rinvsq22
= _mm_mul_ps(rinv22
,rinv22
);
684 rinvsq23
= _mm_mul_ps(rinv23
,rinv23
);
685 rinvsq31
= _mm_mul_ps(rinv31
,rinv31
);
686 rinvsq32
= _mm_mul_ps(rinv32
,rinv32
);
687 rinvsq33
= _mm_mul_ps(rinv33
,rinv33
);
689 fjx0
= _mm_setzero_ps();
690 fjy0
= _mm_setzero_ps();
691 fjz0
= _mm_setzero_ps();
692 fjx1
= _mm_setzero_ps();
693 fjy1
= _mm_setzero_ps();
694 fjz1
= _mm_setzero_ps();
695 fjx2
= _mm_setzero_ps();
696 fjy2
= _mm_setzero_ps();
697 fjz2
= _mm_setzero_ps();
698 fjx3
= _mm_setzero_ps();
699 fjy3
= _mm_setzero_ps();
700 fjz3
= _mm_setzero_ps();
702 /**************************
703 * CALCULATE INTERACTIONS *
704 **************************/
706 /* LENNARD-JONES DISPERSION/REPULSION */
708 rinvsix
= _mm_mul_ps(_mm_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
709 vvdw6
= _mm_mul_ps(c6_00
,rinvsix
);
710 vvdw12
= _mm_mul_ps(c12_00
,_mm_mul_ps(rinvsix
,rinvsix
));
711 vvdw
= _mm_sub_ps( _mm_mul_ps(vvdw12
,one_twelfth
) , _mm_mul_ps(vvdw6
,one_sixth
) );
712 fvdw
= _mm_mul_ps(_mm_sub_ps(vvdw12
,vvdw6
),rinvsq00
);
714 /* Update potential sum for this i atom from the interaction with this j atom. */
715 vvdw
= _mm_andnot_ps(dummy_mask
,vvdw
);
716 vvdwsum
= _mm_add_ps(vvdwsum
,vvdw
);
720 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
722 /* Calculate temporary vectorial force */
723 tx
= _mm_mul_ps(fscal
,dx00
);
724 ty
= _mm_mul_ps(fscal
,dy00
);
725 tz
= _mm_mul_ps(fscal
,dz00
);
727 /* Update vectorial force */
728 fix0
= _mm_add_ps(fix0
,tx
);
729 fiy0
= _mm_add_ps(fiy0
,ty
);
730 fiz0
= _mm_add_ps(fiz0
,tz
);
732 fjx0
= _mm_add_ps(fjx0
,tx
);
733 fjy0
= _mm_add_ps(fjy0
,ty
);
734 fjz0
= _mm_add_ps(fjz0
,tz
);
736 /**************************
737 * CALCULATE INTERACTIONS *
738 **************************/
740 /* COULOMB ELECTROSTATICS */
741 velec
= _mm_mul_ps(qq11
,rinv11
);
742 felec
= _mm_mul_ps(velec
,rinvsq11
);
744 /* Update potential sum for this i atom from the interaction with this j atom. */
745 velec
= _mm_andnot_ps(dummy_mask
,velec
);
746 velecsum
= _mm_add_ps(velecsum
,velec
);
750 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
752 /* Calculate temporary vectorial force */
753 tx
= _mm_mul_ps(fscal
,dx11
);
754 ty
= _mm_mul_ps(fscal
,dy11
);
755 tz
= _mm_mul_ps(fscal
,dz11
);
757 /* Update vectorial force */
758 fix1
= _mm_add_ps(fix1
,tx
);
759 fiy1
= _mm_add_ps(fiy1
,ty
);
760 fiz1
= _mm_add_ps(fiz1
,tz
);
762 fjx1
= _mm_add_ps(fjx1
,tx
);
763 fjy1
= _mm_add_ps(fjy1
,ty
);
764 fjz1
= _mm_add_ps(fjz1
,tz
);
766 /**************************
767 * CALCULATE INTERACTIONS *
768 **************************/
770 /* COULOMB ELECTROSTATICS */
771 velec
= _mm_mul_ps(qq12
,rinv12
);
772 felec
= _mm_mul_ps(velec
,rinvsq12
);
774 /* Update potential sum for this i atom from the interaction with this j atom. */
775 velec
= _mm_andnot_ps(dummy_mask
,velec
);
776 velecsum
= _mm_add_ps(velecsum
,velec
);
780 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
782 /* Calculate temporary vectorial force */
783 tx
= _mm_mul_ps(fscal
,dx12
);
784 ty
= _mm_mul_ps(fscal
,dy12
);
785 tz
= _mm_mul_ps(fscal
,dz12
);
787 /* Update vectorial force */
788 fix1
= _mm_add_ps(fix1
,tx
);
789 fiy1
= _mm_add_ps(fiy1
,ty
);
790 fiz1
= _mm_add_ps(fiz1
,tz
);
792 fjx2
= _mm_add_ps(fjx2
,tx
);
793 fjy2
= _mm_add_ps(fjy2
,ty
);
794 fjz2
= _mm_add_ps(fjz2
,tz
);
796 /**************************
797 * CALCULATE INTERACTIONS *
798 **************************/
800 /* COULOMB ELECTROSTATICS */
801 velec
= _mm_mul_ps(qq13
,rinv13
);
802 felec
= _mm_mul_ps(velec
,rinvsq13
);
804 /* Update potential sum for this i atom from the interaction with this j atom. */
805 velec
= _mm_andnot_ps(dummy_mask
,velec
);
806 velecsum
= _mm_add_ps(velecsum
,velec
);
810 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
812 /* Calculate temporary vectorial force */
813 tx
= _mm_mul_ps(fscal
,dx13
);
814 ty
= _mm_mul_ps(fscal
,dy13
);
815 tz
= _mm_mul_ps(fscal
,dz13
);
817 /* Update vectorial force */
818 fix1
= _mm_add_ps(fix1
,tx
);
819 fiy1
= _mm_add_ps(fiy1
,ty
);
820 fiz1
= _mm_add_ps(fiz1
,tz
);
822 fjx3
= _mm_add_ps(fjx3
,tx
);
823 fjy3
= _mm_add_ps(fjy3
,ty
);
824 fjz3
= _mm_add_ps(fjz3
,tz
);
826 /**************************
827 * CALCULATE INTERACTIONS *
828 **************************/
830 /* COULOMB ELECTROSTATICS */
831 velec
= _mm_mul_ps(qq21
,rinv21
);
832 felec
= _mm_mul_ps(velec
,rinvsq21
);
834 /* Update potential sum for this i atom from the interaction with this j atom. */
835 velec
= _mm_andnot_ps(dummy_mask
,velec
);
836 velecsum
= _mm_add_ps(velecsum
,velec
);
840 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
842 /* Calculate temporary vectorial force */
843 tx
= _mm_mul_ps(fscal
,dx21
);
844 ty
= _mm_mul_ps(fscal
,dy21
);
845 tz
= _mm_mul_ps(fscal
,dz21
);
847 /* Update vectorial force */
848 fix2
= _mm_add_ps(fix2
,tx
);
849 fiy2
= _mm_add_ps(fiy2
,ty
);
850 fiz2
= _mm_add_ps(fiz2
,tz
);
852 fjx1
= _mm_add_ps(fjx1
,tx
);
853 fjy1
= _mm_add_ps(fjy1
,ty
);
854 fjz1
= _mm_add_ps(fjz1
,tz
);
856 /**************************
857 * CALCULATE INTERACTIONS *
858 **************************/
860 /* COULOMB ELECTROSTATICS */
861 velec
= _mm_mul_ps(qq22
,rinv22
);
862 felec
= _mm_mul_ps(velec
,rinvsq22
);
864 /* Update potential sum for this i atom from the interaction with this j atom. */
865 velec
= _mm_andnot_ps(dummy_mask
,velec
);
866 velecsum
= _mm_add_ps(velecsum
,velec
);
870 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
872 /* Calculate temporary vectorial force */
873 tx
= _mm_mul_ps(fscal
,dx22
);
874 ty
= _mm_mul_ps(fscal
,dy22
);
875 tz
= _mm_mul_ps(fscal
,dz22
);
877 /* Update vectorial force */
878 fix2
= _mm_add_ps(fix2
,tx
);
879 fiy2
= _mm_add_ps(fiy2
,ty
);
880 fiz2
= _mm_add_ps(fiz2
,tz
);
882 fjx2
= _mm_add_ps(fjx2
,tx
);
883 fjy2
= _mm_add_ps(fjy2
,ty
);
884 fjz2
= _mm_add_ps(fjz2
,tz
);
886 /**************************
887 * CALCULATE INTERACTIONS *
888 **************************/
890 /* COULOMB ELECTROSTATICS */
891 velec
= _mm_mul_ps(qq23
,rinv23
);
892 felec
= _mm_mul_ps(velec
,rinvsq23
);
894 /* Update potential sum for this i atom from the interaction with this j atom. */
895 velec
= _mm_andnot_ps(dummy_mask
,velec
);
896 velecsum
= _mm_add_ps(velecsum
,velec
);
900 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
902 /* Calculate temporary vectorial force */
903 tx
= _mm_mul_ps(fscal
,dx23
);
904 ty
= _mm_mul_ps(fscal
,dy23
);
905 tz
= _mm_mul_ps(fscal
,dz23
);
907 /* Update vectorial force */
908 fix2
= _mm_add_ps(fix2
,tx
);
909 fiy2
= _mm_add_ps(fiy2
,ty
);
910 fiz2
= _mm_add_ps(fiz2
,tz
);
912 fjx3
= _mm_add_ps(fjx3
,tx
);
913 fjy3
= _mm_add_ps(fjy3
,ty
);
914 fjz3
= _mm_add_ps(fjz3
,tz
);
916 /**************************
917 * CALCULATE INTERACTIONS *
918 **************************/
920 /* COULOMB ELECTROSTATICS */
921 velec
= _mm_mul_ps(qq31
,rinv31
);
922 felec
= _mm_mul_ps(velec
,rinvsq31
);
924 /* Update potential sum for this i atom from the interaction with this j atom. */
925 velec
= _mm_andnot_ps(dummy_mask
,velec
);
926 velecsum
= _mm_add_ps(velecsum
,velec
);
930 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
932 /* Calculate temporary vectorial force */
933 tx
= _mm_mul_ps(fscal
,dx31
);
934 ty
= _mm_mul_ps(fscal
,dy31
);
935 tz
= _mm_mul_ps(fscal
,dz31
);
937 /* Update vectorial force */
938 fix3
= _mm_add_ps(fix3
,tx
);
939 fiy3
= _mm_add_ps(fiy3
,ty
);
940 fiz3
= _mm_add_ps(fiz3
,tz
);
942 fjx1
= _mm_add_ps(fjx1
,tx
);
943 fjy1
= _mm_add_ps(fjy1
,ty
);
944 fjz1
= _mm_add_ps(fjz1
,tz
);
946 /**************************
947 * CALCULATE INTERACTIONS *
948 **************************/
950 /* COULOMB ELECTROSTATICS */
951 velec
= _mm_mul_ps(qq32
,rinv32
);
952 felec
= _mm_mul_ps(velec
,rinvsq32
);
954 /* Update potential sum for this i atom from the interaction with this j atom. */
955 velec
= _mm_andnot_ps(dummy_mask
,velec
);
956 velecsum
= _mm_add_ps(velecsum
,velec
);
960 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
962 /* Calculate temporary vectorial force */
963 tx
= _mm_mul_ps(fscal
,dx32
);
964 ty
= _mm_mul_ps(fscal
,dy32
);
965 tz
= _mm_mul_ps(fscal
,dz32
);
967 /* Update vectorial force */
968 fix3
= _mm_add_ps(fix3
,tx
);
969 fiy3
= _mm_add_ps(fiy3
,ty
);
970 fiz3
= _mm_add_ps(fiz3
,tz
);
972 fjx2
= _mm_add_ps(fjx2
,tx
);
973 fjy2
= _mm_add_ps(fjy2
,ty
);
974 fjz2
= _mm_add_ps(fjz2
,tz
);
976 /**************************
977 * CALCULATE INTERACTIONS *
978 **************************/
980 /* COULOMB ELECTROSTATICS */
981 velec
= _mm_mul_ps(qq33
,rinv33
);
982 felec
= _mm_mul_ps(velec
,rinvsq33
);
984 /* Update potential sum for this i atom from the interaction with this j atom. */
985 velec
= _mm_andnot_ps(dummy_mask
,velec
);
986 velecsum
= _mm_add_ps(velecsum
,velec
);
990 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
992 /* Calculate temporary vectorial force */
993 tx
= _mm_mul_ps(fscal
,dx33
);
994 ty
= _mm_mul_ps(fscal
,dy33
);
995 tz
= _mm_mul_ps(fscal
,dz33
);
997 /* Update vectorial force */
998 fix3
= _mm_add_ps(fix3
,tx
);
999 fiy3
= _mm_add_ps(fiy3
,ty
);
1000 fiz3
= _mm_add_ps(fiz3
,tz
);
1002 fjx3
= _mm_add_ps(fjx3
,tx
);
1003 fjy3
= _mm_add_ps(fjy3
,ty
);
1004 fjz3
= _mm_add_ps(fjz3
,tz
);
1006 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
1007 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
1008 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
1009 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
1011 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
1012 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,
1013 fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
1015 /* Inner loop uses 287 flops */
1018 /* End of innermost loop */
1020 gmx_mm_update_iforce_4atom_swizzle_ps(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
1021 f
+i_coord_offset
,fshift
+i_shift_offset
);
1024 /* Update potential energies */
1025 gmx_mm_update_1pot_ps(velecsum
,kernel_data
->energygrp_elec
+ggid
);
1026 gmx_mm_update_1pot_ps(vvdwsum
,kernel_data
->energygrp_vdw
+ggid
);
1028 /* Increment number of inner iterations */
1029 inneriter
+= j_index_end
- j_index_start
;
1031 /* Outer loop uses 26 flops */
1034 /* Increment number of outer iterations */
1037 /* Update outer/inner flops */
1039 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W4W4_VF
,outeriter
*26 + inneriter
*287);
1042 * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_sse2_single
1043 * Electrostatics interaction: Coulomb
1044 * VdW interaction: LennardJones
1045 * Geometry: Water4-Water4
1046 * Calculate force/pot: Force
1049 nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_sse2_single
1050 (t_nblist
* gmx_restrict nlist
,
1051 rvec
* gmx_restrict xx
,
1052 rvec
* gmx_restrict ff
,
1053 t_forcerec
* gmx_restrict fr
,
1054 t_mdatoms
* gmx_restrict mdatoms
,
1055 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
1056 t_nrnb
* gmx_restrict nrnb
)
1058 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1059 * just 0 for non-waters.
1060 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1061 * jnr indices corresponding to data put in the four positions in the SIMD register.
1063 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
1064 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
1065 int jnrA
,jnrB
,jnrC
,jnrD
;
1066 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
1067 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
1068 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
1069 real rcutoff_scalar
;
1070 real
*shiftvec
,*fshift
,*x
,*f
;
1071 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
;
1072 real scratch
[4*DIM
];
1073 __m128 tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
1075 __m128 ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
1077 __m128 ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
1079 __m128 ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
1081 __m128 ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
1082 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
;
1083 __m128 jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
1084 int vdwjidx1A
,vdwjidx1B
,vdwjidx1C
,vdwjidx1D
;
1085 __m128 jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
1086 int vdwjidx2A
,vdwjidx2B
,vdwjidx2C
,vdwjidx2D
;
1087 __m128 jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
1088 int vdwjidx3A
,vdwjidx3B
,vdwjidx3C
,vdwjidx3D
;
1089 __m128 jx3
,jy3
,jz3
,fjx3
,fjy3
,fjz3
,jq3
,isaj3
;
1090 __m128 dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
1091 __m128 dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
1092 __m128 dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
1093 __m128 dx13
,dy13
,dz13
,rsq13
,rinv13
,rinvsq13
,r13
,qq13
,c6_13
,c12_13
;
1094 __m128 dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
1095 __m128 dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
1096 __m128 dx23
,dy23
,dz23
,rsq23
,rinv23
,rinvsq23
,r23
,qq23
,c6_23
,c12_23
;
1097 __m128 dx31
,dy31
,dz31
,rsq31
,rinv31
,rinvsq31
,r31
,qq31
,c6_31
,c12_31
;
1098 __m128 dx32
,dy32
,dz32
,rsq32
,rinv32
,rinvsq32
,r32
,qq32
,c6_32
,c12_32
;
1099 __m128 dx33
,dy33
,dz33
,rsq33
,rinv33
,rinvsq33
,r33
,qq33
,c6_33
,c12_33
;
1100 __m128 velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
1103 __m128 rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
1106 __m128 one_sixth
= _mm_set1_ps(1.0/6.0);
1107 __m128 one_twelfth
= _mm_set1_ps(1.0/12.0);
1108 __m128 dummy_mask
,cutoff_mask
;
1109 __m128 signbit
= _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1110 __m128 one
= _mm_set1_ps(1.0);
1111 __m128 two
= _mm_set1_ps(2.0);
1117 jindex
= nlist
->jindex
;
1119 shiftidx
= nlist
->shift
;
1121 shiftvec
= fr
->shift_vec
[0];
1122 fshift
= fr
->fshift
[0];
1123 facel
= _mm_set1_ps(fr
->epsfac
);
1124 charge
= mdatoms
->chargeA
;
1125 nvdwtype
= fr
->ntype
;
1126 vdwparam
= fr
->nbfp
;
1127 vdwtype
= mdatoms
->typeA
;
1129 /* Setup water-specific parameters */
1130 inr
= nlist
->iinr
[0];
1131 iq1
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+1]));
1132 iq2
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+2]));
1133 iq3
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+3]));
1134 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
1136 jq1
= _mm_set1_ps(charge
[inr
+1]);
1137 jq2
= _mm_set1_ps(charge
[inr
+2]);
1138 jq3
= _mm_set1_ps(charge
[inr
+3]);
1139 vdwjidx0A
= 2*vdwtype
[inr
+0];
1140 c6_00
= _mm_set1_ps(vdwparam
[vdwioffset0
+vdwjidx0A
]);
1141 c12_00
= _mm_set1_ps(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
1142 qq11
= _mm_mul_ps(iq1
,jq1
);
1143 qq12
= _mm_mul_ps(iq1
,jq2
);
1144 qq13
= _mm_mul_ps(iq1
,jq3
);
1145 qq21
= _mm_mul_ps(iq2
,jq1
);
1146 qq22
= _mm_mul_ps(iq2
,jq2
);
1147 qq23
= _mm_mul_ps(iq2
,jq3
);
1148 qq31
= _mm_mul_ps(iq3
,jq1
);
1149 qq32
= _mm_mul_ps(iq3
,jq2
);
1150 qq33
= _mm_mul_ps(iq3
,jq3
);
1152 /* Avoid stupid compiler warnings */
1153 jnrA
= jnrB
= jnrC
= jnrD
= 0;
1154 j_coord_offsetA
= 0;
1155 j_coord_offsetB
= 0;
1156 j_coord_offsetC
= 0;
1157 j_coord_offsetD
= 0;
1162 for(iidx
=0;iidx
<4*DIM
;iidx
++)
1164 scratch
[iidx
] = 0.0;
1167 /* Start outer loop over neighborlists */
1168 for(iidx
=0; iidx
<nri
; iidx
++)
1170 /* Load shift vector for this list */
1171 i_shift_offset
= DIM
*shiftidx
[iidx
];
1173 /* Load limits for loop over neighbors */
1174 j_index_start
= jindex
[iidx
];
1175 j_index_end
= jindex
[iidx
+1];
1177 /* Get outer coordinate index */
1179 i_coord_offset
= DIM
*inr
;
1181 /* Load i particle coords and add shift vector */
1182 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
1183 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
1185 fix0
= _mm_setzero_ps();
1186 fiy0
= _mm_setzero_ps();
1187 fiz0
= _mm_setzero_ps();
1188 fix1
= _mm_setzero_ps();
1189 fiy1
= _mm_setzero_ps();
1190 fiz1
= _mm_setzero_ps();
1191 fix2
= _mm_setzero_ps();
1192 fiy2
= _mm_setzero_ps();
1193 fiz2
= _mm_setzero_ps();
1194 fix3
= _mm_setzero_ps();
1195 fiy3
= _mm_setzero_ps();
1196 fiz3
= _mm_setzero_ps();
1198 /* Start inner kernel loop */
1199 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+3]>=0; jidx
+=4)
1202 /* Get j neighbor index, and coordinate index */
1204 jnrB
= jjnr
[jidx
+1];
1205 jnrC
= jjnr
[jidx
+2];
1206 jnrD
= jjnr
[jidx
+3];
1207 j_coord_offsetA
= DIM
*jnrA
;
1208 j_coord_offsetB
= DIM
*jnrB
;
1209 j_coord_offsetC
= DIM
*jnrC
;
1210 j_coord_offsetD
= DIM
*jnrD
;
1212 /* load j atom coordinates */
1213 gmx_mm_load_4rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1214 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
1215 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
1216 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
1218 /* Calculate displacement vector */
1219 dx00
= _mm_sub_ps(ix0
,jx0
);
1220 dy00
= _mm_sub_ps(iy0
,jy0
);
1221 dz00
= _mm_sub_ps(iz0
,jz0
);
1222 dx11
= _mm_sub_ps(ix1
,jx1
);
1223 dy11
= _mm_sub_ps(iy1
,jy1
);
1224 dz11
= _mm_sub_ps(iz1
,jz1
);
1225 dx12
= _mm_sub_ps(ix1
,jx2
);
1226 dy12
= _mm_sub_ps(iy1
,jy2
);
1227 dz12
= _mm_sub_ps(iz1
,jz2
);
1228 dx13
= _mm_sub_ps(ix1
,jx3
);
1229 dy13
= _mm_sub_ps(iy1
,jy3
);
1230 dz13
= _mm_sub_ps(iz1
,jz3
);
1231 dx21
= _mm_sub_ps(ix2
,jx1
);
1232 dy21
= _mm_sub_ps(iy2
,jy1
);
1233 dz21
= _mm_sub_ps(iz2
,jz1
);
1234 dx22
= _mm_sub_ps(ix2
,jx2
);
1235 dy22
= _mm_sub_ps(iy2
,jy2
);
1236 dz22
= _mm_sub_ps(iz2
,jz2
);
1237 dx23
= _mm_sub_ps(ix2
,jx3
);
1238 dy23
= _mm_sub_ps(iy2
,jy3
);
1239 dz23
= _mm_sub_ps(iz2
,jz3
);
1240 dx31
= _mm_sub_ps(ix3
,jx1
);
1241 dy31
= _mm_sub_ps(iy3
,jy1
);
1242 dz31
= _mm_sub_ps(iz3
,jz1
);
1243 dx32
= _mm_sub_ps(ix3
,jx2
);
1244 dy32
= _mm_sub_ps(iy3
,jy2
);
1245 dz32
= _mm_sub_ps(iz3
,jz2
);
1246 dx33
= _mm_sub_ps(ix3
,jx3
);
1247 dy33
= _mm_sub_ps(iy3
,jy3
);
1248 dz33
= _mm_sub_ps(iz3
,jz3
);
1250 /* Calculate squared distance and things based on it */
1251 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
1252 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
1253 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
1254 rsq13
= gmx_mm_calc_rsq_ps(dx13
,dy13
,dz13
);
1255 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
1256 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
1257 rsq23
= gmx_mm_calc_rsq_ps(dx23
,dy23
,dz23
);
1258 rsq31
= gmx_mm_calc_rsq_ps(dx31
,dy31
,dz31
);
1259 rsq32
= gmx_mm_calc_rsq_ps(dx32
,dy32
,dz32
);
1260 rsq33
= gmx_mm_calc_rsq_ps(dx33
,dy33
,dz33
);
1262 rinv11
= gmx_mm_invsqrt_ps(rsq11
);
1263 rinv12
= gmx_mm_invsqrt_ps(rsq12
);
1264 rinv13
= gmx_mm_invsqrt_ps(rsq13
);
1265 rinv21
= gmx_mm_invsqrt_ps(rsq21
);
1266 rinv22
= gmx_mm_invsqrt_ps(rsq22
);
1267 rinv23
= gmx_mm_invsqrt_ps(rsq23
);
1268 rinv31
= gmx_mm_invsqrt_ps(rsq31
);
1269 rinv32
= gmx_mm_invsqrt_ps(rsq32
);
1270 rinv33
= gmx_mm_invsqrt_ps(rsq33
);
1272 rinvsq00
= gmx_mm_inv_ps(rsq00
);
1273 rinvsq11
= _mm_mul_ps(rinv11
,rinv11
);
1274 rinvsq12
= _mm_mul_ps(rinv12
,rinv12
);
1275 rinvsq13
= _mm_mul_ps(rinv13
,rinv13
);
1276 rinvsq21
= _mm_mul_ps(rinv21
,rinv21
);
1277 rinvsq22
= _mm_mul_ps(rinv22
,rinv22
);
1278 rinvsq23
= _mm_mul_ps(rinv23
,rinv23
);
1279 rinvsq31
= _mm_mul_ps(rinv31
,rinv31
);
1280 rinvsq32
= _mm_mul_ps(rinv32
,rinv32
);
1281 rinvsq33
= _mm_mul_ps(rinv33
,rinv33
);
1283 fjx0
= _mm_setzero_ps();
1284 fjy0
= _mm_setzero_ps();
1285 fjz0
= _mm_setzero_ps();
1286 fjx1
= _mm_setzero_ps();
1287 fjy1
= _mm_setzero_ps();
1288 fjz1
= _mm_setzero_ps();
1289 fjx2
= _mm_setzero_ps();
1290 fjy2
= _mm_setzero_ps();
1291 fjz2
= _mm_setzero_ps();
1292 fjx3
= _mm_setzero_ps();
1293 fjy3
= _mm_setzero_ps();
1294 fjz3
= _mm_setzero_ps();
1296 /**************************
1297 * CALCULATE INTERACTIONS *
1298 **************************/
1300 /* LENNARD-JONES DISPERSION/REPULSION */
1302 rinvsix
= _mm_mul_ps(_mm_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
1303 fvdw
= _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00
,rinvsix
),c6_00
),_mm_mul_ps(rinvsix
,rinvsq00
));
1307 /* Calculate temporary vectorial force */
1308 tx
= _mm_mul_ps(fscal
,dx00
);
1309 ty
= _mm_mul_ps(fscal
,dy00
);
1310 tz
= _mm_mul_ps(fscal
,dz00
);
1312 /* Update vectorial force */
1313 fix0
= _mm_add_ps(fix0
,tx
);
1314 fiy0
= _mm_add_ps(fiy0
,ty
);
1315 fiz0
= _mm_add_ps(fiz0
,tz
);
1317 fjx0
= _mm_add_ps(fjx0
,tx
);
1318 fjy0
= _mm_add_ps(fjy0
,ty
);
1319 fjz0
= _mm_add_ps(fjz0
,tz
);
1321 /**************************
1322 * CALCULATE INTERACTIONS *
1323 **************************/
1325 /* COULOMB ELECTROSTATICS */
1326 velec
= _mm_mul_ps(qq11
,rinv11
);
1327 felec
= _mm_mul_ps(velec
,rinvsq11
);
1331 /* Calculate temporary vectorial force */
1332 tx
= _mm_mul_ps(fscal
,dx11
);
1333 ty
= _mm_mul_ps(fscal
,dy11
);
1334 tz
= _mm_mul_ps(fscal
,dz11
);
1336 /* Update vectorial force */
1337 fix1
= _mm_add_ps(fix1
,tx
);
1338 fiy1
= _mm_add_ps(fiy1
,ty
);
1339 fiz1
= _mm_add_ps(fiz1
,tz
);
1341 fjx1
= _mm_add_ps(fjx1
,tx
);
1342 fjy1
= _mm_add_ps(fjy1
,ty
);
1343 fjz1
= _mm_add_ps(fjz1
,tz
);
1345 /**************************
1346 * CALCULATE INTERACTIONS *
1347 **************************/
1349 /* COULOMB ELECTROSTATICS */
1350 velec
= _mm_mul_ps(qq12
,rinv12
);
1351 felec
= _mm_mul_ps(velec
,rinvsq12
);
1355 /* Calculate temporary vectorial force */
1356 tx
= _mm_mul_ps(fscal
,dx12
);
1357 ty
= _mm_mul_ps(fscal
,dy12
);
1358 tz
= _mm_mul_ps(fscal
,dz12
);
1360 /* Update vectorial force */
1361 fix1
= _mm_add_ps(fix1
,tx
);
1362 fiy1
= _mm_add_ps(fiy1
,ty
);
1363 fiz1
= _mm_add_ps(fiz1
,tz
);
1365 fjx2
= _mm_add_ps(fjx2
,tx
);
1366 fjy2
= _mm_add_ps(fjy2
,ty
);
1367 fjz2
= _mm_add_ps(fjz2
,tz
);
1369 /**************************
1370 * CALCULATE INTERACTIONS *
1371 **************************/
1373 /* COULOMB ELECTROSTATICS */
1374 velec
= _mm_mul_ps(qq13
,rinv13
);
1375 felec
= _mm_mul_ps(velec
,rinvsq13
);
1379 /* Calculate temporary vectorial force */
1380 tx
= _mm_mul_ps(fscal
,dx13
);
1381 ty
= _mm_mul_ps(fscal
,dy13
);
1382 tz
= _mm_mul_ps(fscal
,dz13
);
1384 /* Update vectorial force */
1385 fix1
= _mm_add_ps(fix1
,tx
);
1386 fiy1
= _mm_add_ps(fiy1
,ty
);
1387 fiz1
= _mm_add_ps(fiz1
,tz
);
1389 fjx3
= _mm_add_ps(fjx3
,tx
);
1390 fjy3
= _mm_add_ps(fjy3
,ty
);
1391 fjz3
= _mm_add_ps(fjz3
,tz
);
1393 /**************************
1394 * CALCULATE INTERACTIONS *
1395 **************************/
1397 /* COULOMB ELECTROSTATICS */
1398 velec
= _mm_mul_ps(qq21
,rinv21
);
1399 felec
= _mm_mul_ps(velec
,rinvsq21
);
1403 /* Calculate temporary vectorial force */
1404 tx
= _mm_mul_ps(fscal
,dx21
);
1405 ty
= _mm_mul_ps(fscal
,dy21
);
1406 tz
= _mm_mul_ps(fscal
,dz21
);
1408 /* Update vectorial force */
1409 fix2
= _mm_add_ps(fix2
,tx
);
1410 fiy2
= _mm_add_ps(fiy2
,ty
);
1411 fiz2
= _mm_add_ps(fiz2
,tz
);
1413 fjx1
= _mm_add_ps(fjx1
,tx
);
1414 fjy1
= _mm_add_ps(fjy1
,ty
);
1415 fjz1
= _mm_add_ps(fjz1
,tz
);
1417 /**************************
1418 * CALCULATE INTERACTIONS *
1419 **************************/
1421 /* COULOMB ELECTROSTATICS */
1422 velec
= _mm_mul_ps(qq22
,rinv22
);
1423 felec
= _mm_mul_ps(velec
,rinvsq22
);
1427 /* Calculate temporary vectorial force */
1428 tx
= _mm_mul_ps(fscal
,dx22
);
1429 ty
= _mm_mul_ps(fscal
,dy22
);
1430 tz
= _mm_mul_ps(fscal
,dz22
);
1432 /* Update vectorial force */
1433 fix2
= _mm_add_ps(fix2
,tx
);
1434 fiy2
= _mm_add_ps(fiy2
,ty
);
1435 fiz2
= _mm_add_ps(fiz2
,tz
);
1437 fjx2
= _mm_add_ps(fjx2
,tx
);
1438 fjy2
= _mm_add_ps(fjy2
,ty
);
1439 fjz2
= _mm_add_ps(fjz2
,tz
);
1441 /**************************
1442 * CALCULATE INTERACTIONS *
1443 **************************/
1445 /* COULOMB ELECTROSTATICS */
1446 velec
= _mm_mul_ps(qq23
,rinv23
);
1447 felec
= _mm_mul_ps(velec
,rinvsq23
);
1451 /* Calculate temporary vectorial force */
1452 tx
= _mm_mul_ps(fscal
,dx23
);
1453 ty
= _mm_mul_ps(fscal
,dy23
);
1454 tz
= _mm_mul_ps(fscal
,dz23
);
1456 /* Update vectorial force */
1457 fix2
= _mm_add_ps(fix2
,tx
);
1458 fiy2
= _mm_add_ps(fiy2
,ty
);
1459 fiz2
= _mm_add_ps(fiz2
,tz
);
1461 fjx3
= _mm_add_ps(fjx3
,tx
);
1462 fjy3
= _mm_add_ps(fjy3
,ty
);
1463 fjz3
= _mm_add_ps(fjz3
,tz
);
1465 /**************************
1466 * CALCULATE INTERACTIONS *
1467 **************************/
1469 /* COULOMB ELECTROSTATICS */
1470 velec
= _mm_mul_ps(qq31
,rinv31
);
1471 felec
= _mm_mul_ps(velec
,rinvsq31
);
1475 /* Calculate temporary vectorial force */
1476 tx
= _mm_mul_ps(fscal
,dx31
);
1477 ty
= _mm_mul_ps(fscal
,dy31
);
1478 tz
= _mm_mul_ps(fscal
,dz31
);
1480 /* Update vectorial force */
1481 fix3
= _mm_add_ps(fix3
,tx
);
1482 fiy3
= _mm_add_ps(fiy3
,ty
);
1483 fiz3
= _mm_add_ps(fiz3
,tz
);
1485 fjx1
= _mm_add_ps(fjx1
,tx
);
1486 fjy1
= _mm_add_ps(fjy1
,ty
);
1487 fjz1
= _mm_add_ps(fjz1
,tz
);
1489 /**************************
1490 * CALCULATE INTERACTIONS *
1491 **************************/
1493 /* COULOMB ELECTROSTATICS */
1494 velec
= _mm_mul_ps(qq32
,rinv32
);
1495 felec
= _mm_mul_ps(velec
,rinvsq32
);
1499 /* Calculate temporary vectorial force */
1500 tx
= _mm_mul_ps(fscal
,dx32
);
1501 ty
= _mm_mul_ps(fscal
,dy32
);
1502 tz
= _mm_mul_ps(fscal
,dz32
);
1504 /* Update vectorial force */
1505 fix3
= _mm_add_ps(fix3
,tx
);
1506 fiy3
= _mm_add_ps(fiy3
,ty
);
1507 fiz3
= _mm_add_ps(fiz3
,tz
);
1509 fjx2
= _mm_add_ps(fjx2
,tx
);
1510 fjy2
= _mm_add_ps(fjy2
,ty
);
1511 fjz2
= _mm_add_ps(fjz2
,tz
);
1513 /**************************
1514 * CALCULATE INTERACTIONS *
1515 **************************/
1517 /* COULOMB ELECTROSTATICS */
1518 velec
= _mm_mul_ps(qq33
,rinv33
);
1519 felec
= _mm_mul_ps(velec
,rinvsq33
);
1523 /* Calculate temporary vectorial force */
1524 tx
= _mm_mul_ps(fscal
,dx33
);
1525 ty
= _mm_mul_ps(fscal
,dy33
);
1526 tz
= _mm_mul_ps(fscal
,dz33
);
1528 /* Update vectorial force */
1529 fix3
= _mm_add_ps(fix3
,tx
);
1530 fiy3
= _mm_add_ps(fiy3
,ty
);
1531 fiz3
= _mm_add_ps(fiz3
,tz
);
1533 fjx3
= _mm_add_ps(fjx3
,tx
);
1534 fjy3
= _mm_add_ps(fjy3
,ty
);
1535 fjz3
= _mm_add_ps(fjz3
,tz
);
1537 fjptrA
= f
+j_coord_offsetA
;
1538 fjptrB
= f
+j_coord_offsetB
;
1539 fjptrC
= f
+j_coord_offsetC
;
1540 fjptrD
= f
+j_coord_offsetD
;
1542 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
1543 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,
1544 fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
1546 /* Inner loop uses 273 flops */
1549 if(jidx
<j_index_end
)
1552 /* Get j neighbor index, and coordinate index */
1553 jnrlistA
= jjnr
[jidx
];
1554 jnrlistB
= jjnr
[jidx
+1];
1555 jnrlistC
= jjnr
[jidx
+2];
1556 jnrlistD
= jjnr
[jidx
+3];
1557 /* Sign of each element will be negative for non-real atoms.
1558 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1559 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1561 dummy_mask
= gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128()));
1562 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
1563 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
1564 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
1565 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
1566 j_coord_offsetA
= DIM
*jnrA
;
1567 j_coord_offsetB
= DIM
*jnrB
;
1568 j_coord_offsetC
= DIM
*jnrC
;
1569 j_coord_offsetD
= DIM
*jnrD
;
1571 /* load j atom coordinates */
1572 gmx_mm_load_4rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1573 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
1574 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
1575 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
1577 /* Calculate displacement vector */
1578 dx00
= _mm_sub_ps(ix0
,jx0
);
1579 dy00
= _mm_sub_ps(iy0
,jy0
);
1580 dz00
= _mm_sub_ps(iz0
,jz0
);
1581 dx11
= _mm_sub_ps(ix1
,jx1
);
1582 dy11
= _mm_sub_ps(iy1
,jy1
);
1583 dz11
= _mm_sub_ps(iz1
,jz1
);
1584 dx12
= _mm_sub_ps(ix1
,jx2
);
1585 dy12
= _mm_sub_ps(iy1
,jy2
);
1586 dz12
= _mm_sub_ps(iz1
,jz2
);
1587 dx13
= _mm_sub_ps(ix1
,jx3
);
1588 dy13
= _mm_sub_ps(iy1
,jy3
);
1589 dz13
= _mm_sub_ps(iz1
,jz3
);
1590 dx21
= _mm_sub_ps(ix2
,jx1
);
1591 dy21
= _mm_sub_ps(iy2
,jy1
);
1592 dz21
= _mm_sub_ps(iz2
,jz1
);
1593 dx22
= _mm_sub_ps(ix2
,jx2
);
1594 dy22
= _mm_sub_ps(iy2
,jy2
);
1595 dz22
= _mm_sub_ps(iz2
,jz2
);
1596 dx23
= _mm_sub_ps(ix2
,jx3
);
1597 dy23
= _mm_sub_ps(iy2
,jy3
);
1598 dz23
= _mm_sub_ps(iz2
,jz3
);
1599 dx31
= _mm_sub_ps(ix3
,jx1
);
1600 dy31
= _mm_sub_ps(iy3
,jy1
);
1601 dz31
= _mm_sub_ps(iz3
,jz1
);
1602 dx32
= _mm_sub_ps(ix3
,jx2
);
1603 dy32
= _mm_sub_ps(iy3
,jy2
);
1604 dz32
= _mm_sub_ps(iz3
,jz2
);
1605 dx33
= _mm_sub_ps(ix3
,jx3
);
1606 dy33
= _mm_sub_ps(iy3
,jy3
);
1607 dz33
= _mm_sub_ps(iz3
,jz3
);
1609 /* Calculate squared distance and things based on it */
1610 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
1611 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
1612 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
1613 rsq13
= gmx_mm_calc_rsq_ps(dx13
,dy13
,dz13
);
1614 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
1615 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
1616 rsq23
= gmx_mm_calc_rsq_ps(dx23
,dy23
,dz23
);
1617 rsq31
= gmx_mm_calc_rsq_ps(dx31
,dy31
,dz31
);
1618 rsq32
= gmx_mm_calc_rsq_ps(dx32
,dy32
,dz32
);
1619 rsq33
= gmx_mm_calc_rsq_ps(dx33
,dy33
,dz33
);
1621 rinv11
= gmx_mm_invsqrt_ps(rsq11
);
1622 rinv12
= gmx_mm_invsqrt_ps(rsq12
);
1623 rinv13
= gmx_mm_invsqrt_ps(rsq13
);
1624 rinv21
= gmx_mm_invsqrt_ps(rsq21
);
1625 rinv22
= gmx_mm_invsqrt_ps(rsq22
);
1626 rinv23
= gmx_mm_invsqrt_ps(rsq23
);
1627 rinv31
= gmx_mm_invsqrt_ps(rsq31
);
1628 rinv32
= gmx_mm_invsqrt_ps(rsq32
);
1629 rinv33
= gmx_mm_invsqrt_ps(rsq33
);
1631 rinvsq00
= gmx_mm_inv_ps(rsq00
);
1632 rinvsq11
= _mm_mul_ps(rinv11
,rinv11
);
1633 rinvsq12
= _mm_mul_ps(rinv12
,rinv12
);
1634 rinvsq13
= _mm_mul_ps(rinv13
,rinv13
);
1635 rinvsq21
= _mm_mul_ps(rinv21
,rinv21
);
1636 rinvsq22
= _mm_mul_ps(rinv22
,rinv22
);
1637 rinvsq23
= _mm_mul_ps(rinv23
,rinv23
);
1638 rinvsq31
= _mm_mul_ps(rinv31
,rinv31
);
1639 rinvsq32
= _mm_mul_ps(rinv32
,rinv32
);
1640 rinvsq33
= _mm_mul_ps(rinv33
,rinv33
);
1642 fjx0
= _mm_setzero_ps();
1643 fjy0
= _mm_setzero_ps();
1644 fjz0
= _mm_setzero_ps();
1645 fjx1
= _mm_setzero_ps();
1646 fjy1
= _mm_setzero_ps();
1647 fjz1
= _mm_setzero_ps();
1648 fjx2
= _mm_setzero_ps();
1649 fjy2
= _mm_setzero_ps();
1650 fjz2
= _mm_setzero_ps();
1651 fjx3
= _mm_setzero_ps();
1652 fjy3
= _mm_setzero_ps();
1653 fjz3
= _mm_setzero_ps();
1655 /**************************
1656 * CALCULATE INTERACTIONS *
1657 **************************/
1659 /* LENNARD-JONES DISPERSION/REPULSION */
1661 rinvsix
= _mm_mul_ps(_mm_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
1662 fvdw
= _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00
,rinvsix
),c6_00
),_mm_mul_ps(rinvsix
,rinvsq00
));
1666 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1668 /* Calculate temporary vectorial force */
1669 tx
= _mm_mul_ps(fscal
,dx00
);
1670 ty
= _mm_mul_ps(fscal
,dy00
);
1671 tz
= _mm_mul_ps(fscal
,dz00
);
1673 /* Update vectorial force */
1674 fix0
= _mm_add_ps(fix0
,tx
);
1675 fiy0
= _mm_add_ps(fiy0
,ty
);
1676 fiz0
= _mm_add_ps(fiz0
,tz
);
1678 fjx0
= _mm_add_ps(fjx0
,tx
);
1679 fjy0
= _mm_add_ps(fjy0
,ty
);
1680 fjz0
= _mm_add_ps(fjz0
,tz
);
1682 /**************************
1683 * CALCULATE INTERACTIONS *
1684 **************************/
1686 /* COULOMB ELECTROSTATICS */
1687 velec
= _mm_mul_ps(qq11
,rinv11
);
1688 felec
= _mm_mul_ps(velec
,rinvsq11
);
1692 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1694 /* Calculate temporary vectorial force */
1695 tx
= _mm_mul_ps(fscal
,dx11
);
1696 ty
= _mm_mul_ps(fscal
,dy11
);
1697 tz
= _mm_mul_ps(fscal
,dz11
);
1699 /* Update vectorial force */
1700 fix1
= _mm_add_ps(fix1
,tx
);
1701 fiy1
= _mm_add_ps(fiy1
,ty
);
1702 fiz1
= _mm_add_ps(fiz1
,tz
);
1704 fjx1
= _mm_add_ps(fjx1
,tx
);
1705 fjy1
= _mm_add_ps(fjy1
,ty
);
1706 fjz1
= _mm_add_ps(fjz1
,tz
);
1708 /**************************
1709 * CALCULATE INTERACTIONS *
1710 **************************/
1712 /* COULOMB ELECTROSTATICS */
1713 velec
= _mm_mul_ps(qq12
,rinv12
);
1714 felec
= _mm_mul_ps(velec
,rinvsq12
);
1718 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1720 /* Calculate temporary vectorial force */
1721 tx
= _mm_mul_ps(fscal
,dx12
);
1722 ty
= _mm_mul_ps(fscal
,dy12
);
1723 tz
= _mm_mul_ps(fscal
,dz12
);
1725 /* Update vectorial force */
1726 fix1
= _mm_add_ps(fix1
,tx
);
1727 fiy1
= _mm_add_ps(fiy1
,ty
);
1728 fiz1
= _mm_add_ps(fiz1
,tz
);
1730 fjx2
= _mm_add_ps(fjx2
,tx
);
1731 fjy2
= _mm_add_ps(fjy2
,ty
);
1732 fjz2
= _mm_add_ps(fjz2
,tz
);
1734 /**************************
1735 * CALCULATE INTERACTIONS *
1736 **************************/
1738 /* COULOMB ELECTROSTATICS */
1739 velec
= _mm_mul_ps(qq13
,rinv13
);
1740 felec
= _mm_mul_ps(velec
,rinvsq13
);
1744 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1746 /* Calculate temporary vectorial force */
1747 tx
= _mm_mul_ps(fscal
,dx13
);
1748 ty
= _mm_mul_ps(fscal
,dy13
);
1749 tz
= _mm_mul_ps(fscal
,dz13
);
1751 /* Update vectorial force */
1752 fix1
= _mm_add_ps(fix1
,tx
);
1753 fiy1
= _mm_add_ps(fiy1
,ty
);
1754 fiz1
= _mm_add_ps(fiz1
,tz
);
1756 fjx3
= _mm_add_ps(fjx3
,tx
);
1757 fjy3
= _mm_add_ps(fjy3
,ty
);
1758 fjz3
= _mm_add_ps(fjz3
,tz
);
1760 /**************************
1761 * CALCULATE INTERACTIONS *
1762 **************************/
1764 /* COULOMB ELECTROSTATICS */
1765 velec
= _mm_mul_ps(qq21
,rinv21
);
1766 felec
= _mm_mul_ps(velec
,rinvsq21
);
1770 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1772 /* Calculate temporary vectorial force */
1773 tx
= _mm_mul_ps(fscal
,dx21
);
1774 ty
= _mm_mul_ps(fscal
,dy21
);
1775 tz
= _mm_mul_ps(fscal
,dz21
);
1777 /* Update vectorial force */
1778 fix2
= _mm_add_ps(fix2
,tx
);
1779 fiy2
= _mm_add_ps(fiy2
,ty
);
1780 fiz2
= _mm_add_ps(fiz2
,tz
);
1782 fjx1
= _mm_add_ps(fjx1
,tx
);
1783 fjy1
= _mm_add_ps(fjy1
,ty
);
1784 fjz1
= _mm_add_ps(fjz1
,tz
);
1786 /**************************
1787 * CALCULATE INTERACTIONS *
1788 **************************/
1790 /* COULOMB ELECTROSTATICS */
1791 velec
= _mm_mul_ps(qq22
,rinv22
);
1792 felec
= _mm_mul_ps(velec
,rinvsq22
);
1796 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1798 /* Calculate temporary vectorial force */
1799 tx
= _mm_mul_ps(fscal
,dx22
);
1800 ty
= _mm_mul_ps(fscal
,dy22
);
1801 tz
= _mm_mul_ps(fscal
,dz22
);
1803 /* Update vectorial force */
1804 fix2
= _mm_add_ps(fix2
,tx
);
1805 fiy2
= _mm_add_ps(fiy2
,ty
);
1806 fiz2
= _mm_add_ps(fiz2
,tz
);
1808 fjx2
= _mm_add_ps(fjx2
,tx
);
1809 fjy2
= _mm_add_ps(fjy2
,ty
);
1810 fjz2
= _mm_add_ps(fjz2
,tz
);
1812 /**************************
1813 * CALCULATE INTERACTIONS *
1814 **************************/
1816 /* COULOMB ELECTROSTATICS */
1817 velec
= _mm_mul_ps(qq23
,rinv23
);
1818 felec
= _mm_mul_ps(velec
,rinvsq23
);
1822 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1824 /* Calculate temporary vectorial force */
1825 tx
= _mm_mul_ps(fscal
,dx23
);
1826 ty
= _mm_mul_ps(fscal
,dy23
);
1827 tz
= _mm_mul_ps(fscal
,dz23
);
1829 /* Update vectorial force */
1830 fix2
= _mm_add_ps(fix2
,tx
);
1831 fiy2
= _mm_add_ps(fiy2
,ty
);
1832 fiz2
= _mm_add_ps(fiz2
,tz
);
1834 fjx3
= _mm_add_ps(fjx3
,tx
);
1835 fjy3
= _mm_add_ps(fjy3
,ty
);
1836 fjz3
= _mm_add_ps(fjz3
,tz
);
1838 /**************************
1839 * CALCULATE INTERACTIONS *
1840 **************************/
1842 /* COULOMB ELECTROSTATICS */
1843 velec
= _mm_mul_ps(qq31
,rinv31
);
1844 felec
= _mm_mul_ps(velec
,rinvsq31
);
1848 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1850 /* Calculate temporary vectorial force */
1851 tx
= _mm_mul_ps(fscal
,dx31
);
1852 ty
= _mm_mul_ps(fscal
,dy31
);
1853 tz
= _mm_mul_ps(fscal
,dz31
);
1855 /* Update vectorial force */
1856 fix3
= _mm_add_ps(fix3
,tx
);
1857 fiy3
= _mm_add_ps(fiy3
,ty
);
1858 fiz3
= _mm_add_ps(fiz3
,tz
);
1860 fjx1
= _mm_add_ps(fjx1
,tx
);
1861 fjy1
= _mm_add_ps(fjy1
,ty
);
1862 fjz1
= _mm_add_ps(fjz1
,tz
);
1864 /**************************
1865 * CALCULATE INTERACTIONS *
1866 **************************/
1868 /* COULOMB ELECTROSTATICS */
1869 velec
= _mm_mul_ps(qq32
,rinv32
);
1870 felec
= _mm_mul_ps(velec
,rinvsq32
);
1874 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1876 /* Calculate temporary vectorial force */
1877 tx
= _mm_mul_ps(fscal
,dx32
);
1878 ty
= _mm_mul_ps(fscal
,dy32
);
1879 tz
= _mm_mul_ps(fscal
,dz32
);
1881 /* Update vectorial force */
1882 fix3
= _mm_add_ps(fix3
,tx
);
1883 fiy3
= _mm_add_ps(fiy3
,ty
);
1884 fiz3
= _mm_add_ps(fiz3
,tz
);
1886 fjx2
= _mm_add_ps(fjx2
,tx
);
1887 fjy2
= _mm_add_ps(fjy2
,ty
);
1888 fjz2
= _mm_add_ps(fjz2
,tz
);
1890 /**************************
1891 * CALCULATE INTERACTIONS *
1892 **************************/
1894 /* COULOMB ELECTROSTATICS */
1895 velec
= _mm_mul_ps(qq33
,rinv33
);
1896 felec
= _mm_mul_ps(velec
,rinvsq33
);
1900 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1902 /* Calculate temporary vectorial force */
1903 tx
= _mm_mul_ps(fscal
,dx33
);
1904 ty
= _mm_mul_ps(fscal
,dy33
);
1905 tz
= _mm_mul_ps(fscal
,dz33
);
1907 /* Update vectorial force */
1908 fix3
= _mm_add_ps(fix3
,tx
);
1909 fiy3
= _mm_add_ps(fiy3
,ty
);
1910 fiz3
= _mm_add_ps(fiz3
,tz
);
1912 fjx3
= _mm_add_ps(fjx3
,tx
);
1913 fjy3
= _mm_add_ps(fjy3
,ty
);
1914 fjz3
= _mm_add_ps(fjz3
,tz
);
1916 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
1917 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
1918 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
1919 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
1921 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
1922 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,
1923 fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
1925 /* Inner loop uses 273 flops */
1928 /* End of innermost loop */
1930 gmx_mm_update_iforce_4atom_swizzle_ps(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
1931 f
+i_coord_offset
,fshift
+i_shift_offset
);
1933 /* Increment number of inner iterations */
1934 inneriter
+= j_index_end
- j_index_start
;
1936 /* Outer loop uses 24 flops */
1939 /* Increment number of outer iterations */
1942 /* Update outer/inner flops */
1944 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W4W4_F
,outeriter
*24 + inneriter
*273);