2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_single kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/math/vec.h"
46 #include "gromacs/legacyheaders/nrnb.h"
48 #include "gromacs/simd/math_x86_sse2_single.h"
49 #include "kernelutil_x86_sse2_single.h"
52 * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_sse2_single
53 * Electrostatics interaction: Coulomb
54 * VdW interaction: LennardJones
55 * Geometry: Water3-Water3
56 * Calculate force/pot: PotentialAndForce
59 nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_sse2_single
60 (t_nblist
* gmx_restrict nlist
,
61 rvec
* gmx_restrict xx
,
62 rvec
* gmx_restrict ff
,
63 t_forcerec
* gmx_restrict fr
,
64 t_mdatoms
* gmx_restrict mdatoms
,
65 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
66 t_nrnb
* gmx_restrict nrnb
)
68 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
69 * just 0 for non-waters.
70 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
71 * jnr indices corresponding to data put in the four positions in the SIMD register.
73 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
74 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
75 int jnrA
,jnrB
,jnrC
,jnrD
;
76 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
77 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
78 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
80 real
*shiftvec
,*fshift
,*x
,*f
;
81 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
;
83 __m128 tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
85 __m128 ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
87 __m128 ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
89 __m128 ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
90 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
;
91 __m128 jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
92 int vdwjidx1A
,vdwjidx1B
,vdwjidx1C
,vdwjidx1D
;
93 __m128 jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
94 int vdwjidx2A
,vdwjidx2B
,vdwjidx2C
,vdwjidx2D
;
95 __m128 jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
96 __m128 dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
97 __m128 dx01
,dy01
,dz01
,rsq01
,rinv01
,rinvsq01
,r01
,qq01
,c6_01
,c12_01
;
98 __m128 dx02
,dy02
,dz02
,rsq02
,rinv02
,rinvsq02
,r02
,qq02
,c6_02
,c12_02
;
99 __m128 dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
100 __m128 dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
101 __m128 dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
102 __m128 dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
103 __m128 dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
104 __m128 dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
105 __m128 velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
108 __m128 rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
111 __m128 one_sixth
= _mm_set1_ps(1.0/6.0);
112 __m128 one_twelfth
= _mm_set1_ps(1.0/12.0);
113 __m128 dummy_mask
,cutoff_mask
;
114 __m128 signbit
= _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
115 __m128 one
= _mm_set1_ps(1.0);
116 __m128 two
= _mm_set1_ps(2.0);
122 jindex
= nlist
->jindex
;
124 shiftidx
= nlist
->shift
;
126 shiftvec
= fr
->shift_vec
[0];
127 fshift
= fr
->fshift
[0];
128 facel
= _mm_set1_ps(fr
->epsfac
);
129 charge
= mdatoms
->chargeA
;
130 nvdwtype
= fr
->ntype
;
132 vdwtype
= mdatoms
->typeA
;
134 /* Setup water-specific parameters */
135 inr
= nlist
->iinr
[0];
136 iq0
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+0]));
137 iq1
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+1]));
138 iq2
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+2]));
139 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
141 jq0
= _mm_set1_ps(charge
[inr
+0]);
142 jq1
= _mm_set1_ps(charge
[inr
+1]);
143 jq2
= _mm_set1_ps(charge
[inr
+2]);
144 vdwjidx0A
= 2*vdwtype
[inr
+0];
145 qq00
= _mm_mul_ps(iq0
,jq0
);
146 c6_00
= _mm_set1_ps(vdwparam
[vdwioffset0
+vdwjidx0A
]);
147 c12_00
= _mm_set1_ps(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
148 qq01
= _mm_mul_ps(iq0
,jq1
);
149 qq02
= _mm_mul_ps(iq0
,jq2
);
150 qq10
= _mm_mul_ps(iq1
,jq0
);
151 qq11
= _mm_mul_ps(iq1
,jq1
);
152 qq12
= _mm_mul_ps(iq1
,jq2
);
153 qq20
= _mm_mul_ps(iq2
,jq0
);
154 qq21
= _mm_mul_ps(iq2
,jq1
);
155 qq22
= _mm_mul_ps(iq2
,jq2
);
157 /* Avoid stupid compiler warnings */
158 jnrA
= jnrB
= jnrC
= jnrD
= 0;
167 for(iidx
=0;iidx
<4*DIM
;iidx
++)
172 /* Start outer loop over neighborlists */
173 for(iidx
=0; iidx
<nri
; iidx
++)
175 /* Load shift vector for this list */
176 i_shift_offset
= DIM
*shiftidx
[iidx
];
178 /* Load limits for loop over neighbors */
179 j_index_start
= jindex
[iidx
];
180 j_index_end
= jindex
[iidx
+1];
182 /* Get outer coordinate index */
184 i_coord_offset
= DIM
*inr
;
186 /* Load i particle coords and add shift vector */
187 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
188 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
190 fix0
= _mm_setzero_ps();
191 fiy0
= _mm_setzero_ps();
192 fiz0
= _mm_setzero_ps();
193 fix1
= _mm_setzero_ps();
194 fiy1
= _mm_setzero_ps();
195 fiz1
= _mm_setzero_ps();
196 fix2
= _mm_setzero_ps();
197 fiy2
= _mm_setzero_ps();
198 fiz2
= _mm_setzero_ps();
200 /* Reset potential sums */
201 velecsum
= _mm_setzero_ps();
202 vvdwsum
= _mm_setzero_ps();
204 /* Start inner kernel loop */
205 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+3]>=0; jidx
+=4)
208 /* Get j neighbor index, and coordinate index */
213 j_coord_offsetA
= DIM
*jnrA
;
214 j_coord_offsetB
= DIM
*jnrB
;
215 j_coord_offsetC
= DIM
*jnrC
;
216 j_coord_offsetD
= DIM
*jnrD
;
218 /* load j atom coordinates */
219 gmx_mm_load_3rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
220 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
221 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
223 /* Calculate displacement vector */
224 dx00
= _mm_sub_ps(ix0
,jx0
);
225 dy00
= _mm_sub_ps(iy0
,jy0
);
226 dz00
= _mm_sub_ps(iz0
,jz0
);
227 dx01
= _mm_sub_ps(ix0
,jx1
);
228 dy01
= _mm_sub_ps(iy0
,jy1
);
229 dz01
= _mm_sub_ps(iz0
,jz1
);
230 dx02
= _mm_sub_ps(ix0
,jx2
);
231 dy02
= _mm_sub_ps(iy0
,jy2
);
232 dz02
= _mm_sub_ps(iz0
,jz2
);
233 dx10
= _mm_sub_ps(ix1
,jx0
);
234 dy10
= _mm_sub_ps(iy1
,jy0
);
235 dz10
= _mm_sub_ps(iz1
,jz0
);
236 dx11
= _mm_sub_ps(ix1
,jx1
);
237 dy11
= _mm_sub_ps(iy1
,jy1
);
238 dz11
= _mm_sub_ps(iz1
,jz1
);
239 dx12
= _mm_sub_ps(ix1
,jx2
);
240 dy12
= _mm_sub_ps(iy1
,jy2
);
241 dz12
= _mm_sub_ps(iz1
,jz2
);
242 dx20
= _mm_sub_ps(ix2
,jx0
);
243 dy20
= _mm_sub_ps(iy2
,jy0
);
244 dz20
= _mm_sub_ps(iz2
,jz0
);
245 dx21
= _mm_sub_ps(ix2
,jx1
);
246 dy21
= _mm_sub_ps(iy2
,jy1
);
247 dz21
= _mm_sub_ps(iz2
,jz1
);
248 dx22
= _mm_sub_ps(ix2
,jx2
);
249 dy22
= _mm_sub_ps(iy2
,jy2
);
250 dz22
= _mm_sub_ps(iz2
,jz2
);
252 /* Calculate squared distance and things based on it */
253 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
254 rsq01
= gmx_mm_calc_rsq_ps(dx01
,dy01
,dz01
);
255 rsq02
= gmx_mm_calc_rsq_ps(dx02
,dy02
,dz02
);
256 rsq10
= gmx_mm_calc_rsq_ps(dx10
,dy10
,dz10
);
257 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
258 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
259 rsq20
= gmx_mm_calc_rsq_ps(dx20
,dy20
,dz20
);
260 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
261 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
263 rinv00
= gmx_mm_invsqrt_ps(rsq00
);
264 rinv01
= gmx_mm_invsqrt_ps(rsq01
);
265 rinv02
= gmx_mm_invsqrt_ps(rsq02
);
266 rinv10
= gmx_mm_invsqrt_ps(rsq10
);
267 rinv11
= gmx_mm_invsqrt_ps(rsq11
);
268 rinv12
= gmx_mm_invsqrt_ps(rsq12
);
269 rinv20
= gmx_mm_invsqrt_ps(rsq20
);
270 rinv21
= gmx_mm_invsqrt_ps(rsq21
);
271 rinv22
= gmx_mm_invsqrt_ps(rsq22
);
273 rinvsq00
= _mm_mul_ps(rinv00
,rinv00
);
274 rinvsq01
= _mm_mul_ps(rinv01
,rinv01
);
275 rinvsq02
= _mm_mul_ps(rinv02
,rinv02
);
276 rinvsq10
= _mm_mul_ps(rinv10
,rinv10
);
277 rinvsq11
= _mm_mul_ps(rinv11
,rinv11
);
278 rinvsq12
= _mm_mul_ps(rinv12
,rinv12
);
279 rinvsq20
= _mm_mul_ps(rinv20
,rinv20
);
280 rinvsq21
= _mm_mul_ps(rinv21
,rinv21
);
281 rinvsq22
= _mm_mul_ps(rinv22
,rinv22
);
283 fjx0
= _mm_setzero_ps();
284 fjy0
= _mm_setzero_ps();
285 fjz0
= _mm_setzero_ps();
286 fjx1
= _mm_setzero_ps();
287 fjy1
= _mm_setzero_ps();
288 fjz1
= _mm_setzero_ps();
289 fjx2
= _mm_setzero_ps();
290 fjy2
= _mm_setzero_ps();
291 fjz2
= _mm_setzero_ps();
293 /**************************
294 * CALCULATE INTERACTIONS *
295 **************************/
297 /* COULOMB ELECTROSTATICS */
298 velec
= _mm_mul_ps(qq00
,rinv00
);
299 felec
= _mm_mul_ps(velec
,rinvsq00
);
301 /* LENNARD-JONES DISPERSION/REPULSION */
303 rinvsix
= _mm_mul_ps(_mm_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
304 vvdw6
= _mm_mul_ps(c6_00
,rinvsix
);
305 vvdw12
= _mm_mul_ps(c12_00
,_mm_mul_ps(rinvsix
,rinvsix
));
306 vvdw
= _mm_sub_ps( _mm_mul_ps(vvdw12
,one_twelfth
) , _mm_mul_ps(vvdw6
,one_sixth
) );
307 fvdw
= _mm_mul_ps(_mm_sub_ps(vvdw12
,vvdw6
),rinvsq00
);
309 /* Update potential sum for this i atom from the interaction with this j atom. */
310 velecsum
= _mm_add_ps(velecsum
,velec
);
311 vvdwsum
= _mm_add_ps(vvdwsum
,vvdw
);
313 fscal
= _mm_add_ps(felec
,fvdw
);
315 /* Calculate temporary vectorial force */
316 tx
= _mm_mul_ps(fscal
,dx00
);
317 ty
= _mm_mul_ps(fscal
,dy00
);
318 tz
= _mm_mul_ps(fscal
,dz00
);
320 /* Update vectorial force */
321 fix0
= _mm_add_ps(fix0
,tx
);
322 fiy0
= _mm_add_ps(fiy0
,ty
);
323 fiz0
= _mm_add_ps(fiz0
,tz
);
325 fjx0
= _mm_add_ps(fjx0
,tx
);
326 fjy0
= _mm_add_ps(fjy0
,ty
);
327 fjz0
= _mm_add_ps(fjz0
,tz
);
329 /**************************
330 * CALCULATE INTERACTIONS *
331 **************************/
333 /* COULOMB ELECTROSTATICS */
334 velec
= _mm_mul_ps(qq01
,rinv01
);
335 felec
= _mm_mul_ps(velec
,rinvsq01
);
337 /* Update potential sum for this i atom from the interaction with this j atom. */
338 velecsum
= _mm_add_ps(velecsum
,velec
);
342 /* Calculate temporary vectorial force */
343 tx
= _mm_mul_ps(fscal
,dx01
);
344 ty
= _mm_mul_ps(fscal
,dy01
);
345 tz
= _mm_mul_ps(fscal
,dz01
);
347 /* Update vectorial force */
348 fix0
= _mm_add_ps(fix0
,tx
);
349 fiy0
= _mm_add_ps(fiy0
,ty
);
350 fiz0
= _mm_add_ps(fiz0
,tz
);
352 fjx1
= _mm_add_ps(fjx1
,tx
);
353 fjy1
= _mm_add_ps(fjy1
,ty
);
354 fjz1
= _mm_add_ps(fjz1
,tz
);
356 /**************************
357 * CALCULATE INTERACTIONS *
358 **************************/
360 /* COULOMB ELECTROSTATICS */
361 velec
= _mm_mul_ps(qq02
,rinv02
);
362 felec
= _mm_mul_ps(velec
,rinvsq02
);
364 /* Update potential sum for this i atom from the interaction with this j atom. */
365 velecsum
= _mm_add_ps(velecsum
,velec
);
369 /* Calculate temporary vectorial force */
370 tx
= _mm_mul_ps(fscal
,dx02
);
371 ty
= _mm_mul_ps(fscal
,dy02
);
372 tz
= _mm_mul_ps(fscal
,dz02
);
374 /* Update vectorial force */
375 fix0
= _mm_add_ps(fix0
,tx
);
376 fiy0
= _mm_add_ps(fiy0
,ty
);
377 fiz0
= _mm_add_ps(fiz0
,tz
);
379 fjx2
= _mm_add_ps(fjx2
,tx
);
380 fjy2
= _mm_add_ps(fjy2
,ty
);
381 fjz2
= _mm_add_ps(fjz2
,tz
);
383 /**************************
384 * CALCULATE INTERACTIONS *
385 **************************/
387 /* COULOMB ELECTROSTATICS */
388 velec
= _mm_mul_ps(qq10
,rinv10
);
389 felec
= _mm_mul_ps(velec
,rinvsq10
);
391 /* Update potential sum for this i atom from the interaction with this j atom. */
392 velecsum
= _mm_add_ps(velecsum
,velec
);
396 /* Calculate temporary vectorial force */
397 tx
= _mm_mul_ps(fscal
,dx10
);
398 ty
= _mm_mul_ps(fscal
,dy10
);
399 tz
= _mm_mul_ps(fscal
,dz10
);
401 /* Update vectorial force */
402 fix1
= _mm_add_ps(fix1
,tx
);
403 fiy1
= _mm_add_ps(fiy1
,ty
);
404 fiz1
= _mm_add_ps(fiz1
,tz
);
406 fjx0
= _mm_add_ps(fjx0
,tx
);
407 fjy0
= _mm_add_ps(fjy0
,ty
);
408 fjz0
= _mm_add_ps(fjz0
,tz
);
410 /**************************
411 * CALCULATE INTERACTIONS *
412 **************************/
414 /* COULOMB ELECTROSTATICS */
415 velec
= _mm_mul_ps(qq11
,rinv11
);
416 felec
= _mm_mul_ps(velec
,rinvsq11
);
418 /* Update potential sum for this i atom from the interaction with this j atom. */
419 velecsum
= _mm_add_ps(velecsum
,velec
);
423 /* Calculate temporary vectorial force */
424 tx
= _mm_mul_ps(fscal
,dx11
);
425 ty
= _mm_mul_ps(fscal
,dy11
);
426 tz
= _mm_mul_ps(fscal
,dz11
);
428 /* Update vectorial force */
429 fix1
= _mm_add_ps(fix1
,tx
);
430 fiy1
= _mm_add_ps(fiy1
,ty
);
431 fiz1
= _mm_add_ps(fiz1
,tz
);
433 fjx1
= _mm_add_ps(fjx1
,tx
);
434 fjy1
= _mm_add_ps(fjy1
,ty
);
435 fjz1
= _mm_add_ps(fjz1
,tz
);
437 /**************************
438 * CALCULATE INTERACTIONS *
439 **************************/
441 /* COULOMB ELECTROSTATICS */
442 velec
= _mm_mul_ps(qq12
,rinv12
);
443 felec
= _mm_mul_ps(velec
,rinvsq12
);
445 /* Update potential sum for this i atom from the interaction with this j atom. */
446 velecsum
= _mm_add_ps(velecsum
,velec
);
450 /* Calculate temporary vectorial force */
451 tx
= _mm_mul_ps(fscal
,dx12
);
452 ty
= _mm_mul_ps(fscal
,dy12
);
453 tz
= _mm_mul_ps(fscal
,dz12
);
455 /* Update vectorial force */
456 fix1
= _mm_add_ps(fix1
,tx
);
457 fiy1
= _mm_add_ps(fiy1
,ty
);
458 fiz1
= _mm_add_ps(fiz1
,tz
);
460 fjx2
= _mm_add_ps(fjx2
,tx
);
461 fjy2
= _mm_add_ps(fjy2
,ty
);
462 fjz2
= _mm_add_ps(fjz2
,tz
);
464 /**************************
465 * CALCULATE INTERACTIONS *
466 **************************/
468 /* COULOMB ELECTROSTATICS */
469 velec
= _mm_mul_ps(qq20
,rinv20
);
470 felec
= _mm_mul_ps(velec
,rinvsq20
);
472 /* Update potential sum for this i atom from the interaction with this j atom. */
473 velecsum
= _mm_add_ps(velecsum
,velec
);
477 /* Calculate temporary vectorial force */
478 tx
= _mm_mul_ps(fscal
,dx20
);
479 ty
= _mm_mul_ps(fscal
,dy20
);
480 tz
= _mm_mul_ps(fscal
,dz20
);
482 /* Update vectorial force */
483 fix2
= _mm_add_ps(fix2
,tx
);
484 fiy2
= _mm_add_ps(fiy2
,ty
);
485 fiz2
= _mm_add_ps(fiz2
,tz
);
487 fjx0
= _mm_add_ps(fjx0
,tx
);
488 fjy0
= _mm_add_ps(fjy0
,ty
);
489 fjz0
= _mm_add_ps(fjz0
,tz
);
491 /**************************
492 * CALCULATE INTERACTIONS *
493 **************************/
495 /* COULOMB ELECTROSTATICS */
496 velec
= _mm_mul_ps(qq21
,rinv21
);
497 felec
= _mm_mul_ps(velec
,rinvsq21
);
499 /* Update potential sum for this i atom from the interaction with this j atom. */
500 velecsum
= _mm_add_ps(velecsum
,velec
);
504 /* Calculate temporary vectorial force */
505 tx
= _mm_mul_ps(fscal
,dx21
);
506 ty
= _mm_mul_ps(fscal
,dy21
);
507 tz
= _mm_mul_ps(fscal
,dz21
);
509 /* Update vectorial force */
510 fix2
= _mm_add_ps(fix2
,tx
);
511 fiy2
= _mm_add_ps(fiy2
,ty
);
512 fiz2
= _mm_add_ps(fiz2
,tz
);
514 fjx1
= _mm_add_ps(fjx1
,tx
);
515 fjy1
= _mm_add_ps(fjy1
,ty
);
516 fjz1
= _mm_add_ps(fjz1
,tz
);
518 /**************************
519 * CALCULATE INTERACTIONS *
520 **************************/
522 /* COULOMB ELECTROSTATICS */
523 velec
= _mm_mul_ps(qq22
,rinv22
);
524 felec
= _mm_mul_ps(velec
,rinvsq22
);
526 /* Update potential sum for this i atom from the interaction with this j atom. */
527 velecsum
= _mm_add_ps(velecsum
,velec
);
531 /* Calculate temporary vectorial force */
532 tx
= _mm_mul_ps(fscal
,dx22
);
533 ty
= _mm_mul_ps(fscal
,dy22
);
534 tz
= _mm_mul_ps(fscal
,dz22
);
536 /* Update vectorial force */
537 fix2
= _mm_add_ps(fix2
,tx
);
538 fiy2
= _mm_add_ps(fiy2
,ty
);
539 fiz2
= _mm_add_ps(fiz2
,tz
);
541 fjx2
= _mm_add_ps(fjx2
,tx
);
542 fjy2
= _mm_add_ps(fjy2
,ty
);
543 fjz2
= _mm_add_ps(fjz2
,tz
);
545 fjptrA
= f
+j_coord_offsetA
;
546 fjptrB
= f
+j_coord_offsetB
;
547 fjptrC
= f
+j_coord_offsetC
;
548 fjptrD
= f
+j_coord_offsetD
;
550 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
551 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
553 /* Inner loop uses 264 flops */
559 /* Get j neighbor index, and coordinate index */
560 jnrlistA
= jjnr
[jidx
];
561 jnrlistB
= jjnr
[jidx
+1];
562 jnrlistC
= jjnr
[jidx
+2];
563 jnrlistD
= jjnr
[jidx
+3];
564 /* Sign of each element will be negative for non-real atoms.
565 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
566 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
568 dummy_mask
= gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128()));
569 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
570 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
571 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
572 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
573 j_coord_offsetA
= DIM
*jnrA
;
574 j_coord_offsetB
= DIM
*jnrB
;
575 j_coord_offsetC
= DIM
*jnrC
;
576 j_coord_offsetD
= DIM
*jnrD
;
578 /* load j atom coordinates */
579 gmx_mm_load_3rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
580 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
581 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
583 /* Calculate displacement vector */
584 dx00
= _mm_sub_ps(ix0
,jx0
);
585 dy00
= _mm_sub_ps(iy0
,jy0
);
586 dz00
= _mm_sub_ps(iz0
,jz0
);
587 dx01
= _mm_sub_ps(ix0
,jx1
);
588 dy01
= _mm_sub_ps(iy0
,jy1
);
589 dz01
= _mm_sub_ps(iz0
,jz1
);
590 dx02
= _mm_sub_ps(ix0
,jx2
);
591 dy02
= _mm_sub_ps(iy0
,jy2
);
592 dz02
= _mm_sub_ps(iz0
,jz2
);
593 dx10
= _mm_sub_ps(ix1
,jx0
);
594 dy10
= _mm_sub_ps(iy1
,jy0
);
595 dz10
= _mm_sub_ps(iz1
,jz0
);
596 dx11
= _mm_sub_ps(ix1
,jx1
);
597 dy11
= _mm_sub_ps(iy1
,jy1
);
598 dz11
= _mm_sub_ps(iz1
,jz1
);
599 dx12
= _mm_sub_ps(ix1
,jx2
);
600 dy12
= _mm_sub_ps(iy1
,jy2
);
601 dz12
= _mm_sub_ps(iz1
,jz2
);
602 dx20
= _mm_sub_ps(ix2
,jx0
);
603 dy20
= _mm_sub_ps(iy2
,jy0
);
604 dz20
= _mm_sub_ps(iz2
,jz0
);
605 dx21
= _mm_sub_ps(ix2
,jx1
);
606 dy21
= _mm_sub_ps(iy2
,jy1
);
607 dz21
= _mm_sub_ps(iz2
,jz1
);
608 dx22
= _mm_sub_ps(ix2
,jx2
);
609 dy22
= _mm_sub_ps(iy2
,jy2
);
610 dz22
= _mm_sub_ps(iz2
,jz2
);
612 /* Calculate squared distance and things based on it */
613 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
614 rsq01
= gmx_mm_calc_rsq_ps(dx01
,dy01
,dz01
);
615 rsq02
= gmx_mm_calc_rsq_ps(dx02
,dy02
,dz02
);
616 rsq10
= gmx_mm_calc_rsq_ps(dx10
,dy10
,dz10
);
617 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
618 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
619 rsq20
= gmx_mm_calc_rsq_ps(dx20
,dy20
,dz20
);
620 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
621 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
623 rinv00
= gmx_mm_invsqrt_ps(rsq00
);
624 rinv01
= gmx_mm_invsqrt_ps(rsq01
);
625 rinv02
= gmx_mm_invsqrt_ps(rsq02
);
626 rinv10
= gmx_mm_invsqrt_ps(rsq10
);
627 rinv11
= gmx_mm_invsqrt_ps(rsq11
);
628 rinv12
= gmx_mm_invsqrt_ps(rsq12
);
629 rinv20
= gmx_mm_invsqrt_ps(rsq20
);
630 rinv21
= gmx_mm_invsqrt_ps(rsq21
);
631 rinv22
= gmx_mm_invsqrt_ps(rsq22
);
633 rinvsq00
= _mm_mul_ps(rinv00
,rinv00
);
634 rinvsq01
= _mm_mul_ps(rinv01
,rinv01
);
635 rinvsq02
= _mm_mul_ps(rinv02
,rinv02
);
636 rinvsq10
= _mm_mul_ps(rinv10
,rinv10
);
637 rinvsq11
= _mm_mul_ps(rinv11
,rinv11
);
638 rinvsq12
= _mm_mul_ps(rinv12
,rinv12
);
639 rinvsq20
= _mm_mul_ps(rinv20
,rinv20
);
640 rinvsq21
= _mm_mul_ps(rinv21
,rinv21
);
641 rinvsq22
= _mm_mul_ps(rinv22
,rinv22
);
643 fjx0
= _mm_setzero_ps();
644 fjy0
= _mm_setzero_ps();
645 fjz0
= _mm_setzero_ps();
646 fjx1
= _mm_setzero_ps();
647 fjy1
= _mm_setzero_ps();
648 fjz1
= _mm_setzero_ps();
649 fjx2
= _mm_setzero_ps();
650 fjy2
= _mm_setzero_ps();
651 fjz2
= _mm_setzero_ps();
653 /**************************
654 * CALCULATE INTERACTIONS *
655 **************************/
657 /* COULOMB ELECTROSTATICS */
658 velec
= _mm_mul_ps(qq00
,rinv00
);
659 felec
= _mm_mul_ps(velec
,rinvsq00
);
661 /* LENNARD-JONES DISPERSION/REPULSION */
663 rinvsix
= _mm_mul_ps(_mm_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
664 vvdw6
= _mm_mul_ps(c6_00
,rinvsix
);
665 vvdw12
= _mm_mul_ps(c12_00
,_mm_mul_ps(rinvsix
,rinvsix
));
666 vvdw
= _mm_sub_ps( _mm_mul_ps(vvdw12
,one_twelfth
) , _mm_mul_ps(vvdw6
,one_sixth
) );
667 fvdw
= _mm_mul_ps(_mm_sub_ps(vvdw12
,vvdw6
),rinvsq00
);
669 /* Update potential sum for this i atom from the interaction with this j atom. */
670 velec
= _mm_andnot_ps(dummy_mask
,velec
);
671 velecsum
= _mm_add_ps(velecsum
,velec
);
672 vvdw
= _mm_andnot_ps(dummy_mask
,vvdw
);
673 vvdwsum
= _mm_add_ps(vvdwsum
,vvdw
);
675 fscal
= _mm_add_ps(felec
,fvdw
);
677 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
679 /* Calculate temporary vectorial force */
680 tx
= _mm_mul_ps(fscal
,dx00
);
681 ty
= _mm_mul_ps(fscal
,dy00
);
682 tz
= _mm_mul_ps(fscal
,dz00
);
684 /* Update vectorial force */
685 fix0
= _mm_add_ps(fix0
,tx
);
686 fiy0
= _mm_add_ps(fiy0
,ty
);
687 fiz0
= _mm_add_ps(fiz0
,tz
);
689 fjx0
= _mm_add_ps(fjx0
,tx
);
690 fjy0
= _mm_add_ps(fjy0
,ty
);
691 fjz0
= _mm_add_ps(fjz0
,tz
);
693 /**************************
694 * CALCULATE INTERACTIONS *
695 **************************/
697 /* COULOMB ELECTROSTATICS */
698 velec
= _mm_mul_ps(qq01
,rinv01
);
699 felec
= _mm_mul_ps(velec
,rinvsq01
);
701 /* Update potential sum for this i atom from the interaction with this j atom. */
702 velec
= _mm_andnot_ps(dummy_mask
,velec
);
703 velecsum
= _mm_add_ps(velecsum
,velec
);
707 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
709 /* Calculate temporary vectorial force */
710 tx
= _mm_mul_ps(fscal
,dx01
);
711 ty
= _mm_mul_ps(fscal
,dy01
);
712 tz
= _mm_mul_ps(fscal
,dz01
);
714 /* Update vectorial force */
715 fix0
= _mm_add_ps(fix0
,tx
);
716 fiy0
= _mm_add_ps(fiy0
,ty
);
717 fiz0
= _mm_add_ps(fiz0
,tz
);
719 fjx1
= _mm_add_ps(fjx1
,tx
);
720 fjy1
= _mm_add_ps(fjy1
,ty
);
721 fjz1
= _mm_add_ps(fjz1
,tz
);
723 /**************************
724 * CALCULATE INTERACTIONS *
725 **************************/
727 /* COULOMB ELECTROSTATICS */
728 velec
= _mm_mul_ps(qq02
,rinv02
);
729 felec
= _mm_mul_ps(velec
,rinvsq02
);
731 /* Update potential sum for this i atom from the interaction with this j atom. */
732 velec
= _mm_andnot_ps(dummy_mask
,velec
);
733 velecsum
= _mm_add_ps(velecsum
,velec
);
737 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
739 /* Calculate temporary vectorial force */
740 tx
= _mm_mul_ps(fscal
,dx02
);
741 ty
= _mm_mul_ps(fscal
,dy02
);
742 tz
= _mm_mul_ps(fscal
,dz02
);
744 /* Update vectorial force */
745 fix0
= _mm_add_ps(fix0
,tx
);
746 fiy0
= _mm_add_ps(fiy0
,ty
);
747 fiz0
= _mm_add_ps(fiz0
,tz
);
749 fjx2
= _mm_add_ps(fjx2
,tx
);
750 fjy2
= _mm_add_ps(fjy2
,ty
);
751 fjz2
= _mm_add_ps(fjz2
,tz
);
753 /**************************
754 * CALCULATE INTERACTIONS *
755 **************************/
757 /* COULOMB ELECTROSTATICS */
758 velec
= _mm_mul_ps(qq10
,rinv10
);
759 felec
= _mm_mul_ps(velec
,rinvsq10
);
761 /* Update potential sum for this i atom from the interaction with this j atom. */
762 velec
= _mm_andnot_ps(dummy_mask
,velec
);
763 velecsum
= _mm_add_ps(velecsum
,velec
);
767 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
769 /* Calculate temporary vectorial force */
770 tx
= _mm_mul_ps(fscal
,dx10
);
771 ty
= _mm_mul_ps(fscal
,dy10
);
772 tz
= _mm_mul_ps(fscal
,dz10
);
774 /* Update vectorial force */
775 fix1
= _mm_add_ps(fix1
,tx
);
776 fiy1
= _mm_add_ps(fiy1
,ty
);
777 fiz1
= _mm_add_ps(fiz1
,tz
);
779 fjx0
= _mm_add_ps(fjx0
,tx
);
780 fjy0
= _mm_add_ps(fjy0
,ty
);
781 fjz0
= _mm_add_ps(fjz0
,tz
);
783 /**************************
784 * CALCULATE INTERACTIONS *
785 **************************/
787 /* COULOMB ELECTROSTATICS */
788 velec
= _mm_mul_ps(qq11
,rinv11
);
789 felec
= _mm_mul_ps(velec
,rinvsq11
);
791 /* Update potential sum for this i atom from the interaction with this j atom. */
792 velec
= _mm_andnot_ps(dummy_mask
,velec
);
793 velecsum
= _mm_add_ps(velecsum
,velec
);
797 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
799 /* Calculate temporary vectorial force */
800 tx
= _mm_mul_ps(fscal
,dx11
);
801 ty
= _mm_mul_ps(fscal
,dy11
);
802 tz
= _mm_mul_ps(fscal
,dz11
);
804 /* Update vectorial force */
805 fix1
= _mm_add_ps(fix1
,tx
);
806 fiy1
= _mm_add_ps(fiy1
,ty
);
807 fiz1
= _mm_add_ps(fiz1
,tz
);
809 fjx1
= _mm_add_ps(fjx1
,tx
);
810 fjy1
= _mm_add_ps(fjy1
,ty
);
811 fjz1
= _mm_add_ps(fjz1
,tz
);
813 /**************************
814 * CALCULATE INTERACTIONS *
815 **************************/
817 /* COULOMB ELECTROSTATICS */
818 velec
= _mm_mul_ps(qq12
,rinv12
);
819 felec
= _mm_mul_ps(velec
,rinvsq12
);
821 /* Update potential sum for this i atom from the interaction with this j atom. */
822 velec
= _mm_andnot_ps(dummy_mask
,velec
);
823 velecsum
= _mm_add_ps(velecsum
,velec
);
827 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
829 /* Calculate temporary vectorial force */
830 tx
= _mm_mul_ps(fscal
,dx12
);
831 ty
= _mm_mul_ps(fscal
,dy12
);
832 tz
= _mm_mul_ps(fscal
,dz12
);
834 /* Update vectorial force */
835 fix1
= _mm_add_ps(fix1
,tx
);
836 fiy1
= _mm_add_ps(fiy1
,ty
);
837 fiz1
= _mm_add_ps(fiz1
,tz
);
839 fjx2
= _mm_add_ps(fjx2
,tx
);
840 fjy2
= _mm_add_ps(fjy2
,ty
);
841 fjz2
= _mm_add_ps(fjz2
,tz
);
843 /**************************
844 * CALCULATE INTERACTIONS *
845 **************************/
847 /* COULOMB ELECTROSTATICS */
848 velec
= _mm_mul_ps(qq20
,rinv20
);
849 felec
= _mm_mul_ps(velec
,rinvsq20
);
851 /* Update potential sum for this i atom from the interaction with this j atom. */
852 velec
= _mm_andnot_ps(dummy_mask
,velec
);
853 velecsum
= _mm_add_ps(velecsum
,velec
);
857 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
859 /* Calculate temporary vectorial force */
860 tx
= _mm_mul_ps(fscal
,dx20
);
861 ty
= _mm_mul_ps(fscal
,dy20
);
862 tz
= _mm_mul_ps(fscal
,dz20
);
864 /* Update vectorial force */
865 fix2
= _mm_add_ps(fix2
,tx
);
866 fiy2
= _mm_add_ps(fiy2
,ty
);
867 fiz2
= _mm_add_ps(fiz2
,tz
);
869 fjx0
= _mm_add_ps(fjx0
,tx
);
870 fjy0
= _mm_add_ps(fjy0
,ty
);
871 fjz0
= _mm_add_ps(fjz0
,tz
);
873 /**************************
874 * CALCULATE INTERACTIONS *
875 **************************/
877 /* COULOMB ELECTROSTATICS */
878 velec
= _mm_mul_ps(qq21
,rinv21
);
879 felec
= _mm_mul_ps(velec
,rinvsq21
);
881 /* Update potential sum for this i atom from the interaction with this j atom. */
882 velec
= _mm_andnot_ps(dummy_mask
,velec
);
883 velecsum
= _mm_add_ps(velecsum
,velec
);
887 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
889 /* Calculate temporary vectorial force */
890 tx
= _mm_mul_ps(fscal
,dx21
);
891 ty
= _mm_mul_ps(fscal
,dy21
);
892 tz
= _mm_mul_ps(fscal
,dz21
);
894 /* Update vectorial force */
895 fix2
= _mm_add_ps(fix2
,tx
);
896 fiy2
= _mm_add_ps(fiy2
,ty
);
897 fiz2
= _mm_add_ps(fiz2
,tz
);
899 fjx1
= _mm_add_ps(fjx1
,tx
);
900 fjy1
= _mm_add_ps(fjy1
,ty
);
901 fjz1
= _mm_add_ps(fjz1
,tz
);
903 /**************************
904 * CALCULATE INTERACTIONS *
905 **************************/
907 /* COULOMB ELECTROSTATICS */
908 velec
= _mm_mul_ps(qq22
,rinv22
);
909 felec
= _mm_mul_ps(velec
,rinvsq22
);
911 /* Update potential sum for this i atom from the interaction with this j atom. */
912 velec
= _mm_andnot_ps(dummy_mask
,velec
);
913 velecsum
= _mm_add_ps(velecsum
,velec
);
917 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
919 /* Calculate temporary vectorial force */
920 tx
= _mm_mul_ps(fscal
,dx22
);
921 ty
= _mm_mul_ps(fscal
,dy22
);
922 tz
= _mm_mul_ps(fscal
,dz22
);
924 /* Update vectorial force */
925 fix2
= _mm_add_ps(fix2
,tx
);
926 fiy2
= _mm_add_ps(fiy2
,ty
);
927 fiz2
= _mm_add_ps(fiz2
,tz
);
929 fjx2
= _mm_add_ps(fjx2
,tx
);
930 fjy2
= _mm_add_ps(fjy2
,ty
);
931 fjz2
= _mm_add_ps(fjz2
,tz
);
933 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
934 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
935 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
936 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
938 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
939 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
941 /* Inner loop uses 264 flops */
944 /* End of innermost loop */
946 gmx_mm_update_iforce_3atom_swizzle_ps(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
947 f
+i_coord_offset
,fshift
+i_shift_offset
);
950 /* Update potential energies */
951 gmx_mm_update_1pot_ps(velecsum
,kernel_data
->energygrp_elec
+ggid
);
952 gmx_mm_update_1pot_ps(vvdwsum
,kernel_data
->energygrp_vdw
+ggid
);
954 /* Increment number of inner iterations */
955 inneriter
+= j_index_end
- j_index_start
;
957 /* Outer loop uses 20 flops */
960 /* Increment number of outer iterations */
963 /* Update outer/inner flops */
965 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W3W3_VF
,outeriter
*20 + inneriter
*264);
968 * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_sse2_single
969 * Electrostatics interaction: Coulomb
970 * VdW interaction: LennardJones
971 * Geometry: Water3-Water3
972 * Calculate force/pot: Force
975 nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_sse2_single
976 (t_nblist
* gmx_restrict nlist
,
977 rvec
* gmx_restrict xx
,
978 rvec
* gmx_restrict ff
,
979 t_forcerec
* gmx_restrict fr
,
980 t_mdatoms
* gmx_restrict mdatoms
,
981 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
982 t_nrnb
* gmx_restrict nrnb
)
984 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
985 * just 0 for non-waters.
986 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
987 * jnr indices corresponding to data put in the four positions in the SIMD register.
989 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
990 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
991 int jnrA
,jnrB
,jnrC
,jnrD
;
992 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
993 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
994 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
996 real
*shiftvec
,*fshift
,*x
,*f
;
997 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
;
999 __m128 tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
1001 __m128 ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
1003 __m128 ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
1005 __m128 ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
1006 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
;
1007 __m128 jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
1008 int vdwjidx1A
,vdwjidx1B
,vdwjidx1C
,vdwjidx1D
;
1009 __m128 jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
1010 int vdwjidx2A
,vdwjidx2B
,vdwjidx2C
,vdwjidx2D
;
1011 __m128 jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
1012 __m128 dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
1013 __m128 dx01
,dy01
,dz01
,rsq01
,rinv01
,rinvsq01
,r01
,qq01
,c6_01
,c12_01
;
1014 __m128 dx02
,dy02
,dz02
,rsq02
,rinv02
,rinvsq02
,r02
,qq02
,c6_02
,c12_02
;
1015 __m128 dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
1016 __m128 dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
1017 __m128 dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
1018 __m128 dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
1019 __m128 dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
1020 __m128 dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
1021 __m128 velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
1024 __m128 rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
1027 __m128 one_sixth
= _mm_set1_ps(1.0/6.0);
1028 __m128 one_twelfth
= _mm_set1_ps(1.0/12.0);
1029 __m128 dummy_mask
,cutoff_mask
;
1030 __m128 signbit
= _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1031 __m128 one
= _mm_set1_ps(1.0);
1032 __m128 two
= _mm_set1_ps(2.0);
1038 jindex
= nlist
->jindex
;
1040 shiftidx
= nlist
->shift
;
1042 shiftvec
= fr
->shift_vec
[0];
1043 fshift
= fr
->fshift
[0];
1044 facel
= _mm_set1_ps(fr
->epsfac
);
1045 charge
= mdatoms
->chargeA
;
1046 nvdwtype
= fr
->ntype
;
1047 vdwparam
= fr
->nbfp
;
1048 vdwtype
= mdatoms
->typeA
;
1050 /* Setup water-specific parameters */
1051 inr
= nlist
->iinr
[0];
1052 iq0
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+0]));
1053 iq1
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+1]));
1054 iq2
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+2]));
1055 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
1057 jq0
= _mm_set1_ps(charge
[inr
+0]);
1058 jq1
= _mm_set1_ps(charge
[inr
+1]);
1059 jq2
= _mm_set1_ps(charge
[inr
+2]);
1060 vdwjidx0A
= 2*vdwtype
[inr
+0];
1061 qq00
= _mm_mul_ps(iq0
,jq0
);
1062 c6_00
= _mm_set1_ps(vdwparam
[vdwioffset0
+vdwjidx0A
]);
1063 c12_00
= _mm_set1_ps(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
1064 qq01
= _mm_mul_ps(iq0
,jq1
);
1065 qq02
= _mm_mul_ps(iq0
,jq2
);
1066 qq10
= _mm_mul_ps(iq1
,jq0
);
1067 qq11
= _mm_mul_ps(iq1
,jq1
);
1068 qq12
= _mm_mul_ps(iq1
,jq2
);
1069 qq20
= _mm_mul_ps(iq2
,jq0
);
1070 qq21
= _mm_mul_ps(iq2
,jq1
);
1071 qq22
= _mm_mul_ps(iq2
,jq2
);
1073 /* Avoid stupid compiler warnings */
1074 jnrA
= jnrB
= jnrC
= jnrD
= 0;
1075 j_coord_offsetA
= 0;
1076 j_coord_offsetB
= 0;
1077 j_coord_offsetC
= 0;
1078 j_coord_offsetD
= 0;
1083 for(iidx
=0;iidx
<4*DIM
;iidx
++)
1085 scratch
[iidx
] = 0.0;
1088 /* Start outer loop over neighborlists */
1089 for(iidx
=0; iidx
<nri
; iidx
++)
1091 /* Load shift vector for this list */
1092 i_shift_offset
= DIM
*shiftidx
[iidx
];
1094 /* Load limits for loop over neighbors */
1095 j_index_start
= jindex
[iidx
];
1096 j_index_end
= jindex
[iidx
+1];
1098 /* Get outer coordinate index */
1100 i_coord_offset
= DIM
*inr
;
1102 /* Load i particle coords and add shift vector */
1103 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
1104 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
1106 fix0
= _mm_setzero_ps();
1107 fiy0
= _mm_setzero_ps();
1108 fiz0
= _mm_setzero_ps();
1109 fix1
= _mm_setzero_ps();
1110 fiy1
= _mm_setzero_ps();
1111 fiz1
= _mm_setzero_ps();
1112 fix2
= _mm_setzero_ps();
1113 fiy2
= _mm_setzero_ps();
1114 fiz2
= _mm_setzero_ps();
1116 /* Start inner kernel loop */
1117 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+3]>=0; jidx
+=4)
1120 /* Get j neighbor index, and coordinate index */
1122 jnrB
= jjnr
[jidx
+1];
1123 jnrC
= jjnr
[jidx
+2];
1124 jnrD
= jjnr
[jidx
+3];
1125 j_coord_offsetA
= DIM
*jnrA
;
1126 j_coord_offsetB
= DIM
*jnrB
;
1127 j_coord_offsetC
= DIM
*jnrC
;
1128 j_coord_offsetD
= DIM
*jnrD
;
1130 /* load j atom coordinates */
1131 gmx_mm_load_3rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1132 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
1133 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
1135 /* Calculate displacement vector */
1136 dx00
= _mm_sub_ps(ix0
,jx0
);
1137 dy00
= _mm_sub_ps(iy0
,jy0
);
1138 dz00
= _mm_sub_ps(iz0
,jz0
);
1139 dx01
= _mm_sub_ps(ix0
,jx1
);
1140 dy01
= _mm_sub_ps(iy0
,jy1
);
1141 dz01
= _mm_sub_ps(iz0
,jz1
);
1142 dx02
= _mm_sub_ps(ix0
,jx2
);
1143 dy02
= _mm_sub_ps(iy0
,jy2
);
1144 dz02
= _mm_sub_ps(iz0
,jz2
);
1145 dx10
= _mm_sub_ps(ix1
,jx0
);
1146 dy10
= _mm_sub_ps(iy1
,jy0
);
1147 dz10
= _mm_sub_ps(iz1
,jz0
);
1148 dx11
= _mm_sub_ps(ix1
,jx1
);
1149 dy11
= _mm_sub_ps(iy1
,jy1
);
1150 dz11
= _mm_sub_ps(iz1
,jz1
);
1151 dx12
= _mm_sub_ps(ix1
,jx2
);
1152 dy12
= _mm_sub_ps(iy1
,jy2
);
1153 dz12
= _mm_sub_ps(iz1
,jz2
);
1154 dx20
= _mm_sub_ps(ix2
,jx0
);
1155 dy20
= _mm_sub_ps(iy2
,jy0
);
1156 dz20
= _mm_sub_ps(iz2
,jz0
);
1157 dx21
= _mm_sub_ps(ix2
,jx1
);
1158 dy21
= _mm_sub_ps(iy2
,jy1
);
1159 dz21
= _mm_sub_ps(iz2
,jz1
);
1160 dx22
= _mm_sub_ps(ix2
,jx2
);
1161 dy22
= _mm_sub_ps(iy2
,jy2
);
1162 dz22
= _mm_sub_ps(iz2
,jz2
);
1164 /* Calculate squared distance and things based on it */
1165 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
1166 rsq01
= gmx_mm_calc_rsq_ps(dx01
,dy01
,dz01
);
1167 rsq02
= gmx_mm_calc_rsq_ps(dx02
,dy02
,dz02
);
1168 rsq10
= gmx_mm_calc_rsq_ps(dx10
,dy10
,dz10
);
1169 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
1170 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
1171 rsq20
= gmx_mm_calc_rsq_ps(dx20
,dy20
,dz20
);
1172 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
1173 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
1175 rinv00
= gmx_mm_invsqrt_ps(rsq00
);
1176 rinv01
= gmx_mm_invsqrt_ps(rsq01
);
1177 rinv02
= gmx_mm_invsqrt_ps(rsq02
);
1178 rinv10
= gmx_mm_invsqrt_ps(rsq10
);
1179 rinv11
= gmx_mm_invsqrt_ps(rsq11
);
1180 rinv12
= gmx_mm_invsqrt_ps(rsq12
);
1181 rinv20
= gmx_mm_invsqrt_ps(rsq20
);
1182 rinv21
= gmx_mm_invsqrt_ps(rsq21
);
1183 rinv22
= gmx_mm_invsqrt_ps(rsq22
);
1185 rinvsq00
= _mm_mul_ps(rinv00
,rinv00
);
1186 rinvsq01
= _mm_mul_ps(rinv01
,rinv01
);
1187 rinvsq02
= _mm_mul_ps(rinv02
,rinv02
);
1188 rinvsq10
= _mm_mul_ps(rinv10
,rinv10
);
1189 rinvsq11
= _mm_mul_ps(rinv11
,rinv11
);
1190 rinvsq12
= _mm_mul_ps(rinv12
,rinv12
);
1191 rinvsq20
= _mm_mul_ps(rinv20
,rinv20
);
1192 rinvsq21
= _mm_mul_ps(rinv21
,rinv21
);
1193 rinvsq22
= _mm_mul_ps(rinv22
,rinv22
);
1195 fjx0
= _mm_setzero_ps();
1196 fjy0
= _mm_setzero_ps();
1197 fjz0
= _mm_setzero_ps();
1198 fjx1
= _mm_setzero_ps();
1199 fjy1
= _mm_setzero_ps();
1200 fjz1
= _mm_setzero_ps();
1201 fjx2
= _mm_setzero_ps();
1202 fjy2
= _mm_setzero_ps();
1203 fjz2
= _mm_setzero_ps();
1205 /**************************
1206 * CALCULATE INTERACTIONS *
1207 **************************/
1209 /* COULOMB ELECTROSTATICS */
1210 velec
= _mm_mul_ps(qq00
,rinv00
);
1211 felec
= _mm_mul_ps(velec
,rinvsq00
);
1213 /* LENNARD-JONES DISPERSION/REPULSION */
1215 rinvsix
= _mm_mul_ps(_mm_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
1216 fvdw
= _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00
,rinvsix
),c6_00
),_mm_mul_ps(rinvsix
,rinvsq00
));
1218 fscal
= _mm_add_ps(felec
,fvdw
);
1220 /* Calculate temporary vectorial force */
1221 tx
= _mm_mul_ps(fscal
,dx00
);
1222 ty
= _mm_mul_ps(fscal
,dy00
);
1223 tz
= _mm_mul_ps(fscal
,dz00
);
1225 /* Update vectorial force */
1226 fix0
= _mm_add_ps(fix0
,tx
);
1227 fiy0
= _mm_add_ps(fiy0
,ty
);
1228 fiz0
= _mm_add_ps(fiz0
,tz
);
1230 fjx0
= _mm_add_ps(fjx0
,tx
);
1231 fjy0
= _mm_add_ps(fjy0
,ty
);
1232 fjz0
= _mm_add_ps(fjz0
,tz
);
1234 /**************************
1235 * CALCULATE INTERACTIONS *
1236 **************************/
1238 /* COULOMB ELECTROSTATICS */
1239 velec
= _mm_mul_ps(qq01
,rinv01
);
1240 felec
= _mm_mul_ps(velec
,rinvsq01
);
1244 /* Calculate temporary vectorial force */
1245 tx
= _mm_mul_ps(fscal
,dx01
);
1246 ty
= _mm_mul_ps(fscal
,dy01
);
1247 tz
= _mm_mul_ps(fscal
,dz01
);
1249 /* Update vectorial force */
1250 fix0
= _mm_add_ps(fix0
,tx
);
1251 fiy0
= _mm_add_ps(fiy0
,ty
);
1252 fiz0
= _mm_add_ps(fiz0
,tz
);
1254 fjx1
= _mm_add_ps(fjx1
,tx
);
1255 fjy1
= _mm_add_ps(fjy1
,ty
);
1256 fjz1
= _mm_add_ps(fjz1
,tz
);
1258 /**************************
1259 * CALCULATE INTERACTIONS *
1260 **************************/
1262 /* COULOMB ELECTROSTATICS */
1263 velec
= _mm_mul_ps(qq02
,rinv02
);
1264 felec
= _mm_mul_ps(velec
,rinvsq02
);
1268 /* Calculate temporary vectorial force */
1269 tx
= _mm_mul_ps(fscal
,dx02
);
1270 ty
= _mm_mul_ps(fscal
,dy02
);
1271 tz
= _mm_mul_ps(fscal
,dz02
);
1273 /* Update vectorial force */
1274 fix0
= _mm_add_ps(fix0
,tx
);
1275 fiy0
= _mm_add_ps(fiy0
,ty
);
1276 fiz0
= _mm_add_ps(fiz0
,tz
);
1278 fjx2
= _mm_add_ps(fjx2
,tx
);
1279 fjy2
= _mm_add_ps(fjy2
,ty
);
1280 fjz2
= _mm_add_ps(fjz2
,tz
);
1282 /**************************
1283 * CALCULATE INTERACTIONS *
1284 **************************/
1286 /* COULOMB ELECTROSTATICS */
1287 velec
= _mm_mul_ps(qq10
,rinv10
);
1288 felec
= _mm_mul_ps(velec
,rinvsq10
);
1292 /* Calculate temporary vectorial force */
1293 tx
= _mm_mul_ps(fscal
,dx10
);
1294 ty
= _mm_mul_ps(fscal
,dy10
);
1295 tz
= _mm_mul_ps(fscal
,dz10
);
1297 /* Update vectorial force */
1298 fix1
= _mm_add_ps(fix1
,tx
);
1299 fiy1
= _mm_add_ps(fiy1
,ty
);
1300 fiz1
= _mm_add_ps(fiz1
,tz
);
1302 fjx0
= _mm_add_ps(fjx0
,tx
);
1303 fjy0
= _mm_add_ps(fjy0
,ty
);
1304 fjz0
= _mm_add_ps(fjz0
,tz
);
1306 /**************************
1307 * CALCULATE INTERACTIONS *
1308 **************************/
1310 /* COULOMB ELECTROSTATICS */
1311 velec
= _mm_mul_ps(qq11
,rinv11
);
1312 felec
= _mm_mul_ps(velec
,rinvsq11
);
1316 /* Calculate temporary vectorial force */
1317 tx
= _mm_mul_ps(fscal
,dx11
);
1318 ty
= _mm_mul_ps(fscal
,dy11
);
1319 tz
= _mm_mul_ps(fscal
,dz11
);
1321 /* Update vectorial force */
1322 fix1
= _mm_add_ps(fix1
,tx
);
1323 fiy1
= _mm_add_ps(fiy1
,ty
);
1324 fiz1
= _mm_add_ps(fiz1
,tz
);
1326 fjx1
= _mm_add_ps(fjx1
,tx
);
1327 fjy1
= _mm_add_ps(fjy1
,ty
);
1328 fjz1
= _mm_add_ps(fjz1
,tz
);
1330 /**************************
1331 * CALCULATE INTERACTIONS *
1332 **************************/
1334 /* COULOMB ELECTROSTATICS */
1335 velec
= _mm_mul_ps(qq12
,rinv12
);
1336 felec
= _mm_mul_ps(velec
,rinvsq12
);
1340 /* Calculate temporary vectorial force */
1341 tx
= _mm_mul_ps(fscal
,dx12
);
1342 ty
= _mm_mul_ps(fscal
,dy12
);
1343 tz
= _mm_mul_ps(fscal
,dz12
);
1345 /* Update vectorial force */
1346 fix1
= _mm_add_ps(fix1
,tx
);
1347 fiy1
= _mm_add_ps(fiy1
,ty
);
1348 fiz1
= _mm_add_ps(fiz1
,tz
);
1350 fjx2
= _mm_add_ps(fjx2
,tx
);
1351 fjy2
= _mm_add_ps(fjy2
,ty
);
1352 fjz2
= _mm_add_ps(fjz2
,tz
);
1354 /**************************
1355 * CALCULATE INTERACTIONS *
1356 **************************/
1358 /* COULOMB ELECTROSTATICS */
1359 velec
= _mm_mul_ps(qq20
,rinv20
);
1360 felec
= _mm_mul_ps(velec
,rinvsq20
);
1364 /* Calculate temporary vectorial force */
1365 tx
= _mm_mul_ps(fscal
,dx20
);
1366 ty
= _mm_mul_ps(fscal
,dy20
);
1367 tz
= _mm_mul_ps(fscal
,dz20
);
1369 /* Update vectorial force */
1370 fix2
= _mm_add_ps(fix2
,tx
);
1371 fiy2
= _mm_add_ps(fiy2
,ty
);
1372 fiz2
= _mm_add_ps(fiz2
,tz
);
1374 fjx0
= _mm_add_ps(fjx0
,tx
);
1375 fjy0
= _mm_add_ps(fjy0
,ty
);
1376 fjz0
= _mm_add_ps(fjz0
,tz
);
1378 /**************************
1379 * CALCULATE INTERACTIONS *
1380 **************************/
1382 /* COULOMB ELECTROSTATICS */
1383 velec
= _mm_mul_ps(qq21
,rinv21
);
1384 felec
= _mm_mul_ps(velec
,rinvsq21
);
1388 /* Calculate temporary vectorial force */
1389 tx
= _mm_mul_ps(fscal
,dx21
);
1390 ty
= _mm_mul_ps(fscal
,dy21
);
1391 tz
= _mm_mul_ps(fscal
,dz21
);
1393 /* Update vectorial force */
1394 fix2
= _mm_add_ps(fix2
,tx
);
1395 fiy2
= _mm_add_ps(fiy2
,ty
);
1396 fiz2
= _mm_add_ps(fiz2
,tz
);
1398 fjx1
= _mm_add_ps(fjx1
,tx
);
1399 fjy1
= _mm_add_ps(fjy1
,ty
);
1400 fjz1
= _mm_add_ps(fjz1
,tz
);
1402 /**************************
1403 * CALCULATE INTERACTIONS *
1404 **************************/
1406 /* COULOMB ELECTROSTATICS */
1407 velec
= _mm_mul_ps(qq22
,rinv22
);
1408 felec
= _mm_mul_ps(velec
,rinvsq22
);
1412 /* Calculate temporary vectorial force */
1413 tx
= _mm_mul_ps(fscal
,dx22
);
1414 ty
= _mm_mul_ps(fscal
,dy22
);
1415 tz
= _mm_mul_ps(fscal
,dz22
);
1417 /* Update vectorial force */
1418 fix2
= _mm_add_ps(fix2
,tx
);
1419 fiy2
= _mm_add_ps(fiy2
,ty
);
1420 fiz2
= _mm_add_ps(fiz2
,tz
);
1422 fjx2
= _mm_add_ps(fjx2
,tx
);
1423 fjy2
= _mm_add_ps(fjy2
,ty
);
1424 fjz2
= _mm_add_ps(fjz2
,tz
);
1426 fjptrA
= f
+j_coord_offsetA
;
1427 fjptrB
= f
+j_coord_offsetB
;
1428 fjptrC
= f
+j_coord_offsetC
;
1429 fjptrD
= f
+j_coord_offsetD
;
1431 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
1432 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
1434 /* Inner loop uses 250 flops */
1437 if(jidx
<j_index_end
)
1440 /* Get j neighbor index, and coordinate index */
1441 jnrlistA
= jjnr
[jidx
];
1442 jnrlistB
= jjnr
[jidx
+1];
1443 jnrlistC
= jjnr
[jidx
+2];
1444 jnrlistD
= jjnr
[jidx
+3];
1445 /* Sign of each element will be negative for non-real atoms.
1446 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1447 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1449 dummy_mask
= gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128()));
1450 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
1451 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
1452 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
1453 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
1454 j_coord_offsetA
= DIM
*jnrA
;
1455 j_coord_offsetB
= DIM
*jnrB
;
1456 j_coord_offsetC
= DIM
*jnrC
;
1457 j_coord_offsetD
= DIM
*jnrD
;
1459 /* load j atom coordinates */
1460 gmx_mm_load_3rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1461 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
1462 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
1464 /* Calculate displacement vector */
1465 dx00
= _mm_sub_ps(ix0
,jx0
);
1466 dy00
= _mm_sub_ps(iy0
,jy0
);
1467 dz00
= _mm_sub_ps(iz0
,jz0
);
1468 dx01
= _mm_sub_ps(ix0
,jx1
);
1469 dy01
= _mm_sub_ps(iy0
,jy1
);
1470 dz01
= _mm_sub_ps(iz0
,jz1
);
1471 dx02
= _mm_sub_ps(ix0
,jx2
);
1472 dy02
= _mm_sub_ps(iy0
,jy2
);
1473 dz02
= _mm_sub_ps(iz0
,jz2
);
1474 dx10
= _mm_sub_ps(ix1
,jx0
);
1475 dy10
= _mm_sub_ps(iy1
,jy0
);
1476 dz10
= _mm_sub_ps(iz1
,jz0
);
1477 dx11
= _mm_sub_ps(ix1
,jx1
);
1478 dy11
= _mm_sub_ps(iy1
,jy1
);
1479 dz11
= _mm_sub_ps(iz1
,jz1
);
1480 dx12
= _mm_sub_ps(ix1
,jx2
);
1481 dy12
= _mm_sub_ps(iy1
,jy2
);
1482 dz12
= _mm_sub_ps(iz1
,jz2
);
1483 dx20
= _mm_sub_ps(ix2
,jx0
);
1484 dy20
= _mm_sub_ps(iy2
,jy0
);
1485 dz20
= _mm_sub_ps(iz2
,jz0
);
1486 dx21
= _mm_sub_ps(ix2
,jx1
);
1487 dy21
= _mm_sub_ps(iy2
,jy1
);
1488 dz21
= _mm_sub_ps(iz2
,jz1
);
1489 dx22
= _mm_sub_ps(ix2
,jx2
);
1490 dy22
= _mm_sub_ps(iy2
,jy2
);
1491 dz22
= _mm_sub_ps(iz2
,jz2
);
1493 /* Calculate squared distance and things based on it */
1494 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
1495 rsq01
= gmx_mm_calc_rsq_ps(dx01
,dy01
,dz01
);
1496 rsq02
= gmx_mm_calc_rsq_ps(dx02
,dy02
,dz02
);
1497 rsq10
= gmx_mm_calc_rsq_ps(dx10
,dy10
,dz10
);
1498 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
1499 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
1500 rsq20
= gmx_mm_calc_rsq_ps(dx20
,dy20
,dz20
);
1501 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
1502 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
1504 rinv00
= gmx_mm_invsqrt_ps(rsq00
);
1505 rinv01
= gmx_mm_invsqrt_ps(rsq01
);
1506 rinv02
= gmx_mm_invsqrt_ps(rsq02
);
1507 rinv10
= gmx_mm_invsqrt_ps(rsq10
);
1508 rinv11
= gmx_mm_invsqrt_ps(rsq11
);
1509 rinv12
= gmx_mm_invsqrt_ps(rsq12
);
1510 rinv20
= gmx_mm_invsqrt_ps(rsq20
);
1511 rinv21
= gmx_mm_invsqrt_ps(rsq21
);
1512 rinv22
= gmx_mm_invsqrt_ps(rsq22
);
1514 rinvsq00
= _mm_mul_ps(rinv00
,rinv00
);
1515 rinvsq01
= _mm_mul_ps(rinv01
,rinv01
);
1516 rinvsq02
= _mm_mul_ps(rinv02
,rinv02
);
1517 rinvsq10
= _mm_mul_ps(rinv10
,rinv10
);
1518 rinvsq11
= _mm_mul_ps(rinv11
,rinv11
);
1519 rinvsq12
= _mm_mul_ps(rinv12
,rinv12
);
1520 rinvsq20
= _mm_mul_ps(rinv20
,rinv20
);
1521 rinvsq21
= _mm_mul_ps(rinv21
,rinv21
);
1522 rinvsq22
= _mm_mul_ps(rinv22
,rinv22
);
1524 fjx0
= _mm_setzero_ps();
1525 fjy0
= _mm_setzero_ps();
1526 fjz0
= _mm_setzero_ps();
1527 fjx1
= _mm_setzero_ps();
1528 fjy1
= _mm_setzero_ps();
1529 fjz1
= _mm_setzero_ps();
1530 fjx2
= _mm_setzero_ps();
1531 fjy2
= _mm_setzero_ps();
1532 fjz2
= _mm_setzero_ps();
1534 /**************************
1535 * CALCULATE INTERACTIONS *
1536 **************************/
1538 /* COULOMB ELECTROSTATICS */
1539 velec
= _mm_mul_ps(qq00
,rinv00
);
1540 felec
= _mm_mul_ps(velec
,rinvsq00
);
1542 /* LENNARD-JONES DISPERSION/REPULSION */
1544 rinvsix
= _mm_mul_ps(_mm_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
1545 fvdw
= _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00
,rinvsix
),c6_00
),_mm_mul_ps(rinvsix
,rinvsq00
));
1547 fscal
= _mm_add_ps(felec
,fvdw
);
1549 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1551 /* Calculate temporary vectorial force */
1552 tx
= _mm_mul_ps(fscal
,dx00
);
1553 ty
= _mm_mul_ps(fscal
,dy00
);
1554 tz
= _mm_mul_ps(fscal
,dz00
);
1556 /* Update vectorial force */
1557 fix0
= _mm_add_ps(fix0
,tx
);
1558 fiy0
= _mm_add_ps(fiy0
,ty
);
1559 fiz0
= _mm_add_ps(fiz0
,tz
);
1561 fjx0
= _mm_add_ps(fjx0
,tx
);
1562 fjy0
= _mm_add_ps(fjy0
,ty
);
1563 fjz0
= _mm_add_ps(fjz0
,tz
);
1565 /**************************
1566 * CALCULATE INTERACTIONS *
1567 **************************/
1569 /* COULOMB ELECTROSTATICS */
1570 velec
= _mm_mul_ps(qq01
,rinv01
);
1571 felec
= _mm_mul_ps(velec
,rinvsq01
);
1575 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1577 /* Calculate temporary vectorial force */
1578 tx
= _mm_mul_ps(fscal
,dx01
);
1579 ty
= _mm_mul_ps(fscal
,dy01
);
1580 tz
= _mm_mul_ps(fscal
,dz01
);
1582 /* Update vectorial force */
1583 fix0
= _mm_add_ps(fix0
,tx
);
1584 fiy0
= _mm_add_ps(fiy0
,ty
);
1585 fiz0
= _mm_add_ps(fiz0
,tz
);
1587 fjx1
= _mm_add_ps(fjx1
,tx
);
1588 fjy1
= _mm_add_ps(fjy1
,ty
);
1589 fjz1
= _mm_add_ps(fjz1
,tz
);
1591 /**************************
1592 * CALCULATE INTERACTIONS *
1593 **************************/
1595 /* COULOMB ELECTROSTATICS */
1596 velec
= _mm_mul_ps(qq02
,rinv02
);
1597 felec
= _mm_mul_ps(velec
,rinvsq02
);
1601 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1603 /* Calculate temporary vectorial force */
1604 tx
= _mm_mul_ps(fscal
,dx02
);
1605 ty
= _mm_mul_ps(fscal
,dy02
);
1606 tz
= _mm_mul_ps(fscal
,dz02
);
1608 /* Update vectorial force */
1609 fix0
= _mm_add_ps(fix0
,tx
);
1610 fiy0
= _mm_add_ps(fiy0
,ty
);
1611 fiz0
= _mm_add_ps(fiz0
,tz
);
1613 fjx2
= _mm_add_ps(fjx2
,tx
);
1614 fjy2
= _mm_add_ps(fjy2
,ty
);
1615 fjz2
= _mm_add_ps(fjz2
,tz
);
1617 /**************************
1618 * CALCULATE INTERACTIONS *
1619 **************************/
1621 /* COULOMB ELECTROSTATICS */
1622 velec
= _mm_mul_ps(qq10
,rinv10
);
1623 felec
= _mm_mul_ps(velec
,rinvsq10
);
1627 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1629 /* Calculate temporary vectorial force */
1630 tx
= _mm_mul_ps(fscal
,dx10
);
1631 ty
= _mm_mul_ps(fscal
,dy10
);
1632 tz
= _mm_mul_ps(fscal
,dz10
);
1634 /* Update vectorial force */
1635 fix1
= _mm_add_ps(fix1
,tx
);
1636 fiy1
= _mm_add_ps(fiy1
,ty
);
1637 fiz1
= _mm_add_ps(fiz1
,tz
);
1639 fjx0
= _mm_add_ps(fjx0
,tx
);
1640 fjy0
= _mm_add_ps(fjy0
,ty
);
1641 fjz0
= _mm_add_ps(fjz0
,tz
);
1643 /**************************
1644 * CALCULATE INTERACTIONS *
1645 **************************/
1647 /* COULOMB ELECTROSTATICS */
1648 velec
= _mm_mul_ps(qq11
,rinv11
);
1649 felec
= _mm_mul_ps(velec
,rinvsq11
);
1653 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1655 /* Calculate temporary vectorial force */
1656 tx
= _mm_mul_ps(fscal
,dx11
);
1657 ty
= _mm_mul_ps(fscal
,dy11
);
1658 tz
= _mm_mul_ps(fscal
,dz11
);
1660 /* Update vectorial force */
1661 fix1
= _mm_add_ps(fix1
,tx
);
1662 fiy1
= _mm_add_ps(fiy1
,ty
);
1663 fiz1
= _mm_add_ps(fiz1
,tz
);
1665 fjx1
= _mm_add_ps(fjx1
,tx
);
1666 fjy1
= _mm_add_ps(fjy1
,ty
);
1667 fjz1
= _mm_add_ps(fjz1
,tz
);
1669 /**************************
1670 * CALCULATE INTERACTIONS *
1671 **************************/
1673 /* COULOMB ELECTROSTATICS */
1674 velec
= _mm_mul_ps(qq12
,rinv12
);
1675 felec
= _mm_mul_ps(velec
,rinvsq12
);
1679 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1681 /* Calculate temporary vectorial force */
1682 tx
= _mm_mul_ps(fscal
,dx12
);
1683 ty
= _mm_mul_ps(fscal
,dy12
);
1684 tz
= _mm_mul_ps(fscal
,dz12
);
1686 /* Update vectorial force */
1687 fix1
= _mm_add_ps(fix1
,tx
);
1688 fiy1
= _mm_add_ps(fiy1
,ty
);
1689 fiz1
= _mm_add_ps(fiz1
,tz
);
1691 fjx2
= _mm_add_ps(fjx2
,tx
);
1692 fjy2
= _mm_add_ps(fjy2
,ty
);
1693 fjz2
= _mm_add_ps(fjz2
,tz
);
1695 /**************************
1696 * CALCULATE INTERACTIONS *
1697 **************************/
1699 /* COULOMB ELECTROSTATICS */
1700 velec
= _mm_mul_ps(qq20
,rinv20
);
1701 felec
= _mm_mul_ps(velec
,rinvsq20
);
1705 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1707 /* Calculate temporary vectorial force */
1708 tx
= _mm_mul_ps(fscal
,dx20
);
1709 ty
= _mm_mul_ps(fscal
,dy20
);
1710 tz
= _mm_mul_ps(fscal
,dz20
);
1712 /* Update vectorial force */
1713 fix2
= _mm_add_ps(fix2
,tx
);
1714 fiy2
= _mm_add_ps(fiy2
,ty
);
1715 fiz2
= _mm_add_ps(fiz2
,tz
);
1717 fjx0
= _mm_add_ps(fjx0
,tx
);
1718 fjy0
= _mm_add_ps(fjy0
,ty
);
1719 fjz0
= _mm_add_ps(fjz0
,tz
);
1721 /**************************
1722 * CALCULATE INTERACTIONS *
1723 **************************/
1725 /* COULOMB ELECTROSTATICS */
1726 velec
= _mm_mul_ps(qq21
,rinv21
);
1727 felec
= _mm_mul_ps(velec
,rinvsq21
);
1731 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1733 /* Calculate temporary vectorial force */
1734 tx
= _mm_mul_ps(fscal
,dx21
);
1735 ty
= _mm_mul_ps(fscal
,dy21
);
1736 tz
= _mm_mul_ps(fscal
,dz21
);
1738 /* Update vectorial force */
1739 fix2
= _mm_add_ps(fix2
,tx
);
1740 fiy2
= _mm_add_ps(fiy2
,ty
);
1741 fiz2
= _mm_add_ps(fiz2
,tz
);
1743 fjx1
= _mm_add_ps(fjx1
,tx
);
1744 fjy1
= _mm_add_ps(fjy1
,ty
);
1745 fjz1
= _mm_add_ps(fjz1
,tz
);
1747 /**************************
1748 * CALCULATE INTERACTIONS *
1749 **************************/
1751 /* COULOMB ELECTROSTATICS */
1752 velec
= _mm_mul_ps(qq22
,rinv22
);
1753 felec
= _mm_mul_ps(velec
,rinvsq22
);
1757 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1759 /* Calculate temporary vectorial force */
1760 tx
= _mm_mul_ps(fscal
,dx22
);
1761 ty
= _mm_mul_ps(fscal
,dy22
);
1762 tz
= _mm_mul_ps(fscal
,dz22
);
1764 /* Update vectorial force */
1765 fix2
= _mm_add_ps(fix2
,tx
);
1766 fiy2
= _mm_add_ps(fiy2
,ty
);
1767 fiz2
= _mm_add_ps(fiz2
,tz
);
1769 fjx2
= _mm_add_ps(fjx2
,tx
);
1770 fjy2
= _mm_add_ps(fjy2
,ty
);
1771 fjz2
= _mm_add_ps(fjz2
,tz
);
1773 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
1774 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
1775 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
1776 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
1778 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
1779 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
1781 /* Inner loop uses 250 flops */
1784 /* End of innermost loop */
1786 gmx_mm_update_iforce_3atom_swizzle_ps(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
1787 f
+i_coord_offset
,fshift
+i_shift_offset
);
1789 /* Increment number of inner iterations */
1790 inneriter
+= j_index_end
- j_index_start
;
1792 /* Outer loop uses 18 flops */
1795 /* Increment number of outer iterations */
1798 /* Update outer/inner flops */
1800 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W3W3_F
,outeriter
*18 + inneriter
*250);