2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse4_1_single kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_sse4_1_single.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_sse4_1_single
51 * Electrostatics interaction: ReactionField
52 * VdW interaction: LennardJones
53 * Geometry: Water3-Water3
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_sse4_1_single
58 (t_nblist
* gmx_restrict nlist
,
59 rvec
* gmx_restrict xx
,
60 rvec
* gmx_restrict ff
,
61 struct t_forcerec
* gmx_restrict fr
,
62 t_mdatoms
* gmx_restrict mdatoms
,
63 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
64 t_nrnb
* gmx_restrict nrnb
)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
72 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
73 int jnrA
,jnrB
,jnrC
,jnrD
;
74 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
75 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
76 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
78 real
*shiftvec
,*fshift
,*x
,*f
;
79 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
;
81 __m128 tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
83 __m128 ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
85 __m128 ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
87 __m128 ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
88 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
;
89 __m128 jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
90 int vdwjidx1A
,vdwjidx1B
,vdwjidx1C
,vdwjidx1D
;
91 __m128 jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
92 int vdwjidx2A
,vdwjidx2B
,vdwjidx2C
,vdwjidx2D
;
93 __m128 jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
94 __m128 dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
95 __m128 dx01
,dy01
,dz01
,rsq01
,rinv01
,rinvsq01
,r01
,qq01
,c6_01
,c12_01
;
96 __m128 dx02
,dy02
,dz02
,rsq02
,rinv02
,rinvsq02
,r02
,qq02
,c6_02
,c12_02
;
97 __m128 dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
98 __m128 dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
99 __m128 dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
100 __m128 dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
101 __m128 dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
102 __m128 dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
103 __m128 velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
106 __m128 rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
109 __m128 one_sixth
= _mm_set1_ps(1.0/6.0);
110 __m128 one_twelfth
= _mm_set1_ps(1.0/12.0);
111 __m128 dummy_mask
,cutoff_mask
;
112 __m128 signbit
= _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
113 __m128 one
= _mm_set1_ps(1.0);
114 __m128 two
= _mm_set1_ps(2.0);
120 jindex
= nlist
->jindex
;
122 shiftidx
= nlist
->shift
;
124 shiftvec
= fr
->shift_vec
[0];
125 fshift
= fr
->fshift
[0];
126 facel
= _mm_set1_ps(fr
->ic
->epsfac
);
127 charge
= mdatoms
->chargeA
;
128 krf
= _mm_set1_ps(fr
->ic
->k_rf
);
129 krf2
= _mm_set1_ps(fr
->ic
->k_rf
*2.0);
130 crf
= _mm_set1_ps(fr
->ic
->c_rf
);
131 nvdwtype
= fr
->ntype
;
133 vdwtype
= mdatoms
->typeA
;
135 /* Setup water-specific parameters */
136 inr
= nlist
->iinr
[0];
137 iq0
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+0]));
138 iq1
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+1]));
139 iq2
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+2]));
140 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
142 jq0
= _mm_set1_ps(charge
[inr
+0]);
143 jq1
= _mm_set1_ps(charge
[inr
+1]);
144 jq2
= _mm_set1_ps(charge
[inr
+2]);
145 vdwjidx0A
= 2*vdwtype
[inr
+0];
146 qq00
= _mm_mul_ps(iq0
,jq0
);
147 c6_00
= _mm_set1_ps(vdwparam
[vdwioffset0
+vdwjidx0A
]);
148 c12_00
= _mm_set1_ps(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
149 qq01
= _mm_mul_ps(iq0
,jq1
);
150 qq02
= _mm_mul_ps(iq0
,jq2
);
151 qq10
= _mm_mul_ps(iq1
,jq0
);
152 qq11
= _mm_mul_ps(iq1
,jq1
);
153 qq12
= _mm_mul_ps(iq1
,jq2
);
154 qq20
= _mm_mul_ps(iq2
,jq0
);
155 qq21
= _mm_mul_ps(iq2
,jq1
);
156 qq22
= _mm_mul_ps(iq2
,jq2
);
158 /* Avoid stupid compiler warnings */
159 jnrA
= jnrB
= jnrC
= jnrD
= 0;
168 for(iidx
=0;iidx
<4*DIM
;iidx
++)
173 /* Start outer loop over neighborlists */
174 for(iidx
=0; iidx
<nri
; iidx
++)
176 /* Load shift vector for this list */
177 i_shift_offset
= DIM
*shiftidx
[iidx
];
179 /* Load limits for loop over neighbors */
180 j_index_start
= jindex
[iidx
];
181 j_index_end
= jindex
[iidx
+1];
183 /* Get outer coordinate index */
185 i_coord_offset
= DIM
*inr
;
187 /* Load i particle coords and add shift vector */
188 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
189 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
191 fix0
= _mm_setzero_ps();
192 fiy0
= _mm_setzero_ps();
193 fiz0
= _mm_setzero_ps();
194 fix1
= _mm_setzero_ps();
195 fiy1
= _mm_setzero_ps();
196 fiz1
= _mm_setzero_ps();
197 fix2
= _mm_setzero_ps();
198 fiy2
= _mm_setzero_ps();
199 fiz2
= _mm_setzero_ps();
201 /* Reset potential sums */
202 velecsum
= _mm_setzero_ps();
203 vvdwsum
= _mm_setzero_ps();
205 /* Start inner kernel loop */
206 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+3]>=0; jidx
+=4)
209 /* Get j neighbor index, and coordinate index */
214 j_coord_offsetA
= DIM
*jnrA
;
215 j_coord_offsetB
= DIM
*jnrB
;
216 j_coord_offsetC
= DIM
*jnrC
;
217 j_coord_offsetD
= DIM
*jnrD
;
219 /* load j atom coordinates */
220 gmx_mm_load_3rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
221 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
222 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
224 /* Calculate displacement vector */
225 dx00
= _mm_sub_ps(ix0
,jx0
);
226 dy00
= _mm_sub_ps(iy0
,jy0
);
227 dz00
= _mm_sub_ps(iz0
,jz0
);
228 dx01
= _mm_sub_ps(ix0
,jx1
);
229 dy01
= _mm_sub_ps(iy0
,jy1
);
230 dz01
= _mm_sub_ps(iz0
,jz1
);
231 dx02
= _mm_sub_ps(ix0
,jx2
);
232 dy02
= _mm_sub_ps(iy0
,jy2
);
233 dz02
= _mm_sub_ps(iz0
,jz2
);
234 dx10
= _mm_sub_ps(ix1
,jx0
);
235 dy10
= _mm_sub_ps(iy1
,jy0
);
236 dz10
= _mm_sub_ps(iz1
,jz0
);
237 dx11
= _mm_sub_ps(ix1
,jx1
);
238 dy11
= _mm_sub_ps(iy1
,jy1
);
239 dz11
= _mm_sub_ps(iz1
,jz1
);
240 dx12
= _mm_sub_ps(ix1
,jx2
);
241 dy12
= _mm_sub_ps(iy1
,jy2
);
242 dz12
= _mm_sub_ps(iz1
,jz2
);
243 dx20
= _mm_sub_ps(ix2
,jx0
);
244 dy20
= _mm_sub_ps(iy2
,jy0
);
245 dz20
= _mm_sub_ps(iz2
,jz0
);
246 dx21
= _mm_sub_ps(ix2
,jx1
);
247 dy21
= _mm_sub_ps(iy2
,jy1
);
248 dz21
= _mm_sub_ps(iz2
,jz1
);
249 dx22
= _mm_sub_ps(ix2
,jx2
);
250 dy22
= _mm_sub_ps(iy2
,jy2
);
251 dz22
= _mm_sub_ps(iz2
,jz2
);
253 /* Calculate squared distance and things based on it */
254 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
255 rsq01
= gmx_mm_calc_rsq_ps(dx01
,dy01
,dz01
);
256 rsq02
= gmx_mm_calc_rsq_ps(dx02
,dy02
,dz02
);
257 rsq10
= gmx_mm_calc_rsq_ps(dx10
,dy10
,dz10
);
258 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
259 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
260 rsq20
= gmx_mm_calc_rsq_ps(dx20
,dy20
,dz20
);
261 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
262 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
264 rinv00
= sse41_invsqrt_f(rsq00
);
265 rinv01
= sse41_invsqrt_f(rsq01
);
266 rinv02
= sse41_invsqrt_f(rsq02
);
267 rinv10
= sse41_invsqrt_f(rsq10
);
268 rinv11
= sse41_invsqrt_f(rsq11
);
269 rinv12
= sse41_invsqrt_f(rsq12
);
270 rinv20
= sse41_invsqrt_f(rsq20
);
271 rinv21
= sse41_invsqrt_f(rsq21
);
272 rinv22
= sse41_invsqrt_f(rsq22
);
274 rinvsq00
= _mm_mul_ps(rinv00
,rinv00
);
275 rinvsq01
= _mm_mul_ps(rinv01
,rinv01
);
276 rinvsq02
= _mm_mul_ps(rinv02
,rinv02
);
277 rinvsq10
= _mm_mul_ps(rinv10
,rinv10
);
278 rinvsq11
= _mm_mul_ps(rinv11
,rinv11
);
279 rinvsq12
= _mm_mul_ps(rinv12
,rinv12
);
280 rinvsq20
= _mm_mul_ps(rinv20
,rinv20
);
281 rinvsq21
= _mm_mul_ps(rinv21
,rinv21
);
282 rinvsq22
= _mm_mul_ps(rinv22
,rinv22
);
284 fjx0
= _mm_setzero_ps();
285 fjy0
= _mm_setzero_ps();
286 fjz0
= _mm_setzero_ps();
287 fjx1
= _mm_setzero_ps();
288 fjy1
= _mm_setzero_ps();
289 fjz1
= _mm_setzero_ps();
290 fjx2
= _mm_setzero_ps();
291 fjy2
= _mm_setzero_ps();
292 fjz2
= _mm_setzero_ps();
294 /**************************
295 * CALCULATE INTERACTIONS *
296 **************************/
298 /* REACTION-FIELD ELECTROSTATICS */
299 velec
= _mm_mul_ps(qq00
,_mm_sub_ps(_mm_add_ps(rinv00
,_mm_mul_ps(krf
,rsq00
)),crf
));
300 felec
= _mm_mul_ps(qq00
,_mm_sub_ps(_mm_mul_ps(rinv00
,rinvsq00
),krf2
));
302 /* LENNARD-JONES DISPERSION/REPULSION */
304 rinvsix
= _mm_mul_ps(_mm_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
305 vvdw6
= _mm_mul_ps(c6_00
,rinvsix
);
306 vvdw12
= _mm_mul_ps(c12_00
,_mm_mul_ps(rinvsix
,rinvsix
));
307 vvdw
= _mm_sub_ps( _mm_mul_ps(vvdw12
,one_twelfth
) , _mm_mul_ps(vvdw6
,one_sixth
) );
308 fvdw
= _mm_mul_ps(_mm_sub_ps(vvdw12
,vvdw6
),rinvsq00
);
310 /* Update potential sum for this i atom from the interaction with this j atom. */
311 velecsum
= _mm_add_ps(velecsum
,velec
);
312 vvdwsum
= _mm_add_ps(vvdwsum
,vvdw
);
314 fscal
= _mm_add_ps(felec
,fvdw
);
316 /* Calculate temporary vectorial force */
317 tx
= _mm_mul_ps(fscal
,dx00
);
318 ty
= _mm_mul_ps(fscal
,dy00
);
319 tz
= _mm_mul_ps(fscal
,dz00
);
321 /* Update vectorial force */
322 fix0
= _mm_add_ps(fix0
,tx
);
323 fiy0
= _mm_add_ps(fiy0
,ty
);
324 fiz0
= _mm_add_ps(fiz0
,tz
);
326 fjx0
= _mm_add_ps(fjx0
,tx
);
327 fjy0
= _mm_add_ps(fjy0
,ty
);
328 fjz0
= _mm_add_ps(fjz0
,tz
);
330 /**************************
331 * CALCULATE INTERACTIONS *
332 **************************/
334 /* REACTION-FIELD ELECTROSTATICS */
335 velec
= _mm_mul_ps(qq01
,_mm_sub_ps(_mm_add_ps(rinv01
,_mm_mul_ps(krf
,rsq01
)),crf
));
336 felec
= _mm_mul_ps(qq01
,_mm_sub_ps(_mm_mul_ps(rinv01
,rinvsq01
),krf2
));
338 /* Update potential sum for this i atom from the interaction with this j atom. */
339 velecsum
= _mm_add_ps(velecsum
,velec
);
343 /* Calculate temporary vectorial force */
344 tx
= _mm_mul_ps(fscal
,dx01
);
345 ty
= _mm_mul_ps(fscal
,dy01
);
346 tz
= _mm_mul_ps(fscal
,dz01
);
348 /* Update vectorial force */
349 fix0
= _mm_add_ps(fix0
,tx
);
350 fiy0
= _mm_add_ps(fiy0
,ty
);
351 fiz0
= _mm_add_ps(fiz0
,tz
);
353 fjx1
= _mm_add_ps(fjx1
,tx
);
354 fjy1
= _mm_add_ps(fjy1
,ty
);
355 fjz1
= _mm_add_ps(fjz1
,tz
);
357 /**************************
358 * CALCULATE INTERACTIONS *
359 **************************/
361 /* REACTION-FIELD ELECTROSTATICS */
362 velec
= _mm_mul_ps(qq02
,_mm_sub_ps(_mm_add_ps(rinv02
,_mm_mul_ps(krf
,rsq02
)),crf
));
363 felec
= _mm_mul_ps(qq02
,_mm_sub_ps(_mm_mul_ps(rinv02
,rinvsq02
),krf2
));
365 /* Update potential sum for this i atom from the interaction with this j atom. */
366 velecsum
= _mm_add_ps(velecsum
,velec
);
370 /* Calculate temporary vectorial force */
371 tx
= _mm_mul_ps(fscal
,dx02
);
372 ty
= _mm_mul_ps(fscal
,dy02
);
373 tz
= _mm_mul_ps(fscal
,dz02
);
375 /* Update vectorial force */
376 fix0
= _mm_add_ps(fix0
,tx
);
377 fiy0
= _mm_add_ps(fiy0
,ty
);
378 fiz0
= _mm_add_ps(fiz0
,tz
);
380 fjx2
= _mm_add_ps(fjx2
,tx
);
381 fjy2
= _mm_add_ps(fjy2
,ty
);
382 fjz2
= _mm_add_ps(fjz2
,tz
);
384 /**************************
385 * CALCULATE INTERACTIONS *
386 **************************/
388 /* REACTION-FIELD ELECTROSTATICS */
389 velec
= _mm_mul_ps(qq10
,_mm_sub_ps(_mm_add_ps(rinv10
,_mm_mul_ps(krf
,rsq10
)),crf
));
390 felec
= _mm_mul_ps(qq10
,_mm_sub_ps(_mm_mul_ps(rinv10
,rinvsq10
),krf2
));
392 /* Update potential sum for this i atom from the interaction with this j atom. */
393 velecsum
= _mm_add_ps(velecsum
,velec
);
397 /* Calculate temporary vectorial force */
398 tx
= _mm_mul_ps(fscal
,dx10
);
399 ty
= _mm_mul_ps(fscal
,dy10
);
400 tz
= _mm_mul_ps(fscal
,dz10
);
402 /* Update vectorial force */
403 fix1
= _mm_add_ps(fix1
,tx
);
404 fiy1
= _mm_add_ps(fiy1
,ty
);
405 fiz1
= _mm_add_ps(fiz1
,tz
);
407 fjx0
= _mm_add_ps(fjx0
,tx
);
408 fjy0
= _mm_add_ps(fjy0
,ty
);
409 fjz0
= _mm_add_ps(fjz0
,tz
);
411 /**************************
412 * CALCULATE INTERACTIONS *
413 **************************/
415 /* REACTION-FIELD ELECTROSTATICS */
416 velec
= _mm_mul_ps(qq11
,_mm_sub_ps(_mm_add_ps(rinv11
,_mm_mul_ps(krf
,rsq11
)),crf
));
417 felec
= _mm_mul_ps(qq11
,_mm_sub_ps(_mm_mul_ps(rinv11
,rinvsq11
),krf2
));
419 /* Update potential sum for this i atom from the interaction with this j atom. */
420 velecsum
= _mm_add_ps(velecsum
,velec
);
424 /* Calculate temporary vectorial force */
425 tx
= _mm_mul_ps(fscal
,dx11
);
426 ty
= _mm_mul_ps(fscal
,dy11
);
427 tz
= _mm_mul_ps(fscal
,dz11
);
429 /* Update vectorial force */
430 fix1
= _mm_add_ps(fix1
,tx
);
431 fiy1
= _mm_add_ps(fiy1
,ty
);
432 fiz1
= _mm_add_ps(fiz1
,tz
);
434 fjx1
= _mm_add_ps(fjx1
,tx
);
435 fjy1
= _mm_add_ps(fjy1
,ty
);
436 fjz1
= _mm_add_ps(fjz1
,tz
);
438 /**************************
439 * CALCULATE INTERACTIONS *
440 **************************/
442 /* REACTION-FIELD ELECTROSTATICS */
443 velec
= _mm_mul_ps(qq12
,_mm_sub_ps(_mm_add_ps(rinv12
,_mm_mul_ps(krf
,rsq12
)),crf
));
444 felec
= _mm_mul_ps(qq12
,_mm_sub_ps(_mm_mul_ps(rinv12
,rinvsq12
),krf2
));
446 /* Update potential sum for this i atom from the interaction with this j atom. */
447 velecsum
= _mm_add_ps(velecsum
,velec
);
451 /* Calculate temporary vectorial force */
452 tx
= _mm_mul_ps(fscal
,dx12
);
453 ty
= _mm_mul_ps(fscal
,dy12
);
454 tz
= _mm_mul_ps(fscal
,dz12
);
456 /* Update vectorial force */
457 fix1
= _mm_add_ps(fix1
,tx
);
458 fiy1
= _mm_add_ps(fiy1
,ty
);
459 fiz1
= _mm_add_ps(fiz1
,tz
);
461 fjx2
= _mm_add_ps(fjx2
,tx
);
462 fjy2
= _mm_add_ps(fjy2
,ty
);
463 fjz2
= _mm_add_ps(fjz2
,tz
);
465 /**************************
466 * CALCULATE INTERACTIONS *
467 **************************/
469 /* REACTION-FIELD ELECTROSTATICS */
470 velec
= _mm_mul_ps(qq20
,_mm_sub_ps(_mm_add_ps(rinv20
,_mm_mul_ps(krf
,rsq20
)),crf
));
471 felec
= _mm_mul_ps(qq20
,_mm_sub_ps(_mm_mul_ps(rinv20
,rinvsq20
),krf2
));
473 /* Update potential sum for this i atom from the interaction with this j atom. */
474 velecsum
= _mm_add_ps(velecsum
,velec
);
478 /* Calculate temporary vectorial force */
479 tx
= _mm_mul_ps(fscal
,dx20
);
480 ty
= _mm_mul_ps(fscal
,dy20
);
481 tz
= _mm_mul_ps(fscal
,dz20
);
483 /* Update vectorial force */
484 fix2
= _mm_add_ps(fix2
,tx
);
485 fiy2
= _mm_add_ps(fiy2
,ty
);
486 fiz2
= _mm_add_ps(fiz2
,tz
);
488 fjx0
= _mm_add_ps(fjx0
,tx
);
489 fjy0
= _mm_add_ps(fjy0
,ty
);
490 fjz0
= _mm_add_ps(fjz0
,tz
);
492 /**************************
493 * CALCULATE INTERACTIONS *
494 **************************/
496 /* REACTION-FIELD ELECTROSTATICS */
497 velec
= _mm_mul_ps(qq21
,_mm_sub_ps(_mm_add_ps(rinv21
,_mm_mul_ps(krf
,rsq21
)),crf
));
498 felec
= _mm_mul_ps(qq21
,_mm_sub_ps(_mm_mul_ps(rinv21
,rinvsq21
),krf2
));
500 /* Update potential sum for this i atom from the interaction with this j atom. */
501 velecsum
= _mm_add_ps(velecsum
,velec
);
505 /* Calculate temporary vectorial force */
506 tx
= _mm_mul_ps(fscal
,dx21
);
507 ty
= _mm_mul_ps(fscal
,dy21
);
508 tz
= _mm_mul_ps(fscal
,dz21
);
510 /* Update vectorial force */
511 fix2
= _mm_add_ps(fix2
,tx
);
512 fiy2
= _mm_add_ps(fiy2
,ty
);
513 fiz2
= _mm_add_ps(fiz2
,tz
);
515 fjx1
= _mm_add_ps(fjx1
,tx
);
516 fjy1
= _mm_add_ps(fjy1
,ty
);
517 fjz1
= _mm_add_ps(fjz1
,tz
);
519 /**************************
520 * CALCULATE INTERACTIONS *
521 **************************/
523 /* REACTION-FIELD ELECTROSTATICS */
524 velec
= _mm_mul_ps(qq22
,_mm_sub_ps(_mm_add_ps(rinv22
,_mm_mul_ps(krf
,rsq22
)),crf
));
525 felec
= _mm_mul_ps(qq22
,_mm_sub_ps(_mm_mul_ps(rinv22
,rinvsq22
),krf2
));
527 /* Update potential sum for this i atom from the interaction with this j atom. */
528 velecsum
= _mm_add_ps(velecsum
,velec
);
532 /* Calculate temporary vectorial force */
533 tx
= _mm_mul_ps(fscal
,dx22
);
534 ty
= _mm_mul_ps(fscal
,dy22
);
535 tz
= _mm_mul_ps(fscal
,dz22
);
537 /* Update vectorial force */
538 fix2
= _mm_add_ps(fix2
,tx
);
539 fiy2
= _mm_add_ps(fiy2
,ty
);
540 fiz2
= _mm_add_ps(fiz2
,tz
);
542 fjx2
= _mm_add_ps(fjx2
,tx
);
543 fjy2
= _mm_add_ps(fjy2
,ty
);
544 fjz2
= _mm_add_ps(fjz2
,tz
);
546 fjptrA
= f
+j_coord_offsetA
;
547 fjptrB
= f
+j_coord_offsetB
;
548 fjptrC
= f
+j_coord_offsetC
;
549 fjptrD
= f
+j_coord_offsetD
;
551 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
552 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
554 /* Inner loop uses 300 flops */
560 /* Get j neighbor index, and coordinate index */
561 jnrlistA
= jjnr
[jidx
];
562 jnrlistB
= jjnr
[jidx
+1];
563 jnrlistC
= jjnr
[jidx
+2];
564 jnrlistD
= jjnr
[jidx
+3];
565 /* Sign of each element will be negative for non-real atoms.
566 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
567 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
569 dummy_mask
= gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128()));
570 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
571 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
572 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
573 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
574 j_coord_offsetA
= DIM
*jnrA
;
575 j_coord_offsetB
= DIM
*jnrB
;
576 j_coord_offsetC
= DIM
*jnrC
;
577 j_coord_offsetD
= DIM
*jnrD
;
579 /* load j atom coordinates */
580 gmx_mm_load_3rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
581 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
582 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
584 /* Calculate displacement vector */
585 dx00
= _mm_sub_ps(ix0
,jx0
);
586 dy00
= _mm_sub_ps(iy0
,jy0
);
587 dz00
= _mm_sub_ps(iz0
,jz0
);
588 dx01
= _mm_sub_ps(ix0
,jx1
);
589 dy01
= _mm_sub_ps(iy0
,jy1
);
590 dz01
= _mm_sub_ps(iz0
,jz1
);
591 dx02
= _mm_sub_ps(ix0
,jx2
);
592 dy02
= _mm_sub_ps(iy0
,jy2
);
593 dz02
= _mm_sub_ps(iz0
,jz2
);
594 dx10
= _mm_sub_ps(ix1
,jx0
);
595 dy10
= _mm_sub_ps(iy1
,jy0
);
596 dz10
= _mm_sub_ps(iz1
,jz0
);
597 dx11
= _mm_sub_ps(ix1
,jx1
);
598 dy11
= _mm_sub_ps(iy1
,jy1
);
599 dz11
= _mm_sub_ps(iz1
,jz1
);
600 dx12
= _mm_sub_ps(ix1
,jx2
);
601 dy12
= _mm_sub_ps(iy1
,jy2
);
602 dz12
= _mm_sub_ps(iz1
,jz2
);
603 dx20
= _mm_sub_ps(ix2
,jx0
);
604 dy20
= _mm_sub_ps(iy2
,jy0
);
605 dz20
= _mm_sub_ps(iz2
,jz0
);
606 dx21
= _mm_sub_ps(ix2
,jx1
);
607 dy21
= _mm_sub_ps(iy2
,jy1
);
608 dz21
= _mm_sub_ps(iz2
,jz1
);
609 dx22
= _mm_sub_ps(ix2
,jx2
);
610 dy22
= _mm_sub_ps(iy2
,jy2
);
611 dz22
= _mm_sub_ps(iz2
,jz2
);
613 /* Calculate squared distance and things based on it */
614 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
615 rsq01
= gmx_mm_calc_rsq_ps(dx01
,dy01
,dz01
);
616 rsq02
= gmx_mm_calc_rsq_ps(dx02
,dy02
,dz02
);
617 rsq10
= gmx_mm_calc_rsq_ps(dx10
,dy10
,dz10
);
618 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
619 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
620 rsq20
= gmx_mm_calc_rsq_ps(dx20
,dy20
,dz20
);
621 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
622 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
624 rinv00
= sse41_invsqrt_f(rsq00
);
625 rinv01
= sse41_invsqrt_f(rsq01
);
626 rinv02
= sse41_invsqrt_f(rsq02
);
627 rinv10
= sse41_invsqrt_f(rsq10
);
628 rinv11
= sse41_invsqrt_f(rsq11
);
629 rinv12
= sse41_invsqrt_f(rsq12
);
630 rinv20
= sse41_invsqrt_f(rsq20
);
631 rinv21
= sse41_invsqrt_f(rsq21
);
632 rinv22
= sse41_invsqrt_f(rsq22
);
634 rinvsq00
= _mm_mul_ps(rinv00
,rinv00
);
635 rinvsq01
= _mm_mul_ps(rinv01
,rinv01
);
636 rinvsq02
= _mm_mul_ps(rinv02
,rinv02
);
637 rinvsq10
= _mm_mul_ps(rinv10
,rinv10
);
638 rinvsq11
= _mm_mul_ps(rinv11
,rinv11
);
639 rinvsq12
= _mm_mul_ps(rinv12
,rinv12
);
640 rinvsq20
= _mm_mul_ps(rinv20
,rinv20
);
641 rinvsq21
= _mm_mul_ps(rinv21
,rinv21
);
642 rinvsq22
= _mm_mul_ps(rinv22
,rinv22
);
644 fjx0
= _mm_setzero_ps();
645 fjy0
= _mm_setzero_ps();
646 fjz0
= _mm_setzero_ps();
647 fjx1
= _mm_setzero_ps();
648 fjy1
= _mm_setzero_ps();
649 fjz1
= _mm_setzero_ps();
650 fjx2
= _mm_setzero_ps();
651 fjy2
= _mm_setzero_ps();
652 fjz2
= _mm_setzero_ps();
654 /**************************
655 * CALCULATE INTERACTIONS *
656 **************************/
658 /* REACTION-FIELD ELECTROSTATICS */
659 velec
= _mm_mul_ps(qq00
,_mm_sub_ps(_mm_add_ps(rinv00
,_mm_mul_ps(krf
,rsq00
)),crf
));
660 felec
= _mm_mul_ps(qq00
,_mm_sub_ps(_mm_mul_ps(rinv00
,rinvsq00
),krf2
));
662 /* LENNARD-JONES DISPERSION/REPULSION */
664 rinvsix
= _mm_mul_ps(_mm_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
665 vvdw6
= _mm_mul_ps(c6_00
,rinvsix
);
666 vvdw12
= _mm_mul_ps(c12_00
,_mm_mul_ps(rinvsix
,rinvsix
));
667 vvdw
= _mm_sub_ps( _mm_mul_ps(vvdw12
,one_twelfth
) , _mm_mul_ps(vvdw6
,one_sixth
) );
668 fvdw
= _mm_mul_ps(_mm_sub_ps(vvdw12
,vvdw6
),rinvsq00
);
670 /* Update potential sum for this i atom from the interaction with this j atom. */
671 velec
= _mm_andnot_ps(dummy_mask
,velec
);
672 velecsum
= _mm_add_ps(velecsum
,velec
);
673 vvdw
= _mm_andnot_ps(dummy_mask
,vvdw
);
674 vvdwsum
= _mm_add_ps(vvdwsum
,vvdw
);
676 fscal
= _mm_add_ps(felec
,fvdw
);
678 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
680 /* Calculate temporary vectorial force */
681 tx
= _mm_mul_ps(fscal
,dx00
);
682 ty
= _mm_mul_ps(fscal
,dy00
);
683 tz
= _mm_mul_ps(fscal
,dz00
);
685 /* Update vectorial force */
686 fix0
= _mm_add_ps(fix0
,tx
);
687 fiy0
= _mm_add_ps(fiy0
,ty
);
688 fiz0
= _mm_add_ps(fiz0
,tz
);
690 fjx0
= _mm_add_ps(fjx0
,tx
);
691 fjy0
= _mm_add_ps(fjy0
,ty
);
692 fjz0
= _mm_add_ps(fjz0
,tz
);
694 /**************************
695 * CALCULATE INTERACTIONS *
696 **************************/
698 /* REACTION-FIELD ELECTROSTATICS */
699 velec
= _mm_mul_ps(qq01
,_mm_sub_ps(_mm_add_ps(rinv01
,_mm_mul_ps(krf
,rsq01
)),crf
));
700 felec
= _mm_mul_ps(qq01
,_mm_sub_ps(_mm_mul_ps(rinv01
,rinvsq01
),krf2
));
702 /* Update potential sum for this i atom from the interaction with this j atom. */
703 velec
= _mm_andnot_ps(dummy_mask
,velec
);
704 velecsum
= _mm_add_ps(velecsum
,velec
);
708 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
710 /* Calculate temporary vectorial force */
711 tx
= _mm_mul_ps(fscal
,dx01
);
712 ty
= _mm_mul_ps(fscal
,dy01
);
713 tz
= _mm_mul_ps(fscal
,dz01
);
715 /* Update vectorial force */
716 fix0
= _mm_add_ps(fix0
,tx
);
717 fiy0
= _mm_add_ps(fiy0
,ty
);
718 fiz0
= _mm_add_ps(fiz0
,tz
);
720 fjx1
= _mm_add_ps(fjx1
,tx
);
721 fjy1
= _mm_add_ps(fjy1
,ty
);
722 fjz1
= _mm_add_ps(fjz1
,tz
);
724 /**************************
725 * CALCULATE INTERACTIONS *
726 **************************/
728 /* REACTION-FIELD ELECTROSTATICS */
729 velec
= _mm_mul_ps(qq02
,_mm_sub_ps(_mm_add_ps(rinv02
,_mm_mul_ps(krf
,rsq02
)),crf
));
730 felec
= _mm_mul_ps(qq02
,_mm_sub_ps(_mm_mul_ps(rinv02
,rinvsq02
),krf2
));
732 /* Update potential sum for this i atom from the interaction with this j atom. */
733 velec
= _mm_andnot_ps(dummy_mask
,velec
);
734 velecsum
= _mm_add_ps(velecsum
,velec
);
738 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
740 /* Calculate temporary vectorial force */
741 tx
= _mm_mul_ps(fscal
,dx02
);
742 ty
= _mm_mul_ps(fscal
,dy02
);
743 tz
= _mm_mul_ps(fscal
,dz02
);
745 /* Update vectorial force */
746 fix0
= _mm_add_ps(fix0
,tx
);
747 fiy0
= _mm_add_ps(fiy0
,ty
);
748 fiz0
= _mm_add_ps(fiz0
,tz
);
750 fjx2
= _mm_add_ps(fjx2
,tx
);
751 fjy2
= _mm_add_ps(fjy2
,ty
);
752 fjz2
= _mm_add_ps(fjz2
,tz
);
754 /**************************
755 * CALCULATE INTERACTIONS *
756 **************************/
758 /* REACTION-FIELD ELECTROSTATICS */
759 velec
= _mm_mul_ps(qq10
,_mm_sub_ps(_mm_add_ps(rinv10
,_mm_mul_ps(krf
,rsq10
)),crf
));
760 felec
= _mm_mul_ps(qq10
,_mm_sub_ps(_mm_mul_ps(rinv10
,rinvsq10
),krf2
));
762 /* Update potential sum for this i atom from the interaction with this j atom. */
763 velec
= _mm_andnot_ps(dummy_mask
,velec
);
764 velecsum
= _mm_add_ps(velecsum
,velec
);
768 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
770 /* Calculate temporary vectorial force */
771 tx
= _mm_mul_ps(fscal
,dx10
);
772 ty
= _mm_mul_ps(fscal
,dy10
);
773 tz
= _mm_mul_ps(fscal
,dz10
);
775 /* Update vectorial force */
776 fix1
= _mm_add_ps(fix1
,tx
);
777 fiy1
= _mm_add_ps(fiy1
,ty
);
778 fiz1
= _mm_add_ps(fiz1
,tz
);
780 fjx0
= _mm_add_ps(fjx0
,tx
);
781 fjy0
= _mm_add_ps(fjy0
,ty
);
782 fjz0
= _mm_add_ps(fjz0
,tz
);
784 /**************************
785 * CALCULATE INTERACTIONS *
786 **************************/
788 /* REACTION-FIELD ELECTROSTATICS */
789 velec
= _mm_mul_ps(qq11
,_mm_sub_ps(_mm_add_ps(rinv11
,_mm_mul_ps(krf
,rsq11
)),crf
));
790 felec
= _mm_mul_ps(qq11
,_mm_sub_ps(_mm_mul_ps(rinv11
,rinvsq11
),krf2
));
792 /* Update potential sum for this i atom from the interaction with this j atom. */
793 velec
= _mm_andnot_ps(dummy_mask
,velec
);
794 velecsum
= _mm_add_ps(velecsum
,velec
);
798 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
800 /* Calculate temporary vectorial force */
801 tx
= _mm_mul_ps(fscal
,dx11
);
802 ty
= _mm_mul_ps(fscal
,dy11
);
803 tz
= _mm_mul_ps(fscal
,dz11
);
805 /* Update vectorial force */
806 fix1
= _mm_add_ps(fix1
,tx
);
807 fiy1
= _mm_add_ps(fiy1
,ty
);
808 fiz1
= _mm_add_ps(fiz1
,tz
);
810 fjx1
= _mm_add_ps(fjx1
,tx
);
811 fjy1
= _mm_add_ps(fjy1
,ty
);
812 fjz1
= _mm_add_ps(fjz1
,tz
);
814 /**************************
815 * CALCULATE INTERACTIONS *
816 **************************/
818 /* REACTION-FIELD ELECTROSTATICS */
819 velec
= _mm_mul_ps(qq12
,_mm_sub_ps(_mm_add_ps(rinv12
,_mm_mul_ps(krf
,rsq12
)),crf
));
820 felec
= _mm_mul_ps(qq12
,_mm_sub_ps(_mm_mul_ps(rinv12
,rinvsq12
),krf2
));
822 /* Update potential sum for this i atom from the interaction with this j atom. */
823 velec
= _mm_andnot_ps(dummy_mask
,velec
);
824 velecsum
= _mm_add_ps(velecsum
,velec
);
828 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
830 /* Calculate temporary vectorial force */
831 tx
= _mm_mul_ps(fscal
,dx12
);
832 ty
= _mm_mul_ps(fscal
,dy12
);
833 tz
= _mm_mul_ps(fscal
,dz12
);
835 /* Update vectorial force */
836 fix1
= _mm_add_ps(fix1
,tx
);
837 fiy1
= _mm_add_ps(fiy1
,ty
);
838 fiz1
= _mm_add_ps(fiz1
,tz
);
840 fjx2
= _mm_add_ps(fjx2
,tx
);
841 fjy2
= _mm_add_ps(fjy2
,ty
);
842 fjz2
= _mm_add_ps(fjz2
,tz
);
844 /**************************
845 * CALCULATE INTERACTIONS *
846 **************************/
848 /* REACTION-FIELD ELECTROSTATICS */
849 velec
= _mm_mul_ps(qq20
,_mm_sub_ps(_mm_add_ps(rinv20
,_mm_mul_ps(krf
,rsq20
)),crf
));
850 felec
= _mm_mul_ps(qq20
,_mm_sub_ps(_mm_mul_ps(rinv20
,rinvsq20
),krf2
));
852 /* Update potential sum for this i atom from the interaction with this j atom. */
853 velec
= _mm_andnot_ps(dummy_mask
,velec
);
854 velecsum
= _mm_add_ps(velecsum
,velec
);
858 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
860 /* Calculate temporary vectorial force */
861 tx
= _mm_mul_ps(fscal
,dx20
);
862 ty
= _mm_mul_ps(fscal
,dy20
);
863 tz
= _mm_mul_ps(fscal
,dz20
);
865 /* Update vectorial force */
866 fix2
= _mm_add_ps(fix2
,tx
);
867 fiy2
= _mm_add_ps(fiy2
,ty
);
868 fiz2
= _mm_add_ps(fiz2
,tz
);
870 fjx0
= _mm_add_ps(fjx0
,tx
);
871 fjy0
= _mm_add_ps(fjy0
,ty
);
872 fjz0
= _mm_add_ps(fjz0
,tz
);
874 /**************************
875 * CALCULATE INTERACTIONS *
876 **************************/
878 /* REACTION-FIELD ELECTROSTATICS */
879 velec
= _mm_mul_ps(qq21
,_mm_sub_ps(_mm_add_ps(rinv21
,_mm_mul_ps(krf
,rsq21
)),crf
));
880 felec
= _mm_mul_ps(qq21
,_mm_sub_ps(_mm_mul_ps(rinv21
,rinvsq21
),krf2
));
882 /* Update potential sum for this i atom from the interaction with this j atom. */
883 velec
= _mm_andnot_ps(dummy_mask
,velec
);
884 velecsum
= _mm_add_ps(velecsum
,velec
);
888 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
890 /* Calculate temporary vectorial force */
891 tx
= _mm_mul_ps(fscal
,dx21
);
892 ty
= _mm_mul_ps(fscal
,dy21
);
893 tz
= _mm_mul_ps(fscal
,dz21
);
895 /* Update vectorial force */
896 fix2
= _mm_add_ps(fix2
,tx
);
897 fiy2
= _mm_add_ps(fiy2
,ty
);
898 fiz2
= _mm_add_ps(fiz2
,tz
);
900 fjx1
= _mm_add_ps(fjx1
,tx
);
901 fjy1
= _mm_add_ps(fjy1
,ty
);
902 fjz1
= _mm_add_ps(fjz1
,tz
);
904 /**************************
905 * CALCULATE INTERACTIONS *
906 **************************/
908 /* REACTION-FIELD ELECTROSTATICS */
909 velec
= _mm_mul_ps(qq22
,_mm_sub_ps(_mm_add_ps(rinv22
,_mm_mul_ps(krf
,rsq22
)),crf
));
910 felec
= _mm_mul_ps(qq22
,_mm_sub_ps(_mm_mul_ps(rinv22
,rinvsq22
),krf2
));
912 /* Update potential sum for this i atom from the interaction with this j atom. */
913 velec
= _mm_andnot_ps(dummy_mask
,velec
);
914 velecsum
= _mm_add_ps(velecsum
,velec
);
918 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
920 /* Calculate temporary vectorial force */
921 tx
= _mm_mul_ps(fscal
,dx22
);
922 ty
= _mm_mul_ps(fscal
,dy22
);
923 tz
= _mm_mul_ps(fscal
,dz22
);
925 /* Update vectorial force */
926 fix2
= _mm_add_ps(fix2
,tx
);
927 fiy2
= _mm_add_ps(fiy2
,ty
);
928 fiz2
= _mm_add_ps(fiz2
,tz
);
930 fjx2
= _mm_add_ps(fjx2
,tx
);
931 fjy2
= _mm_add_ps(fjy2
,ty
);
932 fjz2
= _mm_add_ps(fjz2
,tz
);
934 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
935 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
936 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
937 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
939 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
940 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
942 /* Inner loop uses 300 flops */
945 /* End of innermost loop */
947 gmx_mm_update_iforce_3atom_swizzle_ps(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
948 f
+i_coord_offset
,fshift
+i_shift_offset
);
951 /* Update potential energies */
952 gmx_mm_update_1pot_ps(velecsum
,kernel_data
->energygrp_elec
+ggid
);
953 gmx_mm_update_1pot_ps(vvdwsum
,kernel_data
->energygrp_vdw
+ggid
);
955 /* Increment number of inner iterations */
956 inneriter
+= j_index_end
- j_index_start
;
958 /* Outer loop uses 20 flops */
961 /* Increment number of outer iterations */
964 /* Update outer/inner flops */
966 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W3W3_VF
,outeriter
*20 + inneriter
*300);
969 * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_sse4_1_single
970 * Electrostatics interaction: ReactionField
971 * VdW interaction: LennardJones
972 * Geometry: Water3-Water3
973 * Calculate force/pot: Force
976 nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_sse4_1_single
977 (t_nblist
* gmx_restrict nlist
,
978 rvec
* gmx_restrict xx
,
979 rvec
* gmx_restrict ff
,
980 struct t_forcerec
* gmx_restrict fr
,
981 t_mdatoms
* gmx_restrict mdatoms
,
982 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
983 t_nrnb
* gmx_restrict nrnb
)
985 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
986 * just 0 for non-waters.
987 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
988 * jnr indices corresponding to data put in the four positions in the SIMD register.
990 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
991 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
992 int jnrA
,jnrB
,jnrC
,jnrD
;
993 int jnrlistA
,jnrlistB
,jnrlistC
,jnrlistD
;
994 int j_coord_offsetA
,j_coord_offsetB
,j_coord_offsetC
,j_coord_offsetD
;
995 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
997 real
*shiftvec
,*fshift
,*x
,*f
;
998 real
*fjptrA
,*fjptrB
,*fjptrC
,*fjptrD
;
1000 __m128 tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
1002 __m128 ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
1004 __m128 ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
1006 __m128 ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
1007 int vdwjidx0A
,vdwjidx0B
,vdwjidx0C
,vdwjidx0D
;
1008 __m128 jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
1009 int vdwjidx1A
,vdwjidx1B
,vdwjidx1C
,vdwjidx1D
;
1010 __m128 jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
1011 int vdwjidx2A
,vdwjidx2B
,vdwjidx2C
,vdwjidx2D
;
1012 __m128 jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
1013 __m128 dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
1014 __m128 dx01
,dy01
,dz01
,rsq01
,rinv01
,rinvsq01
,r01
,qq01
,c6_01
,c12_01
;
1015 __m128 dx02
,dy02
,dz02
,rsq02
,rinv02
,rinvsq02
,r02
,qq02
,c6_02
,c12_02
;
1016 __m128 dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
1017 __m128 dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
1018 __m128 dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
1019 __m128 dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
1020 __m128 dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
1021 __m128 dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
1022 __m128 velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
1025 __m128 rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
1028 __m128 one_sixth
= _mm_set1_ps(1.0/6.0);
1029 __m128 one_twelfth
= _mm_set1_ps(1.0/12.0);
1030 __m128 dummy_mask
,cutoff_mask
;
1031 __m128 signbit
= _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1032 __m128 one
= _mm_set1_ps(1.0);
1033 __m128 two
= _mm_set1_ps(2.0);
1039 jindex
= nlist
->jindex
;
1041 shiftidx
= nlist
->shift
;
1043 shiftvec
= fr
->shift_vec
[0];
1044 fshift
= fr
->fshift
[0];
1045 facel
= _mm_set1_ps(fr
->ic
->epsfac
);
1046 charge
= mdatoms
->chargeA
;
1047 krf
= _mm_set1_ps(fr
->ic
->k_rf
);
1048 krf2
= _mm_set1_ps(fr
->ic
->k_rf
*2.0);
1049 crf
= _mm_set1_ps(fr
->ic
->c_rf
);
1050 nvdwtype
= fr
->ntype
;
1051 vdwparam
= fr
->nbfp
;
1052 vdwtype
= mdatoms
->typeA
;
1054 /* Setup water-specific parameters */
1055 inr
= nlist
->iinr
[0];
1056 iq0
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+0]));
1057 iq1
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+1]));
1058 iq2
= _mm_mul_ps(facel
,_mm_set1_ps(charge
[inr
+2]));
1059 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
1061 jq0
= _mm_set1_ps(charge
[inr
+0]);
1062 jq1
= _mm_set1_ps(charge
[inr
+1]);
1063 jq2
= _mm_set1_ps(charge
[inr
+2]);
1064 vdwjidx0A
= 2*vdwtype
[inr
+0];
1065 qq00
= _mm_mul_ps(iq0
,jq0
);
1066 c6_00
= _mm_set1_ps(vdwparam
[vdwioffset0
+vdwjidx0A
]);
1067 c12_00
= _mm_set1_ps(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
1068 qq01
= _mm_mul_ps(iq0
,jq1
);
1069 qq02
= _mm_mul_ps(iq0
,jq2
);
1070 qq10
= _mm_mul_ps(iq1
,jq0
);
1071 qq11
= _mm_mul_ps(iq1
,jq1
);
1072 qq12
= _mm_mul_ps(iq1
,jq2
);
1073 qq20
= _mm_mul_ps(iq2
,jq0
);
1074 qq21
= _mm_mul_ps(iq2
,jq1
);
1075 qq22
= _mm_mul_ps(iq2
,jq2
);
1077 /* Avoid stupid compiler warnings */
1078 jnrA
= jnrB
= jnrC
= jnrD
= 0;
1079 j_coord_offsetA
= 0;
1080 j_coord_offsetB
= 0;
1081 j_coord_offsetC
= 0;
1082 j_coord_offsetD
= 0;
1087 for(iidx
=0;iidx
<4*DIM
;iidx
++)
1089 scratch
[iidx
] = 0.0;
1092 /* Start outer loop over neighborlists */
1093 for(iidx
=0; iidx
<nri
; iidx
++)
1095 /* Load shift vector for this list */
1096 i_shift_offset
= DIM
*shiftidx
[iidx
];
1098 /* Load limits for loop over neighbors */
1099 j_index_start
= jindex
[iidx
];
1100 j_index_end
= jindex
[iidx
+1];
1102 /* Get outer coordinate index */
1104 i_coord_offset
= DIM
*inr
;
1106 /* Load i particle coords and add shift vector */
1107 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
1108 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
1110 fix0
= _mm_setzero_ps();
1111 fiy0
= _mm_setzero_ps();
1112 fiz0
= _mm_setzero_ps();
1113 fix1
= _mm_setzero_ps();
1114 fiy1
= _mm_setzero_ps();
1115 fiz1
= _mm_setzero_ps();
1116 fix2
= _mm_setzero_ps();
1117 fiy2
= _mm_setzero_ps();
1118 fiz2
= _mm_setzero_ps();
1120 /* Start inner kernel loop */
1121 for(jidx
=j_index_start
; jidx
<j_index_end
&& jjnr
[jidx
+3]>=0; jidx
+=4)
1124 /* Get j neighbor index, and coordinate index */
1126 jnrB
= jjnr
[jidx
+1];
1127 jnrC
= jjnr
[jidx
+2];
1128 jnrD
= jjnr
[jidx
+3];
1129 j_coord_offsetA
= DIM
*jnrA
;
1130 j_coord_offsetB
= DIM
*jnrB
;
1131 j_coord_offsetC
= DIM
*jnrC
;
1132 j_coord_offsetD
= DIM
*jnrD
;
1134 /* load j atom coordinates */
1135 gmx_mm_load_3rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1136 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
1137 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
1139 /* Calculate displacement vector */
1140 dx00
= _mm_sub_ps(ix0
,jx0
);
1141 dy00
= _mm_sub_ps(iy0
,jy0
);
1142 dz00
= _mm_sub_ps(iz0
,jz0
);
1143 dx01
= _mm_sub_ps(ix0
,jx1
);
1144 dy01
= _mm_sub_ps(iy0
,jy1
);
1145 dz01
= _mm_sub_ps(iz0
,jz1
);
1146 dx02
= _mm_sub_ps(ix0
,jx2
);
1147 dy02
= _mm_sub_ps(iy0
,jy2
);
1148 dz02
= _mm_sub_ps(iz0
,jz2
);
1149 dx10
= _mm_sub_ps(ix1
,jx0
);
1150 dy10
= _mm_sub_ps(iy1
,jy0
);
1151 dz10
= _mm_sub_ps(iz1
,jz0
);
1152 dx11
= _mm_sub_ps(ix1
,jx1
);
1153 dy11
= _mm_sub_ps(iy1
,jy1
);
1154 dz11
= _mm_sub_ps(iz1
,jz1
);
1155 dx12
= _mm_sub_ps(ix1
,jx2
);
1156 dy12
= _mm_sub_ps(iy1
,jy2
);
1157 dz12
= _mm_sub_ps(iz1
,jz2
);
1158 dx20
= _mm_sub_ps(ix2
,jx0
);
1159 dy20
= _mm_sub_ps(iy2
,jy0
);
1160 dz20
= _mm_sub_ps(iz2
,jz0
);
1161 dx21
= _mm_sub_ps(ix2
,jx1
);
1162 dy21
= _mm_sub_ps(iy2
,jy1
);
1163 dz21
= _mm_sub_ps(iz2
,jz1
);
1164 dx22
= _mm_sub_ps(ix2
,jx2
);
1165 dy22
= _mm_sub_ps(iy2
,jy2
);
1166 dz22
= _mm_sub_ps(iz2
,jz2
);
1168 /* Calculate squared distance and things based on it */
1169 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
1170 rsq01
= gmx_mm_calc_rsq_ps(dx01
,dy01
,dz01
);
1171 rsq02
= gmx_mm_calc_rsq_ps(dx02
,dy02
,dz02
);
1172 rsq10
= gmx_mm_calc_rsq_ps(dx10
,dy10
,dz10
);
1173 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
1174 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
1175 rsq20
= gmx_mm_calc_rsq_ps(dx20
,dy20
,dz20
);
1176 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
1177 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
1179 rinv00
= sse41_invsqrt_f(rsq00
);
1180 rinv01
= sse41_invsqrt_f(rsq01
);
1181 rinv02
= sse41_invsqrt_f(rsq02
);
1182 rinv10
= sse41_invsqrt_f(rsq10
);
1183 rinv11
= sse41_invsqrt_f(rsq11
);
1184 rinv12
= sse41_invsqrt_f(rsq12
);
1185 rinv20
= sse41_invsqrt_f(rsq20
);
1186 rinv21
= sse41_invsqrt_f(rsq21
);
1187 rinv22
= sse41_invsqrt_f(rsq22
);
1189 rinvsq00
= _mm_mul_ps(rinv00
,rinv00
);
1190 rinvsq01
= _mm_mul_ps(rinv01
,rinv01
);
1191 rinvsq02
= _mm_mul_ps(rinv02
,rinv02
);
1192 rinvsq10
= _mm_mul_ps(rinv10
,rinv10
);
1193 rinvsq11
= _mm_mul_ps(rinv11
,rinv11
);
1194 rinvsq12
= _mm_mul_ps(rinv12
,rinv12
);
1195 rinvsq20
= _mm_mul_ps(rinv20
,rinv20
);
1196 rinvsq21
= _mm_mul_ps(rinv21
,rinv21
);
1197 rinvsq22
= _mm_mul_ps(rinv22
,rinv22
);
1199 fjx0
= _mm_setzero_ps();
1200 fjy0
= _mm_setzero_ps();
1201 fjz0
= _mm_setzero_ps();
1202 fjx1
= _mm_setzero_ps();
1203 fjy1
= _mm_setzero_ps();
1204 fjz1
= _mm_setzero_ps();
1205 fjx2
= _mm_setzero_ps();
1206 fjy2
= _mm_setzero_ps();
1207 fjz2
= _mm_setzero_ps();
1209 /**************************
1210 * CALCULATE INTERACTIONS *
1211 **************************/
1213 /* REACTION-FIELD ELECTROSTATICS */
1214 felec
= _mm_mul_ps(qq00
,_mm_sub_ps(_mm_mul_ps(rinv00
,rinvsq00
),krf2
));
1216 /* LENNARD-JONES DISPERSION/REPULSION */
1218 rinvsix
= _mm_mul_ps(_mm_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
1219 fvdw
= _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00
,rinvsix
),c6_00
),_mm_mul_ps(rinvsix
,rinvsq00
));
1221 fscal
= _mm_add_ps(felec
,fvdw
);
1223 /* Calculate temporary vectorial force */
1224 tx
= _mm_mul_ps(fscal
,dx00
);
1225 ty
= _mm_mul_ps(fscal
,dy00
);
1226 tz
= _mm_mul_ps(fscal
,dz00
);
1228 /* Update vectorial force */
1229 fix0
= _mm_add_ps(fix0
,tx
);
1230 fiy0
= _mm_add_ps(fiy0
,ty
);
1231 fiz0
= _mm_add_ps(fiz0
,tz
);
1233 fjx0
= _mm_add_ps(fjx0
,tx
);
1234 fjy0
= _mm_add_ps(fjy0
,ty
);
1235 fjz0
= _mm_add_ps(fjz0
,tz
);
1237 /**************************
1238 * CALCULATE INTERACTIONS *
1239 **************************/
1241 /* REACTION-FIELD ELECTROSTATICS */
1242 felec
= _mm_mul_ps(qq01
,_mm_sub_ps(_mm_mul_ps(rinv01
,rinvsq01
),krf2
));
1246 /* Calculate temporary vectorial force */
1247 tx
= _mm_mul_ps(fscal
,dx01
);
1248 ty
= _mm_mul_ps(fscal
,dy01
);
1249 tz
= _mm_mul_ps(fscal
,dz01
);
1251 /* Update vectorial force */
1252 fix0
= _mm_add_ps(fix0
,tx
);
1253 fiy0
= _mm_add_ps(fiy0
,ty
);
1254 fiz0
= _mm_add_ps(fiz0
,tz
);
1256 fjx1
= _mm_add_ps(fjx1
,tx
);
1257 fjy1
= _mm_add_ps(fjy1
,ty
);
1258 fjz1
= _mm_add_ps(fjz1
,tz
);
1260 /**************************
1261 * CALCULATE INTERACTIONS *
1262 **************************/
1264 /* REACTION-FIELD ELECTROSTATICS */
1265 felec
= _mm_mul_ps(qq02
,_mm_sub_ps(_mm_mul_ps(rinv02
,rinvsq02
),krf2
));
1269 /* Calculate temporary vectorial force */
1270 tx
= _mm_mul_ps(fscal
,dx02
);
1271 ty
= _mm_mul_ps(fscal
,dy02
);
1272 tz
= _mm_mul_ps(fscal
,dz02
);
1274 /* Update vectorial force */
1275 fix0
= _mm_add_ps(fix0
,tx
);
1276 fiy0
= _mm_add_ps(fiy0
,ty
);
1277 fiz0
= _mm_add_ps(fiz0
,tz
);
1279 fjx2
= _mm_add_ps(fjx2
,tx
);
1280 fjy2
= _mm_add_ps(fjy2
,ty
);
1281 fjz2
= _mm_add_ps(fjz2
,tz
);
1283 /**************************
1284 * CALCULATE INTERACTIONS *
1285 **************************/
1287 /* REACTION-FIELD ELECTROSTATICS */
1288 felec
= _mm_mul_ps(qq10
,_mm_sub_ps(_mm_mul_ps(rinv10
,rinvsq10
),krf2
));
1292 /* Calculate temporary vectorial force */
1293 tx
= _mm_mul_ps(fscal
,dx10
);
1294 ty
= _mm_mul_ps(fscal
,dy10
);
1295 tz
= _mm_mul_ps(fscal
,dz10
);
1297 /* Update vectorial force */
1298 fix1
= _mm_add_ps(fix1
,tx
);
1299 fiy1
= _mm_add_ps(fiy1
,ty
);
1300 fiz1
= _mm_add_ps(fiz1
,tz
);
1302 fjx0
= _mm_add_ps(fjx0
,tx
);
1303 fjy0
= _mm_add_ps(fjy0
,ty
);
1304 fjz0
= _mm_add_ps(fjz0
,tz
);
1306 /**************************
1307 * CALCULATE INTERACTIONS *
1308 **************************/
1310 /* REACTION-FIELD ELECTROSTATICS */
1311 felec
= _mm_mul_ps(qq11
,_mm_sub_ps(_mm_mul_ps(rinv11
,rinvsq11
),krf2
));
1315 /* Calculate temporary vectorial force */
1316 tx
= _mm_mul_ps(fscal
,dx11
);
1317 ty
= _mm_mul_ps(fscal
,dy11
);
1318 tz
= _mm_mul_ps(fscal
,dz11
);
1320 /* Update vectorial force */
1321 fix1
= _mm_add_ps(fix1
,tx
);
1322 fiy1
= _mm_add_ps(fiy1
,ty
);
1323 fiz1
= _mm_add_ps(fiz1
,tz
);
1325 fjx1
= _mm_add_ps(fjx1
,tx
);
1326 fjy1
= _mm_add_ps(fjy1
,ty
);
1327 fjz1
= _mm_add_ps(fjz1
,tz
);
1329 /**************************
1330 * CALCULATE INTERACTIONS *
1331 **************************/
1333 /* REACTION-FIELD ELECTROSTATICS */
1334 felec
= _mm_mul_ps(qq12
,_mm_sub_ps(_mm_mul_ps(rinv12
,rinvsq12
),krf2
));
1338 /* Calculate temporary vectorial force */
1339 tx
= _mm_mul_ps(fscal
,dx12
);
1340 ty
= _mm_mul_ps(fscal
,dy12
);
1341 tz
= _mm_mul_ps(fscal
,dz12
);
1343 /* Update vectorial force */
1344 fix1
= _mm_add_ps(fix1
,tx
);
1345 fiy1
= _mm_add_ps(fiy1
,ty
);
1346 fiz1
= _mm_add_ps(fiz1
,tz
);
1348 fjx2
= _mm_add_ps(fjx2
,tx
);
1349 fjy2
= _mm_add_ps(fjy2
,ty
);
1350 fjz2
= _mm_add_ps(fjz2
,tz
);
1352 /**************************
1353 * CALCULATE INTERACTIONS *
1354 **************************/
1356 /* REACTION-FIELD ELECTROSTATICS */
1357 felec
= _mm_mul_ps(qq20
,_mm_sub_ps(_mm_mul_ps(rinv20
,rinvsq20
),krf2
));
1361 /* Calculate temporary vectorial force */
1362 tx
= _mm_mul_ps(fscal
,dx20
);
1363 ty
= _mm_mul_ps(fscal
,dy20
);
1364 tz
= _mm_mul_ps(fscal
,dz20
);
1366 /* Update vectorial force */
1367 fix2
= _mm_add_ps(fix2
,tx
);
1368 fiy2
= _mm_add_ps(fiy2
,ty
);
1369 fiz2
= _mm_add_ps(fiz2
,tz
);
1371 fjx0
= _mm_add_ps(fjx0
,tx
);
1372 fjy0
= _mm_add_ps(fjy0
,ty
);
1373 fjz0
= _mm_add_ps(fjz0
,tz
);
1375 /**************************
1376 * CALCULATE INTERACTIONS *
1377 **************************/
1379 /* REACTION-FIELD ELECTROSTATICS */
1380 felec
= _mm_mul_ps(qq21
,_mm_sub_ps(_mm_mul_ps(rinv21
,rinvsq21
),krf2
));
1384 /* Calculate temporary vectorial force */
1385 tx
= _mm_mul_ps(fscal
,dx21
);
1386 ty
= _mm_mul_ps(fscal
,dy21
);
1387 tz
= _mm_mul_ps(fscal
,dz21
);
1389 /* Update vectorial force */
1390 fix2
= _mm_add_ps(fix2
,tx
);
1391 fiy2
= _mm_add_ps(fiy2
,ty
);
1392 fiz2
= _mm_add_ps(fiz2
,tz
);
1394 fjx1
= _mm_add_ps(fjx1
,tx
);
1395 fjy1
= _mm_add_ps(fjy1
,ty
);
1396 fjz1
= _mm_add_ps(fjz1
,tz
);
1398 /**************************
1399 * CALCULATE INTERACTIONS *
1400 **************************/
1402 /* REACTION-FIELD ELECTROSTATICS */
1403 felec
= _mm_mul_ps(qq22
,_mm_sub_ps(_mm_mul_ps(rinv22
,rinvsq22
),krf2
));
1407 /* Calculate temporary vectorial force */
1408 tx
= _mm_mul_ps(fscal
,dx22
);
1409 ty
= _mm_mul_ps(fscal
,dy22
);
1410 tz
= _mm_mul_ps(fscal
,dz22
);
1412 /* Update vectorial force */
1413 fix2
= _mm_add_ps(fix2
,tx
);
1414 fiy2
= _mm_add_ps(fiy2
,ty
);
1415 fiz2
= _mm_add_ps(fiz2
,tz
);
1417 fjx2
= _mm_add_ps(fjx2
,tx
);
1418 fjy2
= _mm_add_ps(fjy2
,ty
);
1419 fjz2
= _mm_add_ps(fjz2
,tz
);
1421 fjptrA
= f
+j_coord_offsetA
;
1422 fjptrB
= f
+j_coord_offsetB
;
1423 fjptrC
= f
+j_coord_offsetC
;
1424 fjptrD
= f
+j_coord_offsetD
;
1426 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
1427 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
1429 /* Inner loop uses 250 flops */
1432 if(jidx
<j_index_end
)
1435 /* Get j neighbor index, and coordinate index */
1436 jnrlistA
= jjnr
[jidx
];
1437 jnrlistB
= jjnr
[jidx
+1];
1438 jnrlistC
= jjnr
[jidx
+2];
1439 jnrlistD
= jjnr
[jidx
+3];
1440 /* Sign of each element will be negative for non-real atoms.
1441 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1442 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1444 dummy_mask
= gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i
*)(jjnr
+jidx
)),_mm_setzero_si128()));
1445 jnrA
= (jnrlistA
>=0) ? jnrlistA
: 0;
1446 jnrB
= (jnrlistB
>=0) ? jnrlistB
: 0;
1447 jnrC
= (jnrlistC
>=0) ? jnrlistC
: 0;
1448 jnrD
= (jnrlistD
>=0) ? jnrlistD
: 0;
1449 j_coord_offsetA
= DIM
*jnrA
;
1450 j_coord_offsetB
= DIM
*jnrB
;
1451 j_coord_offsetC
= DIM
*jnrC
;
1452 j_coord_offsetD
= DIM
*jnrD
;
1454 /* load j atom coordinates */
1455 gmx_mm_load_3rvec_4ptr_swizzle_ps(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1456 x
+j_coord_offsetC
,x
+j_coord_offsetD
,
1457 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
1459 /* Calculate displacement vector */
1460 dx00
= _mm_sub_ps(ix0
,jx0
);
1461 dy00
= _mm_sub_ps(iy0
,jy0
);
1462 dz00
= _mm_sub_ps(iz0
,jz0
);
1463 dx01
= _mm_sub_ps(ix0
,jx1
);
1464 dy01
= _mm_sub_ps(iy0
,jy1
);
1465 dz01
= _mm_sub_ps(iz0
,jz1
);
1466 dx02
= _mm_sub_ps(ix0
,jx2
);
1467 dy02
= _mm_sub_ps(iy0
,jy2
);
1468 dz02
= _mm_sub_ps(iz0
,jz2
);
1469 dx10
= _mm_sub_ps(ix1
,jx0
);
1470 dy10
= _mm_sub_ps(iy1
,jy0
);
1471 dz10
= _mm_sub_ps(iz1
,jz0
);
1472 dx11
= _mm_sub_ps(ix1
,jx1
);
1473 dy11
= _mm_sub_ps(iy1
,jy1
);
1474 dz11
= _mm_sub_ps(iz1
,jz1
);
1475 dx12
= _mm_sub_ps(ix1
,jx2
);
1476 dy12
= _mm_sub_ps(iy1
,jy2
);
1477 dz12
= _mm_sub_ps(iz1
,jz2
);
1478 dx20
= _mm_sub_ps(ix2
,jx0
);
1479 dy20
= _mm_sub_ps(iy2
,jy0
);
1480 dz20
= _mm_sub_ps(iz2
,jz0
);
1481 dx21
= _mm_sub_ps(ix2
,jx1
);
1482 dy21
= _mm_sub_ps(iy2
,jy1
);
1483 dz21
= _mm_sub_ps(iz2
,jz1
);
1484 dx22
= _mm_sub_ps(ix2
,jx2
);
1485 dy22
= _mm_sub_ps(iy2
,jy2
);
1486 dz22
= _mm_sub_ps(iz2
,jz2
);
1488 /* Calculate squared distance and things based on it */
1489 rsq00
= gmx_mm_calc_rsq_ps(dx00
,dy00
,dz00
);
1490 rsq01
= gmx_mm_calc_rsq_ps(dx01
,dy01
,dz01
);
1491 rsq02
= gmx_mm_calc_rsq_ps(dx02
,dy02
,dz02
);
1492 rsq10
= gmx_mm_calc_rsq_ps(dx10
,dy10
,dz10
);
1493 rsq11
= gmx_mm_calc_rsq_ps(dx11
,dy11
,dz11
);
1494 rsq12
= gmx_mm_calc_rsq_ps(dx12
,dy12
,dz12
);
1495 rsq20
= gmx_mm_calc_rsq_ps(dx20
,dy20
,dz20
);
1496 rsq21
= gmx_mm_calc_rsq_ps(dx21
,dy21
,dz21
);
1497 rsq22
= gmx_mm_calc_rsq_ps(dx22
,dy22
,dz22
);
1499 rinv00
= sse41_invsqrt_f(rsq00
);
1500 rinv01
= sse41_invsqrt_f(rsq01
);
1501 rinv02
= sse41_invsqrt_f(rsq02
);
1502 rinv10
= sse41_invsqrt_f(rsq10
);
1503 rinv11
= sse41_invsqrt_f(rsq11
);
1504 rinv12
= sse41_invsqrt_f(rsq12
);
1505 rinv20
= sse41_invsqrt_f(rsq20
);
1506 rinv21
= sse41_invsqrt_f(rsq21
);
1507 rinv22
= sse41_invsqrt_f(rsq22
);
1509 rinvsq00
= _mm_mul_ps(rinv00
,rinv00
);
1510 rinvsq01
= _mm_mul_ps(rinv01
,rinv01
);
1511 rinvsq02
= _mm_mul_ps(rinv02
,rinv02
);
1512 rinvsq10
= _mm_mul_ps(rinv10
,rinv10
);
1513 rinvsq11
= _mm_mul_ps(rinv11
,rinv11
);
1514 rinvsq12
= _mm_mul_ps(rinv12
,rinv12
);
1515 rinvsq20
= _mm_mul_ps(rinv20
,rinv20
);
1516 rinvsq21
= _mm_mul_ps(rinv21
,rinv21
);
1517 rinvsq22
= _mm_mul_ps(rinv22
,rinv22
);
1519 fjx0
= _mm_setzero_ps();
1520 fjy0
= _mm_setzero_ps();
1521 fjz0
= _mm_setzero_ps();
1522 fjx1
= _mm_setzero_ps();
1523 fjy1
= _mm_setzero_ps();
1524 fjz1
= _mm_setzero_ps();
1525 fjx2
= _mm_setzero_ps();
1526 fjy2
= _mm_setzero_ps();
1527 fjz2
= _mm_setzero_ps();
1529 /**************************
1530 * CALCULATE INTERACTIONS *
1531 **************************/
1533 /* REACTION-FIELD ELECTROSTATICS */
1534 felec
= _mm_mul_ps(qq00
,_mm_sub_ps(_mm_mul_ps(rinv00
,rinvsq00
),krf2
));
1536 /* LENNARD-JONES DISPERSION/REPULSION */
1538 rinvsix
= _mm_mul_ps(_mm_mul_ps(rinvsq00
,rinvsq00
),rinvsq00
);
1539 fvdw
= _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00
,rinvsix
),c6_00
),_mm_mul_ps(rinvsix
,rinvsq00
));
1541 fscal
= _mm_add_ps(felec
,fvdw
);
1543 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1545 /* Calculate temporary vectorial force */
1546 tx
= _mm_mul_ps(fscal
,dx00
);
1547 ty
= _mm_mul_ps(fscal
,dy00
);
1548 tz
= _mm_mul_ps(fscal
,dz00
);
1550 /* Update vectorial force */
1551 fix0
= _mm_add_ps(fix0
,tx
);
1552 fiy0
= _mm_add_ps(fiy0
,ty
);
1553 fiz0
= _mm_add_ps(fiz0
,tz
);
1555 fjx0
= _mm_add_ps(fjx0
,tx
);
1556 fjy0
= _mm_add_ps(fjy0
,ty
);
1557 fjz0
= _mm_add_ps(fjz0
,tz
);
1559 /**************************
1560 * CALCULATE INTERACTIONS *
1561 **************************/
1563 /* REACTION-FIELD ELECTROSTATICS */
1564 felec
= _mm_mul_ps(qq01
,_mm_sub_ps(_mm_mul_ps(rinv01
,rinvsq01
),krf2
));
1568 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1570 /* Calculate temporary vectorial force */
1571 tx
= _mm_mul_ps(fscal
,dx01
);
1572 ty
= _mm_mul_ps(fscal
,dy01
);
1573 tz
= _mm_mul_ps(fscal
,dz01
);
1575 /* Update vectorial force */
1576 fix0
= _mm_add_ps(fix0
,tx
);
1577 fiy0
= _mm_add_ps(fiy0
,ty
);
1578 fiz0
= _mm_add_ps(fiz0
,tz
);
1580 fjx1
= _mm_add_ps(fjx1
,tx
);
1581 fjy1
= _mm_add_ps(fjy1
,ty
);
1582 fjz1
= _mm_add_ps(fjz1
,tz
);
1584 /**************************
1585 * CALCULATE INTERACTIONS *
1586 **************************/
1588 /* REACTION-FIELD ELECTROSTATICS */
1589 felec
= _mm_mul_ps(qq02
,_mm_sub_ps(_mm_mul_ps(rinv02
,rinvsq02
),krf2
));
1593 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1595 /* Calculate temporary vectorial force */
1596 tx
= _mm_mul_ps(fscal
,dx02
);
1597 ty
= _mm_mul_ps(fscal
,dy02
);
1598 tz
= _mm_mul_ps(fscal
,dz02
);
1600 /* Update vectorial force */
1601 fix0
= _mm_add_ps(fix0
,tx
);
1602 fiy0
= _mm_add_ps(fiy0
,ty
);
1603 fiz0
= _mm_add_ps(fiz0
,tz
);
1605 fjx2
= _mm_add_ps(fjx2
,tx
);
1606 fjy2
= _mm_add_ps(fjy2
,ty
);
1607 fjz2
= _mm_add_ps(fjz2
,tz
);
1609 /**************************
1610 * CALCULATE INTERACTIONS *
1611 **************************/
1613 /* REACTION-FIELD ELECTROSTATICS */
1614 felec
= _mm_mul_ps(qq10
,_mm_sub_ps(_mm_mul_ps(rinv10
,rinvsq10
),krf2
));
1618 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1620 /* Calculate temporary vectorial force */
1621 tx
= _mm_mul_ps(fscal
,dx10
);
1622 ty
= _mm_mul_ps(fscal
,dy10
);
1623 tz
= _mm_mul_ps(fscal
,dz10
);
1625 /* Update vectorial force */
1626 fix1
= _mm_add_ps(fix1
,tx
);
1627 fiy1
= _mm_add_ps(fiy1
,ty
);
1628 fiz1
= _mm_add_ps(fiz1
,tz
);
1630 fjx0
= _mm_add_ps(fjx0
,tx
);
1631 fjy0
= _mm_add_ps(fjy0
,ty
);
1632 fjz0
= _mm_add_ps(fjz0
,tz
);
1634 /**************************
1635 * CALCULATE INTERACTIONS *
1636 **************************/
1638 /* REACTION-FIELD ELECTROSTATICS */
1639 felec
= _mm_mul_ps(qq11
,_mm_sub_ps(_mm_mul_ps(rinv11
,rinvsq11
),krf2
));
1643 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1645 /* Calculate temporary vectorial force */
1646 tx
= _mm_mul_ps(fscal
,dx11
);
1647 ty
= _mm_mul_ps(fscal
,dy11
);
1648 tz
= _mm_mul_ps(fscal
,dz11
);
1650 /* Update vectorial force */
1651 fix1
= _mm_add_ps(fix1
,tx
);
1652 fiy1
= _mm_add_ps(fiy1
,ty
);
1653 fiz1
= _mm_add_ps(fiz1
,tz
);
1655 fjx1
= _mm_add_ps(fjx1
,tx
);
1656 fjy1
= _mm_add_ps(fjy1
,ty
);
1657 fjz1
= _mm_add_ps(fjz1
,tz
);
1659 /**************************
1660 * CALCULATE INTERACTIONS *
1661 **************************/
1663 /* REACTION-FIELD ELECTROSTATICS */
1664 felec
= _mm_mul_ps(qq12
,_mm_sub_ps(_mm_mul_ps(rinv12
,rinvsq12
),krf2
));
1668 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1670 /* Calculate temporary vectorial force */
1671 tx
= _mm_mul_ps(fscal
,dx12
);
1672 ty
= _mm_mul_ps(fscal
,dy12
);
1673 tz
= _mm_mul_ps(fscal
,dz12
);
1675 /* Update vectorial force */
1676 fix1
= _mm_add_ps(fix1
,tx
);
1677 fiy1
= _mm_add_ps(fiy1
,ty
);
1678 fiz1
= _mm_add_ps(fiz1
,tz
);
1680 fjx2
= _mm_add_ps(fjx2
,tx
);
1681 fjy2
= _mm_add_ps(fjy2
,ty
);
1682 fjz2
= _mm_add_ps(fjz2
,tz
);
1684 /**************************
1685 * CALCULATE INTERACTIONS *
1686 **************************/
1688 /* REACTION-FIELD ELECTROSTATICS */
1689 felec
= _mm_mul_ps(qq20
,_mm_sub_ps(_mm_mul_ps(rinv20
,rinvsq20
),krf2
));
1693 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1695 /* Calculate temporary vectorial force */
1696 tx
= _mm_mul_ps(fscal
,dx20
);
1697 ty
= _mm_mul_ps(fscal
,dy20
);
1698 tz
= _mm_mul_ps(fscal
,dz20
);
1700 /* Update vectorial force */
1701 fix2
= _mm_add_ps(fix2
,tx
);
1702 fiy2
= _mm_add_ps(fiy2
,ty
);
1703 fiz2
= _mm_add_ps(fiz2
,tz
);
1705 fjx0
= _mm_add_ps(fjx0
,tx
);
1706 fjy0
= _mm_add_ps(fjy0
,ty
);
1707 fjz0
= _mm_add_ps(fjz0
,tz
);
1709 /**************************
1710 * CALCULATE INTERACTIONS *
1711 **************************/
1713 /* REACTION-FIELD ELECTROSTATICS */
1714 felec
= _mm_mul_ps(qq21
,_mm_sub_ps(_mm_mul_ps(rinv21
,rinvsq21
),krf2
));
1718 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1720 /* Calculate temporary vectorial force */
1721 tx
= _mm_mul_ps(fscal
,dx21
);
1722 ty
= _mm_mul_ps(fscal
,dy21
);
1723 tz
= _mm_mul_ps(fscal
,dz21
);
1725 /* Update vectorial force */
1726 fix2
= _mm_add_ps(fix2
,tx
);
1727 fiy2
= _mm_add_ps(fiy2
,ty
);
1728 fiz2
= _mm_add_ps(fiz2
,tz
);
1730 fjx1
= _mm_add_ps(fjx1
,tx
);
1731 fjy1
= _mm_add_ps(fjy1
,ty
);
1732 fjz1
= _mm_add_ps(fjz1
,tz
);
1734 /**************************
1735 * CALCULATE INTERACTIONS *
1736 **************************/
1738 /* REACTION-FIELD ELECTROSTATICS */
1739 felec
= _mm_mul_ps(qq22
,_mm_sub_ps(_mm_mul_ps(rinv22
,rinvsq22
),krf2
));
1743 fscal
= _mm_andnot_ps(dummy_mask
,fscal
);
1745 /* Calculate temporary vectorial force */
1746 tx
= _mm_mul_ps(fscal
,dx22
);
1747 ty
= _mm_mul_ps(fscal
,dy22
);
1748 tz
= _mm_mul_ps(fscal
,dz22
);
1750 /* Update vectorial force */
1751 fix2
= _mm_add_ps(fix2
,tx
);
1752 fiy2
= _mm_add_ps(fiy2
,ty
);
1753 fiz2
= _mm_add_ps(fiz2
,tz
);
1755 fjx2
= _mm_add_ps(fjx2
,tx
);
1756 fjy2
= _mm_add_ps(fjy2
,ty
);
1757 fjz2
= _mm_add_ps(fjz2
,tz
);
1759 fjptrA
= (jnrlistA
>=0) ? f
+j_coord_offsetA
: scratch
;
1760 fjptrB
= (jnrlistB
>=0) ? f
+j_coord_offsetB
: scratch
;
1761 fjptrC
= (jnrlistC
>=0) ? f
+j_coord_offsetC
: scratch
;
1762 fjptrD
= (jnrlistD
>=0) ? f
+j_coord_offsetD
: scratch
;
1764 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA
,fjptrB
,fjptrC
,fjptrD
,
1765 fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
1767 /* Inner loop uses 250 flops */
1770 /* End of innermost loop */
1772 gmx_mm_update_iforce_3atom_swizzle_ps(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
1773 f
+i_coord_offset
,fshift
+i_shift_offset
);
1775 /* Increment number of inner iterations */
1776 inneriter
+= j_index_end
- j_index_start
;
1778 /* Outer loop uses 18 flops */
1781 /* Increment number of outer iterations */
1784 /* Update outer/inner flops */
1786 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W3W3_F
,outeriter
*18 + inneriter
*250);