2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse4_1_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_sse4_1_double.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_sse4_1_double
51 * Electrostatics interaction: ReactionField
52 * VdW interaction: LennardJones
53 * Geometry: Water3-Water3
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecRF_VdwLJ_GeomW3W3_VF_sse4_1_double
58 (t_nblist
* gmx_restrict nlist
,
59 rvec
* gmx_restrict xx
,
60 rvec
* gmx_restrict ff
,
61 struct t_forcerec
* gmx_restrict fr
,
62 t_mdatoms
* gmx_restrict mdatoms
,
63 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
64 t_nrnb
* gmx_restrict nrnb
)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
72 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
74 int j_coord_offsetA
,j_coord_offsetB
;
75 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
77 real
*shiftvec
,*fshift
,*x
,*f
;
78 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
80 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
82 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
84 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
85 int vdwjidx0A
,vdwjidx0B
;
86 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
87 int vdwjidx1A
,vdwjidx1B
;
88 __m128d jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
89 int vdwjidx2A
,vdwjidx2B
;
90 __m128d jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
91 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
92 __m128d dx01
,dy01
,dz01
,rsq01
,rinv01
,rinvsq01
,r01
,qq01
,c6_01
,c12_01
;
93 __m128d dx02
,dy02
,dz02
,rsq02
,rinv02
,rinvsq02
,r02
,qq02
,c6_02
,c12_02
;
94 __m128d dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
95 __m128d dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
96 __m128d dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
97 __m128d dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
98 __m128d dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
99 __m128d dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
100 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
103 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
106 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
107 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
108 __m128d dummy_mask
,cutoff_mask
;
109 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
110 __m128d one
= _mm_set1_pd(1.0);
111 __m128d two
= _mm_set1_pd(2.0);
117 jindex
= nlist
->jindex
;
119 shiftidx
= nlist
->shift
;
121 shiftvec
= fr
->shift_vec
[0];
122 fshift
= fr
->fshift
[0];
123 facel
= _mm_set1_pd(fr
->ic
->epsfac
);
124 charge
= mdatoms
->chargeA
;
125 krf
= _mm_set1_pd(fr
->ic
->k_rf
);
126 krf2
= _mm_set1_pd(fr
->ic
->k_rf
*2.0);
127 crf
= _mm_set1_pd(fr
->ic
->c_rf
);
128 nvdwtype
= fr
->ntype
;
130 vdwtype
= mdatoms
->typeA
;
132 /* Setup water-specific parameters */
133 inr
= nlist
->iinr
[0];
134 iq0
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+0]));
135 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
136 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
137 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
139 jq0
= _mm_set1_pd(charge
[inr
+0]);
140 jq1
= _mm_set1_pd(charge
[inr
+1]);
141 jq2
= _mm_set1_pd(charge
[inr
+2]);
142 vdwjidx0A
= 2*vdwtype
[inr
+0];
143 qq00
= _mm_mul_pd(iq0
,jq0
);
144 c6_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
]);
145 c12_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
146 qq01
= _mm_mul_pd(iq0
,jq1
);
147 qq02
= _mm_mul_pd(iq0
,jq2
);
148 qq10
= _mm_mul_pd(iq1
,jq0
);
149 qq11
= _mm_mul_pd(iq1
,jq1
);
150 qq12
= _mm_mul_pd(iq1
,jq2
);
151 qq20
= _mm_mul_pd(iq2
,jq0
);
152 qq21
= _mm_mul_pd(iq2
,jq1
);
153 qq22
= _mm_mul_pd(iq2
,jq2
);
155 /* Avoid stupid compiler warnings */
163 /* Start outer loop over neighborlists */
164 for(iidx
=0; iidx
<nri
; iidx
++)
166 /* Load shift vector for this list */
167 i_shift_offset
= DIM
*shiftidx
[iidx
];
169 /* Load limits for loop over neighbors */
170 j_index_start
= jindex
[iidx
];
171 j_index_end
= jindex
[iidx
+1];
173 /* Get outer coordinate index */
175 i_coord_offset
= DIM
*inr
;
177 /* Load i particle coords and add shift vector */
178 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
179 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
181 fix0
= _mm_setzero_pd();
182 fiy0
= _mm_setzero_pd();
183 fiz0
= _mm_setzero_pd();
184 fix1
= _mm_setzero_pd();
185 fiy1
= _mm_setzero_pd();
186 fiz1
= _mm_setzero_pd();
187 fix2
= _mm_setzero_pd();
188 fiy2
= _mm_setzero_pd();
189 fiz2
= _mm_setzero_pd();
191 /* Reset potential sums */
192 velecsum
= _mm_setzero_pd();
193 vvdwsum
= _mm_setzero_pd();
195 /* Start inner kernel loop */
196 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
199 /* Get j neighbor index, and coordinate index */
202 j_coord_offsetA
= DIM
*jnrA
;
203 j_coord_offsetB
= DIM
*jnrB
;
205 /* load j atom coordinates */
206 gmx_mm_load_3rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
207 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
209 /* Calculate displacement vector */
210 dx00
= _mm_sub_pd(ix0
,jx0
);
211 dy00
= _mm_sub_pd(iy0
,jy0
);
212 dz00
= _mm_sub_pd(iz0
,jz0
);
213 dx01
= _mm_sub_pd(ix0
,jx1
);
214 dy01
= _mm_sub_pd(iy0
,jy1
);
215 dz01
= _mm_sub_pd(iz0
,jz1
);
216 dx02
= _mm_sub_pd(ix0
,jx2
);
217 dy02
= _mm_sub_pd(iy0
,jy2
);
218 dz02
= _mm_sub_pd(iz0
,jz2
);
219 dx10
= _mm_sub_pd(ix1
,jx0
);
220 dy10
= _mm_sub_pd(iy1
,jy0
);
221 dz10
= _mm_sub_pd(iz1
,jz0
);
222 dx11
= _mm_sub_pd(ix1
,jx1
);
223 dy11
= _mm_sub_pd(iy1
,jy1
);
224 dz11
= _mm_sub_pd(iz1
,jz1
);
225 dx12
= _mm_sub_pd(ix1
,jx2
);
226 dy12
= _mm_sub_pd(iy1
,jy2
);
227 dz12
= _mm_sub_pd(iz1
,jz2
);
228 dx20
= _mm_sub_pd(ix2
,jx0
);
229 dy20
= _mm_sub_pd(iy2
,jy0
);
230 dz20
= _mm_sub_pd(iz2
,jz0
);
231 dx21
= _mm_sub_pd(ix2
,jx1
);
232 dy21
= _mm_sub_pd(iy2
,jy1
);
233 dz21
= _mm_sub_pd(iz2
,jz1
);
234 dx22
= _mm_sub_pd(ix2
,jx2
);
235 dy22
= _mm_sub_pd(iy2
,jy2
);
236 dz22
= _mm_sub_pd(iz2
,jz2
);
238 /* Calculate squared distance and things based on it */
239 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
240 rsq01
= gmx_mm_calc_rsq_pd(dx01
,dy01
,dz01
);
241 rsq02
= gmx_mm_calc_rsq_pd(dx02
,dy02
,dz02
);
242 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
243 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
244 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
245 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
246 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
247 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
249 rinv00
= sse41_invsqrt_d(rsq00
);
250 rinv01
= sse41_invsqrt_d(rsq01
);
251 rinv02
= sse41_invsqrt_d(rsq02
);
252 rinv10
= sse41_invsqrt_d(rsq10
);
253 rinv11
= sse41_invsqrt_d(rsq11
);
254 rinv12
= sse41_invsqrt_d(rsq12
);
255 rinv20
= sse41_invsqrt_d(rsq20
);
256 rinv21
= sse41_invsqrt_d(rsq21
);
257 rinv22
= sse41_invsqrt_d(rsq22
);
259 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
260 rinvsq01
= _mm_mul_pd(rinv01
,rinv01
);
261 rinvsq02
= _mm_mul_pd(rinv02
,rinv02
);
262 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
263 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
264 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
265 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
266 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
267 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
269 fjx0
= _mm_setzero_pd();
270 fjy0
= _mm_setzero_pd();
271 fjz0
= _mm_setzero_pd();
272 fjx1
= _mm_setzero_pd();
273 fjy1
= _mm_setzero_pd();
274 fjz1
= _mm_setzero_pd();
275 fjx2
= _mm_setzero_pd();
276 fjy2
= _mm_setzero_pd();
277 fjz2
= _mm_setzero_pd();
279 /**************************
280 * CALCULATE INTERACTIONS *
281 **************************/
283 /* REACTION-FIELD ELECTROSTATICS */
284 velec
= _mm_mul_pd(qq00
,_mm_sub_pd(_mm_add_pd(rinv00
,_mm_mul_pd(krf
,rsq00
)),crf
));
285 felec
= _mm_mul_pd(qq00
,_mm_sub_pd(_mm_mul_pd(rinv00
,rinvsq00
),krf2
));
287 /* LENNARD-JONES DISPERSION/REPULSION */
289 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
290 vvdw6
= _mm_mul_pd(c6_00
,rinvsix
);
291 vvdw12
= _mm_mul_pd(c12_00
,_mm_mul_pd(rinvsix
,rinvsix
));
292 vvdw
= _mm_sub_pd( _mm_mul_pd(vvdw12
,one_twelfth
) , _mm_mul_pd(vvdw6
,one_sixth
) );
293 fvdw
= _mm_mul_pd(_mm_sub_pd(vvdw12
,vvdw6
),rinvsq00
);
295 /* Update potential sum for this i atom from the interaction with this j atom. */
296 velecsum
= _mm_add_pd(velecsum
,velec
);
297 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
299 fscal
= _mm_add_pd(felec
,fvdw
);
301 /* Calculate temporary vectorial force */
302 tx
= _mm_mul_pd(fscal
,dx00
);
303 ty
= _mm_mul_pd(fscal
,dy00
);
304 tz
= _mm_mul_pd(fscal
,dz00
);
306 /* Update vectorial force */
307 fix0
= _mm_add_pd(fix0
,tx
);
308 fiy0
= _mm_add_pd(fiy0
,ty
);
309 fiz0
= _mm_add_pd(fiz0
,tz
);
311 fjx0
= _mm_add_pd(fjx0
,tx
);
312 fjy0
= _mm_add_pd(fjy0
,ty
);
313 fjz0
= _mm_add_pd(fjz0
,tz
);
315 /**************************
316 * CALCULATE INTERACTIONS *
317 **************************/
319 /* REACTION-FIELD ELECTROSTATICS */
320 velec
= _mm_mul_pd(qq01
,_mm_sub_pd(_mm_add_pd(rinv01
,_mm_mul_pd(krf
,rsq01
)),crf
));
321 felec
= _mm_mul_pd(qq01
,_mm_sub_pd(_mm_mul_pd(rinv01
,rinvsq01
),krf2
));
323 /* Update potential sum for this i atom from the interaction with this j atom. */
324 velecsum
= _mm_add_pd(velecsum
,velec
);
328 /* Calculate temporary vectorial force */
329 tx
= _mm_mul_pd(fscal
,dx01
);
330 ty
= _mm_mul_pd(fscal
,dy01
);
331 tz
= _mm_mul_pd(fscal
,dz01
);
333 /* Update vectorial force */
334 fix0
= _mm_add_pd(fix0
,tx
);
335 fiy0
= _mm_add_pd(fiy0
,ty
);
336 fiz0
= _mm_add_pd(fiz0
,tz
);
338 fjx1
= _mm_add_pd(fjx1
,tx
);
339 fjy1
= _mm_add_pd(fjy1
,ty
);
340 fjz1
= _mm_add_pd(fjz1
,tz
);
342 /**************************
343 * CALCULATE INTERACTIONS *
344 **************************/
346 /* REACTION-FIELD ELECTROSTATICS */
347 velec
= _mm_mul_pd(qq02
,_mm_sub_pd(_mm_add_pd(rinv02
,_mm_mul_pd(krf
,rsq02
)),crf
));
348 felec
= _mm_mul_pd(qq02
,_mm_sub_pd(_mm_mul_pd(rinv02
,rinvsq02
),krf2
));
350 /* Update potential sum for this i atom from the interaction with this j atom. */
351 velecsum
= _mm_add_pd(velecsum
,velec
);
355 /* Calculate temporary vectorial force */
356 tx
= _mm_mul_pd(fscal
,dx02
);
357 ty
= _mm_mul_pd(fscal
,dy02
);
358 tz
= _mm_mul_pd(fscal
,dz02
);
360 /* Update vectorial force */
361 fix0
= _mm_add_pd(fix0
,tx
);
362 fiy0
= _mm_add_pd(fiy0
,ty
);
363 fiz0
= _mm_add_pd(fiz0
,tz
);
365 fjx2
= _mm_add_pd(fjx2
,tx
);
366 fjy2
= _mm_add_pd(fjy2
,ty
);
367 fjz2
= _mm_add_pd(fjz2
,tz
);
369 /**************************
370 * CALCULATE INTERACTIONS *
371 **************************/
373 /* REACTION-FIELD ELECTROSTATICS */
374 velec
= _mm_mul_pd(qq10
,_mm_sub_pd(_mm_add_pd(rinv10
,_mm_mul_pd(krf
,rsq10
)),crf
));
375 felec
= _mm_mul_pd(qq10
,_mm_sub_pd(_mm_mul_pd(rinv10
,rinvsq10
),krf2
));
377 /* Update potential sum for this i atom from the interaction with this j atom. */
378 velecsum
= _mm_add_pd(velecsum
,velec
);
382 /* Calculate temporary vectorial force */
383 tx
= _mm_mul_pd(fscal
,dx10
);
384 ty
= _mm_mul_pd(fscal
,dy10
);
385 tz
= _mm_mul_pd(fscal
,dz10
);
387 /* Update vectorial force */
388 fix1
= _mm_add_pd(fix1
,tx
);
389 fiy1
= _mm_add_pd(fiy1
,ty
);
390 fiz1
= _mm_add_pd(fiz1
,tz
);
392 fjx0
= _mm_add_pd(fjx0
,tx
);
393 fjy0
= _mm_add_pd(fjy0
,ty
);
394 fjz0
= _mm_add_pd(fjz0
,tz
);
396 /**************************
397 * CALCULATE INTERACTIONS *
398 **************************/
400 /* REACTION-FIELD ELECTROSTATICS */
401 velec
= _mm_mul_pd(qq11
,_mm_sub_pd(_mm_add_pd(rinv11
,_mm_mul_pd(krf
,rsq11
)),crf
));
402 felec
= _mm_mul_pd(qq11
,_mm_sub_pd(_mm_mul_pd(rinv11
,rinvsq11
),krf2
));
404 /* Update potential sum for this i atom from the interaction with this j atom. */
405 velecsum
= _mm_add_pd(velecsum
,velec
);
409 /* Calculate temporary vectorial force */
410 tx
= _mm_mul_pd(fscal
,dx11
);
411 ty
= _mm_mul_pd(fscal
,dy11
);
412 tz
= _mm_mul_pd(fscal
,dz11
);
414 /* Update vectorial force */
415 fix1
= _mm_add_pd(fix1
,tx
);
416 fiy1
= _mm_add_pd(fiy1
,ty
);
417 fiz1
= _mm_add_pd(fiz1
,tz
);
419 fjx1
= _mm_add_pd(fjx1
,tx
);
420 fjy1
= _mm_add_pd(fjy1
,ty
);
421 fjz1
= _mm_add_pd(fjz1
,tz
);
423 /**************************
424 * CALCULATE INTERACTIONS *
425 **************************/
427 /* REACTION-FIELD ELECTROSTATICS */
428 velec
= _mm_mul_pd(qq12
,_mm_sub_pd(_mm_add_pd(rinv12
,_mm_mul_pd(krf
,rsq12
)),crf
));
429 felec
= _mm_mul_pd(qq12
,_mm_sub_pd(_mm_mul_pd(rinv12
,rinvsq12
),krf2
));
431 /* Update potential sum for this i atom from the interaction with this j atom. */
432 velecsum
= _mm_add_pd(velecsum
,velec
);
436 /* Calculate temporary vectorial force */
437 tx
= _mm_mul_pd(fscal
,dx12
);
438 ty
= _mm_mul_pd(fscal
,dy12
);
439 tz
= _mm_mul_pd(fscal
,dz12
);
441 /* Update vectorial force */
442 fix1
= _mm_add_pd(fix1
,tx
);
443 fiy1
= _mm_add_pd(fiy1
,ty
);
444 fiz1
= _mm_add_pd(fiz1
,tz
);
446 fjx2
= _mm_add_pd(fjx2
,tx
);
447 fjy2
= _mm_add_pd(fjy2
,ty
);
448 fjz2
= _mm_add_pd(fjz2
,tz
);
450 /**************************
451 * CALCULATE INTERACTIONS *
452 **************************/
454 /* REACTION-FIELD ELECTROSTATICS */
455 velec
= _mm_mul_pd(qq20
,_mm_sub_pd(_mm_add_pd(rinv20
,_mm_mul_pd(krf
,rsq20
)),crf
));
456 felec
= _mm_mul_pd(qq20
,_mm_sub_pd(_mm_mul_pd(rinv20
,rinvsq20
),krf2
));
458 /* Update potential sum for this i atom from the interaction with this j atom. */
459 velecsum
= _mm_add_pd(velecsum
,velec
);
463 /* Calculate temporary vectorial force */
464 tx
= _mm_mul_pd(fscal
,dx20
);
465 ty
= _mm_mul_pd(fscal
,dy20
);
466 tz
= _mm_mul_pd(fscal
,dz20
);
468 /* Update vectorial force */
469 fix2
= _mm_add_pd(fix2
,tx
);
470 fiy2
= _mm_add_pd(fiy2
,ty
);
471 fiz2
= _mm_add_pd(fiz2
,tz
);
473 fjx0
= _mm_add_pd(fjx0
,tx
);
474 fjy0
= _mm_add_pd(fjy0
,ty
);
475 fjz0
= _mm_add_pd(fjz0
,tz
);
477 /**************************
478 * CALCULATE INTERACTIONS *
479 **************************/
481 /* REACTION-FIELD ELECTROSTATICS */
482 velec
= _mm_mul_pd(qq21
,_mm_sub_pd(_mm_add_pd(rinv21
,_mm_mul_pd(krf
,rsq21
)),crf
));
483 felec
= _mm_mul_pd(qq21
,_mm_sub_pd(_mm_mul_pd(rinv21
,rinvsq21
),krf2
));
485 /* Update potential sum for this i atom from the interaction with this j atom. */
486 velecsum
= _mm_add_pd(velecsum
,velec
);
490 /* Calculate temporary vectorial force */
491 tx
= _mm_mul_pd(fscal
,dx21
);
492 ty
= _mm_mul_pd(fscal
,dy21
);
493 tz
= _mm_mul_pd(fscal
,dz21
);
495 /* Update vectorial force */
496 fix2
= _mm_add_pd(fix2
,tx
);
497 fiy2
= _mm_add_pd(fiy2
,ty
);
498 fiz2
= _mm_add_pd(fiz2
,tz
);
500 fjx1
= _mm_add_pd(fjx1
,tx
);
501 fjy1
= _mm_add_pd(fjy1
,ty
);
502 fjz1
= _mm_add_pd(fjz1
,tz
);
504 /**************************
505 * CALCULATE INTERACTIONS *
506 **************************/
508 /* REACTION-FIELD ELECTROSTATICS */
509 velec
= _mm_mul_pd(qq22
,_mm_sub_pd(_mm_add_pd(rinv22
,_mm_mul_pd(krf
,rsq22
)),crf
));
510 felec
= _mm_mul_pd(qq22
,_mm_sub_pd(_mm_mul_pd(rinv22
,rinvsq22
),krf2
));
512 /* Update potential sum for this i atom from the interaction with this j atom. */
513 velecsum
= _mm_add_pd(velecsum
,velec
);
517 /* Calculate temporary vectorial force */
518 tx
= _mm_mul_pd(fscal
,dx22
);
519 ty
= _mm_mul_pd(fscal
,dy22
);
520 tz
= _mm_mul_pd(fscal
,dz22
);
522 /* Update vectorial force */
523 fix2
= _mm_add_pd(fix2
,tx
);
524 fiy2
= _mm_add_pd(fiy2
,ty
);
525 fiz2
= _mm_add_pd(fiz2
,tz
);
527 fjx2
= _mm_add_pd(fjx2
,tx
);
528 fjy2
= _mm_add_pd(fjy2
,ty
);
529 fjz2
= _mm_add_pd(fjz2
,tz
);
531 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
533 /* Inner loop uses 300 flops */
540 j_coord_offsetA
= DIM
*jnrA
;
542 /* load j atom coordinates */
543 gmx_mm_load_3rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
544 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
546 /* Calculate displacement vector */
547 dx00
= _mm_sub_pd(ix0
,jx0
);
548 dy00
= _mm_sub_pd(iy0
,jy0
);
549 dz00
= _mm_sub_pd(iz0
,jz0
);
550 dx01
= _mm_sub_pd(ix0
,jx1
);
551 dy01
= _mm_sub_pd(iy0
,jy1
);
552 dz01
= _mm_sub_pd(iz0
,jz1
);
553 dx02
= _mm_sub_pd(ix0
,jx2
);
554 dy02
= _mm_sub_pd(iy0
,jy2
);
555 dz02
= _mm_sub_pd(iz0
,jz2
);
556 dx10
= _mm_sub_pd(ix1
,jx0
);
557 dy10
= _mm_sub_pd(iy1
,jy0
);
558 dz10
= _mm_sub_pd(iz1
,jz0
);
559 dx11
= _mm_sub_pd(ix1
,jx1
);
560 dy11
= _mm_sub_pd(iy1
,jy1
);
561 dz11
= _mm_sub_pd(iz1
,jz1
);
562 dx12
= _mm_sub_pd(ix1
,jx2
);
563 dy12
= _mm_sub_pd(iy1
,jy2
);
564 dz12
= _mm_sub_pd(iz1
,jz2
);
565 dx20
= _mm_sub_pd(ix2
,jx0
);
566 dy20
= _mm_sub_pd(iy2
,jy0
);
567 dz20
= _mm_sub_pd(iz2
,jz0
);
568 dx21
= _mm_sub_pd(ix2
,jx1
);
569 dy21
= _mm_sub_pd(iy2
,jy1
);
570 dz21
= _mm_sub_pd(iz2
,jz1
);
571 dx22
= _mm_sub_pd(ix2
,jx2
);
572 dy22
= _mm_sub_pd(iy2
,jy2
);
573 dz22
= _mm_sub_pd(iz2
,jz2
);
575 /* Calculate squared distance and things based on it */
576 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
577 rsq01
= gmx_mm_calc_rsq_pd(dx01
,dy01
,dz01
);
578 rsq02
= gmx_mm_calc_rsq_pd(dx02
,dy02
,dz02
);
579 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
580 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
581 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
582 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
583 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
584 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
586 rinv00
= sse41_invsqrt_d(rsq00
);
587 rinv01
= sse41_invsqrt_d(rsq01
);
588 rinv02
= sse41_invsqrt_d(rsq02
);
589 rinv10
= sse41_invsqrt_d(rsq10
);
590 rinv11
= sse41_invsqrt_d(rsq11
);
591 rinv12
= sse41_invsqrt_d(rsq12
);
592 rinv20
= sse41_invsqrt_d(rsq20
);
593 rinv21
= sse41_invsqrt_d(rsq21
);
594 rinv22
= sse41_invsqrt_d(rsq22
);
596 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
597 rinvsq01
= _mm_mul_pd(rinv01
,rinv01
);
598 rinvsq02
= _mm_mul_pd(rinv02
,rinv02
);
599 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
600 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
601 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
602 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
603 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
604 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
606 fjx0
= _mm_setzero_pd();
607 fjy0
= _mm_setzero_pd();
608 fjz0
= _mm_setzero_pd();
609 fjx1
= _mm_setzero_pd();
610 fjy1
= _mm_setzero_pd();
611 fjz1
= _mm_setzero_pd();
612 fjx2
= _mm_setzero_pd();
613 fjy2
= _mm_setzero_pd();
614 fjz2
= _mm_setzero_pd();
616 /**************************
617 * CALCULATE INTERACTIONS *
618 **************************/
620 /* REACTION-FIELD ELECTROSTATICS */
621 velec
= _mm_mul_pd(qq00
,_mm_sub_pd(_mm_add_pd(rinv00
,_mm_mul_pd(krf
,rsq00
)),crf
));
622 felec
= _mm_mul_pd(qq00
,_mm_sub_pd(_mm_mul_pd(rinv00
,rinvsq00
),krf2
));
624 /* LENNARD-JONES DISPERSION/REPULSION */
626 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
627 vvdw6
= _mm_mul_pd(c6_00
,rinvsix
);
628 vvdw12
= _mm_mul_pd(c12_00
,_mm_mul_pd(rinvsix
,rinvsix
));
629 vvdw
= _mm_sub_pd( _mm_mul_pd(vvdw12
,one_twelfth
) , _mm_mul_pd(vvdw6
,one_sixth
) );
630 fvdw
= _mm_mul_pd(_mm_sub_pd(vvdw12
,vvdw6
),rinvsq00
);
632 /* Update potential sum for this i atom from the interaction with this j atom. */
633 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
634 velecsum
= _mm_add_pd(velecsum
,velec
);
635 vvdw
= _mm_unpacklo_pd(vvdw
,_mm_setzero_pd());
636 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
638 fscal
= _mm_add_pd(felec
,fvdw
);
640 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
642 /* Calculate temporary vectorial force */
643 tx
= _mm_mul_pd(fscal
,dx00
);
644 ty
= _mm_mul_pd(fscal
,dy00
);
645 tz
= _mm_mul_pd(fscal
,dz00
);
647 /* Update vectorial force */
648 fix0
= _mm_add_pd(fix0
,tx
);
649 fiy0
= _mm_add_pd(fiy0
,ty
);
650 fiz0
= _mm_add_pd(fiz0
,tz
);
652 fjx0
= _mm_add_pd(fjx0
,tx
);
653 fjy0
= _mm_add_pd(fjy0
,ty
);
654 fjz0
= _mm_add_pd(fjz0
,tz
);
656 /**************************
657 * CALCULATE INTERACTIONS *
658 **************************/
660 /* REACTION-FIELD ELECTROSTATICS */
661 velec
= _mm_mul_pd(qq01
,_mm_sub_pd(_mm_add_pd(rinv01
,_mm_mul_pd(krf
,rsq01
)),crf
));
662 felec
= _mm_mul_pd(qq01
,_mm_sub_pd(_mm_mul_pd(rinv01
,rinvsq01
),krf2
));
664 /* Update potential sum for this i atom from the interaction with this j atom. */
665 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
666 velecsum
= _mm_add_pd(velecsum
,velec
);
670 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
672 /* Calculate temporary vectorial force */
673 tx
= _mm_mul_pd(fscal
,dx01
);
674 ty
= _mm_mul_pd(fscal
,dy01
);
675 tz
= _mm_mul_pd(fscal
,dz01
);
677 /* Update vectorial force */
678 fix0
= _mm_add_pd(fix0
,tx
);
679 fiy0
= _mm_add_pd(fiy0
,ty
);
680 fiz0
= _mm_add_pd(fiz0
,tz
);
682 fjx1
= _mm_add_pd(fjx1
,tx
);
683 fjy1
= _mm_add_pd(fjy1
,ty
);
684 fjz1
= _mm_add_pd(fjz1
,tz
);
686 /**************************
687 * CALCULATE INTERACTIONS *
688 **************************/
690 /* REACTION-FIELD ELECTROSTATICS */
691 velec
= _mm_mul_pd(qq02
,_mm_sub_pd(_mm_add_pd(rinv02
,_mm_mul_pd(krf
,rsq02
)),crf
));
692 felec
= _mm_mul_pd(qq02
,_mm_sub_pd(_mm_mul_pd(rinv02
,rinvsq02
),krf2
));
694 /* Update potential sum for this i atom from the interaction with this j atom. */
695 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
696 velecsum
= _mm_add_pd(velecsum
,velec
);
700 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
702 /* Calculate temporary vectorial force */
703 tx
= _mm_mul_pd(fscal
,dx02
);
704 ty
= _mm_mul_pd(fscal
,dy02
);
705 tz
= _mm_mul_pd(fscal
,dz02
);
707 /* Update vectorial force */
708 fix0
= _mm_add_pd(fix0
,tx
);
709 fiy0
= _mm_add_pd(fiy0
,ty
);
710 fiz0
= _mm_add_pd(fiz0
,tz
);
712 fjx2
= _mm_add_pd(fjx2
,tx
);
713 fjy2
= _mm_add_pd(fjy2
,ty
);
714 fjz2
= _mm_add_pd(fjz2
,tz
);
716 /**************************
717 * CALCULATE INTERACTIONS *
718 **************************/
720 /* REACTION-FIELD ELECTROSTATICS */
721 velec
= _mm_mul_pd(qq10
,_mm_sub_pd(_mm_add_pd(rinv10
,_mm_mul_pd(krf
,rsq10
)),crf
));
722 felec
= _mm_mul_pd(qq10
,_mm_sub_pd(_mm_mul_pd(rinv10
,rinvsq10
),krf2
));
724 /* Update potential sum for this i atom from the interaction with this j atom. */
725 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
726 velecsum
= _mm_add_pd(velecsum
,velec
);
730 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
732 /* Calculate temporary vectorial force */
733 tx
= _mm_mul_pd(fscal
,dx10
);
734 ty
= _mm_mul_pd(fscal
,dy10
);
735 tz
= _mm_mul_pd(fscal
,dz10
);
737 /* Update vectorial force */
738 fix1
= _mm_add_pd(fix1
,tx
);
739 fiy1
= _mm_add_pd(fiy1
,ty
);
740 fiz1
= _mm_add_pd(fiz1
,tz
);
742 fjx0
= _mm_add_pd(fjx0
,tx
);
743 fjy0
= _mm_add_pd(fjy0
,ty
);
744 fjz0
= _mm_add_pd(fjz0
,tz
);
746 /**************************
747 * CALCULATE INTERACTIONS *
748 **************************/
750 /* REACTION-FIELD ELECTROSTATICS */
751 velec
= _mm_mul_pd(qq11
,_mm_sub_pd(_mm_add_pd(rinv11
,_mm_mul_pd(krf
,rsq11
)),crf
));
752 felec
= _mm_mul_pd(qq11
,_mm_sub_pd(_mm_mul_pd(rinv11
,rinvsq11
),krf2
));
754 /* Update potential sum for this i atom from the interaction with this j atom. */
755 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
756 velecsum
= _mm_add_pd(velecsum
,velec
);
760 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
762 /* Calculate temporary vectorial force */
763 tx
= _mm_mul_pd(fscal
,dx11
);
764 ty
= _mm_mul_pd(fscal
,dy11
);
765 tz
= _mm_mul_pd(fscal
,dz11
);
767 /* Update vectorial force */
768 fix1
= _mm_add_pd(fix1
,tx
);
769 fiy1
= _mm_add_pd(fiy1
,ty
);
770 fiz1
= _mm_add_pd(fiz1
,tz
);
772 fjx1
= _mm_add_pd(fjx1
,tx
);
773 fjy1
= _mm_add_pd(fjy1
,ty
);
774 fjz1
= _mm_add_pd(fjz1
,tz
);
776 /**************************
777 * CALCULATE INTERACTIONS *
778 **************************/
780 /* REACTION-FIELD ELECTROSTATICS */
781 velec
= _mm_mul_pd(qq12
,_mm_sub_pd(_mm_add_pd(rinv12
,_mm_mul_pd(krf
,rsq12
)),crf
));
782 felec
= _mm_mul_pd(qq12
,_mm_sub_pd(_mm_mul_pd(rinv12
,rinvsq12
),krf2
));
784 /* Update potential sum for this i atom from the interaction with this j atom. */
785 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
786 velecsum
= _mm_add_pd(velecsum
,velec
);
790 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
792 /* Calculate temporary vectorial force */
793 tx
= _mm_mul_pd(fscal
,dx12
);
794 ty
= _mm_mul_pd(fscal
,dy12
);
795 tz
= _mm_mul_pd(fscal
,dz12
);
797 /* Update vectorial force */
798 fix1
= _mm_add_pd(fix1
,tx
);
799 fiy1
= _mm_add_pd(fiy1
,ty
);
800 fiz1
= _mm_add_pd(fiz1
,tz
);
802 fjx2
= _mm_add_pd(fjx2
,tx
);
803 fjy2
= _mm_add_pd(fjy2
,ty
);
804 fjz2
= _mm_add_pd(fjz2
,tz
);
806 /**************************
807 * CALCULATE INTERACTIONS *
808 **************************/
810 /* REACTION-FIELD ELECTROSTATICS */
811 velec
= _mm_mul_pd(qq20
,_mm_sub_pd(_mm_add_pd(rinv20
,_mm_mul_pd(krf
,rsq20
)),crf
));
812 felec
= _mm_mul_pd(qq20
,_mm_sub_pd(_mm_mul_pd(rinv20
,rinvsq20
),krf2
));
814 /* Update potential sum for this i atom from the interaction with this j atom. */
815 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
816 velecsum
= _mm_add_pd(velecsum
,velec
);
820 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
822 /* Calculate temporary vectorial force */
823 tx
= _mm_mul_pd(fscal
,dx20
);
824 ty
= _mm_mul_pd(fscal
,dy20
);
825 tz
= _mm_mul_pd(fscal
,dz20
);
827 /* Update vectorial force */
828 fix2
= _mm_add_pd(fix2
,tx
);
829 fiy2
= _mm_add_pd(fiy2
,ty
);
830 fiz2
= _mm_add_pd(fiz2
,tz
);
832 fjx0
= _mm_add_pd(fjx0
,tx
);
833 fjy0
= _mm_add_pd(fjy0
,ty
);
834 fjz0
= _mm_add_pd(fjz0
,tz
);
836 /**************************
837 * CALCULATE INTERACTIONS *
838 **************************/
840 /* REACTION-FIELD ELECTROSTATICS */
841 velec
= _mm_mul_pd(qq21
,_mm_sub_pd(_mm_add_pd(rinv21
,_mm_mul_pd(krf
,rsq21
)),crf
));
842 felec
= _mm_mul_pd(qq21
,_mm_sub_pd(_mm_mul_pd(rinv21
,rinvsq21
),krf2
));
844 /* Update potential sum for this i atom from the interaction with this j atom. */
845 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
846 velecsum
= _mm_add_pd(velecsum
,velec
);
850 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
852 /* Calculate temporary vectorial force */
853 tx
= _mm_mul_pd(fscal
,dx21
);
854 ty
= _mm_mul_pd(fscal
,dy21
);
855 tz
= _mm_mul_pd(fscal
,dz21
);
857 /* Update vectorial force */
858 fix2
= _mm_add_pd(fix2
,tx
);
859 fiy2
= _mm_add_pd(fiy2
,ty
);
860 fiz2
= _mm_add_pd(fiz2
,tz
);
862 fjx1
= _mm_add_pd(fjx1
,tx
);
863 fjy1
= _mm_add_pd(fjy1
,ty
);
864 fjz1
= _mm_add_pd(fjz1
,tz
);
866 /**************************
867 * CALCULATE INTERACTIONS *
868 **************************/
870 /* REACTION-FIELD ELECTROSTATICS */
871 velec
= _mm_mul_pd(qq22
,_mm_sub_pd(_mm_add_pd(rinv22
,_mm_mul_pd(krf
,rsq22
)),crf
));
872 felec
= _mm_mul_pd(qq22
,_mm_sub_pd(_mm_mul_pd(rinv22
,rinvsq22
),krf2
));
874 /* Update potential sum for this i atom from the interaction with this j atom. */
875 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
876 velecsum
= _mm_add_pd(velecsum
,velec
);
880 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
882 /* Calculate temporary vectorial force */
883 tx
= _mm_mul_pd(fscal
,dx22
);
884 ty
= _mm_mul_pd(fscal
,dy22
);
885 tz
= _mm_mul_pd(fscal
,dz22
);
887 /* Update vectorial force */
888 fix2
= _mm_add_pd(fix2
,tx
);
889 fiy2
= _mm_add_pd(fiy2
,ty
);
890 fiz2
= _mm_add_pd(fiz2
,tz
);
892 fjx2
= _mm_add_pd(fjx2
,tx
);
893 fjy2
= _mm_add_pd(fjy2
,ty
);
894 fjz2
= _mm_add_pd(fjz2
,tz
);
896 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
898 /* Inner loop uses 300 flops */
901 /* End of innermost loop */
903 gmx_mm_update_iforce_3atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
904 f
+i_coord_offset
,fshift
+i_shift_offset
);
907 /* Update potential energies */
908 gmx_mm_update_1pot_pd(velecsum
,kernel_data
->energygrp_elec
+ggid
);
909 gmx_mm_update_1pot_pd(vvdwsum
,kernel_data
->energygrp_vdw
+ggid
);
911 /* Increment number of inner iterations */
912 inneriter
+= j_index_end
- j_index_start
;
914 /* Outer loop uses 20 flops */
917 /* Increment number of outer iterations */
920 /* Update outer/inner flops */
922 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W3W3_VF
,outeriter
*20 + inneriter
*300);
925 * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_sse4_1_double
926 * Electrostatics interaction: ReactionField
927 * VdW interaction: LennardJones
928 * Geometry: Water3-Water3
929 * Calculate force/pot: Force
932 nb_kernel_ElecRF_VdwLJ_GeomW3W3_F_sse4_1_double
933 (t_nblist
* gmx_restrict nlist
,
934 rvec
* gmx_restrict xx
,
935 rvec
* gmx_restrict ff
,
936 struct t_forcerec
* gmx_restrict fr
,
937 t_mdatoms
* gmx_restrict mdatoms
,
938 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
939 t_nrnb
* gmx_restrict nrnb
)
941 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
942 * just 0 for non-waters.
943 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
944 * jnr indices corresponding to data put in the four positions in the SIMD register.
946 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
947 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
949 int j_coord_offsetA
,j_coord_offsetB
;
950 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
952 real
*shiftvec
,*fshift
,*x
,*f
;
953 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
955 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
957 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
959 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
960 int vdwjidx0A
,vdwjidx0B
;
961 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
962 int vdwjidx1A
,vdwjidx1B
;
963 __m128d jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
964 int vdwjidx2A
,vdwjidx2B
;
965 __m128d jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
966 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
967 __m128d dx01
,dy01
,dz01
,rsq01
,rinv01
,rinvsq01
,r01
,qq01
,c6_01
,c12_01
;
968 __m128d dx02
,dy02
,dz02
,rsq02
,rinv02
,rinvsq02
,r02
,qq02
,c6_02
,c12_02
;
969 __m128d dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
970 __m128d dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
971 __m128d dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
972 __m128d dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
973 __m128d dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
974 __m128d dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
975 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
978 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
981 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
982 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
983 __m128d dummy_mask
,cutoff_mask
;
984 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
985 __m128d one
= _mm_set1_pd(1.0);
986 __m128d two
= _mm_set1_pd(2.0);
992 jindex
= nlist
->jindex
;
994 shiftidx
= nlist
->shift
;
996 shiftvec
= fr
->shift_vec
[0];
997 fshift
= fr
->fshift
[0];
998 facel
= _mm_set1_pd(fr
->ic
->epsfac
);
999 charge
= mdatoms
->chargeA
;
1000 krf
= _mm_set1_pd(fr
->ic
->k_rf
);
1001 krf2
= _mm_set1_pd(fr
->ic
->k_rf
*2.0);
1002 crf
= _mm_set1_pd(fr
->ic
->c_rf
);
1003 nvdwtype
= fr
->ntype
;
1004 vdwparam
= fr
->nbfp
;
1005 vdwtype
= mdatoms
->typeA
;
1007 /* Setup water-specific parameters */
1008 inr
= nlist
->iinr
[0];
1009 iq0
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+0]));
1010 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
1011 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
1012 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
1014 jq0
= _mm_set1_pd(charge
[inr
+0]);
1015 jq1
= _mm_set1_pd(charge
[inr
+1]);
1016 jq2
= _mm_set1_pd(charge
[inr
+2]);
1017 vdwjidx0A
= 2*vdwtype
[inr
+0];
1018 qq00
= _mm_mul_pd(iq0
,jq0
);
1019 c6_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
]);
1020 c12_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
1021 qq01
= _mm_mul_pd(iq0
,jq1
);
1022 qq02
= _mm_mul_pd(iq0
,jq2
);
1023 qq10
= _mm_mul_pd(iq1
,jq0
);
1024 qq11
= _mm_mul_pd(iq1
,jq1
);
1025 qq12
= _mm_mul_pd(iq1
,jq2
);
1026 qq20
= _mm_mul_pd(iq2
,jq0
);
1027 qq21
= _mm_mul_pd(iq2
,jq1
);
1028 qq22
= _mm_mul_pd(iq2
,jq2
);
1030 /* Avoid stupid compiler warnings */
1032 j_coord_offsetA
= 0;
1033 j_coord_offsetB
= 0;
1038 /* Start outer loop over neighborlists */
1039 for(iidx
=0; iidx
<nri
; iidx
++)
1041 /* Load shift vector for this list */
1042 i_shift_offset
= DIM
*shiftidx
[iidx
];
1044 /* Load limits for loop over neighbors */
1045 j_index_start
= jindex
[iidx
];
1046 j_index_end
= jindex
[iidx
+1];
1048 /* Get outer coordinate index */
1050 i_coord_offset
= DIM
*inr
;
1052 /* Load i particle coords and add shift vector */
1053 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
1054 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
1056 fix0
= _mm_setzero_pd();
1057 fiy0
= _mm_setzero_pd();
1058 fiz0
= _mm_setzero_pd();
1059 fix1
= _mm_setzero_pd();
1060 fiy1
= _mm_setzero_pd();
1061 fiz1
= _mm_setzero_pd();
1062 fix2
= _mm_setzero_pd();
1063 fiy2
= _mm_setzero_pd();
1064 fiz2
= _mm_setzero_pd();
1066 /* Start inner kernel loop */
1067 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
1070 /* Get j neighbor index, and coordinate index */
1072 jnrB
= jjnr
[jidx
+1];
1073 j_coord_offsetA
= DIM
*jnrA
;
1074 j_coord_offsetB
= DIM
*jnrB
;
1076 /* load j atom coordinates */
1077 gmx_mm_load_3rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1078 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
1080 /* Calculate displacement vector */
1081 dx00
= _mm_sub_pd(ix0
,jx0
);
1082 dy00
= _mm_sub_pd(iy0
,jy0
);
1083 dz00
= _mm_sub_pd(iz0
,jz0
);
1084 dx01
= _mm_sub_pd(ix0
,jx1
);
1085 dy01
= _mm_sub_pd(iy0
,jy1
);
1086 dz01
= _mm_sub_pd(iz0
,jz1
);
1087 dx02
= _mm_sub_pd(ix0
,jx2
);
1088 dy02
= _mm_sub_pd(iy0
,jy2
);
1089 dz02
= _mm_sub_pd(iz0
,jz2
);
1090 dx10
= _mm_sub_pd(ix1
,jx0
);
1091 dy10
= _mm_sub_pd(iy1
,jy0
);
1092 dz10
= _mm_sub_pd(iz1
,jz0
);
1093 dx11
= _mm_sub_pd(ix1
,jx1
);
1094 dy11
= _mm_sub_pd(iy1
,jy1
);
1095 dz11
= _mm_sub_pd(iz1
,jz1
);
1096 dx12
= _mm_sub_pd(ix1
,jx2
);
1097 dy12
= _mm_sub_pd(iy1
,jy2
);
1098 dz12
= _mm_sub_pd(iz1
,jz2
);
1099 dx20
= _mm_sub_pd(ix2
,jx0
);
1100 dy20
= _mm_sub_pd(iy2
,jy0
);
1101 dz20
= _mm_sub_pd(iz2
,jz0
);
1102 dx21
= _mm_sub_pd(ix2
,jx1
);
1103 dy21
= _mm_sub_pd(iy2
,jy1
);
1104 dz21
= _mm_sub_pd(iz2
,jz1
);
1105 dx22
= _mm_sub_pd(ix2
,jx2
);
1106 dy22
= _mm_sub_pd(iy2
,jy2
);
1107 dz22
= _mm_sub_pd(iz2
,jz2
);
1109 /* Calculate squared distance and things based on it */
1110 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
1111 rsq01
= gmx_mm_calc_rsq_pd(dx01
,dy01
,dz01
);
1112 rsq02
= gmx_mm_calc_rsq_pd(dx02
,dy02
,dz02
);
1113 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
1114 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
1115 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
1116 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
1117 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
1118 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
1120 rinv00
= sse41_invsqrt_d(rsq00
);
1121 rinv01
= sse41_invsqrt_d(rsq01
);
1122 rinv02
= sse41_invsqrt_d(rsq02
);
1123 rinv10
= sse41_invsqrt_d(rsq10
);
1124 rinv11
= sse41_invsqrt_d(rsq11
);
1125 rinv12
= sse41_invsqrt_d(rsq12
);
1126 rinv20
= sse41_invsqrt_d(rsq20
);
1127 rinv21
= sse41_invsqrt_d(rsq21
);
1128 rinv22
= sse41_invsqrt_d(rsq22
);
1130 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
1131 rinvsq01
= _mm_mul_pd(rinv01
,rinv01
);
1132 rinvsq02
= _mm_mul_pd(rinv02
,rinv02
);
1133 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
1134 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
1135 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
1136 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
1137 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
1138 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
1140 fjx0
= _mm_setzero_pd();
1141 fjy0
= _mm_setzero_pd();
1142 fjz0
= _mm_setzero_pd();
1143 fjx1
= _mm_setzero_pd();
1144 fjy1
= _mm_setzero_pd();
1145 fjz1
= _mm_setzero_pd();
1146 fjx2
= _mm_setzero_pd();
1147 fjy2
= _mm_setzero_pd();
1148 fjz2
= _mm_setzero_pd();
1150 /**************************
1151 * CALCULATE INTERACTIONS *
1152 **************************/
1154 /* REACTION-FIELD ELECTROSTATICS */
1155 felec
= _mm_mul_pd(qq00
,_mm_sub_pd(_mm_mul_pd(rinv00
,rinvsq00
),krf2
));
1157 /* LENNARD-JONES DISPERSION/REPULSION */
1159 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
1160 fvdw
= _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00
,rinvsix
),c6_00
),_mm_mul_pd(rinvsix
,rinvsq00
));
1162 fscal
= _mm_add_pd(felec
,fvdw
);
1164 /* Calculate temporary vectorial force */
1165 tx
= _mm_mul_pd(fscal
,dx00
);
1166 ty
= _mm_mul_pd(fscal
,dy00
);
1167 tz
= _mm_mul_pd(fscal
,dz00
);
1169 /* Update vectorial force */
1170 fix0
= _mm_add_pd(fix0
,tx
);
1171 fiy0
= _mm_add_pd(fiy0
,ty
);
1172 fiz0
= _mm_add_pd(fiz0
,tz
);
1174 fjx0
= _mm_add_pd(fjx0
,tx
);
1175 fjy0
= _mm_add_pd(fjy0
,ty
);
1176 fjz0
= _mm_add_pd(fjz0
,tz
);
1178 /**************************
1179 * CALCULATE INTERACTIONS *
1180 **************************/
1182 /* REACTION-FIELD ELECTROSTATICS */
1183 felec
= _mm_mul_pd(qq01
,_mm_sub_pd(_mm_mul_pd(rinv01
,rinvsq01
),krf2
));
1187 /* Calculate temporary vectorial force */
1188 tx
= _mm_mul_pd(fscal
,dx01
);
1189 ty
= _mm_mul_pd(fscal
,dy01
);
1190 tz
= _mm_mul_pd(fscal
,dz01
);
1192 /* Update vectorial force */
1193 fix0
= _mm_add_pd(fix0
,tx
);
1194 fiy0
= _mm_add_pd(fiy0
,ty
);
1195 fiz0
= _mm_add_pd(fiz0
,tz
);
1197 fjx1
= _mm_add_pd(fjx1
,tx
);
1198 fjy1
= _mm_add_pd(fjy1
,ty
);
1199 fjz1
= _mm_add_pd(fjz1
,tz
);
1201 /**************************
1202 * CALCULATE INTERACTIONS *
1203 **************************/
1205 /* REACTION-FIELD ELECTROSTATICS */
1206 felec
= _mm_mul_pd(qq02
,_mm_sub_pd(_mm_mul_pd(rinv02
,rinvsq02
),krf2
));
1210 /* Calculate temporary vectorial force */
1211 tx
= _mm_mul_pd(fscal
,dx02
);
1212 ty
= _mm_mul_pd(fscal
,dy02
);
1213 tz
= _mm_mul_pd(fscal
,dz02
);
1215 /* Update vectorial force */
1216 fix0
= _mm_add_pd(fix0
,tx
);
1217 fiy0
= _mm_add_pd(fiy0
,ty
);
1218 fiz0
= _mm_add_pd(fiz0
,tz
);
1220 fjx2
= _mm_add_pd(fjx2
,tx
);
1221 fjy2
= _mm_add_pd(fjy2
,ty
);
1222 fjz2
= _mm_add_pd(fjz2
,tz
);
1224 /**************************
1225 * CALCULATE INTERACTIONS *
1226 **************************/
1228 /* REACTION-FIELD ELECTROSTATICS */
1229 felec
= _mm_mul_pd(qq10
,_mm_sub_pd(_mm_mul_pd(rinv10
,rinvsq10
),krf2
));
1233 /* Calculate temporary vectorial force */
1234 tx
= _mm_mul_pd(fscal
,dx10
);
1235 ty
= _mm_mul_pd(fscal
,dy10
);
1236 tz
= _mm_mul_pd(fscal
,dz10
);
1238 /* Update vectorial force */
1239 fix1
= _mm_add_pd(fix1
,tx
);
1240 fiy1
= _mm_add_pd(fiy1
,ty
);
1241 fiz1
= _mm_add_pd(fiz1
,tz
);
1243 fjx0
= _mm_add_pd(fjx0
,tx
);
1244 fjy0
= _mm_add_pd(fjy0
,ty
);
1245 fjz0
= _mm_add_pd(fjz0
,tz
);
1247 /**************************
1248 * CALCULATE INTERACTIONS *
1249 **************************/
1251 /* REACTION-FIELD ELECTROSTATICS */
1252 felec
= _mm_mul_pd(qq11
,_mm_sub_pd(_mm_mul_pd(rinv11
,rinvsq11
),krf2
));
1256 /* Calculate temporary vectorial force */
1257 tx
= _mm_mul_pd(fscal
,dx11
);
1258 ty
= _mm_mul_pd(fscal
,dy11
);
1259 tz
= _mm_mul_pd(fscal
,dz11
);
1261 /* Update vectorial force */
1262 fix1
= _mm_add_pd(fix1
,tx
);
1263 fiy1
= _mm_add_pd(fiy1
,ty
);
1264 fiz1
= _mm_add_pd(fiz1
,tz
);
1266 fjx1
= _mm_add_pd(fjx1
,tx
);
1267 fjy1
= _mm_add_pd(fjy1
,ty
);
1268 fjz1
= _mm_add_pd(fjz1
,tz
);
1270 /**************************
1271 * CALCULATE INTERACTIONS *
1272 **************************/
1274 /* REACTION-FIELD ELECTROSTATICS */
1275 felec
= _mm_mul_pd(qq12
,_mm_sub_pd(_mm_mul_pd(rinv12
,rinvsq12
),krf2
));
1279 /* Calculate temporary vectorial force */
1280 tx
= _mm_mul_pd(fscal
,dx12
);
1281 ty
= _mm_mul_pd(fscal
,dy12
);
1282 tz
= _mm_mul_pd(fscal
,dz12
);
1284 /* Update vectorial force */
1285 fix1
= _mm_add_pd(fix1
,tx
);
1286 fiy1
= _mm_add_pd(fiy1
,ty
);
1287 fiz1
= _mm_add_pd(fiz1
,tz
);
1289 fjx2
= _mm_add_pd(fjx2
,tx
);
1290 fjy2
= _mm_add_pd(fjy2
,ty
);
1291 fjz2
= _mm_add_pd(fjz2
,tz
);
1293 /**************************
1294 * CALCULATE INTERACTIONS *
1295 **************************/
1297 /* REACTION-FIELD ELECTROSTATICS */
1298 felec
= _mm_mul_pd(qq20
,_mm_sub_pd(_mm_mul_pd(rinv20
,rinvsq20
),krf2
));
1302 /* Calculate temporary vectorial force */
1303 tx
= _mm_mul_pd(fscal
,dx20
);
1304 ty
= _mm_mul_pd(fscal
,dy20
);
1305 tz
= _mm_mul_pd(fscal
,dz20
);
1307 /* Update vectorial force */
1308 fix2
= _mm_add_pd(fix2
,tx
);
1309 fiy2
= _mm_add_pd(fiy2
,ty
);
1310 fiz2
= _mm_add_pd(fiz2
,tz
);
1312 fjx0
= _mm_add_pd(fjx0
,tx
);
1313 fjy0
= _mm_add_pd(fjy0
,ty
);
1314 fjz0
= _mm_add_pd(fjz0
,tz
);
1316 /**************************
1317 * CALCULATE INTERACTIONS *
1318 **************************/
1320 /* REACTION-FIELD ELECTROSTATICS */
1321 felec
= _mm_mul_pd(qq21
,_mm_sub_pd(_mm_mul_pd(rinv21
,rinvsq21
),krf2
));
1325 /* Calculate temporary vectorial force */
1326 tx
= _mm_mul_pd(fscal
,dx21
);
1327 ty
= _mm_mul_pd(fscal
,dy21
);
1328 tz
= _mm_mul_pd(fscal
,dz21
);
1330 /* Update vectorial force */
1331 fix2
= _mm_add_pd(fix2
,tx
);
1332 fiy2
= _mm_add_pd(fiy2
,ty
);
1333 fiz2
= _mm_add_pd(fiz2
,tz
);
1335 fjx1
= _mm_add_pd(fjx1
,tx
);
1336 fjy1
= _mm_add_pd(fjy1
,ty
);
1337 fjz1
= _mm_add_pd(fjz1
,tz
);
1339 /**************************
1340 * CALCULATE INTERACTIONS *
1341 **************************/
1343 /* REACTION-FIELD ELECTROSTATICS */
1344 felec
= _mm_mul_pd(qq22
,_mm_sub_pd(_mm_mul_pd(rinv22
,rinvsq22
),krf2
));
1348 /* Calculate temporary vectorial force */
1349 tx
= _mm_mul_pd(fscal
,dx22
);
1350 ty
= _mm_mul_pd(fscal
,dy22
);
1351 tz
= _mm_mul_pd(fscal
,dz22
);
1353 /* Update vectorial force */
1354 fix2
= _mm_add_pd(fix2
,tx
);
1355 fiy2
= _mm_add_pd(fiy2
,ty
);
1356 fiz2
= _mm_add_pd(fiz2
,tz
);
1358 fjx2
= _mm_add_pd(fjx2
,tx
);
1359 fjy2
= _mm_add_pd(fjy2
,ty
);
1360 fjz2
= _mm_add_pd(fjz2
,tz
);
1362 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
1364 /* Inner loop uses 250 flops */
1367 if(jidx
<j_index_end
)
1371 j_coord_offsetA
= DIM
*jnrA
;
1373 /* load j atom coordinates */
1374 gmx_mm_load_3rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
1375 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
1377 /* Calculate displacement vector */
1378 dx00
= _mm_sub_pd(ix0
,jx0
);
1379 dy00
= _mm_sub_pd(iy0
,jy0
);
1380 dz00
= _mm_sub_pd(iz0
,jz0
);
1381 dx01
= _mm_sub_pd(ix0
,jx1
);
1382 dy01
= _mm_sub_pd(iy0
,jy1
);
1383 dz01
= _mm_sub_pd(iz0
,jz1
);
1384 dx02
= _mm_sub_pd(ix0
,jx2
);
1385 dy02
= _mm_sub_pd(iy0
,jy2
);
1386 dz02
= _mm_sub_pd(iz0
,jz2
);
1387 dx10
= _mm_sub_pd(ix1
,jx0
);
1388 dy10
= _mm_sub_pd(iy1
,jy0
);
1389 dz10
= _mm_sub_pd(iz1
,jz0
);
1390 dx11
= _mm_sub_pd(ix1
,jx1
);
1391 dy11
= _mm_sub_pd(iy1
,jy1
);
1392 dz11
= _mm_sub_pd(iz1
,jz1
);
1393 dx12
= _mm_sub_pd(ix1
,jx2
);
1394 dy12
= _mm_sub_pd(iy1
,jy2
);
1395 dz12
= _mm_sub_pd(iz1
,jz2
);
1396 dx20
= _mm_sub_pd(ix2
,jx0
);
1397 dy20
= _mm_sub_pd(iy2
,jy0
);
1398 dz20
= _mm_sub_pd(iz2
,jz0
);
1399 dx21
= _mm_sub_pd(ix2
,jx1
);
1400 dy21
= _mm_sub_pd(iy2
,jy1
);
1401 dz21
= _mm_sub_pd(iz2
,jz1
);
1402 dx22
= _mm_sub_pd(ix2
,jx2
);
1403 dy22
= _mm_sub_pd(iy2
,jy2
);
1404 dz22
= _mm_sub_pd(iz2
,jz2
);
1406 /* Calculate squared distance and things based on it */
1407 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
1408 rsq01
= gmx_mm_calc_rsq_pd(dx01
,dy01
,dz01
);
1409 rsq02
= gmx_mm_calc_rsq_pd(dx02
,dy02
,dz02
);
1410 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
1411 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
1412 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
1413 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
1414 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
1415 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
1417 rinv00
= sse41_invsqrt_d(rsq00
);
1418 rinv01
= sse41_invsqrt_d(rsq01
);
1419 rinv02
= sse41_invsqrt_d(rsq02
);
1420 rinv10
= sse41_invsqrt_d(rsq10
);
1421 rinv11
= sse41_invsqrt_d(rsq11
);
1422 rinv12
= sse41_invsqrt_d(rsq12
);
1423 rinv20
= sse41_invsqrt_d(rsq20
);
1424 rinv21
= sse41_invsqrt_d(rsq21
);
1425 rinv22
= sse41_invsqrt_d(rsq22
);
1427 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
1428 rinvsq01
= _mm_mul_pd(rinv01
,rinv01
);
1429 rinvsq02
= _mm_mul_pd(rinv02
,rinv02
);
1430 rinvsq10
= _mm_mul_pd(rinv10
,rinv10
);
1431 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
1432 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
1433 rinvsq20
= _mm_mul_pd(rinv20
,rinv20
);
1434 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
1435 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
1437 fjx0
= _mm_setzero_pd();
1438 fjy0
= _mm_setzero_pd();
1439 fjz0
= _mm_setzero_pd();
1440 fjx1
= _mm_setzero_pd();
1441 fjy1
= _mm_setzero_pd();
1442 fjz1
= _mm_setzero_pd();
1443 fjx2
= _mm_setzero_pd();
1444 fjy2
= _mm_setzero_pd();
1445 fjz2
= _mm_setzero_pd();
1447 /**************************
1448 * CALCULATE INTERACTIONS *
1449 **************************/
1451 /* REACTION-FIELD ELECTROSTATICS */
1452 felec
= _mm_mul_pd(qq00
,_mm_sub_pd(_mm_mul_pd(rinv00
,rinvsq00
),krf2
));
1454 /* LENNARD-JONES DISPERSION/REPULSION */
1456 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
1457 fvdw
= _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00
,rinvsix
),c6_00
),_mm_mul_pd(rinvsix
,rinvsq00
));
1459 fscal
= _mm_add_pd(felec
,fvdw
);
1461 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1463 /* Calculate temporary vectorial force */
1464 tx
= _mm_mul_pd(fscal
,dx00
);
1465 ty
= _mm_mul_pd(fscal
,dy00
);
1466 tz
= _mm_mul_pd(fscal
,dz00
);
1468 /* Update vectorial force */
1469 fix0
= _mm_add_pd(fix0
,tx
);
1470 fiy0
= _mm_add_pd(fiy0
,ty
);
1471 fiz0
= _mm_add_pd(fiz0
,tz
);
1473 fjx0
= _mm_add_pd(fjx0
,tx
);
1474 fjy0
= _mm_add_pd(fjy0
,ty
);
1475 fjz0
= _mm_add_pd(fjz0
,tz
);
1477 /**************************
1478 * CALCULATE INTERACTIONS *
1479 **************************/
1481 /* REACTION-FIELD ELECTROSTATICS */
1482 felec
= _mm_mul_pd(qq01
,_mm_sub_pd(_mm_mul_pd(rinv01
,rinvsq01
),krf2
));
1486 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1488 /* Calculate temporary vectorial force */
1489 tx
= _mm_mul_pd(fscal
,dx01
);
1490 ty
= _mm_mul_pd(fscal
,dy01
);
1491 tz
= _mm_mul_pd(fscal
,dz01
);
1493 /* Update vectorial force */
1494 fix0
= _mm_add_pd(fix0
,tx
);
1495 fiy0
= _mm_add_pd(fiy0
,ty
);
1496 fiz0
= _mm_add_pd(fiz0
,tz
);
1498 fjx1
= _mm_add_pd(fjx1
,tx
);
1499 fjy1
= _mm_add_pd(fjy1
,ty
);
1500 fjz1
= _mm_add_pd(fjz1
,tz
);
1502 /**************************
1503 * CALCULATE INTERACTIONS *
1504 **************************/
1506 /* REACTION-FIELD ELECTROSTATICS */
1507 felec
= _mm_mul_pd(qq02
,_mm_sub_pd(_mm_mul_pd(rinv02
,rinvsq02
),krf2
));
1511 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1513 /* Calculate temporary vectorial force */
1514 tx
= _mm_mul_pd(fscal
,dx02
);
1515 ty
= _mm_mul_pd(fscal
,dy02
);
1516 tz
= _mm_mul_pd(fscal
,dz02
);
1518 /* Update vectorial force */
1519 fix0
= _mm_add_pd(fix0
,tx
);
1520 fiy0
= _mm_add_pd(fiy0
,ty
);
1521 fiz0
= _mm_add_pd(fiz0
,tz
);
1523 fjx2
= _mm_add_pd(fjx2
,tx
);
1524 fjy2
= _mm_add_pd(fjy2
,ty
);
1525 fjz2
= _mm_add_pd(fjz2
,tz
);
1527 /**************************
1528 * CALCULATE INTERACTIONS *
1529 **************************/
1531 /* REACTION-FIELD ELECTROSTATICS */
1532 felec
= _mm_mul_pd(qq10
,_mm_sub_pd(_mm_mul_pd(rinv10
,rinvsq10
),krf2
));
1536 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1538 /* Calculate temporary vectorial force */
1539 tx
= _mm_mul_pd(fscal
,dx10
);
1540 ty
= _mm_mul_pd(fscal
,dy10
);
1541 tz
= _mm_mul_pd(fscal
,dz10
);
1543 /* Update vectorial force */
1544 fix1
= _mm_add_pd(fix1
,tx
);
1545 fiy1
= _mm_add_pd(fiy1
,ty
);
1546 fiz1
= _mm_add_pd(fiz1
,tz
);
1548 fjx0
= _mm_add_pd(fjx0
,tx
);
1549 fjy0
= _mm_add_pd(fjy0
,ty
);
1550 fjz0
= _mm_add_pd(fjz0
,tz
);
1552 /**************************
1553 * CALCULATE INTERACTIONS *
1554 **************************/
1556 /* REACTION-FIELD ELECTROSTATICS */
1557 felec
= _mm_mul_pd(qq11
,_mm_sub_pd(_mm_mul_pd(rinv11
,rinvsq11
),krf2
));
1561 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1563 /* Calculate temporary vectorial force */
1564 tx
= _mm_mul_pd(fscal
,dx11
);
1565 ty
= _mm_mul_pd(fscal
,dy11
);
1566 tz
= _mm_mul_pd(fscal
,dz11
);
1568 /* Update vectorial force */
1569 fix1
= _mm_add_pd(fix1
,tx
);
1570 fiy1
= _mm_add_pd(fiy1
,ty
);
1571 fiz1
= _mm_add_pd(fiz1
,tz
);
1573 fjx1
= _mm_add_pd(fjx1
,tx
);
1574 fjy1
= _mm_add_pd(fjy1
,ty
);
1575 fjz1
= _mm_add_pd(fjz1
,tz
);
1577 /**************************
1578 * CALCULATE INTERACTIONS *
1579 **************************/
1581 /* REACTION-FIELD ELECTROSTATICS */
1582 felec
= _mm_mul_pd(qq12
,_mm_sub_pd(_mm_mul_pd(rinv12
,rinvsq12
),krf2
));
1586 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1588 /* Calculate temporary vectorial force */
1589 tx
= _mm_mul_pd(fscal
,dx12
);
1590 ty
= _mm_mul_pd(fscal
,dy12
);
1591 tz
= _mm_mul_pd(fscal
,dz12
);
1593 /* Update vectorial force */
1594 fix1
= _mm_add_pd(fix1
,tx
);
1595 fiy1
= _mm_add_pd(fiy1
,ty
);
1596 fiz1
= _mm_add_pd(fiz1
,tz
);
1598 fjx2
= _mm_add_pd(fjx2
,tx
);
1599 fjy2
= _mm_add_pd(fjy2
,ty
);
1600 fjz2
= _mm_add_pd(fjz2
,tz
);
1602 /**************************
1603 * CALCULATE INTERACTIONS *
1604 **************************/
1606 /* REACTION-FIELD ELECTROSTATICS */
1607 felec
= _mm_mul_pd(qq20
,_mm_sub_pd(_mm_mul_pd(rinv20
,rinvsq20
),krf2
));
1611 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1613 /* Calculate temporary vectorial force */
1614 tx
= _mm_mul_pd(fscal
,dx20
);
1615 ty
= _mm_mul_pd(fscal
,dy20
);
1616 tz
= _mm_mul_pd(fscal
,dz20
);
1618 /* Update vectorial force */
1619 fix2
= _mm_add_pd(fix2
,tx
);
1620 fiy2
= _mm_add_pd(fiy2
,ty
);
1621 fiz2
= _mm_add_pd(fiz2
,tz
);
1623 fjx0
= _mm_add_pd(fjx0
,tx
);
1624 fjy0
= _mm_add_pd(fjy0
,ty
);
1625 fjz0
= _mm_add_pd(fjz0
,tz
);
1627 /**************************
1628 * CALCULATE INTERACTIONS *
1629 **************************/
1631 /* REACTION-FIELD ELECTROSTATICS */
1632 felec
= _mm_mul_pd(qq21
,_mm_sub_pd(_mm_mul_pd(rinv21
,rinvsq21
),krf2
));
1636 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1638 /* Calculate temporary vectorial force */
1639 tx
= _mm_mul_pd(fscal
,dx21
);
1640 ty
= _mm_mul_pd(fscal
,dy21
);
1641 tz
= _mm_mul_pd(fscal
,dz21
);
1643 /* Update vectorial force */
1644 fix2
= _mm_add_pd(fix2
,tx
);
1645 fiy2
= _mm_add_pd(fiy2
,ty
);
1646 fiz2
= _mm_add_pd(fiz2
,tz
);
1648 fjx1
= _mm_add_pd(fjx1
,tx
);
1649 fjy1
= _mm_add_pd(fjy1
,ty
);
1650 fjz1
= _mm_add_pd(fjz1
,tz
);
1652 /**************************
1653 * CALCULATE INTERACTIONS *
1654 **************************/
1656 /* REACTION-FIELD ELECTROSTATICS */
1657 felec
= _mm_mul_pd(qq22
,_mm_sub_pd(_mm_mul_pd(rinv22
,rinvsq22
),krf2
));
1661 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1663 /* Calculate temporary vectorial force */
1664 tx
= _mm_mul_pd(fscal
,dx22
);
1665 ty
= _mm_mul_pd(fscal
,dy22
);
1666 tz
= _mm_mul_pd(fscal
,dz22
);
1668 /* Update vectorial force */
1669 fix2
= _mm_add_pd(fix2
,tx
);
1670 fiy2
= _mm_add_pd(fiy2
,ty
);
1671 fiz2
= _mm_add_pd(fiz2
,tz
);
1673 fjx2
= _mm_add_pd(fjx2
,tx
);
1674 fjy2
= _mm_add_pd(fjy2
,ty
);
1675 fjz2
= _mm_add_pd(fjz2
,tz
);
1677 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
1679 /* Inner loop uses 250 flops */
1682 /* End of innermost loop */
1684 gmx_mm_update_iforce_3atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
1685 f
+i_coord_offset
,fshift
+i_shift_offset
);
1687 /* Increment number of inner iterations */
1688 inneriter
+= j_index_end
- j_index_start
;
1690 /* Outer loop uses 18 flops */
1693 /* Increment number of outer iterations */
1696 /* Update outer/inner flops */
1698 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W3W3_F
,outeriter
*18 + inneriter
*250);