2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/math/vec.h"
46 #include "gromacs/legacyheaders/nrnb.h"
48 #include "gromacs/simd/math_x86_sse2_double.h"
49 #include "kernelutil_x86_sse2_double.h"
52 * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_sse2_double
53 * Electrostatics interaction: Coulomb
54 * VdW interaction: LennardJones
55 * Geometry: Water4-Water4
56 * Calculate force/pot: PotentialAndForce
59 nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_sse2_double
60 (t_nblist
* gmx_restrict nlist
,
61 rvec
* gmx_restrict xx
,
62 rvec
* gmx_restrict ff
,
63 t_forcerec
* gmx_restrict fr
,
64 t_mdatoms
* gmx_restrict mdatoms
,
65 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
66 t_nrnb
* gmx_restrict nrnb
)
68 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
69 * just 0 for non-waters.
70 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
71 * jnr indices corresponding to data put in the four positions in the SIMD register.
73 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
74 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
76 int j_coord_offsetA
,j_coord_offsetB
;
77 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
79 real
*shiftvec
,*fshift
,*x
,*f
;
80 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
82 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
84 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
86 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
88 __m128d ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
89 int vdwjidx0A
,vdwjidx0B
;
90 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
91 int vdwjidx1A
,vdwjidx1B
;
92 __m128d jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
93 int vdwjidx2A
,vdwjidx2B
;
94 __m128d jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
95 int vdwjidx3A
,vdwjidx3B
;
96 __m128d jx3
,jy3
,jz3
,fjx3
,fjy3
,fjz3
,jq3
,isaj3
;
97 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
98 __m128d dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
99 __m128d dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
100 __m128d dx13
,dy13
,dz13
,rsq13
,rinv13
,rinvsq13
,r13
,qq13
,c6_13
,c12_13
;
101 __m128d dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
102 __m128d dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
103 __m128d dx23
,dy23
,dz23
,rsq23
,rinv23
,rinvsq23
,r23
,qq23
,c6_23
,c12_23
;
104 __m128d dx31
,dy31
,dz31
,rsq31
,rinv31
,rinvsq31
,r31
,qq31
,c6_31
,c12_31
;
105 __m128d dx32
,dy32
,dz32
,rsq32
,rinv32
,rinvsq32
,r32
,qq32
,c6_32
,c12_32
;
106 __m128d dx33
,dy33
,dz33
,rsq33
,rinv33
,rinvsq33
,r33
,qq33
,c6_33
,c12_33
;
107 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
110 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
113 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
114 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
115 __m128d dummy_mask
,cutoff_mask
;
116 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
117 __m128d one
= _mm_set1_pd(1.0);
118 __m128d two
= _mm_set1_pd(2.0);
124 jindex
= nlist
->jindex
;
126 shiftidx
= nlist
->shift
;
128 shiftvec
= fr
->shift_vec
[0];
129 fshift
= fr
->fshift
[0];
130 facel
= _mm_set1_pd(fr
->epsfac
);
131 charge
= mdatoms
->chargeA
;
132 nvdwtype
= fr
->ntype
;
134 vdwtype
= mdatoms
->typeA
;
136 /* Setup water-specific parameters */
137 inr
= nlist
->iinr
[0];
138 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
139 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
140 iq3
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+3]));
141 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
143 jq1
= _mm_set1_pd(charge
[inr
+1]);
144 jq2
= _mm_set1_pd(charge
[inr
+2]);
145 jq3
= _mm_set1_pd(charge
[inr
+3]);
146 vdwjidx0A
= 2*vdwtype
[inr
+0];
147 c6_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
]);
148 c12_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
149 qq11
= _mm_mul_pd(iq1
,jq1
);
150 qq12
= _mm_mul_pd(iq1
,jq2
);
151 qq13
= _mm_mul_pd(iq1
,jq3
);
152 qq21
= _mm_mul_pd(iq2
,jq1
);
153 qq22
= _mm_mul_pd(iq2
,jq2
);
154 qq23
= _mm_mul_pd(iq2
,jq3
);
155 qq31
= _mm_mul_pd(iq3
,jq1
);
156 qq32
= _mm_mul_pd(iq3
,jq2
);
157 qq33
= _mm_mul_pd(iq3
,jq3
);
159 /* Avoid stupid compiler warnings */
167 /* Start outer loop over neighborlists */
168 for(iidx
=0; iidx
<nri
; iidx
++)
170 /* Load shift vector for this list */
171 i_shift_offset
= DIM
*shiftidx
[iidx
];
173 /* Load limits for loop over neighbors */
174 j_index_start
= jindex
[iidx
];
175 j_index_end
= jindex
[iidx
+1];
177 /* Get outer coordinate index */
179 i_coord_offset
= DIM
*inr
;
181 /* Load i particle coords and add shift vector */
182 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
183 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
185 fix0
= _mm_setzero_pd();
186 fiy0
= _mm_setzero_pd();
187 fiz0
= _mm_setzero_pd();
188 fix1
= _mm_setzero_pd();
189 fiy1
= _mm_setzero_pd();
190 fiz1
= _mm_setzero_pd();
191 fix2
= _mm_setzero_pd();
192 fiy2
= _mm_setzero_pd();
193 fiz2
= _mm_setzero_pd();
194 fix3
= _mm_setzero_pd();
195 fiy3
= _mm_setzero_pd();
196 fiz3
= _mm_setzero_pd();
198 /* Reset potential sums */
199 velecsum
= _mm_setzero_pd();
200 vvdwsum
= _mm_setzero_pd();
202 /* Start inner kernel loop */
203 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
206 /* Get j neighbor index, and coordinate index */
209 j_coord_offsetA
= DIM
*jnrA
;
210 j_coord_offsetB
= DIM
*jnrB
;
212 /* load j atom coordinates */
213 gmx_mm_load_4rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
214 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
215 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
217 /* Calculate displacement vector */
218 dx00
= _mm_sub_pd(ix0
,jx0
);
219 dy00
= _mm_sub_pd(iy0
,jy0
);
220 dz00
= _mm_sub_pd(iz0
,jz0
);
221 dx11
= _mm_sub_pd(ix1
,jx1
);
222 dy11
= _mm_sub_pd(iy1
,jy1
);
223 dz11
= _mm_sub_pd(iz1
,jz1
);
224 dx12
= _mm_sub_pd(ix1
,jx2
);
225 dy12
= _mm_sub_pd(iy1
,jy2
);
226 dz12
= _mm_sub_pd(iz1
,jz2
);
227 dx13
= _mm_sub_pd(ix1
,jx3
);
228 dy13
= _mm_sub_pd(iy1
,jy3
);
229 dz13
= _mm_sub_pd(iz1
,jz3
);
230 dx21
= _mm_sub_pd(ix2
,jx1
);
231 dy21
= _mm_sub_pd(iy2
,jy1
);
232 dz21
= _mm_sub_pd(iz2
,jz1
);
233 dx22
= _mm_sub_pd(ix2
,jx2
);
234 dy22
= _mm_sub_pd(iy2
,jy2
);
235 dz22
= _mm_sub_pd(iz2
,jz2
);
236 dx23
= _mm_sub_pd(ix2
,jx3
);
237 dy23
= _mm_sub_pd(iy2
,jy3
);
238 dz23
= _mm_sub_pd(iz2
,jz3
);
239 dx31
= _mm_sub_pd(ix3
,jx1
);
240 dy31
= _mm_sub_pd(iy3
,jy1
);
241 dz31
= _mm_sub_pd(iz3
,jz1
);
242 dx32
= _mm_sub_pd(ix3
,jx2
);
243 dy32
= _mm_sub_pd(iy3
,jy2
);
244 dz32
= _mm_sub_pd(iz3
,jz2
);
245 dx33
= _mm_sub_pd(ix3
,jx3
);
246 dy33
= _mm_sub_pd(iy3
,jy3
);
247 dz33
= _mm_sub_pd(iz3
,jz3
);
249 /* Calculate squared distance and things based on it */
250 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
251 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
252 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
253 rsq13
= gmx_mm_calc_rsq_pd(dx13
,dy13
,dz13
);
254 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
255 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
256 rsq23
= gmx_mm_calc_rsq_pd(dx23
,dy23
,dz23
);
257 rsq31
= gmx_mm_calc_rsq_pd(dx31
,dy31
,dz31
);
258 rsq32
= gmx_mm_calc_rsq_pd(dx32
,dy32
,dz32
);
259 rsq33
= gmx_mm_calc_rsq_pd(dx33
,dy33
,dz33
);
261 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
262 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
263 rinv13
= gmx_mm_invsqrt_pd(rsq13
);
264 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
265 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
266 rinv23
= gmx_mm_invsqrt_pd(rsq23
);
267 rinv31
= gmx_mm_invsqrt_pd(rsq31
);
268 rinv32
= gmx_mm_invsqrt_pd(rsq32
);
269 rinv33
= gmx_mm_invsqrt_pd(rsq33
);
271 rinvsq00
= gmx_mm_inv_pd(rsq00
);
272 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
273 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
274 rinvsq13
= _mm_mul_pd(rinv13
,rinv13
);
275 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
276 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
277 rinvsq23
= _mm_mul_pd(rinv23
,rinv23
);
278 rinvsq31
= _mm_mul_pd(rinv31
,rinv31
);
279 rinvsq32
= _mm_mul_pd(rinv32
,rinv32
);
280 rinvsq33
= _mm_mul_pd(rinv33
,rinv33
);
282 fjx0
= _mm_setzero_pd();
283 fjy0
= _mm_setzero_pd();
284 fjz0
= _mm_setzero_pd();
285 fjx1
= _mm_setzero_pd();
286 fjy1
= _mm_setzero_pd();
287 fjz1
= _mm_setzero_pd();
288 fjx2
= _mm_setzero_pd();
289 fjy2
= _mm_setzero_pd();
290 fjz2
= _mm_setzero_pd();
291 fjx3
= _mm_setzero_pd();
292 fjy3
= _mm_setzero_pd();
293 fjz3
= _mm_setzero_pd();
295 /**************************
296 * CALCULATE INTERACTIONS *
297 **************************/
299 /* LENNARD-JONES DISPERSION/REPULSION */
301 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
302 vvdw6
= _mm_mul_pd(c6_00
,rinvsix
);
303 vvdw12
= _mm_mul_pd(c12_00
,_mm_mul_pd(rinvsix
,rinvsix
));
304 vvdw
= _mm_sub_pd( _mm_mul_pd(vvdw12
,one_twelfth
) , _mm_mul_pd(vvdw6
,one_sixth
) );
305 fvdw
= _mm_mul_pd(_mm_sub_pd(vvdw12
,vvdw6
),rinvsq00
);
307 /* Update potential sum for this i atom from the interaction with this j atom. */
308 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
312 /* Calculate temporary vectorial force */
313 tx
= _mm_mul_pd(fscal
,dx00
);
314 ty
= _mm_mul_pd(fscal
,dy00
);
315 tz
= _mm_mul_pd(fscal
,dz00
);
317 /* Update vectorial force */
318 fix0
= _mm_add_pd(fix0
,tx
);
319 fiy0
= _mm_add_pd(fiy0
,ty
);
320 fiz0
= _mm_add_pd(fiz0
,tz
);
322 fjx0
= _mm_add_pd(fjx0
,tx
);
323 fjy0
= _mm_add_pd(fjy0
,ty
);
324 fjz0
= _mm_add_pd(fjz0
,tz
);
326 /**************************
327 * CALCULATE INTERACTIONS *
328 **************************/
330 /* COULOMB ELECTROSTATICS */
331 velec
= _mm_mul_pd(qq11
,rinv11
);
332 felec
= _mm_mul_pd(velec
,rinvsq11
);
334 /* Update potential sum for this i atom from the interaction with this j atom. */
335 velecsum
= _mm_add_pd(velecsum
,velec
);
339 /* Calculate temporary vectorial force */
340 tx
= _mm_mul_pd(fscal
,dx11
);
341 ty
= _mm_mul_pd(fscal
,dy11
);
342 tz
= _mm_mul_pd(fscal
,dz11
);
344 /* Update vectorial force */
345 fix1
= _mm_add_pd(fix1
,tx
);
346 fiy1
= _mm_add_pd(fiy1
,ty
);
347 fiz1
= _mm_add_pd(fiz1
,tz
);
349 fjx1
= _mm_add_pd(fjx1
,tx
);
350 fjy1
= _mm_add_pd(fjy1
,ty
);
351 fjz1
= _mm_add_pd(fjz1
,tz
);
353 /**************************
354 * CALCULATE INTERACTIONS *
355 **************************/
357 /* COULOMB ELECTROSTATICS */
358 velec
= _mm_mul_pd(qq12
,rinv12
);
359 felec
= _mm_mul_pd(velec
,rinvsq12
);
361 /* Update potential sum for this i atom from the interaction with this j atom. */
362 velecsum
= _mm_add_pd(velecsum
,velec
);
366 /* Calculate temporary vectorial force */
367 tx
= _mm_mul_pd(fscal
,dx12
);
368 ty
= _mm_mul_pd(fscal
,dy12
);
369 tz
= _mm_mul_pd(fscal
,dz12
);
371 /* Update vectorial force */
372 fix1
= _mm_add_pd(fix1
,tx
);
373 fiy1
= _mm_add_pd(fiy1
,ty
);
374 fiz1
= _mm_add_pd(fiz1
,tz
);
376 fjx2
= _mm_add_pd(fjx2
,tx
);
377 fjy2
= _mm_add_pd(fjy2
,ty
);
378 fjz2
= _mm_add_pd(fjz2
,tz
);
380 /**************************
381 * CALCULATE INTERACTIONS *
382 **************************/
384 /* COULOMB ELECTROSTATICS */
385 velec
= _mm_mul_pd(qq13
,rinv13
);
386 felec
= _mm_mul_pd(velec
,rinvsq13
);
388 /* Update potential sum for this i atom from the interaction with this j atom. */
389 velecsum
= _mm_add_pd(velecsum
,velec
);
393 /* Calculate temporary vectorial force */
394 tx
= _mm_mul_pd(fscal
,dx13
);
395 ty
= _mm_mul_pd(fscal
,dy13
);
396 tz
= _mm_mul_pd(fscal
,dz13
);
398 /* Update vectorial force */
399 fix1
= _mm_add_pd(fix1
,tx
);
400 fiy1
= _mm_add_pd(fiy1
,ty
);
401 fiz1
= _mm_add_pd(fiz1
,tz
);
403 fjx3
= _mm_add_pd(fjx3
,tx
);
404 fjy3
= _mm_add_pd(fjy3
,ty
);
405 fjz3
= _mm_add_pd(fjz3
,tz
);
407 /**************************
408 * CALCULATE INTERACTIONS *
409 **************************/
411 /* COULOMB ELECTROSTATICS */
412 velec
= _mm_mul_pd(qq21
,rinv21
);
413 felec
= _mm_mul_pd(velec
,rinvsq21
);
415 /* Update potential sum for this i atom from the interaction with this j atom. */
416 velecsum
= _mm_add_pd(velecsum
,velec
);
420 /* Calculate temporary vectorial force */
421 tx
= _mm_mul_pd(fscal
,dx21
);
422 ty
= _mm_mul_pd(fscal
,dy21
);
423 tz
= _mm_mul_pd(fscal
,dz21
);
425 /* Update vectorial force */
426 fix2
= _mm_add_pd(fix2
,tx
);
427 fiy2
= _mm_add_pd(fiy2
,ty
);
428 fiz2
= _mm_add_pd(fiz2
,tz
);
430 fjx1
= _mm_add_pd(fjx1
,tx
);
431 fjy1
= _mm_add_pd(fjy1
,ty
);
432 fjz1
= _mm_add_pd(fjz1
,tz
);
434 /**************************
435 * CALCULATE INTERACTIONS *
436 **************************/
438 /* COULOMB ELECTROSTATICS */
439 velec
= _mm_mul_pd(qq22
,rinv22
);
440 felec
= _mm_mul_pd(velec
,rinvsq22
);
442 /* Update potential sum for this i atom from the interaction with this j atom. */
443 velecsum
= _mm_add_pd(velecsum
,velec
);
447 /* Calculate temporary vectorial force */
448 tx
= _mm_mul_pd(fscal
,dx22
);
449 ty
= _mm_mul_pd(fscal
,dy22
);
450 tz
= _mm_mul_pd(fscal
,dz22
);
452 /* Update vectorial force */
453 fix2
= _mm_add_pd(fix2
,tx
);
454 fiy2
= _mm_add_pd(fiy2
,ty
);
455 fiz2
= _mm_add_pd(fiz2
,tz
);
457 fjx2
= _mm_add_pd(fjx2
,tx
);
458 fjy2
= _mm_add_pd(fjy2
,ty
);
459 fjz2
= _mm_add_pd(fjz2
,tz
);
461 /**************************
462 * CALCULATE INTERACTIONS *
463 **************************/
465 /* COULOMB ELECTROSTATICS */
466 velec
= _mm_mul_pd(qq23
,rinv23
);
467 felec
= _mm_mul_pd(velec
,rinvsq23
);
469 /* Update potential sum for this i atom from the interaction with this j atom. */
470 velecsum
= _mm_add_pd(velecsum
,velec
);
474 /* Calculate temporary vectorial force */
475 tx
= _mm_mul_pd(fscal
,dx23
);
476 ty
= _mm_mul_pd(fscal
,dy23
);
477 tz
= _mm_mul_pd(fscal
,dz23
);
479 /* Update vectorial force */
480 fix2
= _mm_add_pd(fix2
,tx
);
481 fiy2
= _mm_add_pd(fiy2
,ty
);
482 fiz2
= _mm_add_pd(fiz2
,tz
);
484 fjx3
= _mm_add_pd(fjx3
,tx
);
485 fjy3
= _mm_add_pd(fjy3
,ty
);
486 fjz3
= _mm_add_pd(fjz3
,tz
);
488 /**************************
489 * CALCULATE INTERACTIONS *
490 **************************/
492 /* COULOMB ELECTROSTATICS */
493 velec
= _mm_mul_pd(qq31
,rinv31
);
494 felec
= _mm_mul_pd(velec
,rinvsq31
);
496 /* Update potential sum for this i atom from the interaction with this j atom. */
497 velecsum
= _mm_add_pd(velecsum
,velec
);
501 /* Calculate temporary vectorial force */
502 tx
= _mm_mul_pd(fscal
,dx31
);
503 ty
= _mm_mul_pd(fscal
,dy31
);
504 tz
= _mm_mul_pd(fscal
,dz31
);
506 /* Update vectorial force */
507 fix3
= _mm_add_pd(fix3
,tx
);
508 fiy3
= _mm_add_pd(fiy3
,ty
);
509 fiz3
= _mm_add_pd(fiz3
,tz
);
511 fjx1
= _mm_add_pd(fjx1
,tx
);
512 fjy1
= _mm_add_pd(fjy1
,ty
);
513 fjz1
= _mm_add_pd(fjz1
,tz
);
515 /**************************
516 * CALCULATE INTERACTIONS *
517 **************************/
519 /* COULOMB ELECTROSTATICS */
520 velec
= _mm_mul_pd(qq32
,rinv32
);
521 felec
= _mm_mul_pd(velec
,rinvsq32
);
523 /* Update potential sum for this i atom from the interaction with this j atom. */
524 velecsum
= _mm_add_pd(velecsum
,velec
);
528 /* Calculate temporary vectorial force */
529 tx
= _mm_mul_pd(fscal
,dx32
);
530 ty
= _mm_mul_pd(fscal
,dy32
);
531 tz
= _mm_mul_pd(fscal
,dz32
);
533 /* Update vectorial force */
534 fix3
= _mm_add_pd(fix3
,tx
);
535 fiy3
= _mm_add_pd(fiy3
,ty
);
536 fiz3
= _mm_add_pd(fiz3
,tz
);
538 fjx2
= _mm_add_pd(fjx2
,tx
);
539 fjy2
= _mm_add_pd(fjy2
,ty
);
540 fjz2
= _mm_add_pd(fjz2
,tz
);
542 /**************************
543 * CALCULATE INTERACTIONS *
544 **************************/
546 /* COULOMB ELECTROSTATICS */
547 velec
= _mm_mul_pd(qq33
,rinv33
);
548 felec
= _mm_mul_pd(velec
,rinvsq33
);
550 /* Update potential sum for this i atom from the interaction with this j atom. */
551 velecsum
= _mm_add_pd(velecsum
,velec
);
555 /* Calculate temporary vectorial force */
556 tx
= _mm_mul_pd(fscal
,dx33
);
557 ty
= _mm_mul_pd(fscal
,dy33
);
558 tz
= _mm_mul_pd(fscal
,dz33
);
560 /* Update vectorial force */
561 fix3
= _mm_add_pd(fix3
,tx
);
562 fiy3
= _mm_add_pd(fiy3
,ty
);
563 fiz3
= _mm_add_pd(fiz3
,tz
);
565 fjx3
= _mm_add_pd(fjx3
,tx
);
566 fjy3
= _mm_add_pd(fjy3
,ty
);
567 fjz3
= _mm_add_pd(fjz3
,tz
);
569 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
571 /* Inner loop uses 287 flops */
578 j_coord_offsetA
= DIM
*jnrA
;
580 /* load j atom coordinates */
581 gmx_mm_load_4rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
582 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
583 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
585 /* Calculate displacement vector */
586 dx00
= _mm_sub_pd(ix0
,jx0
);
587 dy00
= _mm_sub_pd(iy0
,jy0
);
588 dz00
= _mm_sub_pd(iz0
,jz0
);
589 dx11
= _mm_sub_pd(ix1
,jx1
);
590 dy11
= _mm_sub_pd(iy1
,jy1
);
591 dz11
= _mm_sub_pd(iz1
,jz1
);
592 dx12
= _mm_sub_pd(ix1
,jx2
);
593 dy12
= _mm_sub_pd(iy1
,jy2
);
594 dz12
= _mm_sub_pd(iz1
,jz2
);
595 dx13
= _mm_sub_pd(ix1
,jx3
);
596 dy13
= _mm_sub_pd(iy1
,jy3
);
597 dz13
= _mm_sub_pd(iz1
,jz3
);
598 dx21
= _mm_sub_pd(ix2
,jx1
);
599 dy21
= _mm_sub_pd(iy2
,jy1
);
600 dz21
= _mm_sub_pd(iz2
,jz1
);
601 dx22
= _mm_sub_pd(ix2
,jx2
);
602 dy22
= _mm_sub_pd(iy2
,jy2
);
603 dz22
= _mm_sub_pd(iz2
,jz2
);
604 dx23
= _mm_sub_pd(ix2
,jx3
);
605 dy23
= _mm_sub_pd(iy2
,jy3
);
606 dz23
= _mm_sub_pd(iz2
,jz3
);
607 dx31
= _mm_sub_pd(ix3
,jx1
);
608 dy31
= _mm_sub_pd(iy3
,jy1
);
609 dz31
= _mm_sub_pd(iz3
,jz1
);
610 dx32
= _mm_sub_pd(ix3
,jx2
);
611 dy32
= _mm_sub_pd(iy3
,jy2
);
612 dz32
= _mm_sub_pd(iz3
,jz2
);
613 dx33
= _mm_sub_pd(ix3
,jx3
);
614 dy33
= _mm_sub_pd(iy3
,jy3
);
615 dz33
= _mm_sub_pd(iz3
,jz3
);
617 /* Calculate squared distance and things based on it */
618 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
619 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
620 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
621 rsq13
= gmx_mm_calc_rsq_pd(dx13
,dy13
,dz13
);
622 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
623 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
624 rsq23
= gmx_mm_calc_rsq_pd(dx23
,dy23
,dz23
);
625 rsq31
= gmx_mm_calc_rsq_pd(dx31
,dy31
,dz31
);
626 rsq32
= gmx_mm_calc_rsq_pd(dx32
,dy32
,dz32
);
627 rsq33
= gmx_mm_calc_rsq_pd(dx33
,dy33
,dz33
);
629 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
630 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
631 rinv13
= gmx_mm_invsqrt_pd(rsq13
);
632 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
633 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
634 rinv23
= gmx_mm_invsqrt_pd(rsq23
);
635 rinv31
= gmx_mm_invsqrt_pd(rsq31
);
636 rinv32
= gmx_mm_invsqrt_pd(rsq32
);
637 rinv33
= gmx_mm_invsqrt_pd(rsq33
);
639 rinvsq00
= gmx_mm_inv_pd(rsq00
);
640 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
641 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
642 rinvsq13
= _mm_mul_pd(rinv13
,rinv13
);
643 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
644 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
645 rinvsq23
= _mm_mul_pd(rinv23
,rinv23
);
646 rinvsq31
= _mm_mul_pd(rinv31
,rinv31
);
647 rinvsq32
= _mm_mul_pd(rinv32
,rinv32
);
648 rinvsq33
= _mm_mul_pd(rinv33
,rinv33
);
650 fjx0
= _mm_setzero_pd();
651 fjy0
= _mm_setzero_pd();
652 fjz0
= _mm_setzero_pd();
653 fjx1
= _mm_setzero_pd();
654 fjy1
= _mm_setzero_pd();
655 fjz1
= _mm_setzero_pd();
656 fjx2
= _mm_setzero_pd();
657 fjy2
= _mm_setzero_pd();
658 fjz2
= _mm_setzero_pd();
659 fjx3
= _mm_setzero_pd();
660 fjy3
= _mm_setzero_pd();
661 fjz3
= _mm_setzero_pd();
663 /**************************
664 * CALCULATE INTERACTIONS *
665 **************************/
667 /* LENNARD-JONES DISPERSION/REPULSION */
669 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
670 vvdw6
= _mm_mul_pd(c6_00
,rinvsix
);
671 vvdw12
= _mm_mul_pd(c12_00
,_mm_mul_pd(rinvsix
,rinvsix
));
672 vvdw
= _mm_sub_pd( _mm_mul_pd(vvdw12
,one_twelfth
) , _mm_mul_pd(vvdw6
,one_sixth
) );
673 fvdw
= _mm_mul_pd(_mm_sub_pd(vvdw12
,vvdw6
),rinvsq00
);
675 /* Update potential sum for this i atom from the interaction with this j atom. */
676 vvdw
= _mm_unpacklo_pd(vvdw
,_mm_setzero_pd());
677 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
681 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
683 /* Calculate temporary vectorial force */
684 tx
= _mm_mul_pd(fscal
,dx00
);
685 ty
= _mm_mul_pd(fscal
,dy00
);
686 tz
= _mm_mul_pd(fscal
,dz00
);
688 /* Update vectorial force */
689 fix0
= _mm_add_pd(fix0
,tx
);
690 fiy0
= _mm_add_pd(fiy0
,ty
);
691 fiz0
= _mm_add_pd(fiz0
,tz
);
693 fjx0
= _mm_add_pd(fjx0
,tx
);
694 fjy0
= _mm_add_pd(fjy0
,ty
);
695 fjz0
= _mm_add_pd(fjz0
,tz
);
697 /**************************
698 * CALCULATE INTERACTIONS *
699 **************************/
701 /* COULOMB ELECTROSTATICS */
702 velec
= _mm_mul_pd(qq11
,rinv11
);
703 felec
= _mm_mul_pd(velec
,rinvsq11
);
705 /* Update potential sum for this i atom from the interaction with this j atom. */
706 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
707 velecsum
= _mm_add_pd(velecsum
,velec
);
711 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
713 /* Calculate temporary vectorial force */
714 tx
= _mm_mul_pd(fscal
,dx11
);
715 ty
= _mm_mul_pd(fscal
,dy11
);
716 tz
= _mm_mul_pd(fscal
,dz11
);
718 /* Update vectorial force */
719 fix1
= _mm_add_pd(fix1
,tx
);
720 fiy1
= _mm_add_pd(fiy1
,ty
);
721 fiz1
= _mm_add_pd(fiz1
,tz
);
723 fjx1
= _mm_add_pd(fjx1
,tx
);
724 fjy1
= _mm_add_pd(fjy1
,ty
);
725 fjz1
= _mm_add_pd(fjz1
,tz
);
727 /**************************
728 * CALCULATE INTERACTIONS *
729 **************************/
731 /* COULOMB ELECTROSTATICS */
732 velec
= _mm_mul_pd(qq12
,rinv12
);
733 felec
= _mm_mul_pd(velec
,rinvsq12
);
735 /* Update potential sum for this i atom from the interaction with this j atom. */
736 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
737 velecsum
= _mm_add_pd(velecsum
,velec
);
741 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
743 /* Calculate temporary vectorial force */
744 tx
= _mm_mul_pd(fscal
,dx12
);
745 ty
= _mm_mul_pd(fscal
,dy12
);
746 tz
= _mm_mul_pd(fscal
,dz12
);
748 /* Update vectorial force */
749 fix1
= _mm_add_pd(fix1
,tx
);
750 fiy1
= _mm_add_pd(fiy1
,ty
);
751 fiz1
= _mm_add_pd(fiz1
,tz
);
753 fjx2
= _mm_add_pd(fjx2
,tx
);
754 fjy2
= _mm_add_pd(fjy2
,ty
);
755 fjz2
= _mm_add_pd(fjz2
,tz
);
757 /**************************
758 * CALCULATE INTERACTIONS *
759 **************************/
761 /* COULOMB ELECTROSTATICS */
762 velec
= _mm_mul_pd(qq13
,rinv13
);
763 felec
= _mm_mul_pd(velec
,rinvsq13
);
765 /* Update potential sum for this i atom from the interaction with this j atom. */
766 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
767 velecsum
= _mm_add_pd(velecsum
,velec
);
771 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
773 /* Calculate temporary vectorial force */
774 tx
= _mm_mul_pd(fscal
,dx13
);
775 ty
= _mm_mul_pd(fscal
,dy13
);
776 tz
= _mm_mul_pd(fscal
,dz13
);
778 /* Update vectorial force */
779 fix1
= _mm_add_pd(fix1
,tx
);
780 fiy1
= _mm_add_pd(fiy1
,ty
);
781 fiz1
= _mm_add_pd(fiz1
,tz
);
783 fjx3
= _mm_add_pd(fjx3
,tx
);
784 fjy3
= _mm_add_pd(fjy3
,ty
);
785 fjz3
= _mm_add_pd(fjz3
,tz
);
787 /**************************
788 * CALCULATE INTERACTIONS *
789 **************************/
791 /* COULOMB ELECTROSTATICS */
792 velec
= _mm_mul_pd(qq21
,rinv21
);
793 felec
= _mm_mul_pd(velec
,rinvsq21
);
795 /* Update potential sum for this i atom from the interaction with this j atom. */
796 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
797 velecsum
= _mm_add_pd(velecsum
,velec
);
801 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
803 /* Calculate temporary vectorial force */
804 tx
= _mm_mul_pd(fscal
,dx21
);
805 ty
= _mm_mul_pd(fscal
,dy21
);
806 tz
= _mm_mul_pd(fscal
,dz21
);
808 /* Update vectorial force */
809 fix2
= _mm_add_pd(fix2
,tx
);
810 fiy2
= _mm_add_pd(fiy2
,ty
);
811 fiz2
= _mm_add_pd(fiz2
,tz
);
813 fjx1
= _mm_add_pd(fjx1
,tx
);
814 fjy1
= _mm_add_pd(fjy1
,ty
);
815 fjz1
= _mm_add_pd(fjz1
,tz
);
817 /**************************
818 * CALCULATE INTERACTIONS *
819 **************************/
821 /* COULOMB ELECTROSTATICS */
822 velec
= _mm_mul_pd(qq22
,rinv22
);
823 felec
= _mm_mul_pd(velec
,rinvsq22
);
825 /* Update potential sum for this i atom from the interaction with this j atom. */
826 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
827 velecsum
= _mm_add_pd(velecsum
,velec
);
831 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
833 /* Calculate temporary vectorial force */
834 tx
= _mm_mul_pd(fscal
,dx22
);
835 ty
= _mm_mul_pd(fscal
,dy22
);
836 tz
= _mm_mul_pd(fscal
,dz22
);
838 /* Update vectorial force */
839 fix2
= _mm_add_pd(fix2
,tx
);
840 fiy2
= _mm_add_pd(fiy2
,ty
);
841 fiz2
= _mm_add_pd(fiz2
,tz
);
843 fjx2
= _mm_add_pd(fjx2
,tx
);
844 fjy2
= _mm_add_pd(fjy2
,ty
);
845 fjz2
= _mm_add_pd(fjz2
,tz
);
847 /**************************
848 * CALCULATE INTERACTIONS *
849 **************************/
851 /* COULOMB ELECTROSTATICS */
852 velec
= _mm_mul_pd(qq23
,rinv23
);
853 felec
= _mm_mul_pd(velec
,rinvsq23
);
855 /* Update potential sum for this i atom from the interaction with this j atom. */
856 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
857 velecsum
= _mm_add_pd(velecsum
,velec
);
861 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
863 /* Calculate temporary vectorial force */
864 tx
= _mm_mul_pd(fscal
,dx23
);
865 ty
= _mm_mul_pd(fscal
,dy23
);
866 tz
= _mm_mul_pd(fscal
,dz23
);
868 /* Update vectorial force */
869 fix2
= _mm_add_pd(fix2
,tx
);
870 fiy2
= _mm_add_pd(fiy2
,ty
);
871 fiz2
= _mm_add_pd(fiz2
,tz
);
873 fjx3
= _mm_add_pd(fjx3
,tx
);
874 fjy3
= _mm_add_pd(fjy3
,ty
);
875 fjz3
= _mm_add_pd(fjz3
,tz
);
877 /**************************
878 * CALCULATE INTERACTIONS *
879 **************************/
881 /* COULOMB ELECTROSTATICS */
882 velec
= _mm_mul_pd(qq31
,rinv31
);
883 felec
= _mm_mul_pd(velec
,rinvsq31
);
885 /* Update potential sum for this i atom from the interaction with this j atom. */
886 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
887 velecsum
= _mm_add_pd(velecsum
,velec
);
891 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
893 /* Calculate temporary vectorial force */
894 tx
= _mm_mul_pd(fscal
,dx31
);
895 ty
= _mm_mul_pd(fscal
,dy31
);
896 tz
= _mm_mul_pd(fscal
,dz31
);
898 /* Update vectorial force */
899 fix3
= _mm_add_pd(fix3
,tx
);
900 fiy3
= _mm_add_pd(fiy3
,ty
);
901 fiz3
= _mm_add_pd(fiz3
,tz
);
903 fjx1
= _mm_add_pd(fjx1
,tx
);
904 fjy1
= _mm_add_pd(fjy1
,ty
);
905 fjz1
= _mm_add_pd(fjz1
,tz
);
907 /**************************
908 * CALCULATE INTERACTIONS *
909 **************************/
911 /* COULOMB ELECTROSTATICS */
912 velec
= _mm_mul_pd(qq32
,rinv32
);
913 felec
= _mm_mul_pd(velec
,rinvsq32
);
915 /* Update potential sum for this i atom from the interaction with this j atom. */
916 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
917 velecsum
= _mm_add_pd(velecsum
,velec
);
921 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
923 /* Calculate temporary vectorial force */
924 tx
= _mm_mul_pd(fscal
,dx32
);
925 ty
= _mm_mul_pd(fscal
,dy32
);
926 tz
= _mm_mul_pd(fscal
,dz32
);
928 /* Update vectorial force */
929 fix3
= _mm_add_pd(fix3
,tx
);
930 fiy3
= _mm_add_pd(fiy3
,ty
);
931 fiz3
= _mm_add_pd(fiz3
,tz
);
933 fjx2
= _mm_add_pd(fjx2
,tx
);
934 fjy2
= _mm_add_pd(fjy2
,ty
);
935 fjz2
= _mm_add_pd(fjz2
,tz
);
937 /**************************
938 * CALCULATE INTERACTIONS *
939 **************************/
941 /* COULOMB ELECTROSTATICS */
942 velec
= _mm_mul_pd(qq33
,rinv33
);
943 felec
= _mm_mul_pd(velec
,rinvsq33
);
945 /* Update potential sum for this i atom from the interaction with this j atom. */
946 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
947 velecsum
= _mm_add_pd(velecsum
,velec
);
951 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
953 /* Calculate temporary vectorial force */
954 tx
= _mm_mul_pd(fscal
,dx33
);
955 ty
= _mm_mul_pd(fscal
,dy33
);
956 tz
= _mm_mul_pd(fscal
,dz33
);
958 /* Update vectorial force */
959 fix3
= _mm_add_pd(fix3
,tx
);
960 fiy3
= _mm_add_pd(fiy3
,ty
);
961 fiz3
= _mm_add_pd(fiz3
,tz
);
963 fjx3
= _mm_add_pd(fjx3
,tx
);
964 fjy3
= _mm_add_pd(fjy3
,ty
);
965 fjz3
= _mm_add_pd(fjz3
,tz
);
967 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
969 /* Inner loop uses 287 flops */
972 /* End of innermost loop */
974 gmx_mm_update_iforce_4atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
975 f
+i_coord_offset
,fshift
+i_shift_offset
);
978 /* Update potential energies */
979 gmx_mm_update_1pot_pd(velecsum
,kernel_data
->energygrp_elec
+ggid
);
980 gmx_mm_update_1pot_pd(vvdwsum
,kernel_data
->energygrp_vdw
+ggid
);
982 /* Increment number of inner iterations */
983 inneriter
+= j_index_end
- j_index_start
;
985 /* Outer loop uses 26 flops */
988 /* Increment number of outer iterations */
991 /* Update outer/inner flops */
993 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W4W4_VF
,outeriter
*26 + inneriter
*287);
996 * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_sse2_double
997 * Electrostatics interaction: Coulomb
998 * VdW interaction: LennardJones
999 * Geometry: Water4-Water4
1000 * Calculate force/pot: Force
1003 nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_sse2_double
1004 (t_nblist
* gmx_restrict nlist
,
1005 rvec
* gmx_restrict xx
,
1006 rvec
* gmx_restrict ff
,
1007 t_forcerec
* gmx_restrict fr
,
1008 t_mdatoms
* gmx_restrict mdatoms
,
1009 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
1010 t_nrnb
* gmx_restrict nrnb
)
1012 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1013 * just 0 for non-waters.
1014 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1015 * jnr indices corresponding to data put in the four positions in the SIMD register.
1017 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
1018 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
1020 int j_coord_offsetA
,j_coord_offsetB
;
1021 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
1022 real rcutoff_scalar
;
1023 real
*shiftvec
,*fshift
,*x
,*f
;
1024 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
1026 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
1028 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
1030 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
1032 __m128d ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
1033 int vdwjidx0A
,vdwjidx0B
;
1034 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
1035 int vdwjidx1A
,vdwjidx1B
;
1036 __m128d jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
1037 int vdwjidx2A
,vdwjidx2B
;
1038 __m128d jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
1039 int vdwjidx3A
,vdwjidx3B
;
1040 __m128d jx3
,jy3
,jz3
,fjx3
,fjy3
,fjz3
,jq3
,isaj3
;
1041 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
1042 __m128d dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
1043 __m128d dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
1044 __m128d dx13
,dy13
,dz13
,rsq13
,rinv13
,rinvsq13
,r13
,qq13
,c6_13
,c12_13
;
1045 __m128d dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
1046 __m128d dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
1047 __m128d dx23
,dy23
,dz23
,rsq23
,rinv23
,rinvsq23
,r23
,qq23
,c6_23
,c12_23
;
1048 __m128d dx31
,dy31
,dz31
,rsq31
,rinv31
,rinvsq31
,r31
,qq31
,c6_31
,c12_31
;
1049 __m128d dx32
,dy32
,dz32
,rsq32
,rinv32
,rinvsq32
,r32
,qq32
,c6_32
,c12_32
;
1050 __m128d dx33
,dy33
,dz33
,rsq33
,rinv33
,rinvsq33
,r33
,qq33
,c6_33
,c12_33
;
1051 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
1054 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
1057 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
1058 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
1059 __m128d dummy_mask
,cutoff_mask
;
1060 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1061 __m128d one
= _mm_set1_pd(1.0);
1062 __m128d two
= _mm_set1_pd(2.0);
1068 jindex
= nlist
->jindex
;
1070 shiftidx
= nlist
->shift
;
1072 shiftvec
= fr
->shift_vec
[0];
1073 fshift
= fr
->fshift
[0];
1074 facel
= _mm_set1_pd(fr
->epsfac
);
1075 charge
= mdatoms
->chargeA
;
1076 nvdwtype
= fr
->ntype
;
1077 vdwparam
= fr
->nbfp
;
1078 vdwtype
= mdatoms
->typeA
;
1080 /* Setup water-specific parameters */
1081 inr
= nlist
->iinr
[0];
1082 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
1083 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
1084 iq3
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+3]));
1085 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
1087 jq1
= _mm_set1_pd(charge
[inr
+1]);
1088 jq2
= _mm_set1_pd(charge
[inr
+2]);
1089 jq3
= _mm_set1_pd(charge
[inr
+3]);
1090 vdwjidx0A
= 2*vdwtype
[inr
+0];
1091 c6_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
]);
1092 c12_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
1093 qq11
= _mm_mul_pd(iq1
,jq1
);
1094 qq12
= _mm_mul_pd(iq1
,jq2
);
1095 qq13
= _mm_mul_pd(iq1
,jq3
);
1096 qq21
= _mm_mul_pd(iq2
,jq1
);
1097 qq22
= _mm_mul_pd(iq2
,jq2
);
1098 qq23
= _mm_mul_pd(iq2
,jq3
);
1099 qq31
= _mm_mul_pd(iq3
,jq1
);
1100 qq32
= _mm_mul_pd(iq3
,jq2
);
1101 qq33
= _mm_mul_pd(iq3
,jq3
);
1103 /* Avoid stupid compiler warnings */
1105 j_coord_offsetA
= 0;
1106 j_coord_offsetB
= 0;
1111 /* Start outer loop over neighborlists */
1112 for(iidx
=0; iidx
<nri
; iidx
++)
1114 /* Load shift vector for this list */
1115 i_shift_offset
= DIM
*shiftidx
[iidx
];
1117 /* Load limits for loop over neighbors */
1118 j_index_start
= jindex
[iidx
];
1119 j_index_end
= jindex
[iidx
+1];
1121 /* Get outer coordinate index */
1123 i_coord_offset
= DIM
*inr
;
1125 /* Load i particle coords and add shift vector */
1126 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
1127 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
1129 fix0
= _mm_setzero_pd();
1130 fiy0
= _mm_setzero_pd();
1131 fiz0
= _mm_setzero_pd();
1132 fix1
= _mm_setzero_pd();
1133 fiy1
= _mm_setzero_pd();
1134 fiz1
= _mm_setzero_pd();
1135 fix2
= _mm_setzero_pd();
1136 fiy2
= _mm_setzero_pd();
1137 fiz2
= _mm_setzero_pd();
1138 fix3
= _mm_setzero_pd();
1139 fiy3
= _mm_setzero_pd();
1140 fiz3
= _mm_setzero_pd();
1142 /* Start inner kernel loop */
1143 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
1146 /* Get j neighbor index, and coordinate index */
1148 jnrB
= jjnr
[jidx
+1];
1149 j_coord_offsetA
= DIM
*jnrA
;
1150 j_coord_offsetB
= DIM
*jnrB
;
1152 /* load j atom coordinates */
1153 gmx_mm_load_4rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1154 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
1155 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
1157 /* Calculate displacement vector */
1158 dx00
= _mm_sub_pd(ix0
,jx0
);
1159 dy00
= _mm_sub_pd(iy0
,jy0
);
1160 dz00
= _mm_sub_pd(iz0
,jz0
);
1161 dx11
= _mm_sub_pd(ix1
,jx1
);
1162 dy11
= _mm_sub_pd(iy1
,jy1
);
1163 dz11
= _mm_sub_pd(iz1
,jz1
);
1164 dx12
= _mm_sub_pd(ix1
,jx2
);
1165 dy12
= _mm_sub_pd(iy1
,jy2
);
1166 dz12
= _mm_sub_pd(iz1
,jz2
);
1167 dx13
= _mm_sub_pd(ix1
,jx3
);
1168 dy13
= _mm_sub_pd(iy1
,jy3
);
1169 dz13
= _mm_sub_pd(iz1
,jz3
);
1170 dx21
= _mm_sub_pd(ix2
,jx1
);
1171 dy21
= _mm_sub_pd(iy2
,jy1
);
1172 dz21
= _mm_sub_pd(iz2
,jz1
);
1173 dx22
= _mm_sub_pd(ix2
,jx2
);
1174 dy22
= _mm_sub_pd(iy2
,jy2
);
1175 dz22
= _mm_sub_pd(iz2
,jz2
);
1176 dx23
= _mm_sub_pd(ix2
,jx3
);
1177 dy23
= _mm_sub_pd(iy2
,jy3
);
1178 dz23
= _mm_sub_pd(iz2
,jz3
);
1179 dx31
= _mm_sub_pd(ix3
,jx1
);
1180 dy31
= _mm_sub_pd(iy3
,jy1
);
1181 dz31
= _mm_sub_pd(iz3
,jz1
);
1182 dx32
= _mm_sub_pd(ix3
,jx2
);
1183 dy32
= _mm_sub_pd(iy3
,jy2
);
1184 dz32
= _mm_sub_pd(iz3
,jz2
);
1185 dx33
= _mm_sub_pd(ix3
,jx3
);
1186 dy33
= _mm_sub_pd(iy3
,jy3
);
1187 dz33
= _mm_sub_pd(iz3
,jz3
);
1189 /* Calculate squared distance and things based on it */
1190 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
1191 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
1192 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
1193 rsq13
= gmx_mm_calc_rsq_pd(dx13
,dy13
,dz13
);
1194 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
1195 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
1196 rsq23
= gmx_mm_calc_rsq_pd(dx23
,dy23
,dz23
);
1197 rsq31
= gmx_mm_calc_rsq_pd(dx31
,dy31
,dz31
);
1198 rsq32
= gmx_mm_calc_rsq_pd(dx32
,dy32
,dz32
);
1199 rsq33
= gmx_mm_calc_rsq_pd(dx33
,dy33
,dz33
);
1201 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
1202 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
1203 rinv13
= gmx_mm_invsqrt_pd(rsq13
);
1204 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
1205 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
1206 rinv23
= gmx_mm_invsqrt_pd(rsq23
);
1207 rinv31
= gmx_mm_invsqrt_pd(rsq31
);
1208 rinv32
= gmx_mm_invsqrt_pd(rsq32
);
1209 rinv33
= gmx_mm_invsqrt_pd(rsq33
);
1211 rinvsq00
= gmx_mm_inv_pd(rsq00
);
1212 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
1213 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
1214 rinvsq13
= _mm_mul_pd(rinv13
,rinv13
);
1215 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
1216 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
1217 rinvsq23
= _mm_mul_pd(rinv23
,rinv23
);
1218 rinvsq31
= _mm_mul_pd(rinv31
,rinv31
);
1219 rinvsq32
= _mm_mul_pd(rinv32
,rinv32
);
1220 rinvsq33
= _mm_mul_pd(rinv33
,rinv33
);
1222 fjx0
= _mm_setzero_pd();
1223 fjy0
= _mm_setzero_pd();
1224 fjz0
= _mm_setzero_pd();
1225 fjx1
= _mm_setzero_pd();
1226 fjy1
= _mm_setzero_pd();
1227 fjz1
= _mm_setzero_pd();
1228 fjx2
= _mm_setzero_pd();
1229 fjy2
= _mm_setzero_pd();
1230 fjz2
= _mm_setzero_pd();
1231 fjx3
= _mm_setzero_pd();
1232 fjy3
= _mm_setzero_pd();
1233 fjz3
= _mm_setzero_pd();
1235 /**************************
1236 * CALCULATE INTERACTIONS *
1237 **************************/
1239 /* LENNARD-JONES DISPERSION/REPULSION */
1241 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
1242 fvdw
= _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00
,rinvsix
),c6_00
),_mm_mul_pd(rinvsix
,rinvsq00
));
1246 /* Calculate temporary vectorial force */
1247 tx
= _mm_mul_pd(fscal
,dx00
);
1248 ty
= _mm_mul_pd(fscal
,dy00
);
1249 tz
= _mm_mul_pd(fscal
,dz00
);
1251 /* Update vectorial force */
1252 fix0
= _mm_add_pd(fix0
,tx
);
1253 fiy0
= _mm_add_pd(fiy0
,ty
);
1254 fiz0
= _mm_add_pd(fiz0
,tz
);
1256 fjx0
= _mm_add_pd(fjx0
,tx
);
1257 fjy0
= _mm_add_pd(fjy0
,ty
);
1258 fjz0
= _mm_add_pd(fjz0
,tz
);
1260 /**************************
1261 * CALCULATE INTERACTIONS *
1262 **************************/
1264 /* COULOMB ELECTROSTATICS */
1265 velec
= _mm_mul_pd(qq11
,rinv11
);
1266 felec
= _mm_mul_pd(velec
,rinvsq11
);
1270 /* Calculate temporary vectorial force */
1271 tx
= _mm_mul_pd(fscal
,dx11
);
1272 ty
= _mm_mul_pd(fscal
,dy11
);
1273 tz
= _mm_mul_pd(fscal
,dz11
);
1275 /* Update vectorial force */
1276 fix1
= _mm_add_pd(fix1
,tx
);
1277 fiy1
= _mm_add_pd(fiy1
,ty
);
1278 fiz1
= _mm_add_pd(fiz1
,tz
);
1280 fjx1
= _mm_add_pd(fjx1
,tx
);
1281 fjy1
= _mm_add_pd(fjy1
,ty
);
1282 fjz1
= _mm_add_pd(fjz1
,tz
);
1284 /**************************
1285 * CALCULATE INTERACTIONS *
1286 **************************/
1288 /* COULOMB ELECTROSTATICS */
1289 velec
= _mm_mul_pd(qq12
,rinv12
);
1290 felec
= _mm_mul_pd(velec
,rinvsq12
);
1294 /* Calculate temporary vectorial force */
1295 tx
= _mm_mul_pd(fscal
,dx12
);
1296 ty
= _mm_mul_pd(fscal
,dy12
);
1297 tz
= _mm_mul_pd(fscal
,dz12
);
1299 /* Update vectorial force */
1300 fix1
= _mm_add_pd(fix1
,tx
);
1301 fiy1
= _mm_add_pd(fiy1
,ty
);
1302 fiz1
= _mm_add_pd(fiz1
,tz
);
1304 fjx2
= _mm_add_pd(fjx2
,tx
);
1305 fjy2
= _mm_add_pd(fjy2
,ty
);
1306 fjz2
= _mm_add_pd(fjz2
,tz
);
1308 /**************************
1309 * CALCULATE INTERACTIONS *
1310 **************************/
1312 /* COULOMB ELECTROSTATICS */
1313 velec
= _mm_mul_pd(qq13
,rinv13
);
1314 felec
= _mm_mul_pd(velec
,rinvsq13
);
1318 /* Calculate temporary vectorial force */
1319 tx
= _mm_mul_pd(fscal
,dx13
);
1320 ty
= _mm_mul_pd(fscal
,dy13
);
1321 tz
= _mm_mul_pd(fscal
,dz13
);
1323 /* Update vectorial force */
1324 fix1
= _mm_add_pd(fix1
,tx
);
1325 fiy1
= _mm_add_pd(fiy1
,ty
);
1326 fiz1
= _mm_add_pd(fiz1
,tz
);
1328 fjx3
= _mm_add_pd(fjx3
,tx
);
1329 fjy3
= _mm_add_pd(fjy3
,ty
);
1330 fjz3
= _mm_add_pd(fjz3
,tz
);
1332 /**************************
1333 * CALCULATE INTERACTIONS *
1334 **************************/
1336 /* COULOMB ELECTROSTATICS */
1337 velec
= _mm_mul_pd(qq21
,rinv21
);
1338 felec
= _mm_mul_pd(velec
,rinvsq21
);
1342 /* Calculate temporary vectorial force */
1343 tx
= _mm_mul_pd(fscal
,dx21
);
1344 ty
= _mm_mul_pd(fscal
,dy21
);
1345 tz
= _mm_mul_pd(fscal
,dz21
);
1347 /* Update vectorial force */
1348 fix2
= _mm_add_pd(fix2
,tx
);
1349 fiy2
= _mm_add_pd(fiy2
,ty
);
1350 fiz2
= _mm_add_pd(fiz2
,tz
);
1352 fjx1
= _mm_add_pd(fjx1
,tx
);
1353 fjy1
= _mm_add_pd(fjy1
,ty
);
1354 fjz1
= _mm_add_pd(fjz1
,tz
);
1356 /**************************
1357 * CALCULATE INTERACTIONS *
1358 **************************/
1360 /* COULOMB ELECTROSTATICS */
1361 velec
= _mm_mul_pd(qq22
,rinv22
);
1362 felec
= _mm_mul_pd(velec
,rinvsq22
);
1366 /* Calculate temporary vectorial force */
1367 tx
= _mm_mul_pd(fscal
,dx22
);
1368 ty
= _mm_mul_pd(fscal
,dy22
);
1369 tz
= _mm_mul_pd(fscal
,dz22
);
1371 /* Update vectorial force */
1372 fix2
= _mm_add_pd(fix2
,tx
);
1373 fiy2
= _mm_add_pd(fiy2
,ty
);
1374 fiz2
= _mm_add_pd(fiz2
,tz
);
1376 fjx2
= _mm_add_pd(fjx2
,tx
);
1377 fjy2
= _mm_add_pd(fjy2
,ty
);
1378 fjz2
= _mm_add_pd(fjz2
,tz
);
1380 /**************************
1381 * CALCULATE INTERACTIONS *
1382 **************************/
1384 /* COULOMB ELECTROSTATICS */
1385 velec
= _mm_mul_pd(qq23
,rinv23
);
1386 felec
= _mm_mul_pd(velec
,rinvsq23
);
1390 /* Calculate temporary vectorial force */
1391 tx
= _mm_mul_pd(fscal
,dx23
);
1392 ty
= _mm_mul_pd(fscal
,dy23
);
1393 tz
= _mm_mul_pd(fscal
,dz23
);
1395 /* Update vectorial force */
1396 fix2
= _mm_add_pd(fix2
,tx
);
1397 fiy2
= _mm_add_pd(fiy2
,ty
);
1398 fiz2
= _mm_add_pd(fiz2
,tz
);
1400 fjx3
= _mm_add_pd(fjx3
,tx
);
1401 fjy3
= _mm_add_pd(fjy3
,ty
);
1402 fjz3
= _mm_add_pd(fjz3
,tz
);
1404 /**************************
1405 * CALCULATE INTERACTIONS *
1406 **************************/
1408 /* COULOMB ELECTROSTATICS */
1409 velec
= _mm_mul_pd(qq31
,rinv31
);
1410 felec
= _mm_mul_pd(velec
,rinvsq31
);
1414 /* Calculate temporary vectorial force */
1415 tx
= _mm_mul_pd(fscal
,dx31
);
1416 ty
= _mm_mul_pd(fscal
,dy31
);
1417 tz
= _mm_mul_pd(fscal
,dz31
);
1419 /* Update vectorial force */
1420 fix3
= _mm_add_pd(fix3
,tx
);
1421 fiy3
= _mm_add_pd(fiy3
,ty
);
1422 fiz3
= _mm_add_pd(fiz3
,tz
);
1424 fjx1
= _mm_add_pd(fjx1
,tx
);
1425 fjy1
= _mm_add_pd(fjy1
,ty
);
1426 fjz1
= _mm_add_pd(fjz1
,tz
);
1428 /**************************
1429 * CALCULATE INTERACTIONS *
1430 **************************/
1432 /* COULOMB ELECTROSTATICS */
1433 velec
= _mm_mul_pd(qq32
,rinv32
);
1434 felec
= _mm_mul_pd(velec
,rinvsq32
);
1438 /* Calculate temporary vectorial force */
1439 tx
= _mm_mul_pd(fscal
,dx32
);
1440 ty
= _mm_mul_pd(fscal
,dy32
);
1441 tz
= _mm_mul_pd(fscal
,dz32
);
1443 /* Update vectorial force */
1444 fix3
= _mm_add_pd(fix3
,tx
);
1445 fiy3
= _mm_add_pd(fiy3
,ty
);
1446 fiz3
= _mm_add_pd(fiz3
,tz
);
1448 fjx2
= _mm_add_pd(fjx2
,tx
);
1449 fjy2
= _mm_add_pd(fjy2
,ty
);
1450 fjz2
= _mm_add_pd(fjz2
,tz
);
1452 /**************************
1453 * CALCULATE INTERACTIONS *
1454 **************************/
1456 /* COULOMB ELECTROSTATICS */
1457 velec
= _mm_mul_pd(qq33
,rinv33
);
1458 felec
= _mm_mul_pd(velec
,rinvsq33
);
1462 /* Calculate temporary vectorial force */
1463 tx
= _mm_mul_pd(fscal
,dx33
);
1464 ty
= _mm_mul_pd(fscal
,dy33
);
1465 tz
= _mm_mul_pd(fscal
,dz33
);
1467 /* Update vectorial force */
1468 fix3
= _mm_add_pd(fix3
,tx
);
1469 fiy3
= _mm_add_pd(fiy3
,ty
);
1470 fiz3
= _mm_add_pd(fiz3
,tz
);
1472 fjx3
= _mm_add_pd(fjx3
,tx
);
1473 fjy3
= _mm_add_pd(fjy3
,ty
);
1474 fjz3
= _mm_add_pd(fjz3
,tz
);
1476 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
1478 /* Inner loop uses 273 flops */
1481 if(jidx
<j_index_end
)
1485 j_coord_offsetA
= DIM
*jnrA
;
1487 /* load j atom coordinates */
1488 gmx_mm_load_4rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
1489 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
1490 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
1492 /* Calculate displacement vector */
1493 dx00
= _mm_sub_pd(ix0
,jx0
);
1494 dy00
= _mm_sub_pd(iy0
,jy0
);
1495 dz00
= _mm_sub_pd(iz0
,jz0
);
1496 dx11
= _mm_sub_pd(ix1
,jx1
);
1497 dy11
= _mm_sub_pd(iy1
,jy1
);
1498 dz11
= _mm_sub_pd(iz1
,jz1
);
1499 dx12
= _mm_sub_pd(ix1
,jx2
);
1500 dy12
= _mm_sub_pd(iy1
,jy2
);
1501 dz12
= _mm_sub_pd(iz1
,jz2
);
1502 dx13
= _mm_sub_pd(ix1
,jx3
);
1503 dy13
= _mm_sub_pd(iy1
,jy3
);
1504 dz13
= _mm_sub_pd(iz1
,jz3
);
1505 dx21
= _mm_sub_pd(ix2
,jx1
);
1506 dy21
= _mm_sub_pd(iy2
,jy1
);
1507 dz21
= _mm_sub_pd(iz2
,jz1
);
1508 dx22
= _mm_sub_pd(ix2
,jx2
);
1509 dy22
= _mm_sub_pd(iy2
,jy2
);
1510 dz22
= _mm_sub_pd(iz2
,jz2
);
1511 dx23
= _mm_sub_pd(ix2
,jx3
);
1512 dy23
= _mm_sub_pd(iy2
,jy3
);
1513 dz23
= _mm_sub_pd(iz2
,jz3
);
1514 dx31
= _mm_sub_pd(ix3
,jx1
);
1515 dy31
= _mm_sub_pd(iy3
,jy1
);
1516 dz31
= _mm_sub_pd(iz3
,jz1
);
1517 dx32
= _mm_sub_pd(ix3
,jx2
);
1518 dy32
= _mm_sub_pd(iy3
,jy2
);
1519 dz32
= _mm_sub_pd(iz3
,jz2
);
1520 dx33
= _mm_sub_pd(ix3
,jx3
);
1521 dy33
= _mm_sub_pd(iy3
,jy3
);
1522 dz33
= _mm_sub_pd(iz3
,jz3
);
1524 /* Calculate squared distance and things based on it */
1525 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
1526 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
1527 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
1528 rsq13
= gmx_mm_calc_rsq_pd(dx13
,dy13
,dz13
);
1529 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
1530 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
1531 rsq23
= gmx_mm_calc_rsq_pd(dx23
,dy23
,dz23
);
1532 rsq31
= gmx_mm_calc_rsq_pd(dx31
,dy31
,dz31
);
1533 rsq32
= gmx_mm_calc_rsq_pd(dx32
,dy32
,dz32
);
1534 rsq33
= gmx_mm_calc_rsq_pd(dx33
,dy33
,dz33
);
1536 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
1537 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
1538 rinv13
= gmx_mm_invsqrt_pd(rsq13
);
1539 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
1540 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
1541 rinv23
= gmx_mm_invsqrt_pd(rsq23
);
1542 rinv31
= gmx_mm_invsqrt_pd(rsq31
);
1543 rinv32
= gmx_mm_invsqrt_pd(rsq32
);
1544 rinv33
= gmx_mm_invsqrt_pd(rsq33
);
1546 rinvsq00
= gmx_mm_inv_pd(rsq00
);
1547 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
1548 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
1549 rinvsq13
= _mm_mul_pd(rinv13
,rinv13
);
1550 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
1551 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
1552 rinvsq23
= _mm_mul_pd(rinv23
,rinv23
);
1553 rinvsq31
= _mm_mul_pd(rinv31
,rinv31
);
1554 rinvsq32
= _mm_mul_pd(rinv32
,rinv32
);
1555 rinvsq33
= _mm_mul_pd(rinv33
,rinv33
);
1557 fjx0
= _mm_setzero_pd();
1558 fjy0
= _mm_setzero_pd();
1559 fjz0
= _mm_setzero_pd();
1560 fjx1
= _mm_setzero_pd();
1561 fjy1
= _mm_setzero_pd();
1562 fjz1
= _mm_setzero_pd();
1563 fjx2
= _mm_setzero_pd();
1564 fjy2
= _mm_setzero_pd();
1565 fjz2
= _mm_setzero_pd();
1566 fjx3
= _mm_setzero_pd();
1567 fjy3
= _mm_setzero_pd();
1568 fjz3
= _mm_setzero_pd();
1570 /**************************
1571 * CALCULATE INTERACTIONS *
1572 **************************/
1574 /* LENNARD-JONES DISPERSION/REPULSION */
1576 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
1577 fvdw
= _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00
,rinvsix
),c6_00
),_mm_mul_pd(rinvsix
,rinvsq00
));
1581 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1583 /* Calculate temporary vectorial force */
1584 tx
= _mm_mul_pd(fscal
,dx00
);
1585 ty
= _mm_mul_pd(fscal
,dy00
);
1586 tz
= _mm_mul_pd(fscal
,dz00
);
1588 /* Update vectorial force */
1589 fix0
= _mm_add_pd(fix0
,tx
);
1590 fiy0
= _mm_add_pd(fiy0
,ty
);
1591 fiz0
= _mm_add_pd(fiz0
,tz
);
1593 fjx0
= _mm_add_pd(fjx0
,tx
);
1594 fjy0
= _mm_add_pd(fjy0
,ty
);
1595 fjz0
= _mm_add_pd(fjz0
,tz
);
1597 /**************************
1598 * CALCULATE INTERACTIONS *
1599 **************************/
1601 /* COULOMB ELECTROSTATICS */
1602 velec
= _mm_mul_pd(qq11
,rinv11
);
1603 felec
= _mm_mul_pd(velec
,rinvsq11
);
1607 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1609 /* Calculate temporary vectorial force */
1610 tx
= _mm_mul_pd(fscal
,dx11
);
1611 ty
= _mm_mul_pd(fscal
,dy11
);
1612 tz
= _mm_mul_pd(fscal
,dz11
);
1614 /* Update vectorial force */
1615 fix1
= _mm_add_pd(fix1
,tx
);
1616 fiy1
= _mm_add_pd(fiy1
,ty
);
1617 fiz1
= _mm_add_pd(fiz1
,tz
);
1619 fjx1
= _mm_add_pd(fjx1
,tx
);
1620 fjy1
= _mm_add_pd(fjy1
,ty
);
1621 fjz1
= _mm_add_pd(fjz1
,tz
);
1623 /**************************
1624 * CALCULATE INTERACTIONS *
1625 **************************/
1627 /* COULOMB ELECTROSTATICS */
1628 velec
= _mm_mul_pd(qq12
,rinv12
);
1629 felec
= _mm_mul_pd(velec
,rinvsq12
);
1633 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1635 /* Calculate temporary vectorial force */
1636 tx
= _mm_mul_pd(fscal
,dx12
);
1637 ty
= _mm_mul_pd(fscal
,dy12
);
1638 tz
= _mm_mul_pd(fscal
,dz12
);
1640 /* Update vectorial force */
1641 fix1
= _mm_add_pd(fix1
,tx
);
1642 fiy1
= _mm_add_pd(fiy1
,ty
);
1643 fiz1
= _mm_add_pd(fiz1
,tz
);
1645 fjx2
= _mm_add_pd(fjx2
,tx
);
1646 fjy2
= _mm_add_pd(fjy2
,ty
);
1647 fjz2
= _mm_add_pd(fjz2
,tz
);
1649 /**************************
1650 * CALCULATE INTERACTIONS *
1651 **************************/
1653 /* COULOMB ELECTROSTATICS */
1654 velec
= _mm_mul_pd(qq13
,rinv13
);
1655 felec
= _mm_mul_pd(velec
,rinvsq13
);
1659 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1661 /* Calculate temporary vectorial force */
1662 tx
= _mm_mul_pd(fscal
,dx13
);
1663 ty
= _mm_mul_pd(fscal
,dy13
);
1664 tz
= _mm_mul_pd(fscal
,dz13
);
1666 /* Update vectorial force */
1667 fix1
= _mm_add_pd(fix1
,tx
);
1668 fiy1
= _mm_add_pd(fiy1
,ty
);
1669 fiz1
= _mm_add_pd(fiz1
,tz
);
1671 fjx3
= _mm_add_pd(fjx3
,tx
);
1672 fjy3
= _mm_add_pd(fjy3
,ty
);
1673 fjz3
= _mm_add_pd(fjz3
,tz
);
1675 /**************************
1676 * CALCULATE INTERACTIONS *
1677 **************************/
1679 /* COULOMB ELECTROSTATICS */
1680 velec
= _mm_mul_pd(qq21
,rinv21
);
1681 felec
= _mm_mul_pd(velec
,rinvsq21
);
1685 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1687 /* Calculate temporary vectorial force */
1688 tx
= _mm_mul_pd(fscal
,dx21
);
1689 ty
= _mm_mul_pd(fscal
,dy21
);
1690 tz
= _mm_mul_pd(fscal
,dz21
);
1692 /* Update vectorial force */
1693 fix2
= _mm_add_pd(fix2
,tx
);
1694 fiy2
= _mm_add_pd(fiy2
,ty
);
1695 fiz2
= _mm_add_pd(fiz2
,tz
);
1697 fjx1
= _mm_add_pd(fjx1
,tx
);
1698 fjy1
= _mm_add_pd(fjy1
,ty
);
1699 fjz1
= _mm_add_pd(fjz1
,tz
);
1701 /**************************
1702 * CALCULATE INTERACTIONS *
1703 **************************/
1705 /* COULOMB ELECTROSTATICS */
1706 velec
= _mm_mul_pd(qq22
,rinv22
);
1707 felec
= _mm_mul_pd(velec
,rinvsq22
);
1711 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1713 /* Calculate temporary vectorial force */
1714 tx
= _mm_mul_pd(fscal
,dx22
);
1715 ty
= _mm_mul_pd(fscal
,dy22
);
1716 tz
= _mm_mul_pd(fscal
,dz22
);
1718 /* Update vectorial force */
1719 fix2
= _mm_add_pd(fix2
,tx
);
1720 fiy2
= _mm_add_pd(fiy2
,ty
);
1721 fiz2
= _mm_add_pd(fiz2
,tz
);
1723 fjx2
= _mm_add_pd(fjx2
,tx
);
1724 fjy2
= _mm_add_pd(fjy2
,ty
);
1725 fjz2
= _mm_add_pd(fjz2
,tz
);
1727 /**************************
1728 * CALCULATE INTERACTIONS *
1729 **************************/
1731 /* COULOMB ELECTROSTATICS */
1732 velec
= _mm_mul_pd(qq23
,rinv23
);
1733 felec
= _mm_mul_pd(velec
,rinvsq23
);
1737 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1739 /* Calculate temporary vectorial force */
1740 tx
= _mm_mul_pd(fscal
,dx23
);
1741 ty
= _mm_mul_pd(fscal
,dy23
);
1742 tz
= _mm_mul_pd(fscal
,dz23
);
1744 /* Update vectorial force */
1745 fix2
= _mm_add_pd(fix2
,tx
);
1746 fiy2
= _mm_add_pd(fiy2
,ty
);
1747 fiz2
= _mm_add_pd(fiz2
,tz
);
1749 fjx3
= _mm_add_pd(fjx3
,tx
);
1750 fjy3
= _mm_add_pd(fjy3
,ty
);
1751 fjz3
= _mm_add_pd(fjz3
,tz
);
1753 /**************************
1754 * CALCULATE INTERACTIONS *
1755 **************************/
1757 /* COULOMB ELECTROSTATICS */
1758 velec
= _mm_mul_pd(qq31
,rinv31
);
1759 felec
= _mm_mul_pd(velec
,rinvsq31
);
1763 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1765 /* Calculate temporary vectorial force */
1766 tx
= _mm_mul_pd(fscal
,dx31
);
1767 ty
= _mm_mul_pd(fscal
,dy31
);
1768 tz
= _mm_mul_pd(fscal
,dz31
);
1770 /* Update vectorial force */
1771 fix3
= _mm_add_pd(fix3
,tx
);
1772 fiy3
= _mm_add_pd(fiy3
,ty
);
1773 fiz3
= _mm_add_pd(fiz3
,tz
);
1775 fjx1
= _mm_add_pd(fjx1
,tx
);
1776 fjy1
= _mm_add_pd(fjy1
,ty
);
1777 fjz1
= _mm_add_pd(fjz1
,tz
);
1779 /**************************
1780 * CALCULATE INTERACTIONS *
1781 **************************/
1783 /* COULOMB ELECTROSTATICS */
1784 velec
= _mm_mul_pd(qq32
,rinv32
);
1785 felec
= _mm_mul_pd(velec
,rinvsq32
);
1789 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1791 /* Calculate temporary vectorial force */
1792 tx
= _mm_mul_pd(fscal
,dx32
);
1793 ty
= _mm_mul_pd(fscal
,dy32
);
1794 tz
= _mm_mul_pd(fscal
,dz32
);
1796 /* Update vectorial force */
1797 fix3
= _mm_add_pd(fix3
,tx
);
1798 fiy3
= _mm_add_pd(fiy3
,ty
);
1799 fiz3
= _mm_add_pd(fiz3
,tz
);
1801 fjx2
= _mm_add_pd(fjx2
,tx
);
1802 fjy2
= _mm_add_pd(fjy2
,ty
);
1803 fjz2
= _mm_add_pd(fjz2
,tz
);
1805 /**************************
1806 * CALCULATE INTERACTIONS *
1807 **************************/
1809 /* COULOMB ELECTROSTATICS */
1810 velec
= _mm_mul_pd(qq33
,rinv33
);
1811 felec
= _mm_mul_pd(velec
,rinvsq33
);
1815 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1817 /* Calculate temporary vectorial force */
1818 tx
= _mm_mul_pd(fscal
,dx33
);
1819 ty
= _mm_mul_pd(fscal
,dy33
);
1820 tz
= _mm_mul_pd(fscal
,dz33
);
1822 /* Update vectorial force */
1823 fix3
= _mm_add_pd(fix3
,tx
);
1824 fiy3
= _mm_add_pd(fiy3
,ty
);
1825 fiz3
= _mm_add_pd(fiz3
,tz
);
1827 fjx3
= _mm_add_pd(fjx3
,tx
);
1828 fjy3
= _mm_add_pd(fjy3
,ty
);
1829 fjz3
= _mm_add_pd(fjz3
,tz
);
1831 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
1833 /* Inner loop uses 273 flops */
1836 /* End of innermost loop */
1838 gmx_mm_update_iforce_4atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
1839 f
+i_coord_offset
,fshift
+i_shift_offset
);
1841 /* Increment number of inner iterations */
1842 inneriter
+= j_index_end
- j_index_start
;
1844 /* Outer loop uses 24 flops */
1847 /* Increment number of outer iterations */
1850 /* Update outer/inner flops */
1852 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W4W4_F
,outeriter
*24 + inneriter
*273);