2 * Note: this file was generated by the Gromacs sse2_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse2_double.h"
34 #include "kernelutil_x86_sse2_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_sse2_double
38 * Electrostatics interaction: Coulomb
39 * VdW interaction: LennardJones
40 * Geometry: Water4-Water4
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_sse2_double
45 (t_nblist
* gmx_restrict nlist
,
46 rvec
* gmx_restrict xx
,
47 rvec
* gmx_restrict ff
,
48 t_forcerec
* gmx_restrict fr
,
49 t_mdatoms
* gmx_restrict mdatoms
,
50 nb_kernel_data_t
* gmx_restrict kernel_data
,
51 t_nrnb
* gmx_restrict nrnb
)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
59 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
61 int j_coord_offsetA
,j_coord_offsetB
;
62 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
64 real
*shiftvec
,*fshift
,*x
,*f
;
65 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
67 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
69 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
71 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
73 __m128d ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
74 int vdwjidx0A
,vdwjidx0B
;
75 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
76 int vdwjidx1A
,vdwjidx1B
;
77 __m128d jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
78 int vdwjidx2A
,vdwjidx2B
;
79 __m128d jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
80 int vdwjidx3A
,vdwjidx3B
;
81 __m128d jx3
,jy3
,jz3
,fjx3
,fjy3
,fjz3
,jq3
,isaj3
;
82 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
83 __m128d dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
84 __m128d dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
85 __m128d dx13
,dy13
,dz13
,rsq13
,rinv13
,rinvsq13
,r13
,qq13
,c6_13
,c12_13
;
86 __m128d dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
87 __m128d dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
88 __m128d dx23
,dy23
,dz23
,rsq23
,rinv23
,rinvsq23
,r23
,qq23
,c6_23
,c12_23
;
89 __m128d dx31
,dy31
,dz31
,rsq31
,rinv31
,rinvsq31
,r31
,qq31
,c6_31
,c12_31
;
90 __m128d dx32
,dy32
,dz32
,rsq32
,rinv32
,rinvsq32
,r32
,qq32
,c6_32
,c12_32
;
91 __m128d dx33
,dy33
,dz33
,rsq33
,rinv33
,rinvsq33
,r33
,qq33
,c6_33
,c12_33
;
92 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
95 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
98 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
99 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
100 __m128d dummy_mask
,cutoff_mask
;
101 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
102 __m128d one
= _mm_set1_pd(1.0);
103 __m128d two
= _mm_set1_pd(2.0);
109 jindex
= nlist
->jindex
;
111 shiftidx
= nlist
->shift
;
113 shiftvec
= fr
->shift_vec
[0];
114 fshift
= fr
->fshift
[0];
115 facel
= _mm_set1_pd(fr
->epsfac
);
116 charge
= mdatoms
->chargeA
;
117 nvdwtype
= fr
->ntype
;
119 vdwtype
= mdatoms
->typeA
;
121 /* Setup water-specific parameters */
122 inr
= nlist
->iinr
[0];
123 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
124 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
125 iq3
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+3]));
126 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
128 jq1
= _mm_set1_pd(charge
[inr
+1]);
129 jq2
= _mm_set1_pd(charge
[inr
+2]);
130 jq3
= _mm_set1_pd(charge
[inr
+3]);
131 vdwjidx0A
= 2*vdwtype
[inr
+0];
132 c6_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
]);
133 c12_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
134 qq11
= _mm_mul_pd(iq1
,jq1
);
135 qq12
= _mm_mul_pd(iq1
,jq2
);
136 qq13
= _mm_mul_pd(iq1
,jq3
);
137 qq21
= _mm_mul_pd(iq2
,jq1
);
138 qq22
= _mm_mul_pd(iq2
,jq2
);
139 qq23
= _mm_mul_pd(iq2
,jq3
);
140 qq31
= _mm_mul_pd(iq3
,jq1
);
141 qq32
= _mm_mul_pd(iq3
,jq2
);
142 qq33
= _mm_mul_pd(iq3
,jq3
);
144 /* Avoid stupid compiler warnings */
152 /* Start outer loop over neighborlists */
153 for(iidx
=0; iidx
<nri
; iidx
++)
155 /* Load shift vector for this list */
156 i_shift_offset
= DIM
*shiftidx
[iidx
];
158 /* Load limits for loop over neighbors */
159 j_index_start
= jindex
[iidx
];
160 j_index_end
= jindex
[iidx
+1];
162 /* Get outer coordinate index */
164 i_coord_offset
= DIM
*inr
;
166 /* Load i particle coords and add shift vector */
167 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
168 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
170 fix0
= _mm_setzero_pd();
171 fiy0
= _mm_setzero_pd();
172 fiz0
= _mm_setzero_pd();
173 fix1
= _mm_setzero_pd();
174 fiy1
= _mm_setzero_pd();
175 fiz1
= _mm_setzero_pd();
176 fix2
= _mm_setzero_pd();
177 fiy2
= _mm_setzero_pd();
178 fiz2
= _mm_setzero_pd();
179 fix3
= _mm_setzero_pd();
180 fiy3
= _mm_setzero_pd();
181 fiz3
= _mm_setzero_pd();
183 /* Reset potential sums */
184 velecsum
= _mm_setzero_pd();
185 vvdwsum
= _mm_setzero_pd();
187 /* Start inner kernel loop */
188 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
191 /* Get j neighbor index, and coordinate index */
194 j_coord_offsetA
= DIM
*jnrA
;
195 j_coord_offsetB
= DIM
*jnrB
;
197 /* load j atom coordinates */
198 gmx_mm_load_4rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
199 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
200 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
202 /* Calculate displacement vector */
203 dx00
= _mm_sub_pd(ix0
,jx0
);
204 dy00
= _mm_sub_pd(iy0
,jy0
);
205 dz00
= _mm_sub_pd(iz0
,jz0
);
206 dx11
= _mm_sub_pd(ix1
,jx1
);
207 dy11
= _mm_sub_pd(iy1
,jy1
);
208 dz11
= _mm_sub_pd(iz1
,jz1
);
209 dx12
= _mm_sub_pd(ix1
,jx2
);
210 dy12
= _mm_sub_pd(iy1
,jy2
);
211 dz12
= _mm_sub_pd(iz1
,jz2
);
212 dx13
= _mm_sub_pd(ix1
,jx3
);
213 dy13
= _mm_sub_pd(iy1
,jy3
);
214 dz13
= _mm_sub_pd(iz1
,jz3
);
215 dx21
= _mm_sub_pd(ix2
,jx1
);
216 dy21
= _mm_sub_pd(iy2
,jy1
);
217 dz21
= _mm_sub_pd(iz2
,jz1
);
218 dx22
= _mm_sub_pd(ix2
,jx2
);
219 dy22
= _mm_sub_pd(iy2
,jy2
);
220 dz22
= _mm_sub_pd(iz2
,jz2
);
221 dx23
= _mm_sub_pd(ix2
,jx3
);
222 dy23
= _mm_sub_pd(iy2
,jy3
);
223 dz23
= _mm_sub_pd(iz2
,jz3
);
224 dx31
= _mm_sub_pd(ix3
,jx1
);
225 dy31
= _mm_sub_pd(iy3
,jy1
);
226 dz31
= _mm_sub_pd(iz3
,jz1
);
227 dx32
= _mm_sub_pd(ix3
,jx2
);
228 dy32
= _mm_sub_pd(iy3
,jy2
);
229 dz32
= _mm_sub_pd(iz3
,jz2
);
230 dx33
= _mm_sub_pd(ix3
,jx3
);
231 dy33
= _mm_sub_pd(iy3
,jy3
);
232 dz33
= _mm_sub_pd(iz3
,jz3
);
234 /* Calculate squared distance and things based on it */
235 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
236 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
237 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
238 rsq13
= gmx_mm_calc_rsq_pd(dx13
,dy13
,dz13
);
239 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
240 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
241 rsq23
= gmx_mm_calc_rsq_pd(dx23
,dy23
,dz23
);
242 rsq31
= gmx_mm_calc_rsq_pd(dx31
,dy31
,dz31
);
243 rsq32
= gmx_mm_calc_rsq_pd(dx32
,dy32
,dz32
);
244 rsq33
= gmx_mm_calc_rsq_pd(dx33
,dy33
,dz33
);
246 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
247 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
248 rinv13
= gmx_mm_invsqrt_pd(rsq13
);
249 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
250 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
251 rinv23
= gmx_mm_invsqrt_pd(rsq23
);
252 rinv31
= gmx_mm_invsqrt_pd(rsq31
);
253 rinv32
= gmx_mm_invsqrt_pd(rsq32
);
254 rinv33
= gmx_mm_invsqrt_pd(rsq33
);
256 rinvsq00
= gmx_mm_inv_pd(rsq00
);
257 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
258 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
259 rinvsq13
= _mm_mul_pd(rinv13
,rinv13
);
260 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
261 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
262 rinvsq23
= _mm_mul_pd(rinv23
,rinv23
);
263 rinvsq31
= _mm_mul_pd(rinv31
,rinv31
);
264 rinvsq32
= _mm_mul_pd(rinv32
,rinv32
);
265 rinvsq33
= _mm_mul_pd(rinv33
,rinv33
);
267 fjx0
= _mm_setzero_pd();
268 fjy0
= _mm_setzero_pd();
269 fjz0
= _mm_setzero_pd();
270 fjx1
= _mm_setzero_pd();
271 fjy1
= _mm_setzero_pd();
272 fjz1
= _mm_setzero_pd();
273 fjx2
= _mm_setzero_pd();
274 fjy2
= _mm_setzero_pd();
275 fjz2
= _mm_setzero_pd();
276 fjx3
= _mm_setzero_pd();
277 fjy3
= _mm_setzero_pd();
278 fjz3
= _mm_setzero_pd();
280 /**************************
281 * CALCULATE INTERACTIONS *
282 **************************/
284 /* LENNARD-JONES DISPERSION/REPULSION */
286 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
287 vvdw6
= _mm_mul_pd(c6_00
,rinvsix
);
288 vvdw12
= _mm_mul_pd(c12_00
,_mm_mul_pd(rinvsix
,rinvsix
));
289 vvdw
= _mm_sub_pd( _mm_mul_pd(vvdw12
,one_twelfth
) , _mm_mul_pd(vvdw6
,one_sixth
) );
290 fvdw
= _mm_mul_pd(_mm_sub_pd(vvdw12
,vvdw6
),rinvsq00
);
292 /* Update potential sum for this i atom from the interaction with this j atom. */
293 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
297 /* Calculate temporary vectorial force */
298 tx
= _mm_mul_pd(fscal
,dx00
);
299 ty
= _mm_mul_pd(fscal
,dy00
);
300 tz
= _mm_mul_pd(fscal
,dz00
);
302 /* Update vectorial force */
303 fix0
= _mm_add_pd(fix0
,tx
);
304 fiy0
= _mm_add_pd(fiy0
,ty
);
305 fiz0
= _mm_add_pd(fiz0
,tz
);
307 fjx0
= _mm_add_pd(fjx0
,tx
);
308 fjy0
= _mm_add_pd(fjy0
,ty
);
309 fjz0
= _mm_add_pd(fjz0
,tz
);
311 /**************************
312 * CALCULATE INTERACTIONS *
313 **************************/
315 /* COULOMB ELECTROSTATICS */
316 velec
= _mm_mul_pd(qq11
,rinv11
);
317 felec
= _mm_mul_pd(velec
,rinvsq11
);
319 /* Update potential sum for this i atom from the interaction with this j atom. */
320 velecsum
= _mm_add_pd(velecsum
,velec
);
324 /* Calculate temporary vectorial force */
325 tx
= _mm_mul_pd(fscal
,dx11
);
326 ty
= _mm_mul_pd(fscal
,dy11
);
327 tz
= _mm_mul_pd(fscal
,dz11
);
329 /* Update vectorial force */
330 fix1
= _mm_add_pd(fix1
,tx
);
331 fiy1
= _mm_add_pd(fiy1
,ty
);
332 fiz1
= _mm_add_pd(fiz1
,tz
);
334 fjx1
= _mm_add_pd(fjx1
,tx
);
335 fjy1
= _mm_add_pd(fjy1
,ty
);
336 fjz1
= _mm_add_pd(fjz1
,tz
);
338 /**************************
339 * CALCULATE INTERACTIONS *
340 **************************/
342 /* COULOMB ELECTROSTATICS */
343 velec
= _mm_mul_pd(qq12
,rinv12
);
344 felec
= _mm_mul_pd(velec
,rinvsq12
);
346 /* Update potential sum for this i atom from the interaction with this j atom. */
347 velecsum
= _mm_add_pd(velecsum
,velec
);
351 /* Calculate temporary vectorial force */
352 tx
= _mm_mul_pd(fscal
,dx12
);
353 ty
= _mm_mul_pd(fscal
,dy12
);
354 tz
= _mm_mul_pd(fscal
,dz12
);
356 /* Update vectorial force */
357 fix1
= _mm_add_pd(fix1
,tx
);
358 fiy1
= _mm_add_pd(fiy1
,ty
);
359 fiz1
= _mm_add_pd(fiz1
,tz
);
361 fjx2
= _mm_add_pd(fjx2
,tx
);
362 fjy2
= _mm_add_pd(fjy2
,ty
);
363 fjz2
= _mm_add_pd(fjz2
,tz
);
365 /**************************
366 * CALCULATE INTERACTIONS *
367 **************************/
369 /* COULOMB ELECTROSTATICS */
370 velec
= _mm_mul_pd(qq13
,rinv13
);
371 felec
= _mm_mul_pd(velec
,rinvsq13
);
373 /* Update potential sum for this i atom from the interaction with this j atom. */
374 velecsum
= _mm_add_pd(velecsum
,velec
);
378 /* Calculate temporary vectorial force */
379 tx
= _mm_mul_pd(fscal
,dx13
);
380 ty
= _mm_mul_pd(fscal
,dy13
);
381 tz
= _mm_mul_pd(fscal
,dz13
);
383 /* Update vectorial force */
384 fix1
= _mm_add_pd(fix1
,tx
);
385 fiy1
= _mm_add_pd(fiy1
,ty
);
386 fiz1
= _mm_add_pd(fiz1
,tz
);
388 fjx3
= _mm_add_pd(fjx3
,tx
);
389 fjy3
= _mm_add_pd(fjy3
,ty
);
390 fjz3
= _mm_add_pd(fjz3
,tz
);
392 /**************************
393 * CALCULATE INTERACTIONS *
394 **************************/
396 /* COULOMB ELECTROSTATICS */
397 velec
= _mm_mul_pd(qq21
,rinv21
);
398 felec
= _mm_mul_pd(velec
,rinvsq21
);
400 /* Update potential sum for this i atom from the interaction with this j atom. */
401 velecsum
= _mm_add_pd(velecsum
,velec
);
405 /* Calculate temporary vectorial force */
406 tx
= _mm_mul_pd(fscal
,dx21
);
407 ty
= _mm_mul_pd(fscal
,dy21
);
408 tz
= _mm_mul_pd(fscal
,dz21
);
410 /* Update vectorial force */
411 fix2
= _mm_add_pd(fix2
,tx
);
412 fiy2
= _mm_add_pd(fiy2
,ty
);
413 fiz2
= _mm_add_pd(fiz2
,tz
);
415 fjx1
= _mm_add_pd(fjx1
,tx
);
416 fjy1
= _mm_add_pd(fjy1
,ty
);
417 fjz1
= _mm_add_pd(fjz1
,tz
);
419 /**************************
420 * CALCULATE INTERACTIONS *
421 **************************/
423 /* COULOMB ELECTROSTATICS */
424 velec
= _mm_mul_pd(qq22
,rinv22
);
425 felec
= _mm_mul_pd(velec
,rinvsq22
);
427 /* Update potential sum for this i atom from the interaction with this j atom. */
428 velecsum
= _mm_add_pd(velecsum
,velec
);
432 /* Calculate temporary vectorial force */
433 tx
= _mm_mul_pd(fscal
,dx22
);
434 ty
= _mm_mul_pd(fscal
,dy22
);
435 tz
= _mm_mul_pd(fscal
,dz22
);
437 /* Update vectorial force */
438 fix2
= _mm_add_pd(fix2
,tx
);
439 fiy2
= _mm_add_pd(fiy2
,ty
);
440 fiz2
= _mm_add_pd(fiz2
,tz
);
442 fjx2
= _mm_add_pd(fjx2
,tx
);
443 fjy2
= _mm_add_pd(fjy2
,ty
);
444 fjz2
= _mm_add_pd(fjz2
,tz
);
446 /**************************
447 * CALCULATE INTERACTIONS *
448 **************************/
450 /* COULOMB ELECTROSTATICS */
451 velec
= _mm_mul_pd(qq23
,rinv23
);
452 felec
= _mm_mul_pd(velec
,rinvsq23
);
454 /* Update potential sum for this i atom from the interaction with this j atom. */
455 velecsum
= _mm_add_pd(velecsum
,velec
);
459 /* Calculate temporary vectorial force */
460 tx
= _mm_mul_pd(fscal
,dx23
);
461 ty
= _mm_mul_pd(fscal
,dy23
);
462 tz
= _mm_mul_pd(fscal
,dz23
);
464 /* Update vectorial force */
465 fix2
= _mm_add_pd(fix2
,tx
);
466 fiy2
= _mm_add_pd(fiy2
,ty
);
467 fiz2
= _mm_add_pd(fiz2
,tz
);
469 fjx3
= _mm_add_pd(fjx3
,tx
);
470 fjy3
= _mm_add_pd(fjy3
,ty
);
471 fjz3
= _mm_add_pd(fjz3
,tz
);
473 /**************************
474 * CALCULATE INTERACTIONS *
475 **************************/
477 /* COULOMB ELECTROSTATICS */
478 velec
= _mm_mul_pd(qq31
,rinv31
);
479 felec
= _mm_mul_pd(velec
,rinvsq31
);
481 /* Update potential sum for this i atom from the interaction with this j atom. */
482 velecsum
= _mm_add_pd(velecsum
,velec
);
486 /* Calculate temporary vectorial force */
487 tx
= _mm_mul_pd(fscal
,dx31
);
488 ty
= _mm_mul_pd(fscal
,dy31
);
489 tz
= _mm_mul_pd(fscal
,dz31
);
491 /* Update vectorial force */
492 fix3
= _mm_add_pd(fix3
,tx
);
493 fiy3
= _mm_add_pd(fiy3
,ty
);
494 fiz3
= _mm_add_pd(fiz3
,tz
);
496 fjx1
= _mm_add_pd(fjx1
,tx
);
497 fjy1
= _mm_add_pd(fjy1
,ty
);
498 fjz1
= _mm_add_pd(fjz1
,tz
);
500 /**************************
501 * CALCULATE INTERACTIONS *
502 **************************/
504 /* COULOMB ELECTROSTATICS */
505 velec
= _mm_mul_pd(qq32
,rinv32
);
506 felec
= _mm_mul_pd(velec
,rinvsq32
);
508 /* Update potential sum for this i atom from the interaction with this j atom. */
509 velecsum
= _mm_add_pd(velecsum
,velec
);
513 /* Calculate temporary vectorial force */
514 tx
= _mm_mul_pd(fscal
,dx32
);
515 ty
= _mm_mul_pd(fscal
,dy32
);
516 tz
= _mm_mul_pd(fscal
,dz32
);
518 /* Update vectorial force */
519 fix3
= _mm_add_pd(fix3
,tx
);
520 fiy3
= _mm_add_pd(fiy3
,ty
);
521 fiz3
= _mm_add_pd(fiz3
,tz
);
523 fjx2
= _mm_add_pd(fjx2
,tx
);
524 fjy2
= _mm_add_pd(fjy2
,ty
);
525 fjz2
= _mm_add_pd(fjz2
,tz
);
527 /**************************
528 * CALCULATE INTERACTIONS *
529 **************************/
531 /* COULOMB ELECTROSTATICS */
532 velec
= _mm_mul_pd(qq33
,rinv33
);
533 felec
= _mm_mul_pd(velec
,rinvsq33
);
535 /* Update potential sum for this i atom from the interaction with this j atom. */
536 velecsum
= _mm_add_pd(velecsum
,velec
);
540 /* Calculate temporary vectorial force */
541 tx
= _mm_mul_pd(fscal
,dx33
);
542 ty
= _mm_mul_pd(fscal
,dy33
);
543 tz
= _mm_mul_pd(fscal
,dz33
);
545 /* Update vectorial force */
546 fix3
= _mm_add_pd(fix3
,tx
);
547 fiy3
= _mm_add_pd(fiy3
,ty
);
548 fiz3
= _mm_add_pd(fiz3
,tz
);
550 fjx3
= _mm_add_pd(fjx3
,tx
);
551 fjy3
= _mm_add_pd(fjy3
,ty
);
552 fjz3
= _mm_add_pd(fjz3
,tz
);
554 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
556 /* Inner loop uses 287 flops */
563 j_coord_offsetA
= DIM
*jnrA
;
565 /* load j atom coordinates */
566 gmx_mm_load_4rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
567 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
568 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
570 /* Calculate displacement vector */
571 dx00
= _mm_sub_pd(ix0
,jx0
);
572 dy00
= _mm_sub_pd(iy0
,jy0
);
573 dz00
= _mm_sub_pd(iz0
,jz0
);
574 dx11
= _mm_sub_pd(ix1
,jx1
);
575 dy11
= _mm_sub_pd(iy1
,jy1
);
576 dz11
= _mm_sub_pd(iz1
,jz1
);
577 dx12
= _mm_sub_pd(ix1
,jx2
);
578 dy12
= _mm_sub_pd(iy1
,jy2
);
579 dz12
= _mm_sub_pd(iz1
,jz2
);
580 dx13
= _mm_sub_pd(ix1
,jx3
);
581 dy13
= _mm_sub_pd(iy1
,jy3
);
582 dz13
= _mm_sub_pd(iz1
,jz3
);
583 dx21
= _mm_sub_pd(ix2
,jx1
);
584 dy21
= _mm_sub_pd(iy2
,jy1
);
585 dz21
= _mm_sub_pd(iz2
,jz1
);
586 dx22
= _mm_sub_pd(ix2
,jx2
);
587 dy22
= _mm_sub_pd(iy2
,jy2
);
588 dz22
= _mm_sub_pd(iz2
,jz2
);
589 dx23
= _mm_sub_pd(ix2
,jx3
);
590 dy23
= _mm_sub_pd(iy2
,jy3
);
591 dz23
= _mm_sub_pd(iz2
,jz3
);
592 dx31
= _mm_sub_pd(ix3
,jx1
);
593 dy31
= _mm_sub_pd(iy3
,jy1
);
594 dz31
= _mm_sub_pd(iz3
,jz1
);
595 dx32
= _mm_sub_pd(ix3
,jx2
);
596 dy32
= _mm_sub_pd(iy3
,jy2
);
597 dz32
= _mm_sub_pd(iz3
,jz2
);
598 dx33
= _mm_sub_pd(ix3
,jx3
);
599 dy33
= _mm_sub_pd(iy3
,jy3
);
600 dz33
= _mm_sub_pd(iz3
,jz3
);
602 /* Calculate squared distance and things based on it */
603 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
604 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
605 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
606 rsq13
= gmx_mm_calc_rsq_pd(dx13
,dy13
,dz13
);
607 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
608 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
609 rsq23
= gmx_mm_calc_rsq_pd(dx23
,dy23
,dz23
);
610 rsq31
= gmx_mm_calc_rsq_pd(dx31
,dy31
,dz31
);
611 rsq32
= gmx_mm_calc_rsq_pd(dx32
,dy32
,dz32
);
612 rsq33
= gmx_mm_calc_rsq_pd(dx33
,dy33
,dz33
);
614 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
615 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
616 rinv13
= gmx_mm_invsqrt_pd(rsq13
);
617 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
618 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
619 rinv23
= gmx_mm_invsqrt_pd(rsq23
);
620 rinv31
= gmx_mm_invsqrt_pd(rsq31
);
621 rinv32
= gmx_mm_invsqrt_pd(rsq32
);
622 rinv33
= gmx_mm_invsqrt_pd(rsq33
);
624 rinvsq00
= gmx_mm_inv_pd(rsq00
);
625 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
626 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
627 rinvsq13
= _mm_mul_pd(rinv13
,rinv13
);
628 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
629 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
630 rinvsq23
= _mm_mul_pd(rinv23
,rinv23
);
631 rinvsq31
= _mm_mul_pd(rinv31
,rinv31
);
632 rinvsq32
= _mm_mul_pd(rinv32
,rinv32
);
633 rinvsq33
= _mm_mul_pd(rinv33
,rinv33
);
635 fjx0
= _mm_setzero_pd();
636 fjy0
= _mm_setzero_pd();
637 fjz0
= _mm_setzero_pd();
638 fjx1
= _mm_setzero_pd();
639 fjy1
= _mm_setzero_pd();
640 fjz1
= _mm_setzero_pd();
641 fjx2
= _mm_setzero_pd();
642 fjy2
= _mm_setzero_pd();
643 fjz2
= _mm_setzero_pd();
644 fjx3
= _mm_setzero_pd();
645 fjy3
= _mm_setzero_pd();
646 fjz3
= _mm_setzero_pd();
648 /**************************
649 * CALCULATE INTERACTIONS *
650 **************************/
652 /* LENNARD-JONES DISPERSION/REPULSION */
654 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
655 vvdw6
= _mm_mul_pd(c6_00
,rinvsix
);
656 vvdw12
= _mm_mul_pd(c12_00
,_mm_mul_pd(rinvsix
,rinvsix
));
657 vvdw
= _mm_sub_pd( _mm_mul_pd(vvdw12
,one_twelfth
) , _mm_mul_pd(vvdw6
,one_sixth
) );
658 fvdw
= _mm_mul_pd(_mm_sub_pd(vvdw12
,vvdw6
),rinvsq00
);
660 /* Update potential sum for this i atom from the interaction with this j atom. */
661 vvdw
= _mm_unpacklo_pd(vvdw
,_mm_setzero_pd());
662 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
666 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
668 /* Calculate temporary vectorial force */
669 tx
= _mm_mul_pd(fscal
,dx00
);
670 ty
= _mm_mul_pd(fscal
,dy00
);
671 tz
= _mm_mul_pd(fscal
,dz00
);
673 /* Update vectorial force */
674 fix0
= _mm_add_pd(fix0
,tx
);
675 fiy0
= _mm_add_pd(fiy0
,ty
);
676 fiz0
= _mm_add_pd(fiz0
,tz
);
678 fjx0
= _mm_add_pd(fjx0
,tx
);
679 fjy0
= _mm_add_pd(fjy0
,ty
);
680 fjz0
= _mm_add_pd(fjz0
,tz
);
682 /**************************
683 * CALCULATE INTERACTIONS *
684 **************************/
686 /* COULOMB ELECTROSTATICS */
687 velec
= _mm_mul_pd(qq11
,rinv11
);
688 felec
= _mm_mul_pd(velec
,rinvsq11
);
690 /* Update potential sum for this i atom from the interaction with this j atom. */
691 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
692 velecsum
= _mm_add_pd(velecsum
,velec
);
696 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
698 /* Calculate temporary vectorial force */
699 tx
= _mm_mul_pd(fscal
,dx11
);
700 ty
= _mm_mul_pd(fscal
,dy11
);
701 tz
= _mm_mul_pd(fscal
,dz11
);
703 /* Update vectorial force */
704 fix1
= _mm_add_pd(fix1
,tx
);
705 fiy1
= _mm_add_pd(fiy1
,ty
);
706 fiz1
= _mm_add_pd(fiz1
,tz
);
708 fjx1
= _mm_add_pd(fjx1
,tx
);
709 fjy1
= _mm_add_pd(fjy1
,ty
);
710 fjz1
= _mm_add_pd(fjz1
,tz
);
712 /**************************
713 * CALCULATE INTERACTIONS *
714 **************************/
716 /* COULOMB ELECTROSTATICS */
717 velec
= _mm_mul_pd(qq12
,rinv12
);
718 felec
= _mm_mul_pd(velec
,rinvsq12
);
720 /* Update potential sum for this i atom from the interaction with this j atom. */
721 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
722 velecsum
= _mm_add_pd(velecsum
,velec
);
726 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
728 /* Calculate temporary vectorial force */
729 tx
= _mm_mul_pd(fscal
,dx12
);
730 ty
= _mm_mul_pd(fscal
,dy12
);
731 tz
= _mm_mul_pd(fscal
,dz12
);
733 /* Update vectorial force */
734 fix1
= _mm_add_pd(fix1
,tx
);
735 fiy1
= _mm_add_pd(fiy1
,ty
);
736 fiz1
= _mm_add_pd(fiz1
,tz
);
738 fjx2
= _mm_add_pd(fjx2
,tx
);
739 fjy2
= _mm_add_pd(fjy2
,ty
);
740 fjz2
= _mm_add_pd(fjz2
,tz
);
742 /**************************
743 * CALCULATE INTERACTIONS *
744 **************************/
746 /* COULOMB ELECTROSTATICS */
747 velec
= _mm_mul_pd(qq13
,rinv13
);
748 felec
= _mm_mul_pd(velec
,rinvsq13
);
750 /* Update potential sum for this i atom from the interaction with this j atom. */
751 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
752 velecsum
= _mm_add_pd(velecsum
,velec
);
756 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
758 /* Calculate temporary vectorial force */
759 tx
= _mm_mul_pd(fscal
,dx13
);
760 ty
= _mm_mul_pd(fscal
,dy13
);
761 tz
= _mm_mul_pd(fscal
,dz13
);
763 /* Update vectorial force */
764 fix1
= _mm_add_pd(fix1
,tx
);
765 fiy1
= _mm_add_pd(fiy1
,ty
);
766 fiz1
= _mm_add_pd(fiz1
,tz
);
768 fjx3
= _mm_add_pd(fjx3
,tx
);
769 fjy3
= _mm_add_pd(fjy3
,ty
);
770 fjz3
= _mm_add_pd(fjz3
,tz
);
772 /**************************
773 * CALCULATE INTERACTIONS *
774 **************************/
776 /* COULOMB ELECTROSTATICS */
777 velec
= _mm_mul_pd(qq21
,rinv21
);
778 felec
= _mm_mul_pd(velec
,rinvsq21
);
780 /* Update potential sum for this i atom from the interaction with this j atom. */
781 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
782 velecsum
= _mm_add_pd(velecsum
,velec
);
786 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
788 /* Calculate temporary vectorial force */
789 tx
= _mm_mul_pd(fscal
,dx21
);
790 ty
= _mm_mul_pd(fscal
,dy21
);
791 tz
= _mm_mul_pd(fscal
,dz21
);
793 /* Update vectorial force */
794 fix2
= _mm_add_pd(fix2
,tx
);
795 fiy2
= _mm_add_pd(fiy2
,ty
);
796 fiz2
= _mm_add_pd(fiz2
,tz
);
798 fjx1
= _mm_add_pd(fjx1
,tx
);
799 fjy1
= _mm_add_pd(fjy1
,ty
);
800 fjz1
= _mm_add_pd(fjz1
,tz
);
802 /**************************
803 * CALCULATE INTERACTIONS *
804 **************************/
806 /* COULOMB ELECTROSTATICS */
807 velec
= _mm_mul_pd(qq22
,rinv22
);
808 felec
= _mm_mul_pd(velec
,rinvsq22
);
810 /* Update potential sum for this i atom from the interaction with this j atom. */
811 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
812 velecsum
= _mm_add_pd(velecsum
,velec
);
816 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
818 /* Calculate temporary vectorial force */
819 tx
= _mm_mul_pd(fscal
,dx22
);
820 ty
= _mm_mul_pd(fscal
,dy22
);
821 tz
= _mm_mul_pd(fscal
,dz22
);
823 /* Update vectorial force */
824 fix2
= _mm_add_pd(fix2
,tx
);
825 fiy2
= _mm_add_pd(fiy2
,ty
);
826 fiz2
= _mm_add_pd(fiz2
,tz
);
828 fjx2
= _mm_add_pd(fjx2
,tx
);
829 fjy2
= _mm_add_pd(fjy2
,ty
);
830 fjz2
= _mm_add_pd(fjz2
,tz
);
832 /**************************
833 * CALCULATE INTERACTIONS *
834 **************************/
836 /* COULOMB ELECTROSTATICS */
837 velec
= _mm_mul_pd(qq23
,rinv23
);
838 felec
= _mm_mul_pd(velec
,rinvsq23
);
840 /* Update potential sum for this i atom from the interaction with this j atom. */
841 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
842 velecsum
= _mm_add_pd(velecsum
,velec
);
846 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
848 /* Calculate temporary vectorial force */
849 tx
= _mm_mul_pd(fscal
,dx23
);
850 ty
= _mm_mul_pd(fscal
,dy23
);
851 tz
= _mm_mul_pd(fscal
,dz23
);
853 /* Update vectorial force */
854 fix2
= _mm_add_pd(fix2
,tx
);
855 fiy2
= _mm_add_pd(fiy2
,ty
);
856 fiz2
= _mm_add_pd(fiz2
,tz
);
858 fjx3
= _mm_add_pd(fjx3
,tx
);
859 fjy3
= _mm_add_pd(fjy3
,ty
);
860 fjz3
= _mm_add_pd(fjz3
,tz
);
862 /**************************
863 * CALCULATE INTERACTIONS *
864 **************************/
866 /* COULOMB ELECTROSTATICS */
867 velec
= _mm_mul_pd(qq31
,rinv31
);
868 felec
= _mm_mul_pd(velec
,rinvsq31
);
870 /* Update potential sum for this i atom from the interaction with this j atom. */
871 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
872 velecsum
= _mm_add_pd(velecsum
,velec
);
876 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
878 /* Calculate temporary vectorial force */
879 tx
= _mm_mul_pd(fscal
,dx31
);
880 ty
= _mm_mul_pd(fscal
,dy31
);
881 tz
= _mm_mul_pd(fscal
,dz31
);
883 /* Update vectorial force */
884 fix3
= _mm_add_pd(fix3
,tx
);
885 fiy3
= _mm_add_pd(fiy3
,ty
);
886 fiz3
= _mm_add_pd(fiz3
,tz
);
888 fjx1
= _mm_add_pd(fjx1
,tx
);
889 fjy1
= _mm_add_pd(fjy1
,ty
);
890 fjz1
= _mm_add_pd(fjz1
,tz
);
892 /**************************
893 * CALCULATE INTERACTIONS *
894 **************************/
896 /* COULOMB ELECTROSTATICS */
897 velec
= _mm_mul_pd(qq32
,rinv32
);
898 felec
= _mm_mul_pd(velec
,rinvsq32
);
900 /* Update potential sum for this i atom from the interaction with this j atom. */
901 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
902 velecsum
= _mm_add_pd(velecsum
,velec
);
906 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
908 /* Calculate temporary vectorial force */
909 tx
= _mm_mul_pd(fscal
,dx32
);
910 ty
= _mm_mul_pd(fscal
,dy32
);
911 tz
= _mm_mul_pd(fscal
,dz32
);
913 /* Update vectorial force */
914 fix3
= _mm_add_pd(fix3
,tx
);
915 fiy3
= _mm_add_pd(fiy3
,ty
);
916 fiz3
= _mm_add_pd(fiz3
,tz
);
918 fjx2
= _mm_add_pd(fjx2
,tx
);
919 fjy2
= _mm_add_pd(fjy2
,ty
);
920 fjz2
= _mm_add_pd(fjz2
,tz
);
922 /**************************
923 * CALCULATE INTERACTIONS *
924 **************************/
926 /* COULOMB ELECTROSTATICS */
927 velec
= _mm_mul_pd(qq33
,rinv33
);
928 felec
= _mm_mul_pd(velec
,rinvsq33
);
930 /* Update potential sum for this i atom from the interaction with this j atom. */
931 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
932 velecsum
= _mm_add_pd(velecsum
,velec
);
936 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
938 /* Calculate temporary vectorial force */
939 tx
= _mm_mul_pd(fscal
,dx33
);
940 ty
= _mm_mul_pd(fscal
,dy33
);
941 tz
= _mm_mul_pd(fscal
,dz33
);
943 /* Update vectorial force */
944 fix3
= _mm_add_pd(fix3
,tx
);
945 fiy3
= _mm_add_pd(fiy3
,ty
);
946 fiz3
= _mm_add_pd(fiz3
,tz
);
948 fjx3
= _mm_add_pd(fjx3
,tx
);
949 fjy3
= _mm_add_pd(fjy3
,ty
);
950 fjz3
= _mm_add_pd(fjz3
,tz
);
952 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
954 /* Inner loop uses 287 flops */
957 /* End of innermost loop */
959 gmx_mm_update_iforce_4atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
960 f
+i_coord_offset
,fshift
+i_shift_offset
);
963 /* Update potential energies */
964 gmx_mm_update_1pot_pd(velecsum
,kernel_data
->energygrp_elec
+ggid
);
965 gmx_mm_update_1pot_pd(vvdwsum
,kernel_data
->energygrp_vdw
+ggid
);
967 /* Increment number of inner iterations */
968 inneriter
+= j_index_end
- j_index_start
;
970 /* Outer loop uses 26 flops */
973 /* Increment number of outer iterations */
976 /* Update outer/inner flops */
978 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W4W4_VF
,outeriter
*26 + inneriter
*287);
981 * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_sse2_double
982 * Electrostatics interaction: Coulomb
983 * VdW interaction: LennardJones
984 * Geometry: Water4-Water4
985 * Calculate force/pot: Force
988 nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_sse2_double
989 (t_nblist
* gmx_restrict nlist
,
990 rvec
* gmx_restrict xx
,
991 rvec
* gmx_restrict ff
,
992 t_forcerec
* gmx_restrict fr
,
993 t_mdatoms
* gmx_restrict mdatoms
,
994 nb_kernel_data_t
* gmx_restrict kernel_data
,
995 t_nrnb
* gmx_restrict nrnb
)
997 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
998 * just 0 for non-waters.
999 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1000 * jnr indices corresponding to data put in the four positions in the SIMD register.
1002 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
1003 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
1005 int j_coord_offsetA
,j_coord_offsetB
;
1006 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
1007 real rcutoff_scalar
;
1008 real
*shiftvec
,*fshift
,*x
,*f
;
1009 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
1011 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
1013 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
1015 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
1017 __m128d ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
1018 int vdwjidx0A
,vdwjidx0B
;
1019 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
1020 int vdwjidx1A
,vdwjidx1B
;
1021 __m128d jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
1022 int vdwjidx2A
,vdwjidx2B
;
1023 __m128d jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
1024 int vdwjidx3A
,vdwjidx3B
;
1025 __m128d jx3
,jy3
,jz3
,fjx3
,fjy3
,fjz3
,jq3
,isaj3
;
1026 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
1027 __m128d dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
1028 __m128d dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
1029 __m128d dx13
,dy13
,dz13
,rsq13
,rinv13
,rinvsq13
,r13
,qq13
,c6_13
,c12_13
;
1030 __m128d dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
1031 __m128d dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
1032 __m128d dx23
,dy23
,dz23
,rsq23
,rinv23
,rinvsq23
,r23
,qq23
,c6_23
,c12_23
;
1033 __m128d dx31
,dy31
,dz31
,rsq31
,rinv31
,rinvsq31
,r31
,qq31
,c6_31
,c12_31
;
1034 __m128d dx32
,dy32
,dz32
,rsq32
,rinv32
,rinvsq32
,r32
,qq32
,c6_32
,c12_32
;
1035 __m128d dx33
,dy33
,dz33
,rsq33
,rinv33
,rinvsq33
,r33
,qq33
,c6_33
,c12_33
;
1036 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
1039 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
1042 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
1043 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
1044 __m128d dummy_mask
,cutoff_mask
;
1045 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1046 __m128d one
= _mm_set1_pd(1.0);
1047 __m128d two
= _mm_set1_pd(2.0);
1053 jindex
= nlist
->jindex
;
1055 shiftidx
= nlist
->shift
;
1057 shiftvec
= fr
->shift_vec
[0];
1058 fshift
= fr
->fshift
[0];
1059 facel
= _mm_set1_pd(fr
->epsfac
);
1060 charge
= mdatoms
->chargeA
;
1061 nvdwtype
= fr
->ntype
;
1062 vdwparam
= fr
->nbfp
;
1063 vdwtype
= mdatoms
->typeA
;
1065 /* Setup water-specific parameters */
1066 inr
= nlist
->iinr
[0];
1067 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
1068 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
1069 iq3
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+3]));
1070 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
1072 jq1
= _mm_set1_pd(charge
[inr
+1]);
1073 jq2
= _mm_set1_pd(charge
[inr
+2]);
1074 jq3
= _mm_set1_pd(charge
[inr
+3]);
1075 vdwjidx0A
= 2*vdwtype
[inr
+0];
1076 c6_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
]);
1077 c12_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
1078 qq11
= _mm_mul_pd(iq1
,jq1
);
1079 qq12
= _mm_mul_pd(iq1
,jq2
);
1080 qq13
= _mm_mul_pd(iq1
,jq3
);
1081 qq21
= _mm_mul_pd(iq2
,jq1
);
1082 qq22
= _mm_mul_pd(iq2
,jq2
);
1083 qq23
= _mm_mul_pd(iq2
,jq3
);
1084 qq31
= _mm_mul_pd(iq3
,jq1
);
1085 qq32
= _mm_mul_pd(iq3
,jq2
);
1086 qq33
= _mm_mul_pd(iq3
,jq3
);
1088 /* Avoid stupid compiler warnings */
1090 j_coord_offsetA
= 0;
1091 j_coord_offsetB
= 0;
1096 /* Start outer loop over neighborlists */
1097 for(iidx
=0; iidx
<nri
; iidx
++)
1099 /* Load shift vector for this list */
1100 i_shift_offset
= DIM
*shiftidx
[iidx
];
1102 /* Load limits for loop over neighbors */
1103 j_index_start
= jindex
[iidx
];
1104 j_index_end
= jindex
[iidx
+1];
1106 /* Get outer coordinate index */
1108 i_coord_offset
= DIM
*inr
;
1110 /* Load i particle coords and add shift vector */
1111 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
1112 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
1114 fix0
= _mm_setzero_pd();
1115 fiy0
= _mm_setzero_pd();
1116 fiz0
= _mm_setzero_pd();
1117 fix1
= _mm_setzero_pd();
1118 fiy1
= _mm_setzero_pd();
1119 fiz1
= _mm_setzero_pd();
1120 fix2
= _mm_setzero_pd();
1121 fiy2
= _mm_setzero_pd();
1122 fiz2
= _mm_setzero_pd();
1123 fix3
= _mm_setzero_pd();
1124 fiy3
= _mm_setzero_pd();
1125 fiz3
= _mm_setzero_pd();
1127 /* Start inner kernel loop */
1128 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
1131 /* Get j neighbor index, and coordinate index */
1133 jnrB
= jjnr
[jidx
+1];
1134 j_coord_offsetA
= DIM
*jnrA
;
1135 j_coord_offsetB
= DIM
*jnrB
;
1137 /* load j atom coordinates */
1138 gmx_mm_load_4rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1139 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
1140 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
1142 /* Calculate displacement vector */
1143 dx00
= _mm_sub_pd(ix0
,jx0
);
1144 dy00
= _mm_sub_pd(iy0
,jy0
);
1145 dz00
= _mm_sub_pd(iz0
,jz0
);
1146 dx11
= _mm_sub_pd(ix1
,jx1
);
1147 dy11
= _mm_sub_pd(iy1
,jy1
);
1148 dz11
= _mm_sub_pd(iz1
,jz1
);
1149 dx12
= _mm_sub_pd(ix1
,jx2
);
1150 dy12
= _mm_sub_pd(iy1
,jy2
);
1151 dz12
= _mm_sub_pd(iz1
,jz2
);
1152 dx13
= _mm_sub_pd(ix1
,jx3
);
1153 dy13
= _mm_sub_pd(iy1
,jy3
);
1154 dz13
= _mm_sub_pd(iz1
,jz3
);
1155 dx21
= _mm_sub_pd(ix2
,jx1
);
1156 dy21
= _mm_sub_pd(iy2
,jy1
);
1157 dz21
= _mm_sub_pd(iz2
,jz1
);
1158 dx22
= _mm_sub_pd(ix2
,jx2
);
1159 dy22
= _mm_sub_pd(iy2
,jy2
);
1160 dz22
= _mm_sub_pd(iz2
,jz2
);
1161 dx23
= _mm_sub_pd(ix2
,jx3
);
1162 dy23
= _mm_sub_pd(iy2
,jy3
);
1163 dz23
= _mm_sub_pd(iz2
,jz3
);
1164 dx31
= _mm_sub_pd(ix3
,jx1
);
1165 dy31
= _mm_sub_pd(iy3
,jy1
);
1166 dz31
= _mm_sub_pd(iz3
,jz1
);
1167 dx32
= _mm_sub_pd(ix3
,jx2
);
1168 dy32
= _mm_sub_pd(iy3
,jy2
);
1169 dz32
= _mm_sub_pd(iz3
,jz2
);
1170 dx33
= _mm_sub_pd(ix3
,jx3
);
1171 dy33
= _mm_sub_pd(iy3
,jy3
);
1172 dz33
= _mm_sub_pd(iz3
,jz3
);
1174 /* Calculate squared distance and things based on it */
1175 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
1176 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
1177 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
1178 rsq13
= gmx_mm_calc_rsq_pd(dx13
,dy13
,dz13
);
1179 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
1180 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
1181 rsq23
= gmx_mm_calc_rsq_pd(dx23
,dy23
,dz23
);
1182 rsq31
= gmx_mm_calc_rsq_pd(dx31
,dy31
,dz31
);
1183 rsq32
= gmx_mm_calc_rsq_pd(dx32
,dy32
,dz32
);
1184 rsq33
= gmx_mm_calc_rsq_pd(dx33
,dy33
,dz33
);
1186 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
1187 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
1188 rinv13
= gmx_mm_invsqrt_pd(rsq13
);
1189 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
1190 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
1191 rinv23
= gmx_mm_invsqrt_pd(rsq23
);
1192 rinv31
= gmx_mm_invsqrt_pd(rsq31
);
1193 rinv32
= gmx_mm_invsqrt_pd(rsq32
);
1194 rinv33
= gmx_mm_invsqrt_pd(rsq33
);
1196 rinvsq00
= gmx_mm_inv_pd(rsq00
);
1197 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
1198 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
1199 rinvsq13
= _mm_mul_pd(rinv13
,rinv13
);
1200 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
1201 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
1202 rinvsq23
= _mm_mul_pd(rinv23
,rinv23
);
1203 rinvsq31
= _mm_mul_pd(rinv31
,rinv31
);
1204 rinvsq32
= _mm_mul_pd(rinv32
,rinv32
);
1205 rinvsq33
= _mm_mul_pd(rinv33
,rinv33
);
1207 fjx0
= _mm_setzero_pd();
1208 fjy0
= _mm_setzero_pd();
1209 fjz0
= _mm_setzero_pd();
1210 fjx1
= _mm_setzero_pd();
1211 fjy1
= _mm_setzero_pd();
1212 fjz1
= _mm_setzero_pd();
1213 fjx2
= _mm_setzero_pd();
1214 fjy2
= _mm_setzero_pd();
1215 fjz2
= _mm_setzero_pd();
1216 fjx3
= _mm_setzero_pd();
1217 fjy3
= _mm_setzero_pd();
1218 fjz3
= _mm_setzero_pd();
1220 /**************************
1221 * CALCULATE INTERACTIONS *
1222 **************************/
1224 /* LENNARD-JONES DISPERSION/REPULSION */
1226 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
1227 fvdw
= _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00
,rinvsix
),c6_00
),_mm_mul_pd(rinvsix
,rinvsq00
));
1231 /* Calculate temporary vectorial force */
1232 tx
= _mm_mul_pd(fscal
,dx00
);
1233 ty
= _mm_mul_pd(fscal
,dy00
);
1234 tz
= _mm_mul_pd(fscal
,dz00
);
1236 /* Update vectorial force */
1237 fix0
= _mm_add_pd(fix0
,tx
);
1238 fiy0
= _mm_add_pd(fiy0
,ty
);
1239 fiz0
= _mm_add_pd(fiz0
,tz
);
1241 fjx0
= _mm_add_pd(fjx0
,tx
);
1242 fjy0
= _mm_add_pd(fjy0
,ty
);
1243 fjz0
= _mm_add_pd(fjz0
,tz
);
1245 /**************************
1246 * CALCULATE INTERACTIONS *
1247 **************************/
1249 /* COULOMB ELECTROSTATICS */
1250 velec
= _mm_mul_pd(qq11
,rinv11
);
1251 felec
= _mm_mul_pd(velec
,rinvsq11
);
1255 /* Calculate temporary vectorial force */
1256 tx
= _mm_mul_pd(fscal
,dx11
);
1257 ty
= _mm_mul_pd(fscal
,dy11
);
1258 tz
= _mm_mul_pd(fscal
,dz11
);
1260 /* Update vectorial force */
1261 fix1
= _mm_add_pd(fix1
,tx
);
1262 fiy1
= _mm_add_pd(fiy1
,ty
);
1263 fiz1
= _mm_add_pd(fiz1
,tz
);
1265 fjx1
= _mm_add_pd(fjx1
,tx
);
1266 fjy1
= _mm_add_pd(fjy1
,ty
);
1267 fjz1
= _mm_add_pd(fjz1
,tz
);
1269 /**************************
1270 * CALCULATE INTERACTIONS *
1271 **************************/
1273 /* COULOMB ELECTROSTATICS */
1274 velec
= _mm_mul_pd(qq12
,rinv12
);
1275 felec
= _mm_mul_pd(velec
,rinvsq12
);
1279 /* Calculate temporary vectorial force */
1280 tx
= _mm_mul_pd(fscal
,dx12
);
1281 ty
= _mm_mul_pd(fscal
,dy12
);
1282 tz
= _mm_mul_pd(fscal
,dz12
);
1284 /* Update vectorial force */
1285 fix1
= _mm_add_pd(fix1
,tx
);
1286 fiy1
= _mm_add_pd(fiy1
,ty
);
1287 fiz1
= _mm_add_pd(fiz1
,tz
);
1289 fjx2
= _mm_add_pd(fjx2
,tx
);
1290 fjy2
= _mm_add_pd(fjy2
,ty
);
1291 fjz2
= _mm_add_pd(fjz2
,tz
);
1293 /**************************
1294 * CALCULATE INTERACTIONS *
1295 **************************/
1297 /* COULOMB ELECTROSTATICS */
1298 velec
= _mm_mul_pd(qq13
,rinv13
);
1299 felec
= _mm_mul_pd(velec
,rinvsq13
);
1303 /* Calculate temporary vectorial force */
1304 tx
= _mm_mul_pd(fscal
,dx13
);
1305 ty
= _mm_mul_pd(fscal
,dy13
);
1306 tz
= _mm_mul_pd(fscal
,dz13
);
1308 /* Update vectorial force */
1309 fix1
= _mm_add_pd(fix1
,tx
);
1310 fiy1
= _mm_add_pd(fiy1
,ty
);
1311 fiz1
= _mm_add_pd(fiz1
,tz
);
1313 fjx3
= _mm_add_pd(fjx3
,tx
);
1314 fjy3
= _mm_add_pd(fjy3
,ty
);
1315 fjz3
= _mm_add_pd(fjz3
,tz
);
1317 /**************************
1318 * CALCULATE INTERACTIONS *
1319 **************************/
1321 /* COULOMB ELECTROSTATICS */
1322 velec
= _mm_mul_pd(qq21
,rinv21
);
1323 felec
= _mm_mul_pd(velec
,rinvsq21
);
1327 /* Calculate temporary vectorial force */
1328 tx
= _mm_mul_pd(fscal
,dx21
);
1329 ty
= _mm_mul_pd(fscal
,dy21
);
1330 tz
= _mm_mul_pd(fscal
,dz21
);
1332 /* Update vectorial force */
1333 fix2
= _mm_add_pd(fix2
,tx
);
1334 fiy2
= _mm_add_pd(fiy2
,ty
);
1335 fiz2
= _mm_add_pd(fiz2
,tz
);
1337 fjx1
= _mm_add_pd(fjx1
,tx
);
1338 fjy1
= _mm_add_pd(fjy1
,ty
);
1339 fjz1
= _mm_add_pd(fjz1
,tz
);
1341 /**************************
1342 * CALCULATE INTERACTIONS *
1343 **************************/
1345 /* COULOMB ELECTROSTATICS */
1346 velec
= _mm_mul_pd(qq22
,rinv22
);
1347 felec
= _mm_mul_pd(velec
,rinvsq22
);
1351 /* Calculate temporary vectorial force */
1352 tx
= _mm_mul_pd(fscal
,dx22
);
1353 ty
= _mm_mul_pd(fscal
,dy22
);
1354 tz
= _mm_mul_pd(fscal
,dz22
);
1356 /* Update vectorial force */
1357 fix2
= _mm_add_pd(fix2
,tx
);
1358 fiy2
= _mm_add_pd(fiy2
,ty
);
1359 fiz2
= _mm_add_pd(fiz2
,tz
);
1361 fjx2
= _mm_add_pd(fjx2
,tx
);
1362 fjy2
= _mm_add_pd(fjy2
,ty
);
1363 fjz2
= _mm_add_pd(fjz2
,tz
);
1365 /**************************
1366 * CALCULATE INTERACTIONS *
1367 **************************/
1369 /* COULOMB ELECTROSTATICS */
1370 velec
= _mm_mul_pd(qq23
,rinv23
);
1371 felec
= _mm_mul_pd(velec
,rinvsq23
);
1375 /* Calculate temporary vectorial force */
1376 tx
= _mm_mul_pd(fscal
,dx23
);
1377 ty
= _mm_mul_pd(fscal
,dy23
);
1378 tz
= _mm_mul_pd(fscal
,dz23
);
1380 /* Update vectorial force */
1381 fix2
= _mm_add_pd(fix2
,tx
);
1382 fiy2
= _mm_add_pd(fiy2
,ty
);
1383 fiz2
= _mm_add_pd(fiz2
,tz
);
1385 fjx3
= _mm_add_pd(fjx3
,tx
);
1386 fjy3
= _mm_add_pd(fjy3
,ty
);
1387 fjz3
= _mm_add_pd(fjz3
,tz
);
1389 /**************************
1390 * CALCULATE INTERACTIONS *
1391 **************************/
1393 /* COULOMB ELECTROSTATICS */
1394 velec
= _mm_mul_pd(qq31
,rinv31
);
1395 felec
= _mm_mul_pd(velec
,rinvsq31
);
1399 /* Calculate temporary vectorial force */
1400 tx
= _mm_mul_pd(fscal
,dx31
);
1401 ty
= _mm_mul_pd(fscal
,dy31
);
1402 tz
= _mm_mul_pd(fscal
,dz31
);
1404 /* Update vectorial force */
1405 fix3
= _mm_add_pd(fix3
,tx
);
1406 fiy3
= _mm_add_pd(fiy3
,ty
);
1407 fiz3
= _mm_add_pd(fiz3
,tz
);
1409 fjx1
= _mm_add_pd(fjx1
,tx
);
1410 fjy1
= _mm_add_pd(fjy1
,ty
);
1411 fjz1
= _mm_add_pd(fjz1
,tz
);
1413 /**************************
1414 * CALCULATE INTERACTIONS *
1415 **************************/
1417 /* COULOMB ELECTROSTATICS */
1418 velec
= _mm_mul_pd(qq32
,rinv32
);
1419 felec
= _mm_mul_pd(velec
,rinvsq32
);
1423 /* Calculate temporary vectorial force */
1424 tx
= _mm_mul_pd(fscal
,dx32
);
1425 ty
= _mm_mul_pd(fscal
,dy32
);
1426 tz
= _mm_mul_pd(fscal
,dz32
);
1428 /* Update vectorial force */
1429 fix3
= _mm_add_pd(fix3
,tx
);
1430 fiy3
= _mm_add_pd(fiy3
,ty
);
1431 fiz3
= _mm_add_pd(fiz3
,tz
);
1433 fjx2
= _mm_add_pd(fjx2
,tx
);
1434 fjy2
= _mm_add_pd(fjy2
,ty
);
1435 fjz2
= _mm_add_pd(fjz2
,tz
);
1437 /**************************
1438 * CALCULATE INTERACTIONS *
1439 **************************/
1441 /* COULOMB ELECTROSTATICS */
1442 velec
= _mm_mul_pd(qq33
,rinv33
);
1443 felec
= _mm_mul_pd(velec
,rinvsq33
);
1447 /* Calculate temporary vectorial force */
1448 tx
= _mm_mul_pd(fscal
,dx33
);
1449 ty
= _mm_mul_pd(fscal
,dy33
);
1450 tz
= _mm_mul_pd(fscal
,dz33
);
1452 /* Update vectorial force */
1453 fix3
= _mm_add_pd(fix3
,tx
);
1454 fiy3
= _mm_add_pd(fiy3
,ty
);
1455 fiz3
= _mm_add_pd(fiz3
,tz
);
1457 fjx3
= _mm_add_pd(fjx3
,tx
);
1458 fjy3
= _mm_add_pd(fjy3
,ty
);
1459 fjz3
= _mm_add_pd(fjz3
,tz
);
1461 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
1463 /* Inner loop uses 273 flops */
1466 if(jidx
<j_index_end
)
1470 j_coord_offsetA
= DIM
*jnrA
;
1472 /* load j atom coordinates */
1473 gmx_mm_load_4rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
1474 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
1475 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
1477 /* Calculate displacement vector */
1478 dx00
= _mm_sub_pd(ix0
,jx0
);
1479 dy00
= _mm_sub_pd(iy0
,jy0
);
1480 dz00
= _mm_sub_pd(iz0
,jz0
);
1481 dx11
= _mm_sub_pd(ix1
,jx1
);
1482 dy11
= _mm_sub_pd(iy1
,jy1
);
1483 dz11
= _mm_sub_pd(iz1
,jz1
);
1484 dx12
= _mm_sub_pd(ix1
,jx2
);
1485 dy12
= _mm_sub_pd(iy1
,jy2
);
1486 dz12
= _mm_sub_pd(iz1
,jz2
);
1487 dx13
= _mm_sub_pd(ix1
,jx3
);
1488 dy13
= _mm_sub_pd(iy1
,jy3
);
1489 dz13
= _mm_sub_pd(iz1
,jz3
);
1490 dx21
= _mm_sub_pd(ix2
,jx1
);
1491 dy21
= _mm_sub_pd(iy2
,jy1
);
1492 dz21
= _mm_sub_pd(iz2
,jz1
);
1493 dx22
= _mm_sub_pd(ix2
,jx2
);
1494 dy22
= _mm_sub_pd(iy2
,jy2
);
1495 dz22
= _mm_sub_pd(iz2
,jz2
);
1496 dx23
= _mm_sub_pd(ix2
,jx3
);
1497 dy23
= _mm_sub_pd(iy2
,jy3
);
1498 dz23
= _mm_sub_pd(iz2
,jz3
);
1499 dx31
= _mm_sub_pd(ix3
,jx1
);
1500 dy31
= _mm_sub_pd(iy3
,jy1
);
1501 dz31
= _mm_sub_pd(iz3
,jz1
);
1502 dx32
= _mm_sub_pd(ix3
,jx2
);
1503 dy32
= _mm_sub_pd(iy3
,jy2
);
1504 dz32
= _mm_sub_pd(iz3
,jz2
);
1505 dx33
= _mm_sub_pd(ix3
,jx3
);
1506 dy33
= _mm_sub_pd(iy3
,jy3
);
1507 dz33
= _mm_sub_pd(iz3
,jz3
);
1509 /* Calculate squared distance and things based on it */
1510 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
1511 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
1512 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
1513 rsq13
= gmx_mm_calc_rsq_pd(dx13
,dy13
,dz13
);
1514 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
1515 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
1516 rsq23
= gmx_mm_calc_rsq_pd(dx23
,dy23
,dz23
);
1517 rsq31
= gmx_mm_calc_rsq_pd(dx31
,dy31
,dz31
);
1518 rsq32
= gmx_mm_calc_rsq_pd(dx32
,dy32
,dz32
);
1519 rsq33
= gmx_mm_calc_rsq_pd(dx33
,dy33
,dz33
);
1521 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
1522 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
1523 rinv13
= gmx_mm_invsqrt_pd(rsq13
);
1524 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
1525 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
1526 rinv23
= gmx_mm_invsqrt_pd(rsq23
);
1527 rinv31
= gmx_mm_invsqrt_pd(rsq31
);
1528 rinv32
= gmx_mm_invsqrt_pd(rsq32
);
1529 rinv33
= gmx_mm_invsqrt_pd(rsq33
);
1531 rinvsq00
= gmx_mm_inv_pd(rsq00
);
1532 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
1533 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
1534 rinvsq13
= _mm_mul_pd(rinv13
,rinv13
);
1535 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
1536 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
1537 rinvsq23
= _mm_mul_pd(rinv23
,rinv23
);
1538 rinvsq31
= _mm_mul_pd(rinv31
,rinv31
);
1539 rinvsq32
= _mm_mul_pd(rinv32
,rinv32
);
1540 rinvsq33
= _mm_mul_pd(rinv33
,rinv33
);
1542 fjx0
= _mm_setzero_pd();
1543 fjy0
= _mm_setzero_pd();
1544 fjz0
= _mm_setzero_pd();
1545 fjx1
= _mm_setzero_pd();
1546 fjy1
= _mm_setzero_pd();
1547 fjz1
= _mm_setzero_pd();
1548 fjx2
= _mm_setzero_pd();
1549 fjy2
= _mm_setzero_pd();
1550 fjz2
= _mm_setzero_pd();
1551 fjx3
= _mm_setzero_pd();
1552 fjy3
= _mm_setzero_pd();
1553 fjz3
= _mm_setzero_pd();
1555 /**************************
1556 * CALCULATE INTERACTIONS *
1557 **************************/
1559 /* LENNARD-JONES DISPERSION/REPULSION */
1561 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
1562 fvdw
= _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00
,rinvsix
),c6_00
),_mm_mul_pd(rinvsix
,rinvsq00
));
1566 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1568 /* Calculate temporary vectorial force */
1569 tx
= _mm_mul_pd(fscal
,dx00
);
1570 ty
= _mm_mul_pd(fscal
,dy00
);
1571 tz
= _mm_mul_pd(fscal
,dz00
);
1573 /* Update vectorial force */
1574 fix0
= _mm_add_pd(fix0
,tx
);
1575 fiy0
= _mm_add_pd(fiy0
,ty
);
1576 fiz0
= _mm_add_pd(fiz0
,tz
);
1578 fjx0
= _mm_add_pd(fjx0
,tx
);
1579 fjy0
= _mm_add_pd(fjy0
,ty
);
1580 fjz0
= _mm_add_pd(fjz0
,tz
);
1582 /**************************
1583 * CALCULATE INTERACTIONS *
1584 **************************/
1586 /* COULOMB ELECTROSTATICS */
1587 velec
= _mm_mul_pd(qq11
,rinv11
);
1588 felec
= _mm_mul_pd(velec
,rinvsq11
);
1592 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1594 /* Calculate temporary vectorial force */
1595 tx
= _mm_mul_pd(fscal
,dx11
);
1596 ty
= _mm_mul_pd(fscal
,dy11
);
1597 tz
= _mm_mul_pd(fscal
,dz11
);
1599 /* Update vectorial force */
1600 fix1
= _mm_add_pd(fix1
,tx
);
1601 fiy1
= _mm_add_pd(fiy1
,ty
);
1602 fiz1
= _mm_add_pd(fiz1
,tz
);
1604 fjx1
= _mm_add_pd(fjx1
,tx
);
1605 fjy1
= _mm_add_pd(fjy1
,ty
);
1606 fjz1
= _mm_add_pd(fjz1
,tz
);
1608 /**************************
1609 * CALCULATE INTERACTIONS *
1610 **************************/
1612 /* COULOMB ELECTROSTATICS */
1613 velec
= _mm_mul_pd(qq12
,rinv12
);
1614 felec
= _mm_mul_pd(velec
,rinvsq12
);
1618 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1620 /* Calculate temporary vectorial force */
1621 tx
= _mm_mul_pd(fscal
,dx12
);
1622 ty
= _mm_mul_pd(fscal
,dy12
);
1623 tz
= _mm_mul_pd(fscal
,dz12
);
1625 /* Update vectorial force */
1626 fix1
= _mm_add_pd(fix1
,tx
);
1627 fiy1
= _mm_add_pd(fiy1
,ty
);
1628 fiz1
= _mm_add_pd(fiz1
,tz
);
1630 fjx2
= _mm_add_pd(fjx2
,tx
);
1631 fjy2
= _mm_add_pd(fjy2
,ty
);
1632 fjz2
= _mm_add_pd(fjz2
,tz
);
1634 /**************************
1635 * CALCULATE INTERACTIONS *
1636 **************************/
1638 /* COULOMB ELECTROSTATICS */
1639 velec
= _mm_mul_pd(qq13
,rinv13
);
1640 felec
= _mm_mul_pd(velec
,rinvsq13
);
1644 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1646 /* Calculate temporary vectorial force */
1647 tx
= _mm_mul_pd(fscal
,dx13
);
1648 ty
= _mm_mul_pd(fscal
,dy13
);
1649 tz
= _mm_mul_pd(fscal
,dz13
);
1651 /* Update vectorial force */
1652 fix1
= _mm_add_pd(fix1
,tx
);
1653 fiy1
= _mm_add_pd(fiy1
,ty
);
1654 fiz1
= _mm_add_pd(fiz1
,tz
);
1656 fjx3
= _mm_add_pd(fjx3
,tx
);
1657 fjy3
= _mm_add_pd(fjy3
,ty
);
1658 fjz3
= _mm_add_pd(fjz3
,tz
);
1660 /**************************
1661 * CALCULATE INTERACTIONS *
1662 **************************/
1664 /* COULOMB ELECTROSTATICS */
1665 velec
= _mm_mul_pd(qq21
,rinv21
);
1666 felec
= _mm_mul_pd(velec
,rinvsq21
);
1670 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1672 /* Calculate temporary vectorial force */
1673 tx
= _mm_mul_pd(fscal
,dx21
);
1674 ty
= _mm_mul_pd(fscal
,dy21
);
1675 tz
= _mm_mul_pd(fscal
,dz21
);
1677 /* Update vectorial force */
1678 fix2
= _mm_add_pd(fix2
,tx
);
1679 fiy2
= _mm_add_pd(fiy2
,ty
);
1680 fiz2
= _mm_add_pd(fiz2
,tz
);
1682 fjx1
= _mm_add_pd(fjx1
,tx
);
1683 fjy1
= _mm_add_pd(fjy1
,ty
);
1684 fjz1
= _mm_add_pd(fjz1
,tz
);
1686 /**************************
1687 * CALCULATE INTERACTIONS *
1688 **************************/
1690 /* COULOMB ELECTROSTATICS */
1691 velec
= _mm_mul_pd(qq22
,rinv22
);
1692 felec
= _mm_mul_pd(velec
,rinvsq22
);
1696 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1698 /* Calculate temporary vectorial force */
1699 tx
= _mm_mul_pd(fscal
,dx22
);
1700 ty
= _mm_mul_pd(fscal
,dy22
);
1701 tz
= _mm_mul_pd(fscal
,dz22
);
1703 /* Update vectorial force */
1704 fix2
= _mm_add_pd(fix2
,tx
);
1705 fiy2
= _mm_add_pd(fiy2
,ty
);
1706 fiz2
= _mm_add_pd(fiz2
,tz
);
1708 fjx2
= _mm_add_pd(fjx2
,tx
);
1709 fjy2
= _mm_add_pd(fjy2
,ty
);
1710 fjz2
= _mm_add_pd(fjz2
,tz
);
1712 /**************************
1713 * CALCULATE INTERACTIONS *
1714 **************************/
1716 /* COULOMB ELECTROSTATICS */
1717 velec
= _mm_mul_pd(qq23
,rinv23
);
1718 felec
= _mm_mul_pd(velec
,rinvsq23
);
1722 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1724 /* Calculate temporary vectorial force */
1725 tx
= _mm_mul_pd(fscal
,dx23
);
1726 ty
= _mm_mul_pd(fscal
,dy23
);
1727 tz
= _mm_mul_pd(fscal
,dz23
);
1729 /* Update vectorial force */
1730 fix2
= _mm_add_pd(fix2
,tx
);
1731 fiy2
= _mm_add_pd(fiy2
,ty
);
1732 fiz2
= _mm_add_pd(fiz2
,tz
);
1734 fjx3
= _mm_add_pd(fjx3
,tx
);
1735 fjy3
= _mm_add_pd(fjy3
,ty
);
1736 fjz3
= _mm_add_pd(fjz3
,tz
);
1738 /**************************
1739 * CALCULATE INTERACTIONS *
1740 **************************/
1742 /* COULOMB ELECTROSTATICS */
1743 velec
= _mm_mul_pd(qq31
,rinv31
);
1744 felec
= _mm_mul_pd(velec
,rinvsq31
);
1748 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1750 /* Calculate temporary vectorial force */
1751 tx
= _mm_mul_pd(fscal
,dx31
);
1752 ty
= _mm_mul_pd(fscal
,dy31
);
1753 tz
= _mm_mul_pd(fscal
,dz31
);
1755 /* Update vectorial force */
1756 fix3
= _mm_add_pd(fix3
,tx
);
1757 fiy3
= _mm_add_pd(fiy3
,ty
);
1758 fiz3
= _mm_add_pd(fiz3
,tz
);
1760 fjx1
= _mm_add_pd(fjx1
,tx
);
1761 fjy1
= _mm_add_pd(fjy1
,ty
);
1762 fjz1
= _mm_add_pd(fjz1
,tz
);
1764 /**************************
1765 * CALCULATE INTERACTIONS *
1766 **************************/
1768 /* COULOMB ELECTROSTATICS */
1769 velec
= _mm_mul_pd(qq32
,rinv32
);
1770 felec
= _mm_mul_pd(velec
,rinvsq32
);
1774 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1776 /* Calculate temporary vectorial force */
1777 tx
= _mm_mul_pd(fscal
,dx32
);
1778 ty
= _mm_mul_pd(fscal
,dy32
);
1779 tz
= _mm_mul_pd(fscal
,dz32
);
1781 /* Update vectorial force */
1782 fix3
= _mm_add_pd(fix3
,tx
);
1783 fiy3
= _mm_add_pd(fiy3
,ty
);
1784 fiz3
= _mm_add_pd(fiz3
,tz
);
1786 fjx2
= _mm_add_pd(fjx2
,tx
);
1787 fjy2
= _mm_add_pd(fjy2
,ty
);
1788 fjz2
= _mm_add_pd(fjz2
,tz
);
1790 /**************************
1791 * CALCULATE INTERACTIONS *
1792 **************************/
1794 /* COULOMB ELECTROSTATICS */
1795 velec
= _mm_mul_pd(qq33
,rinv33
);
1796 felec
= _mm_mul_pd(velec
,rinvsq33
);
1800 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1802 /* Calculate temporary vectorial force */
1803 tx
= _mm_mul_pd(fscal
,dx33
);
1804 ty
= _mm_mul_pd(fscal
,dy33
);
1805 tz
= _mm_mul_pd(fscal
,dz33
);
1807 /* Update vectorial force */
1808 fix3
= _mm_add_pd(fix3
,tx
);
1809 fiy3
= _mm_add_pd(fiy3
,ty
);
1810 fiz3
= _mm_add_pd(fiz3
,tz
);
1812 fjx3
= _mm_add_pd(fjx3
,tx
);
1813 fjy3
= _mm_add_pd(fjy3
,ty
);
1814 fjz3
= _mm_add_pd(fjz3
,tz
);
1816 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
1818 /* Inner loop uses 273 flops */
1821 /* End of innermost loop */
1823 gmx_mm_update_iforce_4atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
1824 f
+i_coord_offset
,fshift
+i_shift_offset
);
1826 /* Increment number of inner iterations */
1827 inneriter
+= j_index_end
- j_index_start
;
1829 /* Outer loop uses 24 flops */
1832 /* Increment number of outer iterations */
1835 /* Update outer/inner flops */
1837 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W4W4_F
,outeriter
*24 + inneriter
*273);