2 * Note: this file was generated by the Gromacs sse2_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse2_double.h"
34 #include "kernelutil_x86_sse2_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_sse2_double
38 * Electrostatics interaction: Ewald
39 * VdW interaction: LennardJones
40 * Geometry: Water4-Water4
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_sse2_double
45 (t_nblist
* gmx_restrict nlist
,
46 rvec
* gmx_restrict xx
,
47 rvec
* gmx_restrict ff
,
48 t_forcerec
* gmx_restrict fr
,
49 t_mdatoms
* gmx_restrict mdatoms
,
50 nb_kernel_data_t
* gmx_restrict kernel_data
,
51 t_nrnb
* gmx_restrict nrnb
)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
59 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
61 int j_coord_offsetA
,j_coord_offsetB
;
62 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
64 real
*shiftvec
,*fshift
,*x
,*f
;
65 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
67 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
69 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
71 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
73 __m128d ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
74 int vdwjidx0A
,vdwjidx0B
;
75 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
76 int vdwjidx1A
,vdwjidx1B
;
77 __m128d jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
78 int vdwjidx2A
,vdwjidx2B
;
79 __m128d jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
80 int vdwjidx3A
,vdwjidx3B
;
81 __m128d jx3
,jy3
,jz3
,fjx3
,fjy3
,fjz3
,jq3
,isaj3
;
82 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
83 __m128d dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
84 __m128d dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
85 __m128d dx13
,dy13
,dz13
,rsq13
,rinv13
,rinvsq13
,r13
,qq13
,c6_13
,c12_13
;
86 __m128d dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
87 __m128d dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
88 __m128d dx23
,dy23
,dz23
,rsq23
,rinv23
,rinvsq23
,r23
,qq23
,c6_23
,c12_23
;
89 __m128d dx31
,dy31
,dz31
,rsq31
,rinv31
,rinvsq31
,r31
,qq31
,c6_31
,c12_31
;
90 __m128d dx32
,dy32
,dz32
,rsq32
,rinv32
,rinvsq32
,r32
,qq32
,c6_32
,c12_32
;
91 __m128d dx33
,dy33
,dz33
,rsq33
,rinv33
,rinvsq33
,r33
,qq33
,c6_33
,c12_33
;
92 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
95 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
98 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
99 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
101 __m128d ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
103 __m128d dummy_mask
,cutoff_mask
;
104 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
105 __m128d one
= _mm_set1_pd(1.0);
106 __m128d two
= _mm_set1_pd(2.0);
112 jindex
= nlist
->jindex
;
114 shiftidx
= nlist
->shift
;
116 shiftvec
= fr
->shift_vec
[0];
117 fshift
= fr
->fshift
[0];
118 facel
= _mm_set1_pd(fr
->epsfac
);
119 charge
= mdatoms
->chargeA
;
120 nvdwtype
= fr
->ntype
;
122 vdwtype
= mdatoms
->typeA
;
124 sh_ewald
= _mm_set1_pd(fr
->ic
->sh_ewald
);
125 ewtab
= fr
->ic
->tabq_coul_FDV0
;
126 ewtabscale
= _mm_set1_pd(fr
->ic
->tabq_scale
);
127 ewtabhalfspace
= _mm_set1_pd(0.5/fr
->ic
->tabq_scale
);
129 /* Setup water-specific parameters */
130 inr
= nlist
->iinr
[0];
131 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
132 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
133 iq3
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+3]));
134 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
136 jq1
= _mm_set1_pd(charge
[inr
+1]);
137 jq2
= _mm_set1_pd(charge
[inr
+2]);
138 jq3
= _mm_set1_pd(charge
[inr
+3]);
139 vdwjidx0A
= 2*vdwtype
[inr
+0];
140 c6_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
]);
141 c12_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
142 qq11
= _mm_mul_pd(iq1
,jq1
);
143 qq12
= _mm_mul_pd(iq1
,jq2
);
144 qq13
= _mm_mul_pd(iq1
,jq3
);
145 qq21
= _mm_mul_pd(iq2
,jq1
);
146 qq22
= _mm_mul_pd(iq2
,jq2
);
147 qq23
= _mm_mul_pd(iq2
,jq3
);
148 qq31
= _mm_mul_pd(iq3
,jq1
);
149 qq32
= _mm_mul_pd(iq3
,jq2
);
150 qq33
= _mm_mul_pd(iq3
,jq3
);
152 /* Avoid stupid compiler warnings */
160 /* Start outer loop over neighborlists */
161 for(iidx
=0; iidx
<nri
; iidx
++)
163 /* Load shift vector for this list */
164 i_shift_offset
= DIM
*shiftidx
[iidx
];
166 /* Load limits for loop over neighbors */
167 j_index_start
= jindex
[iidx
];
168 j_index_end
= jindex
[iidx
+1];
170 /* Get outer coordinate index */
172 i_coord_offset
= DIM
*inr
;
174 /* Load i particle coords and add shift vector */
175 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
176 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
178 fix0
= _mm_setzero_pd();
179 fiy0
= _mm_setzero_pd();
180 fiz0
= _mm_setzero_pd();
181 fix1
= _mm_setzero_pd();
182 fiy1
= _mm_setzero_pd();
183 fiz1
= _mm_setzero_pd();
184 fix2
= _mm_setzero_pd();
185 fiy2
= _mm_setzero_pd();
186 fiz2
= _mm_setzero_pd();
187 fix3
= _mm_setzero_pd();
188 fiy3
= _mm_setzero_pd();
189 fiz3
= _mm_setzero_pd();
191 /* Reset potential sums */
192 velecsum
= _mm_setzero_pd();
193 vvdwsum
= _mm_setzero_pd();
195 /* Start inner kernel loop */
196 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
199 /* Get j neighbor index, and coordinate index */
202 j_coord_offsetA
= DIM
*jnrA
;
203 j_coord_offsetB
= DIM
*jnrB
;
205 /* load j atom coordinates */
206 gmx_mm_load_4rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
207 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
208 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
210 /* Calculate displacement vector */
211 dx00
= _mm_sub_pd(ix0
,jx0
);
212 dy00
= _mm_sub_pd(iy0
,jy0
);
213 dz00
= _mm_sub_pd(iz0
,jz0
);
214 dx11
= _mm_sub_pd(ix1
,jx1
);
215 dy11
= _mm_sub_pd(iy1
,jy1
);
216 dz11
= _mm_sub_pd(iz1
,jz1
);
217 dx12
= _mm_sub_pd(ix1
,jx2
);
218 dy12
= _mm_sub_pd(iy1
,jy2
);
219 dz12
= _mm_sub_pd(iz1
,jz2
);
220 dx13
= _mm_sub_pd(ix1
,jx3
);
221 dy13
= _mm_sub_pd(iy1
,jy3
);
222 dz13
= _mm_sub_pd(iz1
,jz3
);
223 dx21
= _mm_sub_pd(ix2
,jx1
);
224 dy21
= _mm_sub_pd(iy2
,jy1
);
225 dz21
= _mm_sub_pd(iz2
,jz1
);
226 dx22
= _mm_sub_pd(ix2
,jx2
);
227 dy22
= _mm_sub_pd(iy2
,jy2
);
228 dz22
= _mm_sub_pd(iz2
,jz2
);
229 dx23
= _mm_sub_pd(ix2
,jx3
);
230 dy23
= _mm_sub_pd(iy2
,jy3
);
231 dz23
= _mm_sub_pd(iz2
,jz3
);
232 dx31
= _mm_sub_pd(ix3
,jx1
);
233 dy31
= _mm_sub_pd(iy3
,jy1
);
234 dz31
= _mm_sub_pd(iz3
,jz1
);
235 dx32
= _mm_sub_pd(ix3
,jx2
);
236 dy32
= _mm_sub_pd(iy3
,jy2
);
237 dz32
= _mm_sub_pd(iz3
,jz2
);
238 dx33
= _mm_sub_pd(ix3
,jx3
);
239 dy33
= _mm_sub_pd(iy3
,jy3
);
240 dz33
= _mm_sub_pd(iz3
,jz3
);
242 /* Calculate squared distance and things based on it */
243 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
244 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
245 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
246 rsq13
= gmx_mm_calc_rsq_pd(dx13
,dy13
,dz13
);
247 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
248 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
249 rsq23
= gmx_mm_calc_rsq_pd(dx23
,dy23
,dz23
);
250 rsq31
= gmx_mm_calc_rsq_pd(dx31
,dy31
,dz31
);
251 rsq32
= gmx_mm_calc_rsq_pd(dx32
,dy32
,dz32
);
252 rsq33
= gmx_mm_calc_rsq_pd(dx33
,dy33
,dz33
);
254 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
255 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
256 rinv13
= gmx_mm_invsqrt_pd(rsq13
);
257 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
258 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
259 rinv23
= gmx_mm_invsqrt_pd(rsq23
);
260 rinv31
= gmx_mm_invsqrt_pd(rsq31
);
261 rinv32
= gmx_mm_invsqrt_pd(rsq32
);
262 rinv33
= gmx_mm_invsqrt_pd(rsq33
);
264 rinvsq00
= gmx_mm_inv_pd(rsq00
);
265 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
266 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
267 rinvsq13
= _mm_mul_pd(rinv13
,rinv13
);
268 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
269 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
270 rinvsq23
= _mm_mul_pd(rinv23
,rinv23
);
271 rinvsq31
= _mm_mul_pd(rinv31
,rinv31
);
272 rinvsq32
= _mm_mul_pd(rinv32
,rinv32
);
273 rinvsq33
= _mm_mul_pd(rinv33
,rinv33
);
275 fjx0
= _mm_setzero_pd();
276 fjy0
= _mm_setzero_pd();
277 fjz0
= _mm_setzero_pd();
278 fjx1
= _mm_setzero_pd();
279 fjy1
= _mm_setzero_pd();
280 fjz1
= _mm_setzero_pd();
281 fjx2
= _mm_setzero_pd();
282 fjy2
= _mm_setzero_pd();
283 fjz2
= _mm_setzero_pd();
284 fjx3
= _mm_setzero_pd();
285 fjy3
= _mm_setzero_pd();
286 fjz3
= _mm_setzero_pd();
288 /**************************
289 * CALCULATE INTERACTIONS *
290 **************************/
292 /* LENNARD-JONES DISPERSION/REPULSION */
294 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
295 vvdw6
= _mm_mul_pd(c6_00
,rinvsix
);
296 vvdw12
= _mm_mul_pd(c12_00
,_mm_mul_pd(rinvsix
,rinvsix
));
297 vvdw
= _mm_sub_pd( _mm_mul_pd(vvdw12
,one_twelfth
) , _mm_mul_pd(vvdw6
,one_sixth
) );
298 fvdw
= _mm_mul_pd(_mm_sub_pd(vvdw12
,vvdw6
),rinvsq00
);
300 /* Update potential sum for this i atom from the interaction with this j atom. */
301 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
305 /* Calculate temporary vectorial force */
306 tx
= _mm_mul_pd(fscal
,dx00
);
307 ty
= _mm_mul_pd(fscal
,dy00
);
308 tz
= _mm_mul_pd(fscal
,dz00
);
310 /* Update vectorial force */
311 fix0
= _mm_add_pd(fix0
,tx
);
312 fiy0
= _mm_add_pd(fiy0
,ty
);
313 fiz0
= _mm_add_pd(fiz0
,tz
);
315 fjx0
= _mm_add_pd(fjx0
,tx
);
316 fjy0
= _mm_add_pd(fjy0
,ty
);
317 fjz0
= _mm_add_pd(fjz0
,tz
);
319 /**************************
320 * CALCULATE INTERACTIONS *
321 **************************/
323 r11
= _mm_mul_pd(rsq11
,rinv11
);
325 /* EWALD ELECTROSTATICS */
327 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
328 ewrt
= _mm_mul_pd(r11
,ewtabscale
);
329 ewitab
= _mm_cvttpd_epi32(ewrt
);
330 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
331 ewitab
= _mm_slli_epi32(ewitab
,2);
332 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
333 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
334 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
335 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
336 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
337 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
338 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
339 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
340 velec
= _mm_mul_pd(qq11
,_mm_sub_pd(rinv11
,velec
));
341 felec
= _mm_mul_pd(_mm_mul_pd(qq11
,rinv11
),_mm_sub_pd(rinvsq11
,felec
));
343 /* Update potential sum for this i atom from the interaction with this j atom. */
344 velecsum
= _mm_add_pd(velecsum
,velec
);
348 /* Calculate temporary vectorial force */
349 tx
= _mm_mul_pd(fscal
,dx11
);
350 ty
= _mm_mul_pd(fscal
,dy11
);
351 tz
= _mm_mul_pd(fscal
,dz11
);
353 /* Update vectorial force */
354 fix1
= _mm_add_pd(fix1
,tx
);
355 fiy1
= _mm_add_pd(fiy1
,ty
);
356 fiz1
= _mm_add_pd(fiz1
,tz
);
358 fjx1
= _mm_add_pd(fjx1
,tx
);
359 fjy1
= _mm_add_pd(fjy1
,ty
);
360 fjz1
= _mm_add_pd(fjz1
,tz
);
362 /**************************
363 * CALCULATE INTERACTIONS *
364 **************************/
366 r12
= _mm_mul_pd(rsq12
,rinv12
);
368 /* EWALD ELECTROSTATICS */
370 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
371 ewrt
= _mm_mul_pd(r12
,ewtabscale
);
372 ewitab
= _mm_cvttpd_epi32(ewrt
);
373 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
374 ewitab
= _mm_slli_epi32(ewitab
,2);
375 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
376 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
377 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
378 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
379 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
380 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
381 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
382 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
383 velec
= _mm_mul_pd(qq12
,_mm_sub_pd(rinv12
,velec
));
384 felec
= _mm_mul_pd(_mm_mul_pd(qq12
,rinv12
),_mm_sub_pd(rinvsq12
,felec
));
386 /* Update potential sum for this i atom from the interaction with this j atom. */
387 velecsum
= _mm_add_pd(velecsum
,velec
);
391 /* Calculate temporary vectorial force */
392 tx
= _mm_mul_pd(fscal
,dx12
);
393 ty
= _mm_mul_pd(fscal
,dy12
);
394 tz
= _mm_mul_pd(fscal
,dz12
);
396 /* Update vectorial force */
397 fix1
= _mm_add_pd(fix1
,tx
);
398 fiy1
= _mm_add_pd(fiy1
,ty
);
399 fiz1
= _mm_add_pd(fiz1
,tz
);
401 fjx2
= _mm_add_pd(fjx2
,tx
);
402 fjy2
= _mm_add_pd(fjy2
,ty
);
403 fjz2
= _mm_add_pd(fjz2
,tz
);
405 /**************************
406 * CALCULATE INTERACTIONS *
407 **************************/
409 r13
= _mm_mul_pd(rsq13
,rinv13
);
411 /* EWALD ELECTROSTATICS */
413 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
414 ewrt
= _mm_mul_pd(r13
,ewtabscale
);
415 ewitab
= _mm_cvttpd_epi32(ewrt
);
416 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
417 ewitab
= _mm_slli_epi32(ewitab
,2);
418 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
419 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
420 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
421 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
422 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
423 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
424 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
425 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
426 velec
= _mm_mul_pd(qq13
,_mm_sub_pd(rinv13
,velec
));
427 felec
= _mm_mul_pd(_mm_mul_pd(qq13
,rinv13
),_mm_sub_pd(rinvsq13
,felec
));
429 /* Update potential sum for this i atom from the interaction with this j atom. */
430 velecsum
= _mm_add_pd(velecsum
,velec
);
434 /* Calculate temporary vectorial force */
435 tx
= _mm_mul_pd(fscal
,dx13
);
436 ty
= _mm_mul_pd(fscal
,dy13
);
437 tz
= _mm_mul_pd(fscal
,dz13
);
439 /* Update vectorial force */
440 fix1
= _mm_add_pd(fix1
,tx
);
441 fiy1
= _mm_add_pd(fiy1
,ty
);
442 fiz1
= _mm_add_pd(fiz1
,tz
);
444 fjx3
= _mm_add_pd(fjx3
,tx
);
445 fjy3
= _mm_add_pd(fjy3
,ty
);
446 fjz3
= _mm_add_pd(fjz3
,tz
);
448 /**************************
449 * CALCULATE INTERACTIONS *
450 **************************/
452 r21
= _mm_mul_pd(rsq21
,rinv21
);
454 /* EWALD ELECTROSTATICS */
456 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
457 ewrt
= _mm_mul_pd(r21
,ewtabscale
);
458 ewitab
= _mm_cvttpd_epi32(ewrt
);
459 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
460 ewitab
= _mm_slli_epi32(ewitab
,2);
461 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
462 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
463 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
464 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
465 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
466 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
467 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
468 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
469 velec
= _mm_mul_pd(qq21
,_mm_sub_pd(rinv21
,velec
));
470 felec
= _mm_mul_pd(_mm_mul_pd(qq21
,rinv21
),_mm_sub_pd(rinvsq21
,felec
));
472 /* Update potential sum for this i atom from the interaction with this j atom. */
473 velecsum
= _mm_add_pd(velecsum
,velec
);
477 /* Calculate temporary vectorial force */
478 tx
= _mm_mul_pd(fscal
,dx21
);
479 ty
= _mm_mul_pd(fscal
,dy21
);
480 tz
= _mm_mul_pd(fscal
,dz21
);
482 /* Update vectorial force */
483 fix2
= _mm_add_pd(fix2
,tx
);
484 fiy2
= _mm_add_pd(fiy2
,ty
);
485 fiz2
= _mm_add_pd(fiz2
,tz
);
487 fjx1
= _mm_add_pd(fjx1
,tx
);
488 fjy1
= _mm_add_pd(fjy1
,ty
);
489 fjz1
= _mm_add_pd(fjz1
,tz
);
491 /**************************
492 * CALCULATE INTERACTIONS *
493 **************************/
495 r22
= _mm_mul_pd(rsq22
,rinv22
);
497 /* EWALD ELECTROSTATICS */
499 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
500 ewrt
= _mm_mul_pd(r22
,ewtabscale
);
501 ewitab
= _mm_cvttpd_epi32(ewrt
);
502 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
503 ewitab
= _mm_slli_epi32(ewitab
,2);
504 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
505 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
506 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
507 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
508 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
509 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
510 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
511 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
512 velec
= _mm_mul_pd(qq22
,_mm_sub_pd(rinv22
,velec
));
513 felec
= _mm_mul_pd(_mm_mul_pd(qq22
,rinv22
),_mm_sub_pd(rinvsq22
,felec
));
515 /* Update potential sum for this i atom from the interaction with this j atom. */
516 velecsum
= _mm_add_pd(velecsum
,velec
);
520 /* Calculate temporary vectorial force */
521 tx
= _mm_mul_pd(fscal
,dx22
);
522 ty
= _mm_mul_pd(fscal
,dy22
);
523 tz
= _mm_mul_pd(fscal
,dz22
);
525 /* Update vectorial force */
526 fix2
= _mm_add_pd(fix2
,tx
);
527 fiy2
= _mm_add_pd(fiy2
,ty
);
528 fiz2
= _mm_add_pd(fiz2
,tz
);
530 fjx2
= _mm_add_pd(fjx2
,tx
);
531 fjy2
= _mm_add_pd(fjy2
,ty
);
532 fjz2
= _mm_add_pd(fjz2
,tz
);
534 /**************************
535 * CALCULATE INTERACTIONS *
536 **************************/
538 r23
= _mm_mul_pd(rsq23
,rinv23
);
540 /* EWALD ELECTROSTATICS */
542 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
543 ewrt
= _mm_mul_pd(r23
,ewtabscale
);
544 ewitab
= _mm_cvttpd_epi32(ewrt
);
545 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
546 ewitab
= _mm_slli_epi32(ewitab
,2);
547 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
548 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
549 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
550 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
551 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
552 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
553 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
554 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
555 velec
= _mm_mul_pd(qq23
,_mm_sub_pd(rinv23
,velec
));
556 felec
= _mm_mul_pd(_mm_mul_pd(qq23
,rinv23
),_mm_sub_pd(rinvsq23
,felec
));
558 /* Update potential sum for this i atom from the interaction with this j atom. */
559 velecsum
= _mm_add_pd(velecsum
,velec
);
563 /* Calculate temporary vectorial force */
564 tx
= _mm_mul_pd(fscal
,dx23
);
565 ty
= _mm_mul_pd(fscal
,dy23
);
566 tz
= _mm_mul_pd(fscal
,dz23
);
568 /* Update vectorial force */
569 fix2
= _mm_add_pd(fix2
,tx
);
570 fiy2
= _mm_add_pd(fiy2
,ty
);
571 fiz2
= _mm_add_pd(fiz2
,tz
);
573 fjx3
= _mm_add_pd(fjx3
,tx
);
574 fjy3
= _mm_add_pd(fjy3
,ty
);
575 fjz3
= _mm_add_pd(fjz3
,tz
);
577 /**************************
578 * CALCULATE INTERACTIONS *
579 **************************/
581 r31
= _mm_mul_pd(rsq31
,rinv31
);
583 /* EWALD ELECTROSTATICS */
585 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
586 ewrt
= _mm_mul_pd(r31
,ewtabscale
);
587 ewitab
= _mm_cvttpd_epi32(ewrt
);
588 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
589 ewitab
= _mm_slli_epi32(ewitab
,2);
590 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
591 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
592 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
593 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
594 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
595 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
596 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
597 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
598 velec
= _mm_mul_pd(qq31
,_mm_sub_pd(rinv31
,velec
));
599 felec
= _mm_mul_pd(_mm_mul_pd(qq31
,rinv31
),_mm_sub_pd(rinvsq31
,felec
));
601 /* Update potential sum for this i atom from the interaction with this j atom. */
602 velecsum
= _mm_add_pd(velecsum
,velec
);
606 /* Calculate temporary vectorial force */
607 tx
= _mm_mul_pd(fscal
,dx31
);
608 ty
= _mm_mul_pd(fscal
,dy31
);
609 tz
= _mm_mul_pd(fscal
,dz31
);
611 /* Update vectorial force */
612 fix3
= _mm_add_pd(fix3
,tx
);
613 fiy3
= _mm_add_pd(fiy3
,ty
);
614 fiz3
= _mm_add_pd(fiz3
,tz
);
616 fjx1
= _mm_add_pd(fjx1
,tx
);
617 fjy1
= _mm_add_pd(fjy1
,ty
);
618 fjz1
= _mm_add_pd(fjz1
,tz
);
620 /**************************
621 * CALCULATE INTERACTIONS *
622 **************************/
624 r32
= _mm_mul_pd(rsq32
,rinv32
);
626 /* EWALD ELECTROSTATICS */
628 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
629 ewrt
= _mm_mul_pd(r32
,ewtabscale
);
630 ewitab
= _mm_cvttpd_epi32(ewrt
);
631 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
632 ewitab
= _mm_slli_epi32(ewitab
,2);
633 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
634 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
635 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
636 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
637 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
638 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
639 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
640 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
641 velec
= _mm_mul_pd(qq32
,_mm_sub_pd(rinv32
,velec
));
642 felec
= _mm_mul_pd(_mm_mul_pd(qq32
,rinv32
),_mm_sub_pd(rinvsq32
,felec
));
644 /* Update potential sum for this i atom from the interaction with this j atom. */
645 velecsum
= _mm_add_pd(velecsum
,velec
);
649 /* Calculate temporary vectorial force */
650 tx
= _mm_mul_pd(fscal
,dx32
);
651 ty
= _mm_mul_pd(fscal
,dy32
);
652 tz
= _mm_mul_pd(fscal
,dz32
);
654 /* Update vectorial force */
655 fix3
= _mm_add_pd(fix3
,tx
);
656 fiy3
= _mm_add_pd(fiy3
,ty
);
657 fiz3
= _mm_add_pd(fiz3
,tz
);
659 fjx2
= _mm_add_pd(fjx2
,tx
);
660 fjy2
= _mm_add_pd(fjy2
,ty
);
661 fjz2
= _mm_add_pd(fjz2
,tz
);
663 /**************************
664 * CALCULATE INTERACTIONS *
665 **************************/
667 r33
= _mm_mul_pd(rsq33
,rinv33
);
669 /* EWALD ELECTROSTATICS */
671 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
672 ewrt
= _mm_mul_pd(r33
,ewtabscale
);
673 ewitab
= _mm_cvttpd_epi32(ewrt
);
674 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
675 ewitab
= _mm_slli_epi32(ewitab
,2);
676 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
677 ewtabD
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) );
678 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
679 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
680 ewtabFn
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,1) +2);
681 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
682 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
683 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
684 velec
= _mm_mul_pd(qq33
,_mm_sub_pd(rinv33
,velec
));
685 felec
= _mm_mul_pd(_mm_mul_pd(qq33
,rinv33
),_mm_sub_pd(rinvsq33
,felec
));
687 /* Update potential sum for this i atom from the interaction with this j atom. */
688 velecsum
= _mm_add_pd(velecsum
,velec
);
692 /* Calculate temporary vectorial force */
693 tx
= _mm_mul_pd(fscal
,dx33
);
694 ty
= _mm_mul_pd(fscal
,dy33
);
695 tz
= _mm_mul_pd(fscal
,dz33
);
697 /* Update vectorial force */
698 fix3
= _mm_add_pd(fix3
,tx
);
699 fiy3
= _mm_add_pd(fiy3
,ty
);
700 fiz3
= _mm_add_pd(fiz3
,tz
);
702 fjx3
= _mm_add_pd(fjx3
,tx
);
703 fjy3
= _mm_add_pd(fjy3
,ty
);
704 fjz3
= _mm_add_pd(fjz3
,tz
);
706 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
708 /* Inner loop uses 404 flops */
715 j_coord_offsetA
= DIM
*jnrA
;
717 /* load j atom coordinates */
718 gmx_mm_load_4rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
719 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
720 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
722 /* Calculate displacement vector */
723 dx00
= _mm_sub_pd(ix0
,jx0
);
724 dy00
= _mm_sub_pd(iy0
,jy0
);
725 dz00
= _mm_sub_pd(iz0
,jz0
);
726 dx11
= _mm_sub_pd(ix1
,jx1
);
727 dy11
= _mm_sub_pd(iy1
,jy1
);
728 dz11
= _mm_sub_pd(iz1
,jz1
);
729 dx12
= _mm_sub_pd(ix1
,jx2
);
730 dy12
= _mm_sub_pd(iy1
,jy2
);
731 dz12
= _mm_sub_pd(iz1
,jz2
);
732 dx13
= _mm_sub_pd(ix1
,jx3
);
733 dy13
= _mm_sub_pd(iy1
,jy3
);
734 dz13
= _mm_sub_pd(iz1
,jz3
);
735 dx21
= _mm_sub_pd(ix2
,jx1
);
736 dy21
= _mm_sub_pd(iy2
,jy1
);
737 dz21
= _mm_sub_pd(iz2
,jz1
);
738 dx22
= _mm_sub_pd(ix2
,jx2
);
739 dy22
= _mm_sub_pd(iy2
,jy2
);
740 dz22
= _mm_sub_pd(iz2
,jz2
);
741 dx23
= _mm_sub_pd(ix2
,jx3
);
742 dy23
= _mm_sub_pd(iy2
,jy3
);
743 dz23
= _mm_sub_pd(iz2
,jz3
);
744 dx31
= _mm_sub_pd(ix3
,jx1
);
745 dy31
= _mm_sub_pd(iy3
,jy1
);
746 dz31
= _mm_sub_pd(iz3
,jz1
);
747 dx32
= _mm_sub_pd(ix3
,jx2
);
748 dy32
= _mm_sub_pd(iy3
,jy2
);
749 dz32
= _mm_sub_pd(iz3
,jz2
);
750 dx33
= _mm_sub_pd(ix3
,jx3
);
751 dy33
= _mm_sub_pd(iy3
,jy3
);
752 dz33
= _mm_sub_pd(iz3
,jz3
);
754 /* Calculate squared distance and things based on it */
755 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
756 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
757 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
758 rsq13
= gmx_mm_calc_rsq_pd(dx13
,dy13
,dz13
);
759 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
760 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
761 rsq23
= gmx_mm_calc_rsq_pd(dx23
,dy23
,dz23
);
762 rsq31
= gmx_mm_calc_rsq_pd(dx31
,dy31
,dz31
);
763 rsq32
= gmx_mm_calc_rsq_pd(dx32
,dy32
,dz32
);
764 rsq33
= gmx_mm_calc_rsq_pd(dx33
,dy33
,dz33
);
766 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
767 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
768 rinv13
= gmx_mm_invsqrt_pd(rsq13
);
769 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
770 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
771 rinv23
= gmx_mm_invsqrt_pd(rsq23
);
772 rinv31
= gmx_mm_invsqrt_pd(rsq31
);
773 rinv32
= gmx_mm_invsqrt_pd(rsq32
);
774 rinv33
= gmx_mm_invsqrt_pd(rsq33
);
776 rinvsq00
= gmx_mm_inv_pd(rsq00
);
777 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
778 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
779 rinvsq13
= _mm_mul_pd(rinv13
,rinv13
);
780 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
781 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
782 rinvsq23
= _mm_mul_pd(rinv23
,rinv23
);
783 rinvsq31
= _mm_mul_pd(rinv31
,rinv31
);
784 rinvsq32
= _mm_mul_pd(rinv32
,rinv32
);
785 rinvsq33
= _mm_mul_pd(rinv33
,rinv33
);
787 fjx0
= _mm_setzero_pd();
788 fjy0
= _mm_setzero_pd();
789 fjz0
= _mm_setzero_pd();
790 fjx1
= _mm_setzero_pd();
791 fjy1
= _mm_setzero_pd();
792 fjz1
= _mm_setzero_pd();
793 fjx2
= _mm_setzero_pd();
794 fjy2
= _mm_setzero_pd();
795 fjz2
= _mm_setzero_pd();
796 fjx3
= _mm_setzero_pd();
797 fjy3
= _mm_setzero_pd();
798 fjz3
= _mm_setzero_pd();
800 /**************************
801 * CALCULATE INTERACTIONS *
802 **************************/
804 /* LENNARD-JONES DISPERSION/REPULSION */
806 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
807 vvdw6
= _mm_mul_pd(c6_00
,rinvsix
);
808 vvdw12
= _mm_mul_pd(c12_00
,_mm_mul_pd(rinvsix
,rinvsix
));
809 vvdw
= _mm_sub_pd( _mm_mul_pd(vvdw12
,one_twelfth
) , _mm_mul_pd(vvdw6
,one_sixth
) );
810 fvdw
= _mm_mul_pd(_mm_sub_pd(vvdw12
,vvdw6
),rinvsq00
);
812 /* Update potential sum for this i atom from the interaction with this j atom. */
813 vvdw
= _mm_unpacklo_pd(vvdw
,_mm_setzero_pd());
814 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
818 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
820 /* Calculate temporary vectorial force */
821 tx
= _mm_mul_pd(fscal
,dx00
);
822 ty
= _mm_mul_pd(fscal
,dy00
);
823 tz
= _mm_mul_pd(fscal
,dz00
);
825 /* Update vectorial force */
826 fix0
= _mm_add_pd(fix0
,tx
);
827 fiy0
= _mm_add_pd(fiy0
,ty
);
828 fiz0
= _mm_add_pd(fiz0
,tz
);
830 fjx0
= _mm_add_pd(fjx0
,tx
);
831 fjy0
= _mm_add_pd(fjy0
,ty
);
832 fjz0
= _mm_add_pd(fjz0
,tz
);
834 /**************************
835 * CALCULATE INTERACTIONS *
836 **************************/
838 r11
= _mm_mul_pd(rsq11
,rinv11
);
840 /* EWALD ELECTROSTATICS */
842 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
843 ewrt
= _mm_mul_pd(r11
,ewtabscale
);
844 ewitab
= _mm_cvttpd_epi32(ewrt
);
845 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
846 ewitab
= _mm_slli_epi32(ewitab
,2);
847 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
848 ewtabD
= _mm_setzero_pd();
849 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
850 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
851 ewtabFn
= _mm_setzero_pd();
852 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
853 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
854 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
855 velec
= _mm_mul_pd(qq11
,_mm_sub_pd(rinv11
,velec
));
856 felec
= _mm_mul_pd(_mm_mul_pd(qq11
,rinv11
),_mm_sub_pd(rinvsq11
,felec
));
858 /* Update potential sum for this i atom from the interaction with this j atom. */
859 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
860 velecsum
= _mm_add_pd(velecsum
,velec
);
864 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
866 /* Calculate temporary vectorial force */
867 tx
= _mm_mul_pd(fscal
,dx11
);
868 ty
= _mm_mul_pd(fscal
,dy11
);
869 tz
= _mm_mul_pd(fscal
,dz11
);
871 /* Update vectorial force */
872 fix1
= _mm_add_pd(fix1
,tx
);
873 fiy1
= _mm_add_pd(fiy1
,ty
);
874 fiz1
= _mm_add_pd(fiz1
,tz
);
876 fjx1
= _mm_add_pd(fjx1
,tx
);
877 fjy1
= _mm_add_pd(fjy1
,ty
);
878 fjz1
= _mm_add_pd(fjz1
,tz
);
880 /**************************
881 * CALCULATE INTERACTIONS *
882 **************************/
884 r12
= _mm_mul_pd(rsq12
,rinv12
);
886 /* EWALD ELECTROSTATICS */
888 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
889 ewrt
= _mm_mul_pd(r12
,ewtabscale
);
890 ewitab
= _mm_cvttpd_epi32(ewrt
);
891 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
892 ewitab
= _mm_slli_epi32(ewitab
,2);
893 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
894 ewtabD
= _mm_setzero_pd();
895 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
896 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
897 ewtabFn
= _mm_setzero_pd();
898 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
899 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
900 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
901 velec
= _mm_mul_pd(qq12
,_mm_sub_pd(rinv12
,velec
));
902 felec
= _mm_mul_pd(_mm_mul_pd(qq12
,rinv12
),_mm_sub_pd(rinvsq12
,felec
));
904 /* Update potential sum for this i atom from the interaction with this j atom. */
905 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
906 velecsum
= _mm_add_pd(velecsum
,velec
);
910 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
912 /* Calculate temporary vectorial force */
913 tx
= _mm_mul_pd(fscal
,dx12
);
914 ty
= _mm_mul_pd(fscal
,dy12
);
915 tz
= _mm_mul_pd(fscal
,dz12
);
917 /* Update vectorial force */
918 fix1
= _mm_add_pd(fix1
,tx
);
919 fiy1
= _mm_add_pd(fiy1
,ty
);
920 fiz1
= _mm_add_pd(fiz1
,tz
);
922 fjx2
= _mm_add_pd(fjx2
,tx
);
923 fjy2
= _mm_add_pd(fjy2
,ty
);
924 fjz2
= _mm_add_pd(fjz2
,tz
);
926 /**************************
927 * CALCULATE INTERACTIONS *
928 **************************/
930 r13
= _mm_mul_pd(rsq13
,rinv13
);
932 /* EWALD ELECTROSTATICS */
934 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
935 ewrt
= _mm_mul_pd(r13
,ewtabscale
);
936 ewitab
= _mm_cvttpd_epi32(ewrt
);
937 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
938 ewitab
= _mm_slli_epi32(ewitab
,2);
939 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
940 ewtabD
= _mm_setzero_pd();
941 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
942 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
943 ewtabFn
= _mm_setzero_pd();
944 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
945 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
946 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
947 velec
= _mm_mul_pd(qq13
,_mm_sub_pd(rinv13
,velec
));
948 felec
= _mm_mul_pd(_mm_mul_pd(qq13
,rinv13
),_mm_sub_pd(rinvsq13
,felec
));
950 /* Update potential sum for this i atom from the interaction with this j atom. */
951 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
952 velecsum
= _mm_add_pd(velecsum
,velec
);
956 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
958 /* Calculate temporary vectorial force */
959 tx
= _mm_mul_pd(fscal
,dx13
);
960 ty
= _mm_mul_pd(fscal
,dy13
);
961 tz
= _mm_mul_pd(fscal
,dz13
);
963 /* Update vectorial force */
964 fix1
= _mm_add_pd(fix1
,tx
);
965 fiy1
= _mm_add_pd(fiy1
,ty
);
966 fiz1
= _mm_add_pd(fiz1
,tz
);
968 fjx3
= _mm_add_pd(fjx3
,tx
);
969 fjy3
= _mm_add_pd(fjy3
,ty
);
970 fjz3
= _mm_add_pd(fjz3
,tz
);
972 /**************************
973 * CALCULATE INTERACTIONS *
974 **************************/
976 r21
= _mm_mul_pd(rsq21
,rinv21
);
978 /* EWALD ELECTROSTATICS */
980 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
981 ewrt
= _mm_mul_pd(r21
,ewtabscale
);
982 ewitab
= _mm_cvttpd_epi32(ewrt
);
983 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
984 ewitab
= _mm_slli_epi32(ewitab
,2);
985 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
986 ewtabD
= _mm_setzero_pd();
987 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
988 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
989 ewtabFn
= _mm_setzero_pd();
990 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
991 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
992 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
993 velec
= _mm_mul_pd(qq21
,_mm_sub_pd(rinv21
,velec
));
994 felec
= _mm_mul_pd(_mm_mul_pd(qq21
,rinv21
),_mm_sub_pd(rinvsq21
,felec
));
996 /* Update potential sum for this i atom from the interaction with this j atom. */
997 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
998 velecsum
= _mm_add_pd(velecsum
,velec
);
1002 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1004 /* Calculate temporary vectorial force */
1005 tx
= _mm_mul_pd(fscal
,dx21
);
1006 ty
= _mm_mul_pd(fscal
,dy21
);
1007 tz
= _mm_mul_pd(fscal
,dz21
);
1009 /* Update vectorial force */
1010 fix2
= _mm_add_pd(fix2
,tx
);
1011 fiy2
= _mm_add_pd(fiy2
,ty
);
1012 fiz2
= _mm_add_pd(fiz2
,tz
);
1014 fjx1
= _mm_add_pd(fjx1
,tx
);
1015 fjy1
= _mm_add_pd(fjy1
,ty
);
1016 fjz1
= _mm_add_pd(fjz1
,tz
);
1018 /**************************
1019 * CALCULATE INTERACTIONS *
1020 **************************/
1022 r22
= _mm_mul_pd(rsq22
,rinv22
);
1024 /* EWALD ELECTROSTATICS */
1026 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1027 ewrt
= _mm_mul_pd(r22
,ewtabscale
);
1028 ewitab
= _mm_cvttpd_epi32(ewrt
);
1029 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1030 ewitab
= _mm_slli_epi32(ewitab
,2);
1031 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1032 ewtabD
= _mm_setzero_pd();
1033 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1034 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1035 ewtabFn
= _mm_setzero_pd();
1036 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1037 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1038 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1039 velec
= _mm_mul_pd(qq22
,_mm_sub_pd(rinv22
,velec
));
1040 felec
= _mm_mul_pd(_mm_mul_pd(qq22
,rinv22
),_mm_sub_pd(rinvsq22
,felec
));
1042 /* Update potential sum for this i atom from the interaction with this j atom. */
1043 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1044 velecsum
= _mm_add_pd(velecsum
,velec
);
1048 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1050 /* Calculate temporary vectorial force */
1051 tx
= _mm_mul_pd(fscal
,dx22
);
1052 ty
= _mm_mul_pd(fscal
,dy22
);
1053 tz
= _mm_mul_pd(fscal
,dz22
);
1055 /* Update vectorial force */
1056 fix2
= _mm_add_pd(fix2
,tx
);
1057 fiy2
= _mm_add_pd(fiy2
,ty
);
1058 fiz2
= _mm_add_pd(fiz2
,tz
);
1060 fjx2
= _mm_add_pd(fjx2
,tx
);
1061 fjy2
= _mm_add_pd(fjy2
,ty
);
1062 fjz2
= _mm_add_pd(fjz2
,tz
);
1064 /**************************
1065 * CALCULATE INTERACTIONS *
1066 **************************/
1068 r23
= _mm_mul_pd(rsq23
,rinv23
);
1070 /* EWALD ELECTROSTATICS */
1072 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1073 ewrt
= _mm_mul_pd(r23
,ewtabscale
);
1074 ewitab
= _mm_cvttpd_epi32(ewrt
);
1075 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1076 ewitab
= _mm_slli_epi32(ewitab
,2);
1077 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1078 ewtabD
= _mm_setzero_pd();
1079 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1080 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1081 ewtabFn
= _mm_setzero_pd();
1082 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1083 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1084 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1085 velec
= _mm_mul_pd(qq23
,_mm_sub_pd(rinv23
,velec
));
1086 felec
= _mm_mul_pd(_mm_mul_pd(qq23
,rinv23
),_mm_sub_pd(rinvsq23
,felec
));
1088 /* Update potential sum for this i atom from the interaction with this j atom. */
1089 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1090 velecsum
= _mm_add_pd(velecsum
,velec
);
1094 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1096 /* Calculate temporary vectorial force */
1097 tx
= _mm_mul_pd(fscal
,dx23
);
1098 ty
= _mm_mul_pd(fscal
,dy23
);
1099 tz
= _mm_mul_pd(fscal
,dz23
);
1101 /* Update vectorial force */
1102 fix2
= _mm_add_pd(fix2
,tx
);
1103 fiy2
= _mm_add_pd(fiy2
,ty
);
1104 fiz2
= _mm_add_pd(fiz2
,tz
);
1106 fjx3
= _mm_add_pd(fjx3
,tx
);
1107 fjy3
= _mm_add_pd(fjy3
,ty
);
1108 fjz3
= _mm_add_pd(fjz3
,tz
);
1110 /**************************
1111 * CALCULATE INTERACTIONS *
1112 **************************/
1114 r31
= _mm_mul_pd(rsq31
,rinv31
);
1116 /* EWALD ELECTROSTATICS */
1118 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1119 ewrt
= _mm_mul_pd(r31
,ewtabscale
);
1120 ewitab
= _mm_cvttpd_epi32(ewrt
);
1121 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1122 ewitab
= _mm_slli_epi32(ewitab
,2);
1123 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1124 ewtabD
= _mm_setzero_pd();
1125 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1126 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1127 ewtabFn
= _mm_setzero_pd();
1128 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1129 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1130 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1131 velec
= _mm_mul_pd(qq31
,_mm_sub_pd(rinv31
,velec
));
1132 felec
= _mm_mul_pd(_mm_mul_pd(qq31
,rinv31
),_mm_sub_pd(rinvsq31
,felec
));
1134 /* Update potential sum for this i atom from the interaction with this j atom. */
1135 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1136 velecsum
= _mm_add_pd(velecsum
,velec
);
1140 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1142 /* Calculate temporary vectorial force */
1143 tx
= _mm_mul_pd(fscal
,dx31
);
1144 ty
= _mm_mul_pd(fscal
,dy31
);
1145 tz
= _mm_mul_pd(fscal
,dz31
);
1147 /* Update vectorial force */
1148 fix3
= _mm_add_pd(fix3
,tx
);
1149 fiy3
= _mm_add_pd(fiy3
,ty
);
1150 fiz3
= _mm_add_pd(fiz3
,tz
);
1152 fjx1
= _mm_add_pd(fjx1
,tx
);
1153 fjy1
= _mm_add_pd(fjy1
,ty
);
1154 fjz1
= _mm_add_pd(fjz1
,tz
);
1156 /**************************
1157 * CALCULATE INTERACTIONS *
1158 **************************/
1160 r32
= _mm_mul_pd(rsq32
,rinv32
);
1162 /* EWALD ELECTROSTATICS */
1164 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1165 ewrt
= _mm_mul_pd(r32
,ewtabscale
);
1166 ewitab
= _mm_cvttpd_epi32(ewrt
);
1167 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1168 ewitab
= _mm_slli_epi32(ewitab
,2);
1169 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1170 ewtabD
= _mm_setzero_pd();
1171 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1172 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1173 ewtabFn
= _mm_setzero_pd();
1174 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1175 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1176 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1177 velec
= _mm_mul_pd(qq32
,_mm_sub_pd(rinv32
,velec
));
1178 felec
= _mm_mul_pd(_mm_mul_pd(qq32
,rinv32
),_mm_sub_pd(rinvsq32
,felec
));
1180 /* Update potential sum for this i atom from the interaction with this j atom. */
1181 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1182 velecsum
= _mm_add_pd(velecsum
,velec
);
1186 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1188 /* Calculate temporary vectorial force */
1189 tx
= _mm_mul_pd(fscal
,dx32
);
1190 ty
= _mm_mul_pd(fscal
,dy32
);
1191 tz
= _mm_mul_pd(fscal
,dz32
);
1193 /* Update vectorial force */
1194 fix3
= _mm_add_pd(fix3
,tx
);
1195 fiy3
= _mm_add_pd(fiy3
,ty
);
1196 fiz3
= _mm_add_pd(fiz3
,tz
);
1198 fjx2
= _mm_add_pd(fjx2
,tx
);
1199 fjy2
= _mm_add_pd(fjy2
,ty
);
1200 fjz2
= _mm_add_pd(fjz2
,tz
);
1202 /**************************
1203 * CALCULATE INTERACTIONS *
1204 **************************/
1206 r33
= _mm_mul_pd(rsq33
,rinv33
);
1208 /* EWALD ELECTROSTATICS */
1210 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1211 ewrt
= _mm_mul_pd(r33
,ewtabscale
);
1212 ewitab
= _mm_cvttpd_epi32(ewrt
);
1213 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1214 ewitab
= _mm_slli_epi32(ewitab
,2);
1215 ewtabF
= _mm_load_pd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) );
1216 ewtabD
= _mm_setzero_pd();
1217 GMX_MM_TRANSPOSE2_PD(ewtabF
,ewtabD
);
1218 ewtabV
= _mm_load_sd( ewtab
+ gmx_mm_extract_epi32(ewitab
,0) +2);
1219 ewtabFn
= _mm_setzero_pd();
1220 GMX_MM_TRANSPOSE2_PD(ewtabV
,ewtabFn
);
1221 felec
= _mm_add_pd(ewtabF
,_mm_mul_pd(eweps
,ewtabD
));
1222 velec
= _mm_sub_pd(ewtabV
,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace
,eweps
),_mm_add_pd(ewtabF
,felec
)));
1223 velec
= _mm_mul_pd(qq33
,_mm_sub_pd(rinv33
,velec
));
1224 felec
= _mm_mul_pd(_mm_mul_pd(qq33
,rinv33
),_mm_sub_pd(rinvsq33
,felec
));
1226 /* Update potential sum for this i atom from the interaction with this j atom. */
1227 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1228 velecsum
= _mm_add_pd(velecsum
,velec
);
1232 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1234 /* Calculate temporary vectorial force */
1235 tx
= _mm_mul_pd(fscal
,dx33
);
1236 ty
= _mm_mul_pd(fscal
,dy33
);
1237 tz
= _mm_mul_pd(fscal
,dz33
);
1239 /* Update vectorial force */
1240 fix3
= _mm_add_pd(fix3
,tx
);
1241 fiy3
= _mm_add_pd(fiy3
,ty
);
1242 fiz3
= _mm_add_pd(fiz3
,tz
);
1244 fjx3
= _mm_add_pd(fjx3
,tx
);
1245 fjy3
= _mm_add_pd(fjy3
,ty
);
1246 fjz3
= _mm_add_pd(fjz3
,tz
);
1248 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
1250 /* Inner loop uses 404 flops */
1253 /* End of innermost loop */
1255 gmx_mm_update_iforce_4atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
1256 f
+i_coord_offset
,fshift
+i_shift_offset
);
1259 /* Update potential energies */
1260 gmx_mm_update_1pot_pd(velecsum
,kernel_data
->energygrp_elec
+ggid
);
1261 gmx_mm_update_1pot_pd(vvdwsum
,kernel_data
->energygrp_vdw
+ggid
);
1263 /* Increment number of inner iterations */
1264 inneriter
+= j_index_end
- j_index_start
;
1266 /* Outer loop uses 26 flops */
1269 /* Increment number of outer iterations */
1272 /* Update outer/inner flops */
1274 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W4W4_VF
,outeriter
*26 + inneriter
*404);
1277 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_sse2_double
1278 * Electrostatics interaction: Ewald
1279 * VdW interaction: LennardJones
1280 * Geometry: Water4-Water4
1281 * Calculate force/pot: Force
1284 nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_sse2_double
1285 (t_nblist
* gmx_restrict nlist
,
1286 rvec
* gmx_restrict xx
,
1287 rvec
* gmx_restrict ff
,
1288 t_forcerec
* gmx_restrict fr
,
1289 t_mdatoms
* gmx_restrict mdatoms
,
1290 nb_kernel_data_t
* gmx_restrict kernel_data
,
1291 t_nrnb
* gmx_restrict nrnb
)
1293 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1294 * just 0 for non-waters.
1295 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1296 * jnr indices corresponding to data put in the four positions in the SIMD register.
1298 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
1299 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
1301 int j_coord_offsetA
,j_coord_offsetB
;
1302 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
1303 real rcutoff_scalar
;
1304 real
*shiftvec
,*fshift
,*x
,*f
;
1305 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
1307 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
1309 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
1311 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
1313 __m128d ix3
,iy3
,iz3
,fix3
,fiy3
,fiz3
,iq3
,isai3
;
1314 int vdwjidx0A
,vdwjidx0B
;
1315 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
1316 int vdwjidx1A
,vdwjidx1B
;
1317 __m128d jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
1318 int vdwjidx2A
,vdwjidx2B
;
1319 __m128d jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
1320 int vdwjidx3A
,vdwjidx3B
;
1321 __m128d jx3
,jy3
,jz3
,fjx3
,fjy3
,fjz3
,jq3
,isaj3
;
1322 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
1323 __m128d dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
1324 __m128d dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
1325 __m128d dx13
,dy13
,dz13
,rsq13
,rinv13
,rinvsq13
,r13
,qq13
,c6_13
,c12_13
;
1326 __m128d dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
1327 __m128d dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
1328 __m128d dx23
,dy23
,dz23
,rsq23
,rinv23
,rinvsq23
,r23
,qq23
,c6_23
,c12_23
;
1329 __m128d dx31
,dy31
,dz31
,rsq31
,rinv31
,rinvsq31
,r31
,qq31
,c6_31
,c12_31
;
1330 __m128d dx32
,dy32
,dz32
,rsq32
,rinv32
,rinvsq32
,r32
,qq32
,c6_32
,c12_32
;
1331 __m128d dx33
,dy33
,dz33
,rsq33
,rinv33
,rinvsq33
,r33
,qq33
,c6_33
,c12_33
;
1332 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
1335 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
1338 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
1339 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
1341 __m128d ewtabscale
,eweps
,sh_ewald
,ewrt
,ewtabhalfspace
,ewtabF
,ewtabFn
,ewtabD
,ewtabV
;
1343 __m128d dummy_mask
,cutoff_mask
;
1344 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1345 __m128d one
= _mm_set1_pd(1.0);
1346 __m128d two
= _mm_set1_pd(2.0);
1352 jindex
= nlist
->jindex
;
1354 shiftidx
= nlist
->shift
;
1356 shiftvec
= fr
->shift_vec
[0];
1357 fshift
= fr
->fshift
[0];
1358 facel
= _mm_set1_pd(fr
->epsfac
);
1359 charge
= mdatoms
->chargeA
;
1360 nvdwtype
= fr
->ntype
;
1361 vdwparam
= fr
->nbfp
;
1362 vdwtype
= mdatoms
->typeA
;
1364 sh_ewald
= _mm_set1_pd(fr
->ic
->sh_ewald
);
1365 ewtab
= fr
->ic
->tabq_coul_F
;
1366 ewtabscale
= _mm_set1_pd(fr
->ic
->tabq_scale
);
1367 ewtabhalfspace
= _mm_set1_pd(0.5/fr
->ic
->tabq_scale
);
1369 /* Setup water-specific parameters */
1370 inr
= nlist
->iinr
[0];
1371 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
1372 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
1373 iq3
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+3]));
1374 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
1376 jq1
= _mm_set1_pd(charge
[inr
+1]);
1377 jq2
= _mm_set1_pd(charge
[inr
+2]);
1378 jq3
= _mm_set1_pd(charge
[inr
+3]);
1379 vdwjidx0A
= 2*vdwtype
[inr
+0];
1380 c6_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
]);
1381 c12_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
1382 qq11
= _mm_mul_pd(iq1
,jq1
);
1383 qq12
= _mm_mul_pd(iq1
,jq2
);
1384 qq13
= _mm_mul_pd(iq1
,jq3
);
1385 qq21
= _mm_mul_pd(iq2
,jq1
);
1386 qq22
= _mm_mul_pd(iq2
,jq2
);
1387 qq23
= _mm_mul_pd(iq2
,jq3
);
1388 qq31
= _mm_mul_pd(iq3
,jq1
);
1389 qq32
= _mm_mul_pd(iq3
,jq2
);
1390 qq33
= _mm_mul_pd(iq3
,jq3
);
1392 /* Avoid stupid compiler warnings */
1394 j_coord_offsetA
= 0;
1395 j_coord_offsetB
= 0;
1400 /* Start outer loop over neighborlists */
1401 for(iidx
=0; iidx
<nri
; iidx
++)
1403 /* Load shift vector for this list */
1404 i_shift_offset
= DIM
*shiftidx
[iidx
];
1406 /* Load limits for loop over neighbors */
1407 j_index_start
= jindex
[iidx
];
1408 j_index_end
= jindex
[iidx
+1];
1410 /* Get outer coordinate index */
1412 i_coord_offset
= DIM
*inr
;
1414 /* Load i particle coords and add shift vector */
1415 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
1416 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
,&ix3
,&iy3
,&iz3
);
1418 fix0
= _mm_setzero_pd();
1419 fiy0
= _mm_setzero_pd();
1420 fiz0
= _mm_setzero_pd();
1421 fix1
= _mm_setzero_pd();
1422 fiy1
= _mm_setzero_pd();
1423 fiz1
= _mm_setzero_pd();
1424 fix2
= _mm_setzero_pd();
1425 fiy2
= _mm_setzero_pd();
1426 fiz2
= _mm_setzero_pd();
1427 fix3
= _mm_setzero_pd();
1428 fiy3
= _mm_setzero_pd();
1429 fiz3
= _mm_setzero_pd();
1431 /* Start inner kernel loop */
1432 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
1435 /* Get j neighbor index, and coordinate index */
1437 jnrB
= jjnr
[jidx
+1];
1438 j_coord_offsetA
= DIM
*jnrA
;
1439 j_coord_offsetB
= DIM
*jnrB
;
1441 /* load j atom coordinates */
1442 gmx_mm_load_4rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1443 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
1444 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
1446 /* Calculate displacement vector */
1447 dx00
= _mm_sub_pd(ix0
,jx0
);
1448 dy00
= _mm_sub_pd(iy0
,jy0
);
1449 dz00
= _mm_sub_pd(iz0
,jz0
);
1450 dx11
= _mm_sub_pd(ix1
,jx1
);
1451 dy11
= _mm_sub_pd(iy1
,jy1
);
1452 dz11
= _mm_sub_pd(iz1
,jz1
);
1453 dx12
= _mm_sub_pd(ix1
,jx2
);
1454 dy12
= _mm_sub_pd(iy1
,jy2
);
1455 dz12
= _mm_sub_pd(iz1
,jz2
);
1456 dx13
= _mm_sub_pd(ix1
,jx3
);
1457 dy13
= _mm_sub_pd(iy1
,jy3
);
1458 dz13
= _mm_sub_pd(iz1
,jz3
);
1459 dx21
= _mm_sub_pd(ix2
,jx1
);
1460 dy21
= _mm_sub_pd(iy2
,jy1
);
1461 dz21
= _mm_sub_pd(iz2
,jz1
);
1462 dx22
= _mm_sub_pd(ix2
,jx2
);
1463 dy22
= _mm_sub_pd(iy2
,jy2
);
1464 dz22
= _mm_sub_pd(iz2
,jz2
);
1465 dx23
= _mm_sub_pd(ix2
,jx3
);
1466 dy23
= _mm_sub_pd(iy2
,jy3
);
1467 dz23
= _mm_sub_pd(iz2
,jz3
);
1468 dx31
= _mm_sub_pd(ix3
,jx1
);
1469 dy31
= _mm_sub_pd(iy3
,jy1
);
1470 dz31
= _mm_sub_pd(iz3
,jz1
);
1471 dx32
= _mm_sub_pd(ix3
,jx2
);
1472 dy32
= _mm_sub_pd(iy3
,jy2
);
1473 dz32
= _mm_sub_pd(iz3
,jz2
);
1474 dx33
= _mm_sub_pd(ix3
,jx3
);
1475 dy33
= _mm_sub_pd(iy3
,jy3
);
1476 dz33
= _mm_sub_pd(iz3
,jz3
);
1478 /* Calculate squared distance and things based on it */
1479 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
1480 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
1481 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
1482 rsq13
= gmx_mm_calc_rsq_pd(dx13
,dy13
,dz13
);
1483 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
1484 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
1485 rsq23
= gmx_mm_calc_rsq_pd(dx23
,dy23
,dz23
);
1486 rsq31
= gmx_mm_calc_rsq_pd(dx31
,dy31
,dz31
);
1487 rsq32
= gmx_mm_calc_rsq_pd(dx32
,dy32
,dz32
);
1488 rsq33
= gmx_mm_calc_rsq_pd(dx33
,dy33
,dz33
);
1490 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
1491 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
1492 rinv13
= gmx_mm_invsqrt_pd(rsq13
);
1493 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
1494 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
1495 rinv23
= gmx_mm_invsqrt_pd(rsq23
);
1496 rinv31
= gmx_mm_invsqrt_pd(rsq31
);
1497 rinv32
= gmx_mm_invsqrt_pd(rsq32
);
1498 rinv33
= gmx_mm_invsqrt_pd(rsq33
);
1500 rinvsq00
= gmx_mm_inv_pd(rsq00
);
1501 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
1502 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
1503 rinvsq13
= _mm_mul_pd(rinv13
,rinv13
);
1504 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
1505 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
1506 rinvsq23
= _mm_mul_pd(rinv23
,rinv23
);
1507 rinvsq31
= _mm_mul_pd(rinv31
,rinv31
);
1508 rinvsq32
= _mm_mul_pd(rinv32
,rinv32
);
1509 rinvsq33
= _mm_mul_pd(rinv33
,rinv33
);
1511 fjx0
= _mm_setzero_pd();
1512 fjy0
= _mm_setzero_pd();
1513 fjz0
= _mm_setzero_pd();
1514 fjx1
= _mm_setzero_pd();
1515 fjy1
= _mm_setzero_pd();
1516 fjz1
= _mm_setzero_pd();
1517 fjx2
= _mm_setzero_pd();
1518 fjy2
= _mm_setzero_pd();
1519 fjz2
= _mm_setzero_pd();
1520 fjx3
= _mm_setzero_pd();
1521 fjy3
= _mm_setzero_pd();
1522 fjz3
= _mm_setzero_pd();
1524 /**************************
1525 * CALCULATE INTERACTIONS *
1526 **************************/
1528 /* LENNARD-JONES DISPERSION/REPULSION */
1530 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
1531 fvdw
= _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00
,rinvsix
),c6_00
),_mm_mul_pd(rinvsix
,rinvsq00
));
1535 /* Calculate temporary vectorial force */
1536 tx
= _mm_mul_pd(fscal
,dx00
);
1537 ty
= _mm_mul_pd(fscal
,dy00
);
1538 tz
= _mm_mul_pd(fscal
,dz00
);
1540 /* Update vectorial force */
1541 fix0
= _mm_add_pd(fix0
,tx
);
1542 fiy0
= _mm_add_pd(fiy0
,ty
);
1543 fiz0
= _mm_add_pd(fiz0
,tz
);
1545 fjx0
= _mm_add_pd(fjx0
,tx
);
1546 fjy0
= _mm_add_pd(fjy0
,ty
);
1547 fjz0
= _mm_add_pd(fjz0
,tz
);
1549 /**************************
1550 * CALCULATE INTERACTIONS *
1551 **************************/
1553 r11
= _mm_mul_pd(rsq11
,rinv11
);
1555 /* EWALD ELECTROSTATICS */
1557 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1558 ewrt
= _mm_mul_pd(r11
,ewtabscale
);
1559 ewitab
= _mm_cvttpd_epi32(ewrt
);
1560 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1561 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1563 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
1564 felec
= _mm_mul_pd(_mm_mul_pd(qq11
,rinv11
),_mm_sub_pd(rinvsq11
,felec
));
1568 /* Calculate temporary vectorial force */
1569 tx
= _mm_mul_pd(fscal
,dx11
);
1570 ty
= _mm_mul_pd(fscal
,dy11
);
1571 tz
= _mm_mul_pd(fscal
,dz11
);
1573 /* Update vectorial force */
1574 fix1
= _mm_add_pd(fix1
,tx
);
1575 fiy1
= _mm_add_pd(fiy1
,ty
);
1576 fiz1
= _mm_add_pd(fiz1
,tz
);
1578 fjx1
= _mm_add_pd(fjx1
,tx
);
1579 fjy1
= _mm_add_pd(fjy1
,ty
);
1580 fjz1
= _mm_add_pd(fjz1
,tz
);
1582 /**************************
1583 * CALCULATE INTERACTIONS *
1584 **************************/
1586 r12
= _mm_mul_pd(rsq12
,rinv12
);
1588 /* EWALD ELECTROSTATICS */
1590 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1591 ewrt
= _mm_mul_pd(r12
,ewtabscale
);
1592 ewitab
= _mm_cvttpd_epi32(ewrt
);
1593 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1594 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1596 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
1597 felec
= _mm_mul_pd(_mm_mul_pd(qq12
,rinv12
),_mm_sub_pd(rinvsq12
,felec
));
1601 /* Calculate temporary vectorial force */
1602 tx
= _mm_mul_pd(fscal
,dx12
);
1603 ty
= _mm_mul_pd(fscal
,dy12
);
1604 tz
= _mm_mul_pd(fscal
,dz12
);
1606 /* Update vectorial force */
1607 fix1
= _mm_add_pd(fix1
,tx
);
1608 fiy1
= _mm_add_pd(fiy1
,ty
);
1609 fiz1
= _mm_add_pd(fiz1
,tz
);
1611 fjx2
= _mm_add_pd(fjx2
,tx
);
1612 fjy2
= _mm_add_pd(fjy2
,ty
);
1613 fjz2
= _mm_add_pd(fjz2
,tz
);
1615 /**************************
1616 * CALCULATE INTERACTIONS *
1617 **************************/
1619 r13
= _mm_mul_pd(rsq13
,rinv13
);
1621 /* EWALD ELECTROSTATICS */
1623 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1624 ewrt
= _mm_mul_pd(r13
,ewtabscale
);
1625 ewitab
= _mm_cvttpd_epi32(ewrt
);
1626 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1627 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1629 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
1630 felec
= _mm_mul_pd(_mm_mul_pd(qq13
,rinv13
),_mm_sub_pd(rinvsq13
,felec
));
1634 /* Calculate temporary vectorial force */
1635 tx
= _mm_mul_pd(fscal
,dx13
);
1636 ty
= _mm_mul_pd(fscal
,dy13
);
1637 tz
= _mm_mul_pd(fscal
,dz13
);
1639 /* Update vectorial force */
1640 fix1
= _mm_add_pd(fix1
,tx
);
1641 fiy1
= _mm_add_pd(fiy1
,ty
);
1642 fiz1
= _mm_add_pd(fiz1
,tz
);
1644 fjx3
= _mm_add_pd(fjx3
,tx
);
1645 fjy3
= _mm_add_pd(fjy3
,ty
);
1646 fjz3
= _mm_add_pd(fjz3
,tz
);
1648 /**************************
1649 * CALCULATE INTERACTIONS *
1650 **************************/
1652 r21
= _mm_mul_pd(rsq21
,rinv21
);
1654 /* EWALD ELECTROSTATICS */
1656 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1657 ewrt
= _mm_mul_pd(r21
,ewtabscale
);
1658 ewitab
= _mm_cvttpd_epi32(ewrt
);
1659 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1660 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1662 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
1663 felec
= _mm_mul_pd(_mm_mul_pd(qq21
,rinv21
),_mm_sub_pd(rinvsq21
,felec
));
1667 /* Calculate temporary vectorial force */
1668 tx
= _mm_mul_pd(fscal
,dx21
);
1669 ty
= _mm_mul_pd(fscal
,dy21
);
1670 tz
= _mm_mul_pd(fscal
,dz21
);
1672 /* Update vectorial force */
1673 fix2
= _mm_add_pd(fix2
,tx
);
1674 fiy2
= _mm_add_pd(fiy2
,ty
);
1675 fiz2
= _mm_add_pd(fiz2
,tz
);
1677 fjx1
= _mm_add_pd(fjx1
,tx
);
1678 fjy1
= _mm_add_pd(fjy1
,ty
);
1679 fjz1
= _mm_add_pd(fjz1
,tz
);
1681 /**************************
1682 * CALCULATE INTERACTIONS *
1683 **************************/
1685 r22
= _mm_mul_pd(rsq22
,rinv22
);
1687 /* EWALD ELECTROSTATICS */
1689 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1690 ewrt
= _mm_mul_pd(r22
,ewtabscale
);
1691 ewitab
= _mm_cvttpd_epi32(ewrt
);
1692 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1693 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1695 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
1696 felec
= _mm_mul_pd(_mm_mul_pd(qq22
,rinv22
),_mm_sub_pd(rinvsq22
,felec
));
1700 /* Calculate temporary vectorial force */
1701 tx
= _mm_mul_pd(fscal
,dx22
);
1702 ty
= _mm_mul_pd(fscal
,dy22
);
1703 tz
= _mm_mul_pd(fscal
,dz22
);
1705 /* Update vectorial force */
1706 fix2
= _mm_add_pd(fix2
,tx
);
1707 fiy2
= _mm_add_pd(fiy2
,ty
);
1708 fiz2
= _mm_add_pd(fiz2
,tz
);
1710 fjx2
= _mm_add_pd(fjx2
,tx
);
1711 fjy2
= _mm_add_pd(fjy2
,ty
);
1712 fjz2
= _mm_add_pd(fjz2
,tz
);
1714 /**************************
1715 * CALCULATE INTERACTIONS *
1716 **************************/
1718 r23
= _mm_mul_pd(rsq23
,rinv23
);
1720 /* EWALD ELECTROSTATICS */
1722 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1723 ewrt
= _mm_mul_pd(r23
,ewtabscale
);
1724 ewitab
= _mm_cvttpd_epi32(ewrt
);
1725 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1726 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1728 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
1729 felec
= _mm_mul_pd(_mm_mul_pd(qq23
,rinv23
),_mm_sub_pd(rinvsq23
,felec
));
1733 /* Calculate temporary vectorial force */
1734 tx
= _mm_mul_pd(fscal
,dx23
);
1735 ty
= _mm_mul_pd(fscal
,dy23
);
1736 tz
= _mm_mul_pd(fscal
,dz23
);
1738 /* Update vectorial force */
1739 fix2
= _mm_add_pd(fix2
,tx
);
1740 fiy2
= _mm_add_pd(fiy2
,ty
);
1741 fiz2
= _mm_add_pd(fiz2
,tz
);
1743 fjx3
= _mm_add_pd(fjx3
,tx
);
1744 fjy3
= _mm_add_pd(fjy3
,ty
);
1745 fjz3
= _mm_add_pd(fjz3
,tz
);
1747 /**************************
1748 * CALCULATE INTERACTIONS *
1749 **************************/
1751 r31
= _mm_mul_pd(rsq31
,rinv31
);
1753 /* EWALD ELECTROSTATICS */
1755 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1756 ewrt
= _mm_mul_pd(r31
,ewtabscale
);
1757 ewitab
= _mm_cvttpd_epi32(ewrt
);
1758 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1759 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1761 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
1762 felec
= _mm_mul_pd(_mm_mul_pd(qq31
,rinv31
),_mm_sub_pd(rinvsq31
,felec
));
1766 /* Calculate temporary vectorial force */
1767 tx
= _mm_mul_pd(fscal
,dx31
);
1768 ty
= _mm_mul_pd(fscal
,dy31
);
1769 tz
= _mm_mul_pd(fscal
,dz31
);
1771 /* Update vectorial force */
1772 fix3
= _mm_add_pd(fix3
,tx
);
1773 fiy3
= _mm_add_pd(fiy3
,ty
);
1774 fiz3
= _mm_add_pd(fiz3
,tz
);
1776 fjx1
= _mm_add_pd(fjx1
,tx
);
1777 fjy1
= _mm_add_pd(fjy1
,ty
);
1778 fjz1
= _mm_add_pd(fjz1
,tz
);
1780 /**************************
1781 * CALCULATE INTERACTIONS *
1782 **************************/
1784 r32
= _mm_mul_pd(rsq32
,rinv32
);
1786 /* EWALD ELECTROSTATICS */
1788 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1789 ewrt
= _mm_mul_pd(r32
,ewtabscale
);
1790 ewitab
= _mm_cvttpd_epi32(ewrt
);
1791 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1792 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1794 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
1795 felec
= _mm_mul_pd(_mm_mul_pd(qq32
,rinv32
),_mm_sub_pd(rinvsq32
,felec
));
1799 /* Calculate temporary vectorial force */
1800 tx
= _mm_mul_pd(fscal
,dx32
);
1801 ty
= _mm_mul_pd(fscal
,dy32
);
1802 tz
= _mm_mul_pd(fscal
,dz32
);
1804 /* Update vectorial force */
1805 fix3
= _mm_add_pd(fix3
,tx
);
1806 fiy3
= _mm_add_pd(fiy3
,ty
);
1807 fiz3
= _mm_add_pd(fiz3
,tz
);
1809 fjx2
= _mm_add_pd(fjx2
,tx
);
1810 fjy2
= _mm_add_pd(fjy2
,ty
);
1811 fjz2
= _mm_add_pd(fjz2
,tz
);
1813 /**************************
1814 * CALCULATE INTERACTIONS *
1815 **************************/
1817 r33
= _mm_mul_pd(rsq33
,rinv33
);
1819 /* EWALD ELECTROSTATICS */
1821 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1822 ewrt
= _mm_mul_pd(r33
,ewtabscale
);
1823 ewitab
= _mm_cvttpd_epi32(ewrt
);
1824 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1825 gmx_mm_load_2pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),ewtab
+gmx_mm_extract_epi32(ewitab
,1),
1827 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
1828 felec
= _mm_mul_pd(_mm_mul_pd(qq33
,rinv33
),_mm_sub_pd(rinvsq33
,felec
));
1832 /* Calculate temporary vectorial force */
1833 tx
= _mm_mul_pd(fscal
,dx33
);
1834 ty
= _mm_mul_pd(fscal
,dy33
);
1835 tz
= _mm_mul_pd(fscal
,dz33
);
1837 /* Update vectorial force */
1838 fix3
= _mm_add_pd(fix3
,tx
);
1839 fiy3
= _mm_add_pd(fiy3
,ty
);
1840 fiz3
= _mm_add_pd(fiz3
,tz
);
1842 fjx3
= _mm_add_pd(fjx3
,tx
);
1843 fjy3
= _mm_add_pd(fjy3
,ty
);
1844 fjz3
= _mm_add_pd(fjz3
,tz
);
1846 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
1848 /* Inner loop uses 354 flops */
1851 if(jidx
<j_index_end
)
1855 j_coord_offsetA
= DIM
*jnrA
;
1857 /* load j atom coordinates */
1858 gmx_mm_load_4rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
1859 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,
1860 &jy2
,&jz2
,&jx3
,&jy3
,&jz3
);
1862 /* Calculate displacement vector */
1863 dx00
= _mm_sub_pd(ix0
,jx0
);
1864 dy00
= _mm_sub_pd(iy0
,jy0
);
1865 dz00
= _mm_sub_pd(iz0
,jz0
);
1866 dx11
= _mm_sub_pd(ix1
,jx1
);
1867 dy11
= _mm_sub_pd(iy1
,jy1
);
1868 dz11
= _mm_sub_pd(iz1
,jz1
);
1869 dx12
= _mm_sub_pd(ix1
,jx2
);
1870 dy12
= _mm_sub_pd(iy1
,jy2
);
1871 dz12
= _mm_sub_pd(iz1
,jz2
);
1872 dx13
= _mm_sub_pd(ix1
,jx3
);
1873 dy13
= _mm_sub_pd(iy1
,jy3
);
1874 dz13
= _mm_sub_pd(iz1
,jz3
);
1875 dx21
= _mm_sub_pd(ix2
,jx1
);
1876 dy21
= _mm_sub_pd(iy2
,jy1
);
1877 dz21
= _mm_sub_pd(iz2
,jz1
);
1878 dx22
= _mm_sub_pd(ix2
,jx2
);
1879 dy22
= _mm_sub_pd(iy2
,jy2
);
1880 dz22
= _mm_sub_pd(iz2
,jz2
);
1881 dx23
= _mm_sub_pd(ix2
,jx3
);
1882 dy23
= _mm_sub_pd(iy2
,jy3
);
1883 dz23
= _mm_sub_pd(iz2
,jz3
);
1884 dx31
= _mm_sub_pd(ix3
,jx1
);
1885 dy31
= _mm_sub_pd(iy3
,jy1
);
1886 dz31
= _mm_sub_pd(iz3
,jz1
);
1887 dx32
= _mm_sub_pd(ix3
,jx2
);
1888 dy32
= _mm_sub_pd(iy3
,jy2
);
1889 dz32
= _mm_sub_pd(iz3
,jz2
);
1890 dx33
= _mm_sub_pd(ix3
,jx3
);
1891 dy33
= _mm_sub_pd(iy3
,jy3
);
1892 dz33
= _mm_sub_pd(iz3
,jz3
);
1894 /* Calculate squared distance and things based on it */
1895 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
1896 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
1897 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
1898 rsq13
= gmx_mm_calc_rsq_pd(dx13
,dy13
,dz13
);
1899 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
1900 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
1901 rsq23
= gmx_mm_calc_rsq_pd(dx23
,dy23
,dz23
);
1902 rsq31
= gmx_mm_calc_rsq_pd(dx31
,dy31
,dz31
);
1903 rsq32
= gmx_mm_calc_rsq_pd(dx32
,dy32
,dz32
);
1904 rsq33
= gmx_mm_calc_rsq_pd(dx33
,dy33
,dz33
);
1906 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
1907 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
1908 rinv13
= gmx_mm_invsqrt_pd(rsq13
);
1909 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
1910 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
1911 rinv23
= gmx_mm_invsqrt_pd(rsq23
);
1912 rinv31
= gmx_mm_invsqrt_pd(rsq31
);
1913 rinv32
= gmx_mm_invsqrt_pd(rsq32
);
1914 rinv33
= gmx_mm_invsqrt_pd(rsq33
);
1916 rinvsq00
= gmx_mm_inv_pd(rsq00
);
1917 rinvsq11
= _mm_mul_pd(rinv11
,rinv11
);
1918 rinvsq12
= _mm_mul_pd(rinv12
,rinv12
);
1919 rinvsq13
= _mm_mul_pd(rinv13
,rinv13
);
1920 rinvsq21
= _mm_mul_pd(rinv21
,rinv21
);
1921 rinvsq22
= _mm_mul_pd(rinv22
,rinv22
);
1922 rinvsq23
= _mm_mul_pd(rinv23
,rinv23
);
1923 rinvsq31
= _mm_mul_pd(rinv31
,rinv31
);
1924 rinvsq32
= _mm_mul_pd(rinv32
,rinv32
);
1925 rinvsq33
= _mm_mul_pd(rinv33
,rinv33
);
1927 fjx0
= _mm_setzero_pd();
1928 fjy0
= _mm_setzero_pd();
1929 fjz0
= _mm_setzero_pd();
1930 fjx1
= _mm_setzero_pd();
1931 fjy1
= _mm_setzero_pd();
1932 fjz1
= _mm_setzero_pd();
1933 fjx2
= _mm_setzero_pd();
1934 fjy2
= _mm_setzero_pd();
1935 fjz2
= _mm_setzero_pd();
1936 fjx3
= _mm_setzero_pd();
1937 fjy3
= _mm_setzero_pd();
1938 fjz3
= _mm_setzero_pd();
1940 /**************************
1941 * CALCULATE INTERACTIONS *
1942 **************************/
1944 /* LENNARD-JONES DISPERSION/REPULSION */
1946 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
1947 fvdw
= _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00
,rinvsix
),c6_00
),_mm_mul_pd(rinvsix
,rinvsq00
));
1951 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1953 /* Calculate temporary vectorial force */
1954 tx
= _mm_mul_pd(fscal
,dx00
);
1955 ty
= _mm_mul_pd(fscal
,dy00
);
1956 tz
= _mm_mul_pd(fscal
,dz00
);
1958 /* Update vectorial force */
1959 fix0
= _mm_add_pd(fix0
,tx
);
1960 fiy0
= _mm_add_pd(fiy0
,ty
);
1961 fiz0
= _mm_add_pd(fiz0
,tz
);
1963 fjx0
= _mm_add_pd(fjx0
,tx
);
1964 fjy0
= _mm_add_pd(fjy0
,ty
);
1965 fjz0
= _mm_add_pd(fjz0
,tz
);
1967 /**************************
1968 * CALCULATE INTERACTIONS *
1969 **************************/
1971 r11
= _mm_mul_pd(rsq11
,rinv11
);
1973 /* EWALD ELECTROSTATICS */
1975 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1976 ewrt
= _mm_mul_pd(r11
,ewtabscale
);
1977 ewitab
= _mm_cvttpd_epi32(ewrt
);
1978 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
1979 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
1980 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
1981 felec
= _mm_mul_pd(_mm_mul_pd(qq11
,rinv11
),_mm_sub_pd(rinvsq11
,felec
));
1985 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1987 /* Calculate temporary vectorial force */
1988 tx
= _mm_mul_pd(fscal
,dx11
);
1989 ty
= _mm_mul_pd(fscal
,dy11
);
1990 tz
= _mm_mul_pd(fscal
,dz11
);
1992 /* Update vectorial force */
1993 fix1
= _mm_add_pd(fix1
,tx
);
1994 fiy1
= _mm_add_pd(fiy1
,ty
);
1995 fiz1
= _mm_add_pd(fiz1
,tz
);
1997 fjx1
= _mm_add_pd(fjx1
,tx
);
1998 fjy1
= _mm_add_pd(fjy1
,ty
);
1999 fjz1
= _mm_add_pd(fjz1
,tz
);
2001 /**************************
2002 * CALCULATE INTERACTIONS *
2003 **************************/
2005 r12
= _mm_mul_pd(rsq12
,rinv12
);
2007 /* EWALD ELECTROSTATICS */
2009 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2010 ewrt
= _mm_mul_pd(r12
,ewtabscale
);
2011 ewitab
= _mm_cvttpd_epi32(ewrt
);
2012 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
2013 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
2014 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2015 felec
= _mm_mul_pd(_mm_mul_pd(qq12
,rinv12
),_mm_sub_pd(rinvsq12
,felec
));
2019 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2021 /* Calculate temporary vectorial force */
2022 tx
= _mm_mul_pd(fscal
,dx12
);
2023 ty
= _mm_mul_pd(fscal
,dy12
);
2024 tz
= _mm_mul_pd(fscal
,dz12
);
2026 /* Update vectorial force */
2027 fix1
= _mm_add_pd(fix1
,tx
);
2028 fiy1
= _mm_add_pd(fiy1
,ty
);
2029 fiz1
= _mm_add_pd(fiz1
,tz
);
2031 fjx2
= _mm_add_pd(fjx2
,tx
);
2032 fjy2
= _mm_add_pd(fjy2
,ty
);
2033 fjz2
= _mm_add_pd(fjz2
,tz
);
2035 /**************************
2036 * CALCULATE INTERACTIONS *
2037 **************************/
2039 r13
= _mm_mul_pd(rsq13
,rinv13
);
2041 /* EWALD ELECTROSTATICS */
2043 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2044 ewrt
= _mm_mul_pd(r13
,ewtabscale
);
2045 ewitab
= _mm_cvttpd_epi32(ewrt
);
2046 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
2047 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
2048 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2049 felec
= _mm_mul_pd(_mm_mul_pd(qq13
,rinv13
),_mm_sub_pd(rinvsq13
,felec
));
2053 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2055 /* Calculate temporary vectorial force */
2056 tx
= _mm_mul_pd(fscal
,dx13
);
2057 ty
= _mm_mul_pd(fscal
,dy13
);
2058 tz
= _mm_mul_pd(fscal
,dz13
);
2060 /* Update vectorial force */
2061 fix1
= _mm_add_pd(fix1
,tx
);
2062 fiy1
= _mm_add_pd(fiy1
,ty
);
2063 fiz1
= _mm_add_pd(fiz1
,tz
);
2065 fjx3
= _mm_add_pd(fjx3
,tx
);
2066 fjy3
= _mm_add_pd(fjy3
,ty
);
2067 fjz3
= _mm_add_pd(fjz3
,tz
);
2069 /**************************
2070 * CALCULATE INTERACTIONS *
2071 **************************/
2073 r21
= _mm_mul_pd(rsq21
,rinv21
);
2075 /* EWALD ELECTROSTATICS */
2077 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2078 ewrt
= _mm_mul_pd(r21
,ewtabscale
);
2079 ewitab
= _mm_cvttpd_epi32(ewrt
);
2080 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
2081 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
2082 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2083 felec
= _mm_mul_pd(_mm_mul_pd(qq21
,rinv21
),_mm_sub_pd(rinvsq21
,felec
));
2087 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2089 /* Calculate temporary vectorial force */
2090 tx
= _mm_mul_pd(fscal
,dx21
);
2091 ty
= _mm_mul_pd(fscal
,dy21
);
2092 tz
= _mm_mul_pd(fscal
,dz21
);
2094 /* Update vectorial force */
2095 fix2
= _mm_add_pd(fix2
,tx
);
2096 fiy2
= _mm_add_pd(fiy2
,ty
);
2097 fiz2
= _mm_add_pd(fiz2
,tz
);
2099 fjx1
= _mm_add_pd(fjx1
,tx
);
2100 fjy1
= _mm_add_pd(fjy1
,ty
);
2101 fjz1
= _mm_add_pd(fjz1
,tz
);
2103 /**************************
2104 * CALCULATE INTERACTIONS *
2105 **************************/
2107 r22
= _mm_mul_pd(rsq22
,rinv22
);
2109 /* EWALD ELECTROSTATICS */
2111 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2112 ewrt
= _mm_mul_pd(r22
,ewtabscale
);
2113 ewitab
= _mm_cvttpd_epi32(ewrt
);
2114 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
2115 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
2116 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2117 felec
= _mm_mul_pd(_mm_mul_pd(qq22
,rinv22
),_mm_sub_pd(rinvsq22
,felec
));
2121 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2123 /* Calculate temporary vectorial force */
2124 tx
= _mm_mul_pd(fscal
,dx22
);
2125 ty
= _mm_mul_pd(fscal
,dy22
);
2126 tz
= _mm_mul_pd(fscal
,dz22
);
2128 /* Update vectorial force */
2129 fix2
= _mm_add_pd(fix2
,tx
);
2130 fiy2
= _mm_add_pd(fiy2
,ty
);
2131 fiz2
= _mm_add_pd(fiz2
,tz
);
2133 fjx2
= _mm_add_pd(fjx2
,tx
);
2134 fjy2
= _mm_add_pd(fjy2
,ty
);
2135 fjz2
= _mm_add_pd(fjz2
,tz
);
2137 /**************************
2138 * CALCULATE INTERACTIONS *
2139 **************************/
2141 r23
= _mm_mul_pd(rsq23
,rinv23
);
2143 /* EWALD ELECTROSTATICS */
2145 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2146 ewrt
= _mm_mul_pd(r23
,ewtabscale
);
2147 ewitab
= _mm_cvttpd_epi32(ewrt
);
2148 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
2149 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
2150 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2151 felec
= _mm_mul_pd(_mm_mul_pd(qq23
,rinv23
),_mm_sub_pd(rinvsq23
,felec
));
2155 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2157 /* Calculate temporary vectorial force */
2158 tx
= _mm_mul_pd(fscal
,dx23
);
2159 ty
= _mm_mul_pd(fscal
,dy23
);
2160 tz
= _mm_mul_pd(fscal
,dz23
);
2162 /* Update vectorial force */
2163 fix2
= _mm_add_pd(fix2
,tx
);
2164 fiy2
= _mm_add_pd(fiy2
,ty
);
2165 fiz2
= _mm_add_pd(fiz2
,tz
);
2167 fjx3
= _mm_add_pd(fjx3
,tx
);
2168 fjy3
= _mm_add_pd(fjy3
,ty
);
2169 fjz3
= _mm_add_pd(fjz3
,tz
);
2171 /**************************
2172 * CALCULATE INTERACTIONS *
2173 **************************/
2175 r31
= _mm_mul_pd(rsq31
,rinv31
);
2177 /* EWALD ELECTROSTATICS */
2179 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2180 ewrt
= _mm_mul_pd(r31
,ewtabscale
);
2181 ewitab
= _mm_cvttpd_epi32(ewrt
);
2182 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
2183 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
2184 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2185 felec
= _mm_mul_pd(_mm_mul_pd(qq31
,rinv31
),_mm_sub_pd(rinvsq31
,felec
));
2189 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2191 /* Calculate temporary vectorial force */
2192 tx
= _mm_mul_pd(fscal
,dx31
);
2193 ty
= _mm_mul_pd(fscal
,dy31
);
2194 tz
= _mm_mul_pd(fscal
,dz31
);
2196 /* Update vectorial force */
2197 fix3
= _mm_add_pd(fix3
,tx
);
2198 fiy3
= _mm_add_pd(fiy3
,ty
);
2199 fiz3
= _mm_add_pd(fiz3
,tz
);
2201 fjx1
= _mm_add_pd(fjx1
,tx
);
2202 fjy1
= _mm_add_pd(fjy1
,ty
);
2203 fjz1
= _mm_add_pd(fjz1
,tz
);
2205 /**************************
2206 * CALCULATE INTERACTIONS *
2207 **************************/
2209 r32
= _mm_mul_pd(rsq32
,rinv32
);
2211 /* EWALD ELECTROSTATICS */
2213 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2214 ewrt
= _mm_mul_pd(r32
,ewtabscale
);
2215 ewitab
= _mm_cvttpd_epi32(ewrt
);
2216 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
2217 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
2218 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2219 felec
= _mm_mul_pd(_mm_mul_pd(qq32
,rinv32
),_mm_sub_pd(rinvsq32
,felec
));
2223 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2225 /* Calculate temporary vectorial force */
2226 tx
= _mm_mul_pd(fscal
,dx32
);
2227 ty
= _mm_mul_pd(fscal
,dy32
);
2228 tz
= _mm_mul_pd(fscal
,dz32
);
2230 /* Update vectorial force */
2231 fix3
= _mm_add_pd(fix3
,tx
);
2232 fiy3
= _mm_add_pd(fiy3
,ty
);
2233 fiz3
= _mm_add_pd(fiz3
,tz
);
2235 fjx2
= _mm_add_pd(fjx2
,tx
);
2236 fjy2
= _mm_add_pd(fjy2
,ty
);
2237 fjz2
= _mm_add_pd(fjz2
,tz
);
2239 /**************************
2240 * CALCULATE INTERACTIONS *
2241 **************************/
2243 r33
= _mm_mul_pd(rsq33
,rinv33
);
2245 /* EWALD ELECTROSTATICS */
2247 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2248 ewrt
= _mm_mul_pd(r33
,ewtabscale
);
2249 ewitab
= _mm_cvttpd_epi32(ewrt
);
2250 eweps
= _mm_sub_pd(ewrt
,_mm_cvtepi32_pd(ewitab
));
2251 gmx_mm_load_1pair_swizzle_pd(ewtab
+gmx_mm_extract_epi32(ewitab
,0),&ewtabF
,&ewtabFn
);
2252 felec
= _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one
,eweps
),ewtabF
),_mm_mul_pd(eweps
,ewtabFn
));
2253 felec
= _mm_mul_pd(_mm_mul_pd(qq33
,rinv33
),_mm_sub_pd(rinvsq33
,felec
));
2257 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2259 /* Calculate temporary vectorial force */
2260 tx
= _mm_mul_pd(fscal
,dx33
);
2261 ty
= _mm_mul_pd(fscal
,dy33
);
2262 tz
= _mm_mul_pd(fscal
,dz33
);
2264 /* Update vectorial force */
2265 fix3
= _mm_add_pd(fix3
,tx
);
2266 fiy3
= _mm_add_pd(fiy3
,ty
);
2267 fiz3
= _mm_add_pd(fiz3
,tz
);
2269 fjx3
= _mm_add_pd(fjx3
,tx
);
2270 fjy3
= _mm_add_pd(fjy3
,ty
);
2271 fjz3
= _mm_add_pd(fjz3
,tz
);
2273 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
,fjx3
,fjy3
,fjz3
);
2275 /* Inner loop uses 354 flops */
2278 /* End of innermost loop */
2280 gmx_mm_update_iforce_4atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,fix3
,fiy3
,fiz3
,
2281 f
+i_coord_offset
,fshift
+i_shift_offset
);
2283 /* Increment number of inner iterations */
2284 inneriter
+= j_index_end
- j_index_start
;
2286 /* Outer loop uses 24 flops */
2289 /* Increment number of outer iterations */
2292 /* Update outer/inner flops */
2294 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W4W4_F
,outeriter
*24 + inneriter
*354);