2 * Note: this file was generated by the Gromacs sse2_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse2_double.h"
34 #include "kernelutil_x86_sse2_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sse2_double
38 * Electrostatics interaction: CubicSplineTable
39 * VdW interaction: CubicSplineTable
40 * Geometry: Water3-Water3
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sse2_double
45 (t_nblist
* gmx_restrict nlist
,
46 rvec
* gmx_restrict xx
,
47 rvec
* gmx_restrict ff
,
48 t_forcerec
* gmx_restrict fr
,
49 t_mdatoms
* gmx_restrict mdatoms
,
50 nb_kernel_data_t
* gmx_restrict kernel_data
,
51 t_nrnb
* gmx_restrict nrnb
)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
59 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
61 int j_coord_offsetA
,j_coord_offsetB
;
62 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
64 real
*shiftvec
,*fshift
,*x
,*f
;
65 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
67 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
69 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
71 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
72 int vdwjidx0A
,vdwjidx0B
;
73 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
74 int vdwjidx1A
,vdwjidx1B
;
75 __m128d jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
76 int vdwjidx2A
,vdwjidx2B
;
77 __m128d jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
78 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
79 __m128d dx01
,dy01
,dz01
,rsq01
,rinv01
,rinvsq01
,r01
,qq01
,c6_01
,c12_01
;
80 __m128d dx02
,dy02
,dz02
,rsq02
,rinv02
,rinvsq02
,r02
,qq02
,c6_02
,c12_02
;
81 __m128d dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
82 __m128d dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
83 __m128d dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
84 __m128d dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
85 __m128d dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
86 __m128d dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
87 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
90 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
93 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
94 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
96 __m128i ifour
= _mm_set1_epi32(4);
97 __m128d rt
,vfeps
,vftabscale
,Y
,F
,G
,H
,Heps
,Fp
,VV
,FF
;
99 __m128d dummy_mask
,cutoff_mask
;
100 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
101 __m128d one
= _mm_set1_pd(1.0);
102 __m128d two
= _mm_set1_pd(2.0);
108 jindex
= nlist
->jindex
;
110 shiftidx
= nlist
->shift
;
112 shiftvec
= fr
->shift_vec
[0];
113 fshift
= fr
->fshift
[0];
114 facel
= _mm_set1_pd(fr
->epsfac
);
115 charge
= mdatoms
->chargeA
;
116 nvdwtype
= fr
->ntype
;
118 vdwtype
= mdatoms
->typeA
;
120 vftab
= kernel_data
->table_elec_vdw
->data
;
121 vftabscale
= _mm_set1_pd(kernel_data
->table_elec_vdw
->scale
);
123 /* Setup water-specific parameters */
124 inr
= nlist
->iinr
[0];
125 iq0
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+0]));
126 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
127 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
128 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
130 jq0
= _mm_set1_pd(charge
[inr
+0]);
131 jq1
= _mm_set1_pd(charge
[inr
+1]);
132 jq2
= _mm_set1_pd(charge
[inr
+2]);
133 vdwjidx0A
= 2*vdwtype
[inr
+0];
134 qq00
= _mm_mul_pd(iq0
,jq0
);
135 c6_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
]);
136 c12_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
137 qq01
= _mm_mul_pd(iq0
,jq1
);
138 qq02
= _mm_mul_pd(iq0
,jq2
);
139 qq10
= _mm_mul_pd(iq1
,jq0
);
140 qq11
= _mm_mul_pd(iq1
,jq1
);
141 qq12
= _mm_mul_pd(iq1
,jq2
);
142 qq20
= _mm_mul_pd(iq2
,jq0
);
143 qq21
= _mm_mul_pd(iq2
,jq1
);
144 qq22
= _mm_mul_pd(iq2
,jq2
);
146 /* Avoid stupid compiler warnings */
154 /* Start outer loop over neighborlists */
155 for(iidx
=0; iidx
<nri
; iidx
++)
157 /* Load shift vector for this list */
158 i_shift_offset
= DIM
*shiftidx
[iidx
];
160 /* Load limits for loop over neighbors */
161 j_index_start
= jindex
[iidx
];
162 j_index_end
= jindex
[iidx
+1];
164 /* Get outer coordinate index */
166 i_coord_offset
= DIM
*inr
;
168 /* Load i particle coords and add shift vector */
169 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
170 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
172 fix0
= _mm_setzero_pd();
173 fiy0
= _mm_setzero_pd();
174 fiz0
= _mm_setzero_pd();
175 fix1
= _mm_setzero_pd();
176 fiy1
= _mm_setzero_pd();
177 fiz1
= _mm_setzero_pd();
178 fix2
= _mm_setzero_pd();
179 fiy2
= _mm_setzero_pd();
180 fiz2
= _mm_setzero_pd();
182 /* Reset potential sums */
183 velecsum
= _mm_setzero_pd();
184 vvdwsum
= _mm_setzero_pd();
186 /* Start inner kernel loop */
187 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
190 /* Get j neighbor index, and coordinate index */
193 j_coord_offsetA
= DIM
*jnrA
;
194 j_coord_offsetB
= DIM
*jnrB
;
196 /* load j atom coordinates */
197 gmx_mm_load_3rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
198 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
200 /* Calculate displacement vector */
201 dx00
= _mm_sub_pd(ix0
,jx0
);
202 dy00
= _mm_sub_pd(iy0
,jy0
);
203 dz00
= _mm_sub_pd(iz0
,jz0
);
204 dx01
= _mm_sub_pd(ix0
,jx1
);
205 dy01
= _mm_sub_pd(iy0
,jy1
);
206 dz01
= _mm_sub_pd(iz0
,jz1
);
207 dx02
= _mm_sub_pd(ix0
,jx2
);
208 dy02
= _mm_sub_pd(iy0
,jy2
);
209 dz02
= _mm_sub_pd(iz0
,jz2
);
210 dx10
= _mm_sub_pd(ix1
,jx0
);
211 dy10
= _mm_sub_pd(iy1
,jy0
);
212 dz10
= _mm_sub_pd(iz1
,jz0
);
213 dx11
= _mm_sub_pd(ix1
,jx1
);
214 dy11
= _mm_sub_pd(iy1
,jy1
);
215 dz11
= _mm_sub_pd(iz1
,jz1
);
216 dx12
= _mm_sub_pd(ix1
,jx2
);
217 dy12
= _mm_sub_pd(iy1
,jy2
);
218 dz12
= _mm_sub_pd(iz1
,jz2
);
219 dx20
= _mm_sub_pd(ix2
,jx0
);
220 dy20
= _mm_sub_pd(iy2
,jy0
);
221 dz20
= _mm_sub_pd(iz2
,jz0
);
222 dx21
= _mm_sub_pd(ix2
,jx1
);
223 dy21
= _mm_sub_pd(iy2
,jy1
);
224 dz21
= _mm_sub_pd(iz2
,jz1
);
225 dx22
= _mm_sub_pd(ix2
,jx2
);
226 dy22
= _mm_sub_pd(iy2
,jy2
);
227 dz22
= _mm_sub_pd(iz2
,jz2
);
229 /* Calculate squared distance and things based on it */
230 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
231 rsq01
= gmx_mm_calc_rsq_pd(dx01
,dy01
,dz01
);
232 rsq02
= gmx_mm_calc_rsq_pd(dx02
,dy02
,dz02
);
233 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
234 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
235 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
236 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
237 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
238 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
240 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
241 rinv01
= gmx_mm_invsqrt_pd(rsq01
);
242 rinv02
= gmx_mm_invsqrt_pd(rsq02
);
243 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
244 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
245 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
246 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
247 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
248 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
250 fjx0
= _mm_setzero_pd();
251 fjy0
= _mm_setzero_pd();
252 fjz0
= _mm_setzero_pd();
253 fjx1
= _mm_setzero_pd();
254 fjy1
= _mm_setzero_pd();
255 fjz1
= _mm_setzero_pd();
256 fjx2
= _mm_setzero_pd();
257 fjy2
= _mm_setzero_pd();
258 fjz2
= _mm_setzero_pd();
260 /**************************
261 * CALCULATE INTERACTIONS *
262 **************************/
264 r00
= _mm_mul_pd(rsq00
,rinv00
);
266 /* Calculate table index by multiplying r with table scale and truncate to integer */
267 rt
= _mm_mul_pd(r00
,vftabscale
);
268 vfitab
= _mm_cvttpd_epi32(rt
);
269 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
270 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
272 /* CUBIC SPLINE TABLE ELECTROSTATICS */
273 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
274 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
275 GMX_MM_TRANSPOSE2_PD(Y
,F
);
276 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
277 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
278 GMX_MM_TRANSPOSE2_PD(G
,H
);
279 Heps
= _mm_mul_pd(vfeps
,H
);
280 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
281 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
282 velec
= _mm_mul_pd(qq00
,VV
);
283 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
284 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq00
,FF
),_mm_mul_pd(vftabscale
,rinv00
)));
286 /* CUBIC SPLINE TABLE DISPERSION */
287 vfitab
= _mm_add_epi32(vfitab
,ifour
);
288 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
289 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
290 GMX_MM_TRANSPOSE2_PD(Y
,F
);
291 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
292 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
293 GMX_MM_TRANSPOSE2_PD(G
,H
);
294 Heps
= _mm_mul_pd(vfeps
,H
);
295 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
296 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
297 vvdw6
= _mm_mul_pd(c6_00
,VV
);
298 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
299 fvdw6
= _mm_mul_pd(c6_00
,FF
);
301 /* CUBIC SPLINE TABLE REPULSION */
302 vfitab
= _mm_add_epi32(vfitab
,ifour
);
303 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
304 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
305 GMX_MM_TRANSPOSE2_PD(Y
,F
);
306 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
307 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
308 GMX_MM_TRANSPOSE2_PD(G
,H
);
309 Heps
= _mm_mul_pd(vfeps
,H
);
310 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
311 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
312 vvdw12
= _mm_mul_pd(c12_00
,VV
);
313 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
314 fvdw12
= _mm_mul_pd(c12_00
,FF
);
315 vvdw
= _mm_add_pd(vvdw12
,vvdw6
);
316 fvdw
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_add_pd(fvdw6
,fvdw12
),_mm_mul_pd(vftabscale
,rinv00
)));
318 /* Update potential sum for this i atom from the interaction with this j atom. */
319 velecsum
= _mm_add_pd(velecsum
,velec
);
320 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
322 fscal
= _mm_add_pd(felec
,fvdw
);
324 /* Calculate temporary vectorial force */
325 tx
= _mm_mul_pd(fscal
,dx00
);
326 ty
= _mm_mul_pd(fscal
,dy00
);
327 tz
= _mm_mul_pd(fscal
,dz00
);
329 /* Update vectorial force */
330 fix0
= _mm_add_pd(fix0
,tx
);
331 fiy0
= _mm_add_pd(fiy0
,ty
);
332 fiz0
= _mm_add_pd(fiz0
,tz
);
334 fjx0
= _mm_add_pd(fjx0
,tx
);
335 fjy0
= _mm_add_pd(fjy0
,ty
);
336 fjz0
= _mm_add_pd(fjz0
,tz
);
338 /**************************
339 * CALCULATE INTERACTIONS *
340 **************************/
342 r01
= _mm_mul_pd(rsq01
,rinv01
);
344 /* Calculate table index by multiplying r with table scale and truncate to integer */
345 rt
= _mm_mul_pd(r01
,vftabscale
);
346 vfitab
= _mm_cvttpd_epi32(rt
);
347 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
348 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
350 /* CUBIC SPLINE TABLE ELECTROSTATICS */
351 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
352 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
353 GMX_MM_TRANSPOSE2_PD(Y
,F
);
354 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
355 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
356 GMX_MM_TRANSPOSE2_PD(G
,H
);
357 Heps
= _mm_mul_pd(vfeps
,H
);
358 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
359 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
360 velec
= _mm_mul_pd(qq01
,VV
);
361 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
362 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq01
,FF
),_mm_mul_pd(vftabscale
,rinv01
)));
364 /* Update potential sum for this i atom from the interaction with this j atom. */
365 velecsum
= _mm_add_pd(velecsum
,velec
);
369 /* Calculate temporary vectorial force */
370 tx
= _mm_mul_pd(fscal
,dx01
);
371 ty
= _mm_mul_pd(fscal
,dy01
);
372 tz
= _mm_mul_pd(fscal
,dz01
);
374 /* Update vectorial force */
375 fix0
= _mm_add_pd(fix0
,tx
);
376 fiy0
= _mm_add_pd(fiy0
,ty
);
377 fiz0
= _mm_add_pd(fiz0
,tz
);
379 fjx1
= _mm_add_pd(fjx1
,tx
);
380 fjy1
= _mm_add_pd(fjy1
,ty
);
381 fjz1
= _mm_add_pd(fjz1
,tz
);
383 /**************************
384 * CALCULATE INTERACTIONS *
385 **************************/
387 r02
= _mm_mul_pd(rsq02
,rinv02
);
389 /* Calculate table index by multiplying r with table scale and truncate to integer */
390 rt
= _mm_mul_pd(r02
,vftabscale
);
391 vfitab
= _mm_cvttpd_epi32(rt
);
392 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
393 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
395 /* CUBIC SPLINE TABLE ELECTROSTATICS */
396 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
397 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
398 GMX_MM_TRANSPOSE2_PD(Y
,F
);
399 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
400 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
401 GMX_MM_TRANSPOSE2_PD(G
,H
);
402 Heps
= _mm_mul_pd(vfeps
,H
);
403 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
404 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
405 velec
= _mm_mul_pd(qq02
,VV
);
406 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
407 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq02
,FF
),_mm_mul_pd(vftabscale
,rinv02
)));
409 /* Update potential sum for this i atom from the interaction with this j atom. */
410 velecsum
= _mm_add_pd(velecsum
,velec
);
414 /* Calculate temporary vectorial force */
415 tx
= _mm_mul_pd(fscal
,dx02
);
416 ty
= _mm_mul_pd(fscal
,dy02
);
417 tz
= _mm_mul_pd(fscal
,dz02
);
419 /* Update vectorial force */
420 fix0
= _mm_add_pd(fix0
,tx
);
421 fiy0
= _mm_add_pd(fiy0
,ty
);
422 fiz0
= _mm_add_pd(fiz0
,tz
);
424 fjx2
= _mm_add_pd(fjx2
,tx
);
425 fjy2
= _mm_add_pd(fjy2
,ty
);
426 fjz2
= _mm_add_pd(fjz2
,tz
);
428 /**************************
429 * CALCULATE INTERACTIONS *
430 **************************/
432 r10
= _mm_mul_pd(rsq10
,rinv10
);
434 /* Calculate table index by multiplying r with table scale and truncate to integer */
435 rt
= _mm_mul_pd(r10
,vftabscale
);
436 vfitab
= _mm_cvttpd_epi32(rt
);
437 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
438 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
440 /* CUBIC SPLINE TABLE ELECTROSTATICS */
441 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
442 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
443 GMX_MM_TRANSPOSE2_PD(Y
,F
);
444 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
445 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
446 GMX_MM_TRANSPOSE2_PD(G
,H
);
447 Heps
= _mm_mul_pd(vfeps
,H
);
448 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
449 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
450 velec
= _mm_mul_pd(qq10
,VV
);
451 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
452 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq10
,FF
),_mm_mul_pd(vftabscale
,rinv10
)));
454 /* Update potential sum for this i atom from the interaction with this j atom. */
455 velecsum
= _mm_add_pd(velecsum
,velec
);
459 /* Calculate temporary vectorial force */
460 tx
= _mm_mul_pd(fscal
,dx10
);
461 ty
= _mm_mul_pd(fscal
,dy10
);
462 tz
= _mm_mul_pd(fscal
,dz10
);
464 /* Update vectorial force */
465 fix1
= _mm_add_pd(fix1
,tx
);
466 fiy1
= _mm_add_pd(fiy1
,ty
);
467 fiz1
= _mm_add_pd(fiz1
,tz
);
469 fjx0
= _mm_add_pd(fjx0
,tx
);
470 fjy0
= _mm_add_pd(fjy0
,ty
);
471 fjz0
= _mm_add_pd(fjz0
,tz
);
473 /**************************
474 * CALCULATE INTERACTIONS *
475 **************************/
477 r11
= _mm_mul_pd(rsq11
,rinv11
);
479 /* Calculate table index by multiplying r with table scale and truncate to integer */
480 rt
= _mm_mul_pd(r11
,vftabscale
);
481 vfitab
= _mm_cvttpd_epi32(rt
);
482 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
483 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
485 /* CUBIC SPLINE TABLE ELECTROSTATICS */
486 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
487 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
488 GMX_MM_TRANSPOSE2_PD(Y
,F
);
489 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
490 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
491 GMX_MM_TRANSPOSE2_PD(G
,H
);
492 Heps
= _mm_mul_pd(vfeps
,H
);
493 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
494 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
495 velec
= _mm_mul_pd(qq11
,VV
);
496 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
497 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq11
,FF
),_mm_mul_pd(vftabscale
,rinv11
)));
499 /* Update potential sum for this i atom from the interaction with this j atom. */
500 velecsum
= _mm_add_pd(velecsum
,velec
);
504 /* Calculate temporary vectorial force */
505 tx
= _mm_mul_pd(fscal
,dx11
);
506 ty
= _mm_mul_pd(fscal
,dy11
);
507 tz
= _mm_mul_pd(fscal
,dz11
);
509 /* Update vectorial force */
510 fix1
= _mm_add_pd(fix1
,tx
);
511 fiy1
= _mm_add_pd(fiy1
,ty
);
512 fiz1
= _mm_add_pd(fiz1
,tz
);
514 fjx1
= _mm_add_pd(fjx1
,tx
);
515 fjy1
= _mm_add_pd(fjy1
,ty
);
516 fjz1
= _mm_add_pd(fjz1
,tz
);
518 /**************************
519 * CALCULATE INTERACTIONS *
520 **************************/
522 r12
= _mm_mul_pd(rsq12
,rinv12
);
524 /* Calculate table index by multiplying r with table scale and truncate to integer */
525 rt
= _mm_mul_pd(r12
,vftabscale
);
526 vfitab
= _mm_cvttpd_epi32(rt
);
527 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
528 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
530 /* CUBIC SPLINE TABLE ELECTROSTATICS */
531 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
532 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
533 GMX_MM_TRANSPOSE2_PD(Y
,F
);
534 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
535 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
536 GMX_MM_TRANSPOSE2_PD(G
,H
);
537 Heps
= _mm_mul_pd(vfeps
,H
);
538 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
539 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
540 velec
= _mm_mul_pd(qq12
,VV
);
541 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
542 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq12
,FF
),_mm_mul_pd(vftabscale
,rinv12
)));
544 /* Update potential sum for this i atom from the interaction with this j atom. */
545 velecsum
= _mm_add_pd(velecsum
,velec
);
549 /* Calculate temporary vectorial force */
550 tx
= _mm_mul_pd(fscal
,dx12
);
551 ty
= _mm_mul_pd(fscal
,dy12
);
552 tz
= _mm_mul_pd(fscal
,dz12
);
554 /* Update vectorial force */
555 fix1
= _mm_add_pd(fix1
,tx
);
556 fiy1
= _mm_add_pd(fiy1
,ty
);
557 fiz1
= _mm_add_pd(fiz1
,tz
);
559 fjx2
= _mm_add_pd(fjx2
,tx
);
560 fjy2
= _mm_add_pd(fjy2
,ty
);
561 fjz2
= _mm_add_pd(fjz2
,tz
);
563 /**************************
564 * CALCULATE INTERACTIONS *
565 **************************/
567 r20
= _mm_mul_pd(rsq20
,rinv20
);
569 /* Calculate table index by multiplying r with table scale and truncate to integer */
570 rt
= _mm_mul_pd(r20
,vftabscale
);
571 vfitab
= _mm_cvttpd_epi32(rt
);
572 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
573 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
575 /* CUBIC SPLINE TABLE ELECTROSTATICS */
576 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
577 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
578 GMX_MM_TRANSPOSE2_PD(Y
,F
);
579 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
580 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
581 GMX_MM_TRANSPOSE2_PD(G
,H
);
582 Heps
= _mm_mul_pd(vfeps
,H
);
583 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
584 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
585 velec
= _mm_mul_pd(qq20
,VV
);
586 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
587 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq20
,FF
),_mm_mul_pd(vftabscale
,rinv20
)));
589 /* Update potential sum for this i atom from the interaction with this j atom. */
590 velecsum
= _mm_add_pd(velecsum
,velec
);
594 /* Calculate temporary vectorial force */
595 tx
= _mm_mul_pd(fscal
,dx20
);
596 ty
= _mm_mul_pd(fscal
,dy20
);
597 tz
= _mm_mul_pd(fscal
,dz20
);
599 /* Update vectorial force */
600 fix2
= _mm_add_pd(fix2
,tx
);
601 fiy2
= _mm_add_pd(fiy2
,ty
);
602 fiz2
= _mm_add_pd(fiz2
,tz
);
604 fjx0
= _mm_add_pd(fjx0
,tx
);
605 fjy0
= _mm_add_pd(fjy0
,ty
);
606 fjz0
= _mm_add_pd(fjz0
,tz
);
608 /**************************
609 * CALCULATE INTERACTIONS *
610 **************************/
612 r21
= _mm_mul_pd(rsq21
,rinv21
);
614 /* Calculate table index by multiplying r with table scale and truncate to integer */
615 rt
= _mm_mul_pd(r21
,vftabscale
);
616 vfitab
= _mm_cvttpd_epi32(rt
);
617 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
618 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
620 /* CUBIC SPLINE TABLE ELECTROSTATICS */
621 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
622 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
623 GMX_MM_TRANSPOSE2_PD(Y
,F
);
624 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
625 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
626 GMX_MM_TRANSPOSE2_PD(G
,H
);
627 Heps
= _mm_mul_pd(vfeps
,H
);
628 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
629 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
630 velec
= _mm_mul_pd(qq21
,VV
);
631 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
632 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq21
,FF
),_mm_mul_pd(vftabscale
,rinv21
)));
634 /* Update potential sum for this i atom from the interaction with this j atom. */
635 velecsum
= _mm_add_pd(velecsum
,velec
);
639 /* Calculate temporary vectorial force */
640 tx
= _mm_mul_pd(fscal
,dx21
);
641 ty
= _mm_mul_pd(fscal
,dy21
);
642 tz
= _mm_mul_pd(fscal
,dz21
);
644 /* Update vectorial force */
645 fix2
= _mm_add_pd(fix2
,tx
);
646 fiy2
= _mm_add_pd(fiy2
,ty
);
647 fiz2
= _mm_add_pd(fiz2
,tz
);
649 fjx1
= _mm_add_pd(fjx1
,tx
);
650 fjy1
= _mm_add_pd(fjy1
,ty
);
651 fjz1
= _mm_add_pd(fjz1
,tz
);
653 /**************************
654 * CALCULATE INTERACTIONS *
655 **************************/
657 r22
= _mm_mul_pd(rsq22
,rinv22
);
659 /* Calculate table index by multiplying r with table scale and truncate to integer */
660 rt
= _mm_mul_pd(r22
,vftabscale
);
661 vfitab
= _mm_cvttpd_epi32(rt
);
662 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
663 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
665 /* CUBIC SPLINE TABLE ELECTROSTATICS */
666 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
667 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
668 GMX_MM_TRANSPOSE2_PD(Y
,F
);
669 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
670 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
671 GMX_MM_TRANSPOSE2_PD(G
,H
);
672 Heps
= _mm_mul_pd(vfeps
,H
);
673 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
674 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
675 velec
= _mm_mul_pd(qq22
,VV
);
676 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
677 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq22
,FF
),_mm_mul_pd(vftabscale
,rinv22
)));
679 /* Update potential sum for this i atom from the interaction with this j atom. */
680 velecsum
= _mm_add_pd(velecsum
,velec
);
684 /* Calculate temporary vectorial force */
685 tx
= _mm_mul_pd(fscal
,dx22
);
686 ty
= _mm_mul_pd(fscal
,dy22
);
687 tz
= _mm_mul_pd(fscal
,dz22
);
689 /* Update vectorial force */
690 fix2
= _mm_add_pd(fix2
,tx
);
691 fiy2
= _mm_add_pd(fiy2
,ty
);
692 fiz2
= _mm_add_pd(fiz2
,tz
);
694 fjx2
= _mm_add_pd(fjx2
,tx
);
695 fjy2
= _mm_add_pd(fjy2
,ty
);
696 fjz2
= _mm_add_pd(fjz2
,tz
);
698 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
700 /* Inner loop uses 417 flops */
707 j_coord_offsetA
= DIM
*jnrA
;
709 /* load j atom coordinates */
710 gmx_mm_load_3rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
711 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
713 /* Calculate displacement vector */
714 dx00
= _mm_sub_pd(ix0
,jx0
);
715 dy00
= _mm_sub_pd(iy0
,jy0
);
716 dz00
= _mm_sub_pd(iz0
,jz0
);
717 dx01
= _mm_sub_pd(ix0
,jx1
);
718 dy01
= _mm_sub_pd(iy0
,jy1
);
719 dz01
= _mm_sub_pd(iz0
,jz1
);
720 dx02
= _mm_sub_pd(ix0
,jx2
);
721 dy02
= _mm_sub_pd(iy0
,jy2
);
722 dz02
= _mm_sub_pd(iz0
,jz2
);
723 dx10
= _mm_sub_pd(ix1
,jx0
);
724 dy10
= _mm_sub_pd(iy1
,jy0
);
725 dz10
= _mm_sub_pd(iz1
,jz0
);
726 dx11
= _mm_sub_pd(ix1
,jx1
);
727 dy11
= _mm_sub_pd(iy1
,jy1
);
728 dz11
= _mm_sub_pd(iz1
,jz1
);
729 dx12
= _mm_sub_pd(ix1
,jx2
);
730 dy12
= _mm_sub_pd(iy1
,jy2
);
731 dz12
= _mm_sub_pd(iz1
,jz2
);
732 dx20
= _mm_sub_pd(ix2
,jx0
);
733 dy20
= _mm_sub_pd(iy2
,jy0
);
734 dz20
= _mm_sub_pd(iz2
,jz0
);
735 dx21
= _mm_sub_pd(ix2
,jx1
);
736 dy21
= _mm_sub_pd(iy2
,jy1
);
737 dz21
= _mm_sub_pd(iz2
,jz1
);
738 dx22
= _mm_sub_pd(ix2
,jx2
);
739 dy22
= _mm_sub_pd(iy2
,jy2
);
740 dz22
= _mm_sub_pd(iz2
,jz2
);
742 /* Calculate squared distance and things based on it */
743 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
744 rsq01
= gmx_mm_calc_rsq_pd(dx01
,dy01
,dz01
);
745 rsq02
= gmx_mm_calc_rsq_pd(dx02
,dy02
,dz02
);
746 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
747 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
748 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
749 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
750 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
751 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
753 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
754 rinv01
= gmx_mm_invsqrt_pd(rsq01
);
755 rinv02
= gmx_mm_invsqrt_pd(rsq02
);
756 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
757 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
758 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
759 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
760 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
761 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
763 fjx0
= _mm_setzero_pd();
764 fjy0
= _mm_setzero_pd();
765 fjz0
= _mm_setzero_pd();
766 fjx1
= _mm_setzero_pd();
767 fjy1
= _mm_setzero_pd();
768 fjz1
= _mm_setzero_pd();
769 fjx2
= _mm_setzero_pd();
770 fjy2
= _mm_setzero_pd();
771 fjz2
= _mm_setzero_pd();
773 /**************************
774 * CALCULATE INTERACTIONS *
775 **************************/
777 r00
= _mm_mul_pd(rsq00
,rinv00
);
779 /* Calculate table index by multiplying r with table scale and truncate to integer */
780 rt
= _mm_mul_pd(r00
,vftabscale
);
781 vfitab
= _mm_cvttpd_epi32(rt
);
782 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
783 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
785 /* CUBIC SPLINE TABLE ELECTROSTATICS */
786 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
787 F
= _mm_setzero_pd();
788 GMX_MM_TRANSPOSE2_PD(Y
,F
);
789 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
790 H
= _mm_setzero_pd();
791 GMX_MM_TRANSPOSE2_PD(G
,H
);
792 Heps
= _mm_mul_pd(vfeps
,H
);
793 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
794 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
795 velec
= _mm_mul_pd(qq00
,VV
);
796 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
797 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq00
,FF
),_mm_mul_pd(vftabscale
,rinv00
)));
799 /* CUBIC SPLINE TABLE DISPERSION */
800 vfitab
= _mm_add_epi32(vfitab
,ifour
);
801 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
802 F
= _mm_setzero_pd();
803 GMX_MM_TRANSPOSE2_PD(Y
,F
);
804 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
805 H
= _mm_setzero_pd();
806 GMX_MM_TRANSPOSE2_PD(G
,H
);
807 Heps
= _mm_mul_pd(vfeps
,H
);
808 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
809 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
810 vvdw6
= _mm_mul_pd(c6_00
,VV
);
811 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
812 fvdw6
= _mm_mul_pd(c6_00
,FF
);
814 /* CUBIC SPLINE TABLE REPULSION */
815 vfitab
= _mm_add_epi32(vfitab
,ifour
);
816 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
817 F
= _mm_setzero_pd();
818 GMX_MM_TRANSPOSE2_PD(Y
,F
);
819 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
820 H
= _mm_setzero_pd();
821 GMX_MM_TRANSPOSE2_PD(G
,H
);
822 Heps
= _mm_mul_pd(vfeps
,H
);
823 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
824 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
825 vvdw12
= _mm_mul_pd(c12_00
,VV
);
826 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
827 fvdw12
= _mm_mul_pd(c12_00
,FF
);
828 vvdw
= _mm_add_pd(vvdw12
,vvdw6
);
829 fvdw
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_add_pd(fvdw6
,fvdw12
),_mm_mul_pd(vftabscale
,rinv00
)));
831 /* Update potential sum for this i atom from the interaction with this j atom. */
832 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
833 velecsum
= _mm_add_pd(velecsum
,velec
);
834 vvdw
= _mm_unpacklo_pd(vvdw
,_mm_setzero_pd());
835 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
837 fscal
= _mm_add_pd(felec
,fvdw
);
839 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
841 /* Calculate temporary vectorial force */
842 tx
= _mm_mul_pd(fscal
,dx00
);
843 ty
= _mm_mul_pd(fscal
,dy00
);
844 tz
= _mm_mul_pd(fscal
,dz00
);
846 /* Update vectorial force */
847 fix0
= _mm_add_pd(fix0
,tx
);
848 fiy0
= _mm_add_pd(fiy0
,ty
);
849 fiz0
= _mm_add_pd(fiz0
,tz
);
851 fjx0
= _mm_add_pd(fjx0
,tx
);
852 fjy0
= _mm_add_pd(fjy0
,ty
);
853 fjz0
= _mm_add_pd(fjz0
,tz
);
855 /**************************
856 * CALCULATE INTERACTIONS *
857 **************************/
859 r01
= _mm_mul_pd(rsq01
,rinv01
);
861 /* Calculate table index by multiplying r with table scale and truncate to integer */
862 rt
= _mm_mul_pd(r01
,vftabscale
);
863 vfitab
= _mm_cvttpd_epi32(rt
);
864 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
865 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
867 /* CUBIC SPLINE TABLE ELECTROSTATICS */
868 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
869 F
= _mm_setzero_pd();
870 GMX_MM_TRANSPOSE2_PD(Y
,F
);
871 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
872 H
= _mm_setzero_pd();
873 GMX_MM_TRANSPOSE2_PD(G
,H
);
874 Heps
= _mm_mul_pd(vfeps
,H
);
875 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
876 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
877 velec
= _mm_mul_pd(qq01
,VV
);
878 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
879 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq01
,FF
),_mm_mul_pd(vftabscale
,rinv01
)));
881 /* Update potential sum for this i atom from the interaction with this j atom. */
882 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
883 velecsum
= _mm_add_pd(velecsum
,velec
);
887 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
889 /* Calculate temporary vectorial force */
890 tx
= _mm_mul_pd(fscal
,dx01
);
891 ty
= _mm_mul_pd(fscal
,dy01
);
892 tz
= _mm_mul_pd(fscal
,dz01
);
894 /* Update vectorial force */
895 fix0
= _mm_add_pd(fix0
,tx
);
896 fiy0
= _mm_add_pd(fiy0
,ty
);
897 fiz0
= _mm_add_pd(fiz0
,tz
);
899 fjx1
= _mm_add_pd(fjx1
,tx
);
900 fjy1
= _mm_add_pd(fjy1
,ty
);
901 fjz1
= _mm_add_pd(fjz1
,tz
);
903 /**************************
904 * CALCULATE INTERACTIONS *
905 **************************/
907 r02
= _mm_mul_pd(rsq02
,rinv02
);
909 /* Calculate table index by multiplying r with table scale and truncate to integer */
910 rt
= _mm_mul_pd(r02
,vftabscale
);
911 vfitab
= _mm_cvttpd_epi32(rt
);
912 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
913 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
915 /* CUBIC SPLINE TABLE ELECTROSTATICS */
916 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
917 F
= _mm_setzero_pd();
918 GMX_MM_TRANSPOSE2_PD(Y
,F
);
919 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
920 H
= _mm_setzero_pd();
921 GMX_MM_TRANSPOSE2_PD(G
,H
);
922 Heps
= _mm_mul_pd(vfeps
,H
);
923 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
924 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
925 velec
= _mm_mul_pd(qq02
,VV
);
926 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
927 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq02
,FF
),_mm_mul_pd(vftabscale
,rinv02
)));
929 /* Update potential sum for this i atom from the interaction with this j atom. */
930 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
931 velecsum
= _mm_add_pd(velecsum
,velec
);
935 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
937 /* Calculate temporary vectorial force */
938 tx
= _mm_mul_pd(fscal
,dx02
);
939 ty
= _mm_mul_pd(fscal
,dy02
);
940 tz
= _mm_mul_pd(fscal
,dz02
);
942 /* Update vectorial force */
943 fix0
= _mm_add_pd(fix0
,tx
);
944 fiy0
= _mm_add_pd(fiy0
,ty
);
945 fiz0
= _mm_add_pd(fiz0
,tz
);
947 fjx2
= _mm_add_pd(fjx2
,tx
);
948 fjy2
= _mm_add_pd(fjy2
,ty
);
949 fjz2
= _mm_add_pd(fjz2
,tz
);
951 /**************************
952 * CALCULATE INTERACTIONS *
953 **************************/
955 r10
= _mm_mul_pd(rsq10
,rinv10
);
957 /* Calculate table index by multiplying r with table scale and truncate to integer */
958 rt
= _mm_mul_pd(r10
,vftabscale
);
959 vfitab
= _mm_cvttpd_epi32(rt
);
960 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
961 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
963 /* CUBIC SPLINE TABLE ELECTROSTATICS */
964 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
965 F
= _mm_setzero_pd();
966 GMX_MM_TRANSPOSE2_PD(Y
,F
);
967 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
968 H
= _mm_setzero_pd();
969 GMX_MM_TRANSPOSE2_PD(G
,H
);
970 Heps
= _mm_mul_pd(vfeps
,H
);
971 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
972 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
973 velec
= _mm_mul_pd(qq10
,VV
);
974 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
975 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq10
,FF
),_mm_mul_pd(vftabscale
,rinv10
)));
977 /* Update potential sum for this i atom from the interaction with this j atom. */
978 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
979 velecsum
= _mm_add_pd(velecsum
,velec
);
983 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
985 /* Calculate temporary vectorial force */
986 tx
= _mm_mul_pd(fscal
,dx10
);
987 ty
= _mm_mul_pd(fscal
,dy10
);
988 tz
= _mm_mul_pd(fscal
,dz10
);
990 /* Update vectorial force */
991 fix1
= _mm_add_pd(fix1
,tx
);
992 fiy1
= _mm_add_pd(fiy1
,ty
);
993 fiz1
= _mm_add_pd(fiz1
,tz
);
995 fjx0
= _mm_add_pd(fjx0
,tx
);
996 fjy0
= _mm_add_pd(fjy0
,ty
);
997 fjz0
= _mm_add_pd(fjz0
,tz
);
999 /**************************
1000 * CALCULATE INTERACTIONS *
1001 **************************/
1003 r11
= _mm_mul_pd(rsq11
,rinv11
);
1005 /* Calculate table index by multiplying r with table scale and truncate to integer */
1006 rt
= _mm_mul_pd(r11
,vftabscale
);
1007 vfitab
= _mm_cvttpd_epi32(rt
);
1008 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1009 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1011 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1012 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1013 F
= _mm_setzero_pd();
1014 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1015 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1016 H
= _mm_setzero_pd();
1017 GMX_MM_TRANSPOSE2_PD(G
,H
);
1018 Heps
= _mm_mul_pd(vfeps
,H
);
1019 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1020 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
1021 velec
= _mm_mul_pd(qq11
,VV
);
1022 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1023 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq11
,FF
),_mm_mul_pd(vftabscale
,rinv11
)));
1025 /* Update potential sum for this i atom from the interaction with this j atom. */
1026 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1027 velecsum
= _mm_add_pd(velecsum
,velec
);
1031 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1033 /* Calculate temporary vectorial force */
1034 tx
= _mm_mul_pd(fscal
,dx11
);
1035 ty
= _mm_mul_pd(fscal
,dy11
);
1036 tz
= _mm_mul_pd(fscal
,dz11
);
1038 /* Update vectorial force */
1039 fix1
= _mm_add_pd(fix1
,tx
);
1040 fiy1
= _mm_add_pd(fiy1
,ty
);
1041 fiz1
= _mm_add_pd(fiz1
,tz
);
1043 fjx1
= _mm_add_pd(fjx1
,tx
);
1044 fjy1
= _mm_add_pd(fjy1
,ty
);
1045 fjz1
= _mm_add_pd(fjz1
,tz
);
1047 /**************************
1048 * CALCULATE INTERACTIONS *
1049 **************************/
1051 r12
= _mm_mul_pd(rsq12
,rinv12
);
1053 /* Calculate table index by multiplying r with table scale and truncate to integer */
1054 rt
= _mm_mul_pd(r12
,vftabscale
);
1055 vfitab
= _mm_cvttpd_epi32(rt
);
1056 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1057 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1059 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1060 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1061 F
= _mm_setzero_pd();
1062 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1063 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1064 H
= _mm_setzero_pd();
1065 GMX_MM_TRANSPOSE2_PD(G
,H
);
1066 Heps
= _mm_mul_pd(vfeps
,H
);
1067 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1068 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
1069 velec
= _mm_mul_pd(qq12
,VV
);
1070 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1071 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq12
,FF
),_mm_mul_pd(vftabscale
,rinv12
)));
1073 /* Update potential sum for this i atom from the interaction with this j atom. */
1074 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1075 velecsum
= _mm_add_pd(velecsum
,velec
);
1079 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1081 /* Calculate temporary vectorial force */
1082 tx
= _mm_mul_pd(fscal
,dx12
);
1083 ty
= _mm_mul_pd(fscal
,dy12
);
1084 tz
= _mm_mul_pd(fscal
,dz12
);
1086 /* Update vectorial force */
1087 fix1
= _mm_add_pd(fix1
,tx
);
1088 fiy1
= _mm_add_pd(fiy1
,ty
);
1089 fiz1
= _mm_add_pd(fiz1
,tz
);
1091 fjx2
= _mm_add_pd(fjx2
,tx
);
1092 fjy2
= _mm_add_pd(fjy2
,ty
);
1093 fjz2
= _mm_add_pd(fjz2
,tz
);
1095 /**************************
1096 * CALCULATE INTERACTIONS *
1097 **************************/
1099 r20
= _mm_mul_pd(rsq20
,rinv20
);
1101 /* Calculate table index by multiplying r with table scale and truncate to integer */
1102 rt
= _mm_mul_pd(r20
,vftabscale
);
1103 vfitab
= _mm_cvttpd_epi32(rt
);
1104 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1105 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1107 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1108 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1109 F
= _mm_setzero_pd();
1110 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1111 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1112 H
= _mm_setzero_pd();
1113 GMX_MM_TRANSPOSE2_PD(G
,H
);
1114 Heps
= _mm_mul_pd(vfeps
,H
);
1115 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1116 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
1117 velec
= _mm_mul_pd(qq20
,VV
);
1118 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1119 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq20
,FF
),_mm_mul_pd(vftabscale
,rinv20
)));
1121 /* Update potential sum for this i atom from the interaction with this j atom. */
1122 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1123 velecsum
= _mm_add_pd(velecsum
,velec
);
1127 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1129 /* Calculate temporary vectorial force */
1130 tx
= _mm_mul_pd(fscal
,dx20
);
1131 ty
= _mm_mul_pd(fscal
,dy20
);
1132 tz
= _mm_mul_pd(fscal
,dz20
);
1134 /* Update vectorial force */
1135 fix2
= _mm_add_pd(fix2
,tx
);
1136 fiy2
= _mm_add_pd(fiy2
,ty
);
1137 fiz2
= _mm_add_pd(fiz2
,tz
);
1139 fjx0
= _mm_add_pd(fjx0
,tx
);
1140 fjy0
= _mm_add_pd(fjy0
,ty
);
1141 fjz0
= _mm_add_pd(fjz0
,tz
);
1143 /**************************
1144 * CALCULATE INTERACTIONS *
1145 **************************/
1147 r21
= _mm_mul_pd(rsq21
,rinv21
);
1149 /* Calculate table index by multiplying r with table scale and truncate to integer */
1150 rt
= _mm_mul_pd(r21
,vftabscale
);
1151 vfitab
= _mm_cvttpd_epi32(rt
);
1152 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1153 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1155 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1156 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1157 F
= _mm_setzero_pd();
1158 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1159 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1160 H
= _mm_setzero_pd();
1161 GMX_MM_TRANSPOSE2_PD(G
,H
);
1162 Heps
= _mm_mul_pd(vfeps
,H
);
1163 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1164 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
1165 velec
= _mm_mul_pd(qq21
,VV
);
1166 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1167 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq21
,FF
),_mm_mul_pd(vftabscale
,rinv21
)));
1169 /* Update potential sum for this i atom from the interaction with this j atom. */
1170 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1171 velecsum
= _mm_add_pd(velecsum
,velec
);
1175 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1177 /* Calculate temporary vectorial force */
1178 tx
= _mm_mul_pd(fscal
,dx21
);
1179 ty
= _mm_mul_pd(fscal
,dy21
);
1180 tz
= _mm_mul_pd(fscal
,dz21
);
1182 /* Update vectorial force */
1183 fix2
= _mm_add_pd(fix2
,tx
);
1184 fiy2
= _mm_add_pd(fiy2
,ty
);
1185 fiz2
= _mm_add_pd(fiz2
,tz
);
1187 fjx1
= _mm_add_pd(fjx1
,tx
);
1188 fjy1
= _mm_add_pd(fjy1
,ty
);
1189 fjz1
= _mm_add_pd(fjz1
,tz
);
1191 /**************************
1192 * CALCULATE INTERACTIONS *
1193 **************************/
1195 r22
= _mm_mul_pd(rsq22
,rinv22
);
1197 /* Calculate table index by multiplying r with table scale and truncate to integer */
1198 rt
= _mm_mul_pd(r22
,vftabscale
);
1199 vfitab
= _mm_cvttpd_epi32(rt
);
1200 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1201 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1203 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1204 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1205 F
= _mm_setzero_pd();
1206 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1207 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1208 H
= _mm_setzero_pd();
1209 GMX_MM_TRANSPOSE2_PD(G
,H
);
1210 Heps
= _mm_mul_pd(vfeps
,H
);
1211 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1212 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
1213 velec
= _mm_mul_pd(qq22
,VV
);
1214 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1215 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq22
,FF
),_mm_mul_pd(vftabscale
,rinv22
)));
1217 /* Update potential sum for this i atom from the interaction with this j atom. */
1218 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1219 velecsum
= _mm_add_pd(velecsum
,velec
);
1223 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1225 /* Calculate temporary vectorial force */
1226 tx
= _mm_mul_pd(fscal
,dx22
);
1227 ty
= _mm_mul_pd(fscal
,dy22
);
1228 tz
= _mm_mul_pd(fscal
,dz22
);
1230 /* Update vectorial force */
1231 fix2
= _mm_add_pd(fix2
,tx
);
1232 fiy2
= _mm_add_pd(fiy2
,ty
);
1233 fiz2
= _mm_add_pd(fiz2
,tz
);
1235 fjx2
= _mm_add_pd(fjx2
,tx
);
1236 fjy2
= _mm_add_pd(fjy2
,ty
);
1237 fjz2
= _mm_add_pd(fjz2
,tz
);
1239 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
1241 /* Inner loop uses 417 flops */
1244 /* End of innermost loop */
1246 gmx_mm_update_iforce_3atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
1247 f
+i_coord_offset
,fshift
+i_shift_offset
);
1250 /* Update potential energies */
1251 gmx_mm_update_1pot_pd(velecsum
,kernel_data
->energygrp_elec
+ggid
);
1252 gmx_mm_update_1pot_pd(vvdwsum
,kernel_data
->energygrp_vdw
+ggid
);
1254 /* Increment number of inner iterations */
1255 inneriter
+= j_index_end
- j_index_start
;
1257 /* Outer loop uses 20 flops */
1260 /* Increment number of outer iterations */
1263 /* Update outer/inner flops */
1265 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W3W3_VF
,outeriter
*20 + inneriter
*417);
1268 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sse2_double
1269 * Electrostatics interaction: CubicSplineTable
1270 * VdW interaction: CubicSplineTable
1271 * Geometry: Water3-Water3
1272 * Calculate force/pot: Force
1275 nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sse2_double
1276 (t_nblist
* gmx_restrict nlist
,
1277 rvec
* gmx_restrict xx
,
1278 rvec
* gmx_restrict ff
,
1279 t_forcerec
* gmx_restrict fr
,
1280 t_mdatoms
* gmx_restrict mdatoms
,
1281 nb_kernel_data_t
* gmx_restrict kernel_data
,
1282 t_nrnb
* gmx_restrict nrnb
)
1284 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1285 * just 0 for non-waters.
1286 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1287 * jnr indices corresponding to data put in the four positions in the SIMD register.
1289 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
1290 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
1292 int j_coord_offsetA
,j_coord_offsetB
;
1293 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
1294 real rcutoff_scalar
;
1295 real
*shiftvec
,*fshift
,*x
,*f
;
1296 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
1298 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
1300 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
1302 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
1303 int vdwjidx0A
,vdwjidx0B
;
1304 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
1305 int vdwjidx1A
,vdwjidx1B
;
1306 __m128d jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
1307 int vdwjidx2A
,vdwjidx2B
;
1308 __m128d jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
1309 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
1310 __m128d dx01
,dy01
,dz01
,rsq01
,rinv01
,rinvsq01
,r01
,qq01
,c6_01
,c12_01
;
1311 __m128d dx02
,dy02
,dz02
,rsq02
,rinv02
,rinvsq02
,r02
,qq02
,c6_02
,c12_02
;
1312 __m128d dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
1313 __m128d dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
1314 __m128d dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
1315 __m128d dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
1316 __m128d dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
1317 __m128d dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
1318 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
1321 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
1324 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
1325 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
1327 __m128i ifour
= _mm_set1_epi32(4);
1328 __m128d rt
,vfeps
,vftabscale
,Y
,F
,G
,H
,Heps
,Fp
,VV
,FF
;
1330 __m128d dummy_mask
,cutoff_mask
;
1331 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1332 __m128d one
= _mm_set1_pd(1.0);
1333 __m128d two
= _mm_set1_pd(2.0);
1339 jindex
= nlist
->jindex
;
1341 shiftidx
= nlist
->shift
;
1343 shiftvec
= fr
->shift_vec
[0];
1344 fshift
= fr
->fshift
[0];
1345 facel
= _mm_set1_pd(fr
->epsfac
);
1346 charge
= mdatoms
->chargeA
;
1347 nvdwtype
= fr
->ntype
;
1348 vdwparam
= fr
->nbfp
;
1349 vdwtype
= mdatoms
->typeA
;
1351 vftab
= kernel_data
->table_elec_vdw
->data
;
1352 vftabscale
= _mm_set1_pd(kernel_data
->table_elec_vdw
->scale
);
1354 /* Setup water-specific parameters */
1355 inr
= nlist
->iinr
[0];
1356 iq0
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+0]));
1357 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
1358 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
1359 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
1361 jq0
= _mm_set1_pd(charge
[inr
+0]);
1362 jq1
= _mm_set1_pd(charge
[inr
+1]);
1363 jq2
= _mm_set1_pd(charge
[inr
+2]);
1364 vdwjidx0A
= 2*vdwtype
[inr
+0];
1365 qq00
= _mm_mul_pd(iq0
,jq0
);
1366 c6_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
]);
1367 c12_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
1368 qq01
= _mm_mul_pd(iq0
,jq1
);
1369 qq02
= _mm_mul_pd(iq0
,jq2
);
1370 qq10
= _mm_mul_pd(iq1
,jq0
);
1371 qq11
= _mm_mul_pd(iq1
,jq1
);
1372 qq12
= _mm_mul_pd(iq1
,jq2
);
1373 qq20
= _mm_mul_pd(iq2
,jq0
);
1374 qq21
= _mm_mul_pd(iq2
,jq1
);
1375 qq22
= _mm_mul_pd(iq2
,jq2
);
1377 /* Avoid stupid compiler warnings */
1379 j_coord_offsetA
= 0;
1380 j_coord_offsetB
= 0;
1385 /* Start outer loop over neighborlists */
1386 for(iidx
=0; iidx
<nri
; iidx
++)
1388 /* Load shift vector for this list */
1389 i_shift_offset
= DIM
*shiftidx
[iidx
];
1391 /* Load limits for loop over neighbors */
1392 j_index_start
= jindex
[iidx
];
1393 j_index_end
= jindex
[iidx
+1];
1395 /* Get outer coordinate index */
1397 i_coord_offset
= DIM
*inr
;
1399 /* Load i particle coords and add shift vector */
1400 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
1401 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
1403 fix0
= _mm_setzero_pd();
1404 fiy0
= _mm_setzero_pd();
1405 fiz0
= _mm_setzero_pd();
1406 fix1
= _mm_setzero_pd();
1407 fiy1
= _mm_setzero_pd();
1408 fiz1
= _mm_setzero_pd();
1409 fix2
= _mm_setzero_pd();
1410 fiy2
= _mm_setzero_pd();
1411 fiz2
= _mm_setzero_pd();
1413 /* Start inner kernel loop */
1414 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
1417 /* Get j neighbor index, and coordinate index */
1419 jnrB
= jjnr
[jidx
+1];
1420 j_coord_offsetA
= DIM
*jnrA
;
1421 j_coord_offsetB
= DIM
*jnrB
;
1423 /* load j atom coordinates */
1424 gmx_mm_load_3rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1425 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
1427 /* Calculate displacement vector */
1428 dx00
= _mm_sub_pd(ix0
,jx0
);
1429 dy00
= _mm_sub_pd(iy0
,jy0
);
1430 dz00
= _mm_sub_pd(iz0
,jz0
);
1431 dx01
= _mm_sub_pd(ix0
,jx1
);
1432 dy01
= _mm_sub_pd(iy0
,jy1
);
1433 dz01
= _mm_sub_pd(iz0
,jz1
);
1434 dx02
= _mm_sub_pd(ix0
,jx2
);
1435 dy02
= _mm_sub_pd(iy0
,jy2
);
1436 dz02
= _mm_sub_pd(iz0
,jz2
);
1437 dx10
= _mm_sub_pd(ix1
,jx0
);
1438 dy10
= _mm_sub_pd(iy1
,jy0
);
1439 dz10
= _mm_sub_pd(iz1
,jz0
);
1440 dx11
= _mm_sub_pd(ix1
,jx1
);
1441 dy11
= _mm_sub_pd(iy1
,jy1
);
1442 dz11
= _mm_sub_pd(iz1
,jz1
);
1443 dx12
= _mm_sub_pd(ix1
,jx2
);
1444 dy12
= _mm_sub_pd(iy1
,jy2
);
1445 dz12
= _mm_sub_pd(iz1
,jz2
);
1446 dx20
= _mm_sub_pd(ix2
,jx0
);
1447 dy20
= _mm_sub_pd(iy2
,jy0
);
1448 dz20
= _mm_sub_pd(iz2
,jz0
);
1449 dx21
= _mm_sub_pd(ix2
,jx1
);
1450 dy21
= _mm_sub_pd(iy2
,jy1
);
1451 dz21
= _mm_sub_pd(iz2
,jz1
);
1452 dx22
= _mm_sub_pd(ix2
,jx2
);
1453 dy22
= _mm_sub_pd(iy2
,jy2
);
1454 dz22
= _mm_sub_pd(iz2
,jz2
);
1456 /* Calculate squared distance and things based on it */
1457 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
1458 rsq01
= gmx_mm_calc_rsq_pd(dx01
,dy01
,dz01
);
1459 rsq02
= gmx_mm_calc_rsq_pd(dx02
,dy02
,dz02
);
1460 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
1461 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
1462 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
1463 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
1464 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
1465 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
1467 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
1468 rinv01
= gmx_mm_invsqrt_pd(rsq01
);
1469 rinv02
= gmx_mm_invsqrt_pd(rsq02
);
1470 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
1471 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
1472 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
1473 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
1474 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
1475 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
1477 fjx0
= _mm_setzero_pd();
1478 fjy0
= _mm_setzero_pd();
1479 fjz0
= _mm_setzero_pd();
1480 fjx1
= _mm_setzero_pd();
1481 fjy1
= _mm_setzero_pd();
1482 fjz1
= _mm_setzero_pd();
1483 fjx2
= _mm_setzero_pd();
1484 fjy2
= _mm_setzero_pd();
1485 fjz2
= _mm_setzero_pd();
1487 /**************************
1488 * CALCULATE INTERACTIONS *
1489 **************************/
1491 r00
= _mm_mul_pd(rsq00
,rinv00
);
1493 /* Calculate table index by multiplying r with table scale and truncate to integer */
1494 rt
= _mm_mul_pd(r00
,vftabscale
);
1495 vfitab
= _mm_cvttpd_epi32(rt
);
1496 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1497 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1499 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1500 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1501 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1502 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1503 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1504 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
1505 GMX_MM_TRANSPOSE2_PD(G
,H
);
1506 Heps
= _mm_mul_pd(vfeps
,H
);
1507 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1508 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1509 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq00
,FF
),_mm_mul_pd(vftabscale
,rinv00
)));
1511 /* CUBIC SPLINE TABLE DISPERSION */
1512 vfitab
= _mm_add_epi32(vfitab
,ifour
);
1513 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1514 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1515 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1516 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1517 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
1518 GMX_MM_TRANSPOSE2_PD(G
,H
);
1519 Heps
= _mm_mul_pd(vfeps
,H
);
1520 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1521 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1522 fvdw6
= _mm_mul_pd(c6_00
,FF
);
1524 /* CUBIC SPLINE TABLE REPULSION */
1525 vfitab
= _mm_add_epi32(vfitab
,ifour
);
1526 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1527 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1528 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1529 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1530 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
1531 GMX_MM_TRANSPOSE2_PD(G
,H
);
1532 Heps
= _mm_mul_pd(vfeps
,H
);
1533 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1534 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1535 fvdw12
= _mm_mul_pd(c12_00
,FF
);
1536 fvdw
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_add_pd(fvdw6
,fvdw12
),_mm_mul_pd(vftabscale
,rinv00
)));
1538 fscal
= _mm_add_pd(felec
,fvdw
);
1540 /* Calculate temporary vectorial force */
1541 tx
= _mm_mul_pd(fscal
,dx00
);
1542 ty
= _mm_mul_pd(fscal
,dy00
);
1543 tz
= _mm_mul_pd(fscal
,dz00
);
1545 /* Update vectorial force */
1546 fix0
= _mm_add_pd(fix0
,tx
);
1547 fiy0
= _mm_add_pd(fiy0
,ty
);
1548 fiz0
= _mm_add_pd(fiz0
,tz
);
1550 fjx0
= _mm_add_pd(fjx0
,tx
);
1551 fjy0
= _mm_add_pd(fjy0
,ty
);
1552 fjz0
= _mm_add_pd(fjz0
,tz
);
1554 /**************************
1555 * CALCULATE INTERACTIONS *
1556 **************************/
1558 r01
= _mm_mul_pd(rsq01
,rinv01
);
1560 /* Calculate table index by multiplying r with table scale and truncate to integer */
1561 rt
= _mm_mul_pd(r01
,vftabscale
);
1562 vfitab
= _mm_cvttpd_epi32(rt
);
1563 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1564 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1566 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1567 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1568 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1569 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1570 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1571 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
1572 GMX_MM_TRANSPOSE2_PD(G
,H
);
1573 Heps
= _mm_mul_pd(vfeps
,H
);
1574 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1575 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1576 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq01
,FF
),_mm_mul_pd(vftabscale
,rinv01
)));
1580 /* Calculate temporary vectorial force */
1581 tx
= _mm_mul_pd(fscal
,dx01
);
1582 ty
= _mm_mul_pd(fscal
,dy01
);
1583 tz
= _mm_mul_pd(fscal
,dz01
);
1585 /* Update vectorial force */
1586 fix0
= _mm_add_pd(fix0
,tx
);
1587 fiy0
= _mm_add_pd(fiy0
,ty
);
1588 fiz0
= _mm_add_pd(fiz0
,tz
);
1590 fjx1
= _mm_add_pd(fjx1
,tx
);
1591 fjy1
= _mm_add_pd(fjy1
,ty
);
1592 fjz1
= _mm_add_pd(fjz1
,tz
);
1594 /**************************
1595 * CALCULATE INTERACTIONS *
1596 **************************/
1598 r02
= _mm_mul_pd(rsq02
,rinv02
);
1600 /* Calculate table index by multiplying r with table scale and truncate to integer */
1601 rt
= _mm_mul_pd(r02
,vftabscale
);
1602 vfitab
= _mm_cvttpd_epi32(rt
);
1603 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1604 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1606 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1607 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1608 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1609 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1610 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1611 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
1612 GMX_MM_TRANSPOSE2_PD(G
,H
);
1613 Heps
= _mm_mul_pd(vfeps
,H
);
1614 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1615 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1616 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq02
,FF
),_mm_mul_pd(vftabscale
,rinv02
)));
1620 /* Calculate temporary vectorial force */
1621 tx
= _mm_mul_pd(fscal
,dx02
);
1622 ty
= _mm_mul_pd(fscal
,dy02
);
1623 tz
= _mm_mul_pd(fscal
,dz02
);
1625 /* Update vectorial force */
1626 fix0
= _mm_add_pd(fix0
,tx
);
1627 fiy0
= _mm_add_pd(fiy0
,ty
);
1628 fiz0
= _mm_add_pd(fiz0
,tz
);
1630 fjx2
= _mm_add_pd(fjx2
,tx
);
1631 fjy2
= _mm_add_pd(fjy2
,ty
);
1632 fjz2
= _mm_add_pd(fjz2
,tz
);
1634 /**************************
1635 * CALCULATE INTERACTIONS *
1636 **************************/
1638 r10
= _mm_mul_pd(rsq10
,rinv10
);
1640 /* Calculate table index by multiplying r with table scale and truncate to integer */
1641 rt
= _mm_mul_pd(r10
,vftabscale
);
1642 vfitab
= _mm_cvttpd_epi32(rt
);
1643 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1644 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1646 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1647 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1648 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1649 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1650 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1651 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
1652 GMX_MM_TRANSPOSE2_PD(G
,H
);
1653 Heps
= _mm_mul_pd(vfeps
,H
);
1654 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1655 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1656 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq10
,FF
),_mm_mul_pd(vftabscale
,rinv10
)));
1660 /* Calculate temporary vectorial force */
1661 tx
= _mm_mul_pd(fscal
,dx10
);
1662 ty
= _mm_mul_pd(fscal
,dy10
);
1663 tz
= _mm_mul_pd(fscal
,dz10
);
1665 /* Update vectorial force */
1666 fix1
= _mm_add_pd(fix1
,tx
);
1667 fiy1
= _mm_add_pd(fiy1
,ty
);
1668 fiz1
= _mm_add_pd(fiz1
,tz
);
1670 fjx0
= _mm_add_pd(fjx0
,tx
);
1671 fjy0
= _mm_add_pd(fjy0
,ty
);
1672 fjz0
= _mm_add_pd(fjz0
,tz
);
1674 /**************************
1675 * CALCULATE INTERACTIONS *
1676 **************************/
1678 r11
= _mm_mul_pd(rsq11
,rinv11
);
1680 /* Calculate table index by multiplying r with table scale and truncate to integer */
1681 rt
= _mm_mul_pd(r11
,vftabscale
);
1682 vfitab
= _mm_cvttpd_epi32(rt
);
1683 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1684 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1686 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1687 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1688 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1689 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1690 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1691 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
1692 GMX_MM_TRANSPOSE2_PD(G
,H
);
1693 Heps
= _mm_mul_pd(vfeps
,H
);
1694 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1695 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1696 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq11
,FF
),_mm_mul_pd(vftabscale
,rinv11
)));
1700 /* Calculate temporary vectorial force */
1701 tx
= _mm_mul_pd(fscal
,dx11
);
1702 ty
= _mm_mul_pd(fscal
,dy11
);
1703 tz
= _mm_mul_pd(fscal
,dz11
);
1705 /* Update vectorial force */
1706 fix1
= _mm_add_pd(fix1
,tx
);
1707 fiy1
= _mm_add_pd(fiy1
,ty
);
1708 fiz1
= _mm_add_pd(fiz1
,tz
);
1710 fjx1
= _mm_add_pd(fjx1
,tx
);
1711 fjy1
= _mm_add_pd(fjy1
,ty
);
1712 fjz1
= _mm_add_pd(fjz1
,tz
);
1714 /**************************
1715 * CALCULATE INTERACTIONS *
1716 **************************/
1718 r12
= _mm_mul_pd(rsq12
,rinv12
);
1720 /* Calculate table index by multiplying r with table scale and truncate to integer */
1721 rt
= _mm_mul_pd(r12
,vftabscale
);
1722 vfitab
= _mm_cvttpd_epi32(rt
);
1723 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1724 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1726 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1727 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1728 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1729 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1730 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1731 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
1732 GMX_MM_TRANSPOSE2_PD(G
,H
);
1733 Heps
= _mm_mul_pd(vfeps
,H
);
1734 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1735 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1736 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq12
,FF
),_mm_mul_pd(vftabscale
,rinv12
)));
1740 /* Calculate temporary vectorial force */
1741 tx
= _mm_mul_pd(fscal
,dx12
);
1742 ty
= _mm_mul_pd(fscal
,dy12
);
1743 tz
= _mm_mul_pd(fscal
,dz12
);
1745 /* Update vectorial force */
1746 fix1
= _mm_add_pd(fix1
,tx
);
1747 fiy1
= _mm_add_pd(fiy1
,ty
);
1748 fiz1
= _mm_add_pd(fiz1
,tz
);
1750 fjx2
= _mm_add_pd(fjx2
,tx
);
1751 fjy2
= _mm_add_pd(fjy2
,ty
);
1752 fjz2
= _mm_add_pd(fjz2
,tz
);
1754 /**************************
1755 * CALCULATE INTERACTIONS *
1756 **************************/
1758 r20
= _mm_mul_pd(rsq20
,rinv20
);
1760 /* Calculate table index by multiplying r with table scale and truncate to integer */
1761 rt
= _mm_mul_pd(r20
,vftabscale
);
1762 vfitab
= _mm_cvttpd_epi32(rt
);
1763 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1764 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1766 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1767 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1768 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1769 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1770 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1771 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
1772 GMX_MM_TRANSPOSE2_PD(G
,H
);
1773 Heps
= _mm_mul_pd(vfeps
,H
);
1774 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1775 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1776 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq20
,FF
),_mm_mul_pd(vftabscale
,rinv20
)));
1780 /* Calculate temporary vectorial force */
1781 tx
= _mm_mul_pd(fscal
,dx20
);
1782 ty
= _mm_mul_pd(fscal
,dy20
);
1783 tz
= _mm_mul_pd(fscal
,dz20
);
1785 /* Update vectorial force */
1786 fix2
= _mm_add_pd(fix2
,tx
);
1787 fiy2
= _mm_add_pd(fiy2
,ty
);
1788 fiz2
= _mm_add_pd(fiz2
,tz
);
1790 fjx0
= _mm_add_pd(fjx0
,tx
);
1791 fjy0
= _mm_add_pd(fjy0
,ty
);
1792 fjz0
= _mm_add_pd(fjz0
,tz
);
1794 /**************************
1795 * CALCULATE INTERACTIONS *
1796 **************************/
1798 r21
= _mm_mul_pd(rsq21
,rinv21
);
1800 /* Calculate table index by multiplying r with table scale and truncate to integer */
1801 rt
= _mm_mul_pd(r21
,vftabscale
);
1802 vfitab
= _mm_cvttpd_epi32(rt
);
1803 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1804 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1806 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1807 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1808 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1809 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1810 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1811 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
1812 GMX_MM_TRANSPOSE2_PD(G
,H
);
1813 Heps
= _mm_mul_pd(vfeps
,H
);
1814 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1815 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1816 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq21
,FF
),_mm_mul_pd(vftabscale
,rinv21
)));
1820 /* Calculate temporary vectorial force */
1821 tx
= _mm_mul_pd(fscal
,dx21
);
1822 ty
= _mm_mul_pd(fscal
,dy21
);
1823 tz
= _mm_mul_pd(fscal
,dz21
);
1825 /* Update vectorial force */
1826 fix2
= _mm_add_pd(fix2
,tx
);
1827 fiy2
= _mm_add_pd(fiy2
,ty
);
1828 fiz2
= _mm_add_pd(fiz2
,tz
);
1830 fjx1
= _mm_add_pd(fjx1
,tx
);
1831 fjy1
= _mm_add_pd(fjy1
,ty
);
1832 fjz1
= _mm_add_pd(fjz1
,tz
);
1834 /**************************
1835 * CALCULATE INTERACTIONS *
1836 **************************/
1838 r22
= _mm_mul_pd(rsq22
,rinv22
);
1840 /* Calculate table index by multiplying r with table scale and truncate to integer */
1841 rt
= _mm_mul_pd(r22
,vftabscale
);
1842 vfitab
= _mm_cvttpd_epi32(rt
);
1843 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1844 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1846 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1847 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1848 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1849 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1850 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1851 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
1852 GMX_MM_TRANSPOSE2_PD(G
,H
);
1853 Heps
= _mm_mul_pd(vfeps
,H
);
1854 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1855 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1856 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq22
,FF
),_mm_mul_pd(vftabscale
,rinv22
)));
1860 /* Calculate temporary vectorial force */
1861 tx
= _mm_mul_pd(fscal
,dx22
);
1862 ty
= _mm_mul_pd(fscal
,dy22
);
1863 tz
= _mm_mul_pd(fscal
,dz22
);
1865 /* Update vectorial force */
1866 fix2
= _mm_add_pd(fix2
,tx
);
1867 fiy2
= _mm_add_pd(fiy2
,ty
);
1868 fiz2
= _mm_add_pd(fiz2
,tz
);
1870 fjx2
= _mm_add_pd(fjx2
,tx
);
1871 fjy2
= _mm_add_pd(fjy2
,ty
);
1872 fjz2
= _mm_add_pd(fjz2
,tz
);
1874 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
1876 /* Inner loop uses 373 flops */
1879 if(jidx
<j_index_end
)
1883 j_coord_offsetA
= DIM
*jnrA
;
1885 /* load j atom coordinates */
1886 gmx_mm_load_3rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
1887 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
1889 /* Calculate displacement vector */
1890 dx00
= _mm_sub_pd(ix0
,jx0
);
1891 dy00
= _mm_sub_pd(iy0
,jy0
);
1892 dz00
= _mm_sub_pd(iz0
,jz0
);
1893 dx01
= _mm_sub_pd(ix0
,jx1
);
1894 dy01
= _mm_sub_pd(iy0
,jy1
);
1895 dz01
= _mm_sub_pd(iz0
,jz1
);
1896 dx02
= _mm_sub_pd(ix0
,jx2
);
1897 dy02
= _mm_sub_pd(iy0
,jy2
);
1898 dz02
= _mm_sub_pd(iz0
,jz2
);
1899 dx10
= _mm_sub_pd(ix1
,jx0
);
1900 dy10
= _mm_sub_pd(iy1
,jy0
);
1901 dz10
= _mm_sub_pd(iz1
,jz0
);
1902 dx11
= _mm_sub_pd(ix1
,jx1
);
1903 dy11
= _mm_sub_pd(iy1
,jy1
);
1904 dz11
= _mm_sub_pd(iz1
,jz1
);
1905 dx12
= _mm_sub_pd(ix1
,jx2
);
1906 dy12
= _mm_sub_pd(iy1
,jy2
);
1907 dz12
= _mm_sub_pd(iz1
,jz2
);
1908 dx20
= _mm_sub_pd(ix2
,jx0
);
1909 dy20
= _mm_sub_pd(iy2
,jy0
);
1910 dz20
= _mm_sub_pd(iz2
,jz0
);
1911 dx21
= _mm_sub_pd(ix2
,jx1
);
1912 dy21
= _mm_sub_pd(iy2
,jy1
);
1913 dz21
= _mm_sub_pd(iz2
,jz1
);
1914 dx22
= _mm_sub_pd(ix2
,jx2
);
1915 dy22
= _mm_sub_pd(iy2
,jy2
);
1916 dz22
= _mm_sub_pd(iz2
,jz2
);
1918 /* Calculate squared distance and things based on it */
1919 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
1920 rsq01
= gmx_mm_calc_rsq_pd(dx01
,dy01
,dz01
);
1921 rsq02
= gmx_mm_calc_rsq_pd(dx02
,dy02
,dz02
);
1922 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
1923 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
1924 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
1925 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
1926 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
1927 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
1929 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
1930 rinv01
= gmx_mm_invsqrt_pd(rsq01
);
1931 rinv02
= gmx_mm_invsqrt_pd(rsq02
);
1932 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
1933 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
1934 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
1935 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
1936 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
1937 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
1939 fjx0
= _mm_setzero_pd();
1940 fjy0
= _mm_setzero_pd();
1941 fjz0
= _mm_setzero_pd();
1942 fjx1
= _mm_setzero_pd();
1943 fjy1
= _mm_setzero_pd();
1944 fjz1
= _mm_setzero_pd();
1945 fjx2
= _mm_setzero_pd();
1946 fjy2
= _mm_setzero_pd();
1947 fjz2
= _mm_setzero_pd();
1949 /**************************
1950 * CALCULATE INTERACTIONS *
1951 **************************/
1953 r00
= _mm_mul_pd(rsq00
,rinv00
);
1955 /* Calculate table index by multiplying r with table scale and truncate to integer */
1956 rt
= _mm_mul_pd(r00
,vftabscale
);
1957 vfitab
= _mm_cvttpd_epi32(rt
);
1958 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1959 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1961 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1962 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1963 F
= _mm_setzero_pd();
1964 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1965 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1966 H
= _mm_setzero_pd();
1967 GMX_MM_TRANSPOSE2_PD(G
,H
);
1968 Heps
= _mm_mul_pd(vfeps
,H
);
1969 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1970 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1971 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq00
,FF
),_mm_mul_pd(vftabscale
,rinv00
)));
1973 /* CUBIC SPLINE TABLE DISPERSION */
1974 vfitab
= _mm_add_epi32(vfitab
,ifour
);
1975 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1976 F
= _mm_setzero_pd();
1977 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1978 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1979 H
= _mm_setzero_pd();
1980 GMX_MM_TRANSPOSE2_PD(G
,H
);
1981 Heps
= _mm_mul_pd(vfeps
,H
);
1982 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1983 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1984 fvdw6
= _mm_mul_pd(c6_00
,FF
);
1986 /* CUBIC SPLINE TABLE REPULSION */
1987 vfitab
= _mm_add_epi32(vfitab
,ifour
);
1988 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1989 F
= _mm_setzero_pd();
1990 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1991 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1992 H
= _mm_setzero_pd();
1993 GMX_MM_TRANSPOSE2_PD(G
,H
);
1994 Heps
= _mm_mul_pd(vfeps
,H
);
1995 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1996 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1997 fvdw12
= _mm_mul_pd(c12_00
,FF
);
1998 fvdw
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_add_pd(fvdw6
,fvdw12
),_mm_mul_pd(vftabscale
,rinv00
)));
2000 fscal
= _mm_add_pd(felec
,fvdw
);
2002 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2004 /* Calculate temporary vectorial force */
2005 tx
= _mm_mul_pd(fscal
,dx00
);
2006 ty
= _mm_mul_pd(fscal
,dy00
);
2007 tz
= _mm_mul_pd(fscal
,dz00
);
2009 /* Update vectorial force */
2010 fix0
= _mm_add_pd(fix0
,tx
);
2011 fiy0
= _mm_add_pd(fiy0
,ty
);
2012 fiz0
= _mm_add_pd(fiz0
,tz
);
2014 fjx0
= _mm_add_pd(fjx0
,tx
);
2015 fjy0
= _mm_add_pd(fjy0
,ty
);
2016 fjz0
= _mm_add_pd(fjz0
,tz
);
2018 /**************************
2019 * CALCULATE INTERACTIONS *
2020 **************************/
2022 r01
= _mm_mul_pd(rsq01
,rinv01
);
2024 /* Calculate table index by multiplying r with table scale and truncate to integer */
2025 rt
= _mm_mul_pd(r01
,vftabscale
);
2026 vfitab
= _mm_cvttpd_epi32(rt
);
2027 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
2028 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
2030 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2031 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2032 F
= _mm_setzero_pd();
2033 GMX_MM_TRANSPOSE2_PD(Y
,F
);
2034 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
2035 H
= _mm_setzero_pd();
2036 GMX_MM_TRANSPOSE2_PD(G
,H
);
2037 Heps
= _mm_mul_pd(vfeps
,H
);
2038 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
2039 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
2040 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq01
,FF
),_mm_mul_pd(vftabscale
,rinv01
)));
2044 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2046 /* Calculate temporary vectorial force */
2047 tx
= _mm_mul_pd(fscal
,dx01
);
2048 ty
= _mm_mul_pd(fscal
,dy01
);
2049 tz
= _mm_mul_pd(fscal
,dz01
);
2051 /* Update vectorial force */
2052 fix0
= _mm_add_pd(fix0
,tx
);
2053 fiy0
= _mm_add_pd(fiy0
,ty
);
2054 fiz0
= _mm_add_pd(fiz0
,tz
);
2056 fjx1
= _mm_add_pd(fjx1
,tx
);
2057 fjy1
= _mm_add_pd(fjy1
,ty
);
2058 fjz1
= _mm_add_pd(fjz1
,tz
);
2060 /**************************
2061 * CALCULATE INTERACTIONS *
2062 **************************/
2064 r02
= _mm_mul_pd(rsq02
,rinv02
);
2066 /* Calculate table index by multiplying r with table scale and truncate to integer */
2067 rt
= _mm_mul_pd(r02
,vftabscale
);
2068 vfitab
= _mm_cvttpd_epi32(rt
);
2069 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
2070 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
2072 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2073 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2074 F
= _mm_setzero_pd();
2075 GMX_MM_TRANSPOSE2_PD(Y
,F
);
2076 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
2077 H
= _mm_setzero_pd();
2078 GMX_MM_TRANSPOSE2_PD(G
,H
);
2079 Heps
= _mm_mul_pd(vfeps
,H
);
2080 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
2081 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
2082 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq02
,FF
),_mm_mul_pd(vftabscale
,rinv02
)));
2086 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2088 /* Calculate temporary vectorial force */
2089 tx
= _mm_mul_pd(fscal
,dx02
);
2090 ty
= _mm_mul_pd(fscal
,dy02
);
2091 tz
= _mm_mul_pd(fscal
,dz02
);
2093 /* Update vectorial force */
2094 fix0
= _mm_add_pd(fix0
,tx
);
2095 fiy0
= _mm_add_pd(fiy0
,ty
);
2096 fiz0
= _mm_add_pd(fiz0
,tz
);
2098 fjx2
= _mm_add_pd(fjx2
,tx
);
2099 fjy2
= _mm_add_pd(fjy2
,ty
);
2100 fjz2
= _mm_add_pd(fjz2
,tz
);
2102 /**************************
2103 * CALCULATE INTERACTIONS *
2104 **************************/
2106 r10
= _mm_mul_pd(rsq10
,rinv10
);
2108 /* Calculate table index by multiplying r with table scale and truncate to integer */
2109 rt
= _mm_mul_pd(r10
,vftabscale
);
2110 vfitab
= _mm_cvttpd_epi32(rt
);
2111 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
2112 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
2114 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2115 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2116 F
= _mm_setzero_pd();
2117 GMX_MM_TRANSPOSE2_PD(Y
,F
);
2118 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
2119 H
= _mm_setzero_pd();
2120 GMX_MM_TRANSPOSE2_PD(G
,H
);
2121 Heps
= _mm_mul_pd(vfeps
,H
);
2122 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
2123 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
2124 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq10
,FF
),_mm_mul_pd(vftabscale
,rinv10
)));
2128 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2130 /* Calculate temporary vectorial force */
2131 tx
= _mm_mul_pd(fscal
,dx10
);
2132 ty
= _mm_mul_pd(fscal
,dy10
);
2133 tz
= _mm_mul_pd(fscal
,dz10
);
2135 /* Update vectorial force */
2136 fix1
= _mm_add_pd(fix1
,tx
);
2137 fiy1
= _mm_add_pd(fiy1
,ty
);
2138 fiz1
= _mm_add_pd(fiz1
,tz
);
2140 fjx0
= _mm_add_pd(fjx0
,tx
);
2141 fjy0
= _mm_add_pd(fjy0
,ty
);
2142 fjz0
= _mm_add_pd(fjz0
,tz
);
2144 /**************************
2145 * CALCULATE INTERACTIONS *
2146 **************************/
2148 r11
= _mm_mul_pd(rsq11
,rinv11
);
2150 /* Calculate table index by multiplying r with table scale and truncate to integer */
2151 rt
= _mm_mul_pd(r11
,vftabscale
);
2152 vfitab
= _mm_cvttpd_epi32(rt
);
2153 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
2154 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
2156 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2157 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2158 F
= _mm_setzero_pd();
2159 GMX_MM_TRANSPOSE2_PD(Y
,F
);
2160 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
2161 H
= _mm_setzero_pd();
2162 GMX_MM_TRANSPOSE2_PD(G
,H
);
2163 Heps
= _mm_mul_pd(vfeps
,H
);
2164 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
2165 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
2166 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq11
,FF
),_mm_mul_pd(vftabscale
,rinv11
)));
2170 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2172 /* Calculate temporary vectorial force */
2173 tx
= _mm_mul_pd(fscal
,dx11
);
2174 ty
= _mm_mul_pd(fscal
,dy11
);
2175 tz
= _mm_mul_pd(fscal
,dz11
);
2177 /* Update vectorial force */
2178 fix1
= _mm_add_pd(fix1
,tx
);
2179 fiy1
= _mm_add_pd(fiy1
,ty
);
2180 fiz1
= _mm_add_pd(fiz1
,tz
);
2182 fjx1
= _mm_add_pd(fjx1
,tx
);
2183 fjy1
= _mm_add_pd(fjy1
,ty
);
2184 fjz1
= _mm_add_pd(fjz1
,tz
);
2186 /**************************
2187 * CALCULATE INTERACTIONS *
2188 **************************/
2190 r12
= _mm_mul_pd(rsq12
,rinv12
);
2192 /* Calculate table index by multiplying r with table scale and truncate to integer */
2193 rt
= _mm_mul_pd(r12
,vftabscale
);
2194 vfitab
= _mm_cvttpd_epi32(rt
);
2195 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
2196 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
2198 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2199 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2200 F
= _mm_setzero_pd();
2201 GMX_MM_TRANSPOSE2_PD(Y
,F
);
2202 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
2203 H
= _mm_setzero_pd();
2204 GMX_MM_TRANSPOSE2_PD(G
,H
);
2205 Heps
= _mm_mul_pd(vfeps
,H
);
2206 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
2207 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
2208 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq12
,FF
),_mm_mul_pd(vftabscale
,rinv12
)));
2212 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2214 /* Calculate temporary vectorial force */
2215 tx
= _mm_mul_pd(fscal
,dx12
);
2216 ty
= _mm_mul_pd(fscal
,dy12
);
2217 tz
= _mm_mul_pd(fscal
,dz12
);
2219 /* Update vectorial force */
2220 fix1
= _mm_add_pd(fix1
,tx
);
2221 fiy1
= _mm_add_pd(fiy1
,ty
);
2222 fiz1
= _mm_add_pd(fiz1
,tz
);
2224 fjx2
= _mm_add_pd(fjx2
,tx
);
2225 fjy2
= _mm_add_pd(fjy2
,ty
);
2226 fjz2
= _mm_add_pd(fjz2
,tz
);
2228 /**************************
2229 * CALCULATE INTERACTIONS *
2230 **************************/
2232 r20
= _mm_mul_pd(rsq20
,rinv20
);
2234 /* Calculate table index by multiplying r with table scale and truncate to integer */
2235 rt
= _mm_mul_pd(r20
,vftabscale
);
2236 vfitab
= _mm_cvttpd_epi32(rt
);
2237 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
2238 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
2240 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2241 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2242 F
= _mm_setzero_pd();
2243 GMX_MM_TRANSPOSE2_PD(Y
,F
);
2244 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
2245 H
= _mm_setzero_pd();
2246 GMX_MM_TRANSPOSE2_PD(G
,H
);
2247 Heps
= _mm_mul_pd(vfeps
,H
);
2248 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
2249 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
2250 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq20
,FF
),_mm_mul_pd(vftabscale
,rinv20
)));
2254 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2256 /* Calculate temporary vectorial force */
2257 tx
= _mm_mul_pd(fscal
,dx20
);
2258 ty
= _mm_mul_pd(fscal
,dy20
);
2259 tz
= _mm_mul_pd(fscal
,dz20
);
2261 /* Update vectorial force */
2262 fix2
= _mm_add_pd(fix2
,tx
);
2263 fiy2
= _mm_add_pd(fiy2
,ty
);
2264 fiz2
= _mm_add_pd(fiz2
,tz
);
2266 fjx0
= _mm_add_pd(fjx0
,tx
);
2267 fjy0
= _mm_add_pd(fjy0
,ty
);
2268 fjz0
= _mm_add_pd(fjz0
,tz
);
2270 /**************************
2271 * CALCULATE INTERACTIONS *
2272 **************************/
2274 r21
= _mm_mul_pd(rsq21
,rinv21
);
2276 /* Calculate table index by multiplying r with table scale and truncate to integer */
2277 rt
= _mm_mul_pd(r21
,vftabscale
);
2278 vfitab
= _mm_cvttpd_epi32(rt
);
2279 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
2280 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
2282 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2283 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2284 F
= _mm_setzero_pd();
2285 GMX_MM_TRANSPOSE2_PD(Y
,F
);
2286 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
2287 H
= _mm_setzero_pd();
2288 GMX_MM_TRANSPOSE2_PD(G
,H
);
2289 Heps
= _mm_mul_pd(vfeps
,H
);
2290 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
2291 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
2292 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq21
,FF
),_mm_mul_pd(vftabscale
,rinv21
)));
2296 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2298 /* Calculate temporary vectorial force */
2299 tx
= _mm_mul_pd(fscal
,dx21
);
2300 ty
= _mm_mul_pd(fscal
,dy21
);
2301 tz
= _mm_mul_pd(fscal
,dz21
);
2303 /* Update vectorial force */
2304 fix2
= _mm_add_pd(fix2
,tx
);
2305 fiy2
= _mm_add_pd(fiy2
,ty
);
2306 fiz2
= _mm_add_pd(fiz2
,tz
);
2308 fjx1
= _mm_add_pd(fjx1
,tx
);
2309 fjy1
= _mm_add_pd(fjy1
,ty
);
2310 fjz1
= _mm_add_pd(fjz1
,tz
);
2312 /**************************
2313 * CALCULATE INTERACTIONS *
2314 **************************/
2316 r22
= _mm_mul_pd(rsq22
,rinv22
);
2318 /* Calculate table index by multiplying r with table scale and truncate to integer */
2319 rt
= _mm_mul_pd(r22
,vftabscale
);
2320 vfitab
= _mm_cvttpd_epi32(rt
);
2321 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
2322 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
2324 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2325 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2326 F
= _mm_setzero_pd();
2327 GMX_MM_TRANSPOSE2_PD(Y
,F
);
2328 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
2329 H
= _mm_setzero_pd();
2330 GMX_MM_TRANSPOSE2_PD(G
,H
);
2331 Heps
= _mm_mul_pd(vfeps
,H
);
2332 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
2333 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
2334 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq22
,FF
),_mm_mul_pd(vftabscale
,rinv22
)));
2338 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2340 /* Calculate temporary vectorial force */
2341 tx
= _mm_mul_pd(fscal
,dx22
);
2342 ty
= _mm_mul_pd(fscal
,dy22
);
2343 tz
= _mm_mul_pd(fscal
,dz22
);
2345 /* Update vectorial force */
2346 fix2
= _mm_add_pd(fix2
,tx
);
2347 fiy2
= _mm_add_pd(fiy2
,ty
);
2348 fiz2
= _mm_add_pd(fiz2
,tz
);
2350 fjx2
= _mm_add_pd(fjx2
,tx
);
2351 fjy2
= _mm_add_pd(fjy2
,ty
);
2352 fjz2
= _mm_add_pd(fjz2
,tz
);
2354 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
2356 /* Inner loop uses 373 flops */
2359 /* End of innermost loop */
2361 gmx_mm_update_iforce_3atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
2362 f
+i_coord_offset
,fshift
+i_shift_offset
);
2364 /* Increment number of inner iterations */
2365 inneriter
+= j_index_end
- j_index_start
;
2367 /* Outer loop uses 18 flops */
2370 /* Increment number of outer iterations */
2373 /* Update outer/inner flops */
2375 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W3W3_F
,outeriter
*18 + inneriter
*373);