2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/math/vec.h"
46 #include "gromacs/legacyheaders/nrnb.h"
48 #include "gromacs/simd/math_x86_sse2_double.h"
49 #include "kernelutil_x86_sse2_double.h"
52 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sse2_double
53 * Electrostatics interaction: CubicSplineTable
54 * VdW interaction: CubicSplineTable
55 * Geometry: Water3-Water3
56 * Calculate force/pot: PotentialAndForce
59 nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sse2_double
60 (t_nblist
* gmx_restrict nlist
,
61 rvec
* gmx_restrict xx
,
62 rvec
* gmx_restrict ff
,
63 t_forcerec
* gmx_restrict fr
,
64 t_mdatoms
* gmx_restrict mdatoms
,
65 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
66 t_nrnb
* gmx_restrict nrnb
)
68 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
69 * just 0 for non-waters.
70 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
71 * jnr indices corresponding to data put in the four positions in the SIMD register.
73 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
74 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
76 int j_coord_offsetA
,j_coord_offsetB
;
77 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
79 real
*shiftvec
,*fshift
,*x
,*f
;
80 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
82 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
84 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
86 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
87 int vdwjidx0A
,vdwjidx0B
;
88 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
89 int vdwjidx1A
,vdwjidx1B
;
90 __m128d jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
91 int vdwjidx2A
,vdwjidx2B
;
92 __m128d jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
93 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
94 __m128d dx01
,dy01
,dz01
,rsq01
,rinv01
,rinvsq01
,r01
,qq01
,c6_01
,c12_01
;
95 __m128d dx02
,dy02
,dz02
,rsq02
,rinv02
,rinvsq02
,r02
,qq02
,c6_02
,c12_02
;
96 __m128d dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
97 __m128d dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
98 __m128d dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
99 __m128d dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
100 __m128d dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
101 __m128d dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
102 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
105 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
108 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
109 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
111 __m128i ifour
= _mm_set1_epi32(4);
112 __m128d rt
,vfeps
,vftabscale
,Y
,F
,G
,H
,Heps
,Fp
,VV
,FF
;
114 __m128d dummy_mask
,cutoff_mask
;
115 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
116 __m128d one
= _mm_set1_pd(1.0);
117 __m128d two
= _mm_set1_pd(2.0);
123 jindex
= nlist
->jindex
;
125 shiftidx
= nlist
->shift
;
127 shiftvec
= fr
->shift_vec
[0];
128 fshift
= fr
->fshift
[0];
129 facel
= _mm_set1_pd(fr
->epsfac
);
130 charge
= mdatoms
->chargeA
;
131 nvdwtype
= fr
->ntype
;
133 vdwtype
= mdatoms
->typeA
;
135 vftab
= kernel_data
->table_elec_vdw
->data
;
136 vftabscale
= _mm_set1_pd(kernel_data
->table_elec_vdw
->scale
);
138 /* Setup water-specific parameters */
139 inr
= nlist
->iinr
[0];
140 iq0
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+0]));
141 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
142 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
143 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
145 jq0
= _mm_set1_pd(charge
[inr
+0]);
146 jq1
= _mm_set1_pd(charge
[inr
+1]);
147 jq2
= _mm_set1_pd(charge
[inr
+2]);
148 vdwjidx0A
= 2*vdwtype
[inr
+0];
149 qq00
= _mm_mul_pd(iq0
,jq0
);
150 c6_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
]);
151 c12_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
152 qq01
= _mm_mul_pd(iq0
,jq1
);
153 qq02
= _mm_mul_pd(iq0
,jq2
);
154 qq10
= _mm_mul_pd(iq1
,jq0
);
155 qq11
= _mm_mul_pd(iq1
,jq1
);
156 qq12
= _mm_mul_pd(iq1
,jq2
);
157 qq20
= _mm_mul_pd(iq2
,jq0
);
158 qq21
= _mm_mul_pd(iq2
,jq1
);
159 qq22
= _mm_mul_pd(iq2
,jq2
);
161 /* Avoid stupid compiler warnings */
169 /* Start outer loop over neighborlists */
170 for(iidx
=0; iidx
<nri
; iidx
++)
172 /* Load shift vector for this list */
173 i_shift_offset
= DIM
*shiftidx
[iidx
];
175 /* Load limits for loop over neighbors */
176 j_index_start
= jindex
[iidx
];
177 j_index_end
= jindex
[iidx
+1];
179 /* Get outer coordinate index */
181 i_coord_offset
= DIM
*inr
;
183 /* Load i particle coords and add shift vector */
184 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
185 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
187 fix0
= _mm_setzero_pd();
188 fiy0
= _mm_setzero_pd();
189 fiz0
= _mm_setzero_pd();
190 fix1
= _mm_setzero_pd();
191 fiy1
= _mm_setzero_pd();
192 fiz1
= _mm_setzero_pd();
193 fix2
= _mm_setzero_pd();
194 fiy2
= _mm_setzero_pd();
195 fiz2
= _mm_setzero_pd();
197 /* Reset potential sums */
198 velecsum
= _mm_setzero_pd();
199 vvdwsum
= _mm_setzero_pd();
201 /* Start inner kernel loop */
202 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
205 /* Get j neighbor index, and coordinate index */
208 j_coord_offsetA
= DIM
*jnrA
;
209 j_coord_offsetB
= DIM
*jnrB
;
211 /* load j atom coordinates */
212 gmx_mm_load_3rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
213 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
215 /* Calculate displacement vector */
216 dx00
= _mm_sub_pd(ix0
,jx0
);
217 dy00
= _mm_sub_pd(iy0
,jy0
);
218 dz00
= _mm_sub_pd(iz0
,jz0
);
219 dx01
= _mm_sub_pd(ix0
,jx1
);
220 dy01
= _mm_sub_pd(iy0
,jy1
);
221 dz01
= _mm_sub_pd(iz0
,jz1
);
222 dx02
= _mm_sub_pd(ix0
,jx2
);
223 dy02
= _mm_sub_pd(iy0
,jy2
);
224 dz02
= _mm_sub_pd(iz0
,jz2
);
225 dx10
= _mm_sub_pd(ix1
,jx0
);
226 dy10
= _mm_sub_pd(iy1
,jy0
);
227 dz10
= _mm_sub_pd(iz1
,jz0
);
228 dx11
= _mm_sub_pd(ix1
,jx1
);
229 dy11
= _mm_sub_pd(iy1
,jy1
);
230 dz11
= _mm_sub_pd(iz1
,jz1
);
231 dx12
= _mm_sub_pd(ix1
,jx2
);
232 dy12
= _mm_sub_pd(iy1
,jy2
);
233 dz12
= _mm_sub_pd(iz1
,jz2
);
234 dx20
= _mm_sub_pd(ix2
,jx0
);
235 dy20
= _mm_sub_pd(iy2
,jy0
);
236 dz20
= _mm_sub_pd(iz2
,jz0
);
237 dx21
= _mm_sub_pd(ix2
,jx1
);
238 dy21
= _mm_sub_pd(iy2
,jy1
);
239 dz21
= _mm_sub_pd(iz2
,jz1
);
240 dx22
= _mm_sub_pd(ix2
,jx2
);
241 dy22
= _mm_sub_pd(iy2
,jy2
);
242 dz22
= _mm_sub_pd(iz2
,jz2
);
244 /* Calculate squared distance and things based on it */
245 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
246 rsq01
= gmx_mm_calc_rsq_pd(dx01
,dy01
,dz01
);
247 rsq02
= gmx_mm_calc_rsq_pd(dx02
,dy02
,dz02
);
248 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
249 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
250 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
251 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
252 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
253 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
255 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
256 rinv01
= gmx_mm_invsqrt_pd(rsq01
);
257 rinv02
= gmx_mm_invsqrt_pd(rsq02
);
258 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
259 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
260 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
261 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
262 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
263 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
265 fjx0
= _mm_setzero_pd();
266 fjy0
= _mm_setzero_pd();
267 fjz0
= _mm_setzero_pd();
268 fjx1
= _mm_setzero_pd();
269 fjy1
= _mm_setzero_pd();
270 fjz1
= _mm_setzero_pd();
271 fjx2
= _mm_setzero_pd();
272 fjy2
= _mm_setzero_pd();
273 fjz2
= _mm_setzero_pd();
275 /**************************
276 * CALCULATE INTERACTIONS *
277 **************************/
279 r00
= _mm_mul_pd(rsq00
,rinv00
);
281 /* Calculate table index by multiplying r with table scale and truncate to integer */
282 rt
= _mm_mul_pd(r00
,vftabscale
);
283 vfitab
= _mm_cvttpd_epi32(rt
);
284 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
285 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
287 /* CUBIC SPLINE TABLE ELECTROSTATICS */
288 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
289 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
290 GMX_MM_TRANSPOSE2_PD(Y
,F
);
291 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
292 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
293 GMX_MM_TRANSPOSE2_PD(G
,H
);
294 Heps
= _mm_mul_pd(vfeps
,H
);
295 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
296 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
297 velec
= _mm_mul_pd(qq00
,VV
);
298 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
299 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq00
,FF
),_mm_mul_pd(vftabscale
,rinv00
)));
301 /* CUBIC SPLINE TABLE DISPERSION */
302 vfitab
= _mm_add_epi32(vfitab
,ifour
);
303 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
304 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
305 GMX_MM_TRANSPOSE2_PD(Y
,F
);
306 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
307 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
308 GMX_MM_TRANSPOSE2_PD(G
,H
);
309 Heps
= _mm_mul_pd(vfeps
,H
);
310 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
311 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
312 vvdw6
= _mm_mul_pd(c6_00
,VV
);
313 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
314 fvdw6
= _mm_mul_pd(c6_00
,FF
);
316 /* CUBIC SPLINE TABLE REPULSION */
317 vfitab
= _mm_add_epi32(vfitab
,ifour
);
318 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
319 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
320 GMX_MM_TRANSPOSE2_PD(Y
,F
);
321 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
322 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
323 GMX_MM_TRANSPOSE2_PD(G
,H
);
324 Heps
= _mm_mul_pd(vfeps
,H
);
325 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
326 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
327 vvdw12
= _mm_mul_pd(c12_00
,VV
);
328 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
329 fvdw12
= _mm_mul_pd(c12_00
,FF
);
330 vvdw
= _mm_add_pd(vvdw12
,vvdw6
);
331 fvdw
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_add_pd(fvdw6
,fvdw12
),_mm_mul_pd(vftabscale
,rinv00
)));
333 /* Update potential sum for this i atom from the interaction with this j atom. */
334 velecsum
= _mm_add_pd(velecsum
,velec
);
335 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
337 fscal
= _mm_add_pd(felec
,fvdw
);
339 /* Calculate temporary vectorial force */
340 tx
= _mm_mul_pd(fscal
,dx00
);
341 ty
= _mm_mul_pd(fscal
,dy00
);
342 tz
= _mm_mul_pd(fscal
,dz00
);
344 /* Update vectorial force */
345 fix0
= _mm_add_pd(fix0
,tx
);
346 fiy0
= _mm_add_pd(fiy0
,ty
);
347 fiz0
= _mm_add_pd(fiz0
,tz
);
349 fjx0
= _mm_add_pd(fjx0
,tx
);
350 fjy0
= _mm_add_pd(fjy0
,ty
);
351 fjz0
= _mm_add_pd(fjz0
,tz
);
353 /**************************
354 * CALCULATE INTERACTIONS *
355 **************************/
357 r01
= _mm_mul_pd(rsq01
,rinv01
);
359 /* Calculate table index by multiplying r with table scale and truncate to integer */
360 rt
= _mm_mul_pd(r01
,vftabscale
);
361 vfitab
= _mm_cvttpd_epi32(rt
);
362 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
363 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
365 /* CUBIC SPLINE TABLE ELECTROSTATICS */
366 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
367 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
368 GMX_MM_TRANSPOSE2_PD(Y
,F
);
369 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
370 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
371 GMX_MM_TRANSPOSE2_PD(G
,H
);
372 Heps
= _mm_mul_pd(vfeps
,H
);
373 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
374 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
375 velec
= _mm_mul_pd(qq01
,VV
);
376 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
377 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq01
,FF
),_mm_mul_pd(vftabscale
,rinv01
)));
379 /* Update potential sum for this i atom from the interaction with this j atom. */
380 velecsum
= _mm_add_pd(velecsum
,velec
);
384 /* Calculate temporary vectorial force */
385 tx
= _mm_mul_pd(fscal
,dx01
);
386 ty
= _mm_mul_pd(fscal
,dy01
);
387 tz
= _mm_mul_pd(fscal
,dz01
);
389 /* Update vectorial force */
390 fix0
= _mm_add_pd(fix0
,tx
);
391 fiy0
= _mm_add_pd(fiy0
,ty
);
392 fiz0
= _mm_add_pd(fiz0
,tz
);
394 fjx1
= _mm_add_pd(fjx1
,tx
);
395 fjy1
= _mm_add_pd(fjy1
,ty
);
396 fjz1
= _mm_add_pd(fjz1
,tz
);
398 /**************************
399 * CALCULATE INTERACTIONS *
400 **************************/
402 r02
= _mm_mul_pd(rsq02
,rinv02
);
404 /* Calculate table index by multiplying r with table scale and truncate to integer */
405 rt
= _mm_mul_pd(r02
,vftabscale
);
406 vfitab
= _mm_cvttpd_epi32(rt
);
407 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
408 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
410 /* CUBIC SPLINE TABLE ELECTROSTATICS */
411 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
412 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
413 GMX_MM_TRANSPOSE2_PD(Y
,F
);
414 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
415 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
416 GMX_MM_TRANSPOSE2_PD(G
,H
);
417 Heps
= _mm_mul_pd(vfeps
,H
);
418 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
419 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
420 velec
= _mm_mul_pd(qq02
,VV
);
421 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
422 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq02
,FF
),_mm_mul_pd(vftabscale
,rinv02
)));
424 /* Update potential sum for this i atom from the interaction with this j atom. */
425 velecsum
= _mm_add_pd(velecsum
,velec
);
429 /* Calculate temporary vectorial force */
430 tx
= _mm_mul_pd(fscal
,dx02
);
431 ty
= _mm_mul_pd(fscal
,dy02
);
432 tz
= _mm_mul_pd(fscal
,dz02
);
434 /* Update vectorial force */
435 fix0
= _mm_add_pd(fix0
,tx
);
436 fiy0
= _mm_add_pd(fiy0
,ty
);
437 fiz0
= _mm_add_pd(fiz0
,tz
);
439 fjx2
= _mm_add_pd(fjx2
,tx
);
440 fjy2
= _mm_add_pd(fjy2
,ty
);
441 fjz2
= _mm_add_pd(fjz2
,tz
);
443 /**************************
444 * CALCULATE INTERACTIONS *
445 **************************/
447 r10
= _mm_mul_pd(rsq10
,rinv10
);
449 /* Calculate table index by multiplying r with table scale and truncate to integer */
450 rt
= _mm_mul_pd(r10
,vftabscale
);
451 vfitab
= _mm_cvttpd_epi32(rt
);
452 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
453 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
455 /* CUBIC SPLINE TABLE ELECTROSTATICS */
456 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
457 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
458 GMX_MM_TRANSPOSE2_PD(Y
,F
);
459 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
460 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
461 GMX_MM_TRANSPOSE2_PD(G
,H
);
462 Heps
= _mm_mul_pd(vfeps
,H
);
463 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
464 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
465 velec
= _mm_mul_pd(qq10
,VV
);
466 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
467 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq10
,FF
),_mm_mul_pd(vftabscale
,rinv10
)));
469 /* Update potential sum for this i atom from the interaction with this j atom. */
470 velecsum
= _mm_add_pd(velecsum
,velec
);
474 /* Calculate temporary vectorial force */
475 tx
= _mm_mul_pd(fscal
,dx10
);
476 ty
= _mm_mul_pd(fscal
,dy10
);
477 tz
= _mm_mul_pd(fscal
,dz10
);
479 /* Update vectorial force */
480 fix1
= _mm_add_pd(fix1
,tx
);
481 fiy1
= _mm_add_pd(fiy1
,ty
);
482 fiz1
= _mm_add_pd(fiz1
,tz
);
484 fjx0
= _mm_add_pd(fjx0
,tx
);
485 fjy0
= _mm_add_pd(fjy0
,ty
);
486 fjz0
= _mm_add_pd(fjz0
,tz
);
488 /**************************
489 * CALCULATE INTERACTIONS *
490 **************************/
492 r11
= _mm_mul_pd(rsq11
,rinv11
);
494 /* Calculate table index by multiplying r with table scale and truncate to integer */
495 rt
= _mm_mul_pd(r11
,vftabscale
);
496 vfitab
= _mm_cvttpd_epi32(rt
);
497 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
498 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
500 /* CUBIC SPLINE TABLE ELECTROSTATICS */
501 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
502 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
503 GMX_MM_TRANSPOSE2_PD(Y
,F
);
504 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
505 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
506 GMX_MM_TRANSPOSE2_PD(G
,H
);
507 Heps
= _mm_mul_pd(vfeps
,H
);
508 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
509 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
510 velec
= _mm_mul_pd(qq11
,VV
);
511 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
512 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq11
,FF
),_mm_mul_pd(vftabscale
,rinv11
)));
514 /* Update potential sum for this i atom from the interaction with this j atom. */
515 velecsum
= _mm_add_pd(velecsum
,velec
);
519 /* Calculate temporary vectorial force */
520 tx
= _mm_mul_pd(fscal
,dx11
);
521 ty
= _mm_mul_pd(fscal
,dy11
);
522 tz
= _mm_mul_pd(fscal
,dz11
);
524 /* Update vectorial force */
525 fix1
= _mm_add_pd(fix1
,tx
);
526 fiy1
= _mm_add_pd(fiy1
,ty
);
527 fiz1
= _mm_add_pd(fiz1
,tz
);
529 fjx1
= _mm_add_pd(fjx1
,tx
);
530 fjy1
= _mm_add_pd(fjy1
,ty
);
531 fjz1
= _mm_add_pd(fjz1
,tz
);
533 /**************************
534 * CALCULATE INTERACTIONS *
535 **************************/
537 r12
= _mm_mul_pd(rsq12
,rinv12
);
539 /* Calculate table index by multiplying r with table scale and truncate to integer */
540 rt
= _mm_mul_pd(r12
,vftabscale
);
541 vfitab
= _mm_cvttpd_epi32(rt
);
542 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
543 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
545 /* CUBIC SPLINE TABLE ELECTROSTATICS */
546 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
547 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
548 GMX_MM_TRANSPOSE2_PD(Y
,F
);
549 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
550 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
551 GMX_MM_TRANSPOSE2_PD(G
,H
);
552 Heps
= _mm_mul_pd(vfeps
,H
);
553 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
554 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
555 velec
= _mm_mul_pd(qq12
,VV
);
556 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
557 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq12
,FF
),_mm_mul_pd(vftabscale
,rinv12
)));
559 /* Update potential sum for this i atom from the interaction with this j atom. */
560 velecsum
= _mm_add_pd(velecsum
,velec
);
564 /* Calculate temporary vectorial force */
565 tx
= _mm_mul_pd(fscal
,dx12
);
566 ty
= _mm_mul_pd(fscal
,dy12
);
567 tz
= _mm_mul_pd(fscal
,dz12
);
569 /* Update vectorial force */
570 fix1
= _mm_add_pd(fix1
,tx
);
571 fiy1
= _mm_add_pd(fiy1
,ty
);
572 fiz1
= _mm_add_pd(fiz1
,tz
);
574 fjx2
= _mm_add_pd(fjx2
,tx
);
575 fjy2
= _mm_add_pd(fjy2
,ty
);
576 fjz2
= _mm_add_pd(fjz2
,tz
);
578 /**************************
579 * CALCULATE INTERACTIONS *
580 **************************/
582 r20
= _mm_mul_pd(rsq20
,rinv20
);
584 /* Calculate table index by multiplying r with table scale and truncate to integer */
585 rt
= _mm_mul_pd(r20
,vftabscale
);
586 vfitab
= _mm_cvttpd_epi32(rt
);
587 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
588 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
590 /* CUBIC SPLINE TABLE ELECTROSTATICS */
591 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
592 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
593 GMX_MM_TRANSPOSE2_PD(Y
,F
);
594 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
595 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
596 GMX_MM_TRANSPOSE2_PD(G
,H
);
597 Heps
= _mm_mul_pd(vfeps
,H
);
598 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
599 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
600 velec
= _mm_mul_pd(qq20
,VV
);
601 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
602 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq20
,FF
),_mm_mul_pd(vftabscale
,rinv20
)));
604 /* Update potential sum for this i atom from the interaction with this j atom. */
605 velecsum
= _mm_add_pd(velecsum
,velec
);
609 /* Calculate temporary vectorial force */
610 tx
= _mm_mul_pd(fscal
,dx20
);
611 ty
= _mm_mul_pd(fscal
,dy20
);
612 tz
= _mm_mul_pd(fscal
,dz20
);
614 /* Update vectorial force */
615 fix2
= _mm_add_pd(fix2
,tx
);
616 fiy2
= _mm_add_pd(fiy2
,ty
);
617 fiz2
= _mm_add_pd(fiz2
,tz
);
619 fjx0
= _mm_add_pd(fjx0
,tx
);
620 fjy0
= _mm_add_pd(fjy0
,ty
);
621 fjz0
= _mm_add_pd(fjz0
,tz
);
623 /**************************
624 * CALCULATE INTERACTIONS *
625 **************************/
627 r21
= _mm_mul_pd(rsq21
,rinv21
);
629 /* Calculate table index by multiplying r with table scale and truncate to integer */
630 rt
= _mm_mul_pd(r21
,vftabscale
);
631 vfitab
= _mm_cvttpd_epi32(rt
);
632 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
633 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
635 /* CUBIC SPLINE TABLE ELECTROSTATICS */
636 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
637 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
638 GMX_MM_TRANSPOSE2_PD(Y
,F
);
639 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
640 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
641 GMX_MM_TRANSPOSE2_PD(G
,H
);
642 Heps
= _mm_mul_pd(vfeps
,H
);
643 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
644 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
645 velec
= _mm_mul_pd(qq21
,VV
);
646 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
647 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq21
,FF
),_mm_mul_pd(vftabscale
,rinv21
)));
649 /* Update potential sum for this i atom from the interaction with this j atom. */
650 velecsum
= _mm_add_pd(velecsum
,velec
);
654 /* Calculate temporary vectorial force */
655 tx
= _mm_mul_pd(fscal
,dx21
);
656 ty
= _mm_mul_pd(fscal
,dy21
);
657 tz
= _mm_mul_pd(fscal
,dz21
);
659 /* Update vectorial force */
660 fix2
= _mm_add_pd(fix2
,tx
);
661 fiy2
= _mm_add_pd(fiy2
,ty
);
662 fiz2
= _mm_add_pd(fiz2
,tz
);
664 fjx1
= _mm_add_pd(fjx1
,tx
);
665 fjy1
= _mm_add_pd(fjy1
,ty
);
666 fjz1
= _mm_add_pd(fjz1
,tz
);
668 /**************************
669 * CALCULATE INTERACTIONS *
670 **************************/
672 r22
= _mm_mul_pd(rsq22
,rinv22
);
674 /* Calculate table index by multiplying r with table scale and truncate to integer */
675 rt
= _mm_mul_pd(r22
,vftabscale
);
676 vfitab
= _mm_cvttpd_epi32(rt
);
677 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
678 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
680 /* CUBIC SPLINE TABLE ELECTROSTATICS */
681 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
682 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
683 GMX_MM_TRANSPOSE2_PD(Y
,F
);
684 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
685 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
686 GMX_MM_TRANSPOSE2_PD(G
,H
);
687 Heps
= _mm_mul_pd(vfeps
,H
);
688 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
689 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
690 velec
= _mm_mul_pd(qq22
,VV
);
691 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
692 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq22
,FF
),_mm_mul_pd(vftabscale
,rinv22
)));
694 /* Update potential sum for this i atom from the interaction with this j atom. */
695 velecsum
= _mm_add_pd(velecsum
,velec
);
699 /* Calculate temporary vectorial force */
700 tx
= _mm_mul_pd(fscal
,dx22
);
701 ty
= _mm_mul_pd(fscal
,dy22
);
702 tz
= _mm_mul_pd(fscal
,dz22
);
704 /* Update vectorial force */
705 fix2
= _mm_add_pd(fix2
,tx
);
706 fiy2
= _mm_add_pd(fiy2
,ty
);
707 fiz2
= _mm_add_pd(fiz2
,tz
);
709 fjx2
= _mm_add_pd(fjx2
,tx
);
710 fjy2
= _mm_add_pd(fjy2
,ty
);
711 fjz2
= _mm_add_pd(fjz2
,tz
);
713 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
715 /* Inner loop uses 417 flops */
722 j_coord_offsetA
= DIM
*jnrA
;
724 /* load j atom coordinates */
725 gmx_mm_load_3rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
726 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
728 /* Calculate displacement vector */
729 dx00
= _mm_sub_pd(ix0
,jx0
);
730 dy00
= _mm_sub_pd(iy0
,jy0
);
731 dz00
= _mm_sub_pd(iz0
,jz0
);
732 dx01
= _mm_sub_pd(ix0
,jx1
);
733 dy01
= _mm_sub_pd(iy0
,jy1
);
734 dz01
= _mm_sub_pd(iz0
,jz1
);
735 dx02
= _mm_sub_pd(ix0
,jx2
);
736 dy02
= _mm_sub_pd(iy0
,jy2
);
737 dz02
= _mm_sub_pd(iz0
,jz2
);
738 dx10
= _mm_sub_pd(ix1
,jx0
);
739 dy10
= _mm_sub_pd(iy1
,jy0
);
740 dz10
= _mm_sub_pd(iz1
,jz0
);
741 dx11
= _mm_sub_pd(ix1
,jx1
);
742 dy11
= _mm_sub_pd(iy1
,jy1
);
743 dz11
= _mm_sub_pd(iz1
,jz1
);
744 dx12
= _mm_sub_pd(ix1
,jx2
);
745 dy12
= _mm_sub_pd(iy1
,jy2
);
746 dz12
= _mm_sub_pd(iz1
,jz2
);
747 dx20
= _mm_sub_pd(ix2
,jx0
);
748 dy20
= _mm_sub_pd(iy2
,jy0
);
749 dz20
= _mm_sub_pd(iz2
,jz0
);
750 dx21
= _mm_sub_pd(ix2
,jx1
);
751 dy21
= _mm_sub_pd(iy2
,jy1
);
752 dz21
= _mm_sub_pd(iz2
,jz1
);
753 dx22
= _mm_sub_pd(ix2
,jx2
);
754 dy22
= _mm_sub_pd(iy2
,jy2
);
755 dz22
= _mm_sub_pd(iz2
,jz2
);
757 /* Calculate squared distance and things based on it */
758 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
759 rsq01
= gmx_mm_calc_rsq_pd(dx01
,dy01
,dz01
);
760 rsq02
= gmx_mm_calc_rsq_pd(dx02
,dy02
,dz02
);
761 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
762 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
763 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
764 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
765 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
766 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
768 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
769 rinv01
= gmx_mm_invsqrt_pd(rsq01
);
770 rinv02
= gmx_mm_invsqrt_pd(rsq02
);
771 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
772 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
773 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
774 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
775 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
776 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
778 fjx0
= _mm_setzero_pd();
779 fjy0
= _mm_setzero_pd();
780 fjz0
= _mm_setzero_pd();
781 fjx1
= _mm_setzero_pd();
782 fjy1
= _mm_setzero_pd();
783 fjz1
= _mm_setzero_pd();
784 fjx2
= _mm_setzero_pd();
785 fjy2
= _mm_setzero_pd();
786 fjz2
= _mm_setzero_pd();
788 /**************************
789 * CALCULATE INTERACTIONS *
790 **************************/
792 r00
= _mm_mul_pd(rsq00
,rinv00
);
794 /* Calculate table index by multiplying r with table scale and truncate to integer */
795 rt
= _mm_mul_pd(r00
,vftabscale
);
796 vfitab
= _mm_cvttpd_epi32(rt
);
797 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
798 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
800 /* CUBIC SPLINE TABLE ELECTROSTATICS */
801 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
802 F
= _mm_setzero_pd();
803 GMX_MM_TRANSPOSE2_PD(Y
,F
);
804 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
805 H
= _mm_setzero_pd();
806 GMX_MM_TRANSPOSE2_PD(G
,H
);
807 Heps
= _mm_mul_pd(vfeps
,H
);
808 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
809 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
810 velec
= _mm_mul_pd(qq00
,VV
);
811 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
812 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq00
,FF
),_mm_mul_pd(vftabscale
,rinv00
)));
814 /* CUBIC SPLINE TABLE DISPERSION */
815 vfitab
= _mm_add_epi32(vfitab
,ifour
);
816 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
817 F
= _mm_setzero_pd();
818 GMX_MM_TRANSPOSE2_PD(Y
,F
);
819 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
820 H
= _mm_setzero_pd();
821 GMX_MM_TRANSPOSE2_PD(G
,H
);
822 Heps
= _mm_mul_pd(vfeps
,H
);
823 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
824 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
825 vvdw6
= _mm_mul_pd(c6_00
,VV
);
826 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
827 fvdw6
= _mm_mul_pd(c6_00
,FF
);
829 /* CUBIC SPLINE TABLE REPULSION */
830 vfitab
= _mm_add_epi32(vfitab
,ifour
);
831 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
832 F
= _mm_setzero_pd();
833 GMX_MM_TRANSPOSE2_PD(Y
,F
);
834 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
835 H
= _mm_setzero_pd();
836 GMX_MM_TRANSPOSE2_PD(G
,H
);
837 Heps
= _mm_mul_pd(vfeps
,H
);
838 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
839 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
840 vvdw12
= _mm_mul_pd(c12_00
,VV
);
841 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
842 fvdw12
= _mm_mul_pd(c12_00
,FF
);
843 vvdw
= _mm_add_pd(vvdw12
,vvdw6
);
844 fvdw
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_add_pd(fvdw6
,fvdw12
),_mm_mul_pd(vftabscale
,rinv00
)));
846 /* Update potential sum for this i atom from the interaction with this j atom. */
847 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
848 velecsum
= _mm_add_pd(velecsum
,velec
);
849 vvdw
= _mm_unpacklo_pd(vvdw
,_mm_setzero_pd());
850 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
852 fscal
= _mm_add_pd(felec
,fvdw
);
854 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
856 /* Calculate temporary vectorial force */
857 tx
= _mm_mul_pd(fscal
,dx00
);
858 ty
= _mm_mul_pd(fscal
,dy00
);
859 tz
= _mm_mul_pd(fscal
,dz00
);
861 /* Update vectorial force */
862 fix0
= _mm_add_pd(fix0
,tx
);
863 fiy0
= _mm_add_pd(fiy0
,ty
);
864 fiz0
= _mm_add_pd(fiz0
,tz
);
866 fjx0
= _mm_add_pd(fjx0
,tx
);
867 fjy0
= _mm_add_pd(fjy0
,ty
);
868 fjz0
= _mm_add_pd(fjz0
,tz
);
870 /**************************
871 * CALCULATE INTERACTIONS *
872 **************************/
874 r01
= _mm_mul_pd(rsq01
,rinv01
);
876 /* Calculate table index by multiplying r with table scale and truncate to integer */
877 rt
= _mm_mul_pd(r01
,vftabscale
);
878 vfitab
= _mm_cvttpd_epi32(rt
);
879 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
880 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
882 /* CUBIC SPLINE TABLE ELECTROSTATICS */
883 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
884 F
= _mm_setzero_pd();
885 GMX_MM_TRANSPOSE2_PD(Y
,F
);
886 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
887 H
= _mm_setzero_pd();
888 GMX_MM_TRANSPOSE2_PD(G
,H
);
889 Heps
= _mm_mul_pd(vfeps
,H
);
890 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
891 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
892 velec
= _mm_mul_pd(qq01
,VV
);
893 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
894 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq01
,FF
),_mm_mul_pd(vftabscale
,rinv01
)));
896 /* Update potential sum for this i atom from the interaction with this j atom. */
897 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
898 velecsum
= _mm_add_pd(velecsum
,velec
);
902 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
904 /* Calculate temporary vectorial force */
905 tx
= _mm_mul_pd(fscal
,dx01
);
906 ty
= _mm_mul_pd(fscal
,dy01
);
907 tz
= _mm_mul_pd(fscal
,dz01
);
909 /* Update vectorial force */
910 fix0
= _mm_add_pd(fix0
,tx
);
911 fiy0
= _mm_add_pd(fiy0
,ty
);
912 fiz0
= _mm_add_pd(fiz0
,tz
);
914 fjx1
= _mm_add_pd(fjx1
,tx
);
915 fjy1
= _mm_add_pd(fjy1
,ty
);
916 fjz1
= _mm_add_pd(fjz1
,tz
);
918 /**************************
919 * CALCULATE INTERACTIONS *
920 **************************/
922 r02
= _mm_mul_pd(rsq02
,rinv02
);
924 /* Calculate table index by multiplying r with table scale and truncate to integer */
925 rt
= _mm_mul_pd(r02
,vftabscale
);
926 vfitab
= _mm_cvttpd_epi32(rt
);
927 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
928 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
930 /* CUBIC SPLINE TABLE ELECTROSTATICS */
931 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
932 F
= _mm_setzero_pd();
933 GMX_MM_TRANSPOSE2_PD(Y
,F
);
934 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
935 H
= _mm_setzero_pd();
936 GMX_MM_TRANSPOSE2_PD(G
,H
);
937 Heps
= _mm_mul_pd(vfeps
,H
);
938 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
939 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
940 velec
= _mm_mul_pd(qq02
,VV
);
941 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
942 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq02
,FF
),_mm_mul_pd(vftabscale
,rinv02
)));
944 /* Update potential sum for this i atom from the interaction with this j atom. */
945 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
946 velecsum
= _mm_add_pd(velecsum
,velec
);
950 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
952 /* Calculate temporary vectorial force */
953 tx
= _mm_mul_pd(fscal
,dx02
);
954 ty
= _mm_mul_pd(fscal
,dy02
);
955 tz
= _mm_mul_pd(fscal
,dz02
);
957 /* Update vectorial force */
958 fix0
= _mm_add_pd(fix0
,tx
);
959 fiy0
= _mm_add_pd(fiy0
,ty
);
960 fiz0
= _mm_add_pd(fiz0
,tz
);
962 fjx2
= _mm_add_pd(fjx2
,tx
);
963 fjy2
= _mm_add_pd(fjy2
,ty
);
964 fjz2
= _mm_add_pd(fjz2
,tz
);
966 /**************************
967 * CALCULATE INTERACTIONS *
968 **************************/
970 r10
= _mm_mul_pd(rsq10
,rinv10
);
972 /* Calculate table index by multiplying r with table scale and truncate to integer */
973 rt
= _mm_mul_pd(r10
,vftabscale
);
974 vfitab
= _mm_cvttpd_epi32(rt
);
975 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
976 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
978 /* CUBIC SPLINE TABLE ELECTROSTATICS */
979 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
980 F
= _mm_setzero_pd();
981 GMX_MM_TRANSPOSE2_PD(Y
,F
);
982 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
983 H
= _mm_setzero_pd();
984 GMX_MM_TRANSPOSE2_PD(G
,H
);
985 Heps
= _mm_mul_pd(vfeps
,H
);
986 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
987 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
988 velec
= _mm_mul_pd(qq10
,VV
);
989 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
990 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq10
,FF
),_mm_mul_pd(vftabscale
,rinv10
)));
992 /* Update potential sum for this i atom from the interaction with this j atom. */
993 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
994 velecsum
= _mm_add_pd(velecsum
,velec
);
998 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1000 /* Calculate temporary vectorial force */
1001 tx
= _mm_mul_pd(fscal
,dx10
);
1002 ty
= _mm_mul_pd(fscal
,dy10
);
1003 tz
= _mm_mul_pd(fscal
,dz10
);
1005 /* Update vectorial force */
1006 fix1
= _mm_add_pd(fix1
,tx
);
1007 fiy1
= _mm_add_pd(fiy1
,ty
);
1008 fiz1
= _mm_add_pd(fiz1
,tz
);
1010 fjx0
= _mm_add_pd(fjx0
,tx
);
1011 fjy0
= _mm_add_pd(fjy0
,ty
);
1012 fjz0
= _mm_add_pd(fjz0
,tz
);
1014 /**************************
1015 * CALCULATE INTERACTIONS *
1016 **************************/
1018 r11
= _mm_mul_pd(rsq11
,rinv11
);
1020 /* Calculate table index by multiplying r with table scale and truncate to integer */
1021 rt
= _mm_mul_pd(r11
,vftabscale
);
1022 vfitab
= _mm_cvttpd_epi32(rt
);
1023 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1024 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1026 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1027 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1028 F
= _mm_setzero_pd();
1029 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1030 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1031 H
= _mm_setzero_pd();
1032 GMX_MM_TRANSPOSE2_PD(G
,H
);
1033 Heps
= _mm_mul_pd(vfeps
,H
);
1034 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1035 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
1036 velec
= _mm_mul_pd(qq11
,VV
);
1037 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1038 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq11
,FF
),_mm_mul_pd(vftabscale
,rinv11
)));
1040 /* Update potential sum for this i atom from the interaction with this j atom. */
1041 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1042 velecsum
= _mm_add_pd(velecsum
,velec
);
1046 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1048 /* Calculate temporary vectorial force */
1049 tx
= _mm_mul_pd(fscal
,dx11
);
1050 ty
= _mm_mul_pd(fscal
,dy11
);
1051 tz
= _mm_mul_pd(fscal
,dz11
);
1053 /* Update vectorial force */
1054 fix1
= _mm_add_pd(fix1
,tx
);
1055 fiy1
= _mm_add_pd(fiy1
,ty
);
1056 fiz1
= _mm_add_pd(fiz1
,tz
);
1058 fjx1
= _mm_add_pd(fjx1
,tx
);
1059 fjy1
= _mm_add_pd(fjy1
,ty
);
1060 fjz1
= _mm_add_pd(fjz1
,tz
);
1062 /**************************
1063 * CALCULATE INTERACTIONS *
1064 **************************/
1066 r12
= _mm_mul_pd(rsq12
,rinv12
);
1068 /* Calculate table index by multiplying r with table scale and truncate to integer */
1069 rt
= _mm_mul_pd(r12
,vftabscale
);
1070 vfitab
= _mm_cvttpd_epi32(rt
);
1071 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1072 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1074 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1075 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1076 F
= _mm_setzero_pd();
1077 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1078 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1079 H
= _mm_setzero_pd();
1080 GMX_MM_TRANSPOSE2_PD(G
,H
);
1081 Heps
= _mm_mul_pd(vfeps
,H
);
1082 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1083 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
1084 velec
= _mm_mul_pd(qq12
,VV
);
1085 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1086 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq12
,FF
),_mm_mul_pd(vftabscale
,rinv12
)));
1088 /* Update potential sum for this i atom from the interaction with this j atom. */
1089 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1090 velecsum
= _mm_add_pd(velecsum
,velec
);
1094 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1096 /* Calculate temporary vectorial force */
1097 tx
= _mm_mul_pd(fscal
,dx12
);
1098 ty
= _mm_mul_pd(fscal
,dy12
);
1099 tz
= _mm_mul_pd(fscal
,dz12
);
1101 /* Update vectorial force */
1102 fix1
= _mm_add_pd(fix1
,tx
);
1103 fiy1
= _mm_add_pd(fiy1
,ty
);
1104 fiz1
= _mm_add_pd(fiz1
,tz
);
1106 fjx2
= _mm_add_pd(fjx2
,tx
);
1107 fjy2
= _mm_add_pd(fjy2
,ty
);
1108 fjz2
= _mm_add_pd(fjz2
,tz
);
1110 /**************************
1111 * CALCULATE INTERACTIONS *
1112 **************************/
1114 r20
= _mm_mul_pd(rsq20
,rinv20
);
1116 /* Calculate table index by multiplying r with table scale and truncate to integer */
1117 rt
= _mm_mul_pd(r20
,vftabscale
);
1118 vfitab
= _mm_cvttpd_epi32(rt
);
1119 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1120 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1122 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1123 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1124 F
= _mm_setzero_pd();
1125 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1126 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1127 H
= _mm_setzero_pd();
1128 GMX_MM_TRANSPOSE2_PD(G
,H
);
1129 Heps
= _mm_mul_pd(vfeps
,H
);
1130 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1131 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
1132 velec
= _mm_mul_pd(qq20
,VV
);
1133 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1134 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq20
,FF
),_mm_mul_pd(vftabscale
,rinv20
)));
1136 /* Update potential sum for this i atom from the interaction with this j atom. */
1137 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1138 velecsum
= _mm_add_pd(velecsum
,velec
);
1142 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1144 /* Calculate temporary vectorial force */
1145 tx
= _mm_mul_pd(fscal
,dx20
);
1146 ty
= _mm_mul_pd(fscal
,dy20
);
1147 tz
= _mm_mul_pd(fscal
,dz20
);
1149 /* Update vectorial force */
1150 fix2
= _mm_add_pd(fix2
,tx
);
1151 fiy2
= _mm_add_pd(fiy2
,ty
);
1152 fiz2
= _mm_add_pd(fiz2
,tz
);
1154 fjx0
= _mm_add_pd(fjx0
,tx
);
1155 fjy0
= _mm_add_pd(fjy0
,ty
);
1156 fjz0
= _mm_add_pd(fjz0
,tz
);
1158 /**************************
1159 * CALCULATE INTERACTIONS *
1160 **************************/
1162 r21
= _mm_mul_pd(rsq21
,rinv21
);
1164 /* Calculate table index by multiplying r with table scale and truncate to integer */
1165 rt
= _mm_mul_pd(r21
,vftabscale
);
1166 vfitab
= _mm_cvttpd_epi32(rt
);
1167 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1168 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1170 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1171 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1172 F
= _mm_setzero_pd();
1173 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1174 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1175 H
= _mm_setzero_pd();
1176 GMX_MM_TRANSPOSE2_PD(G
,H
);
1177 Heps
= _mm_mul_pd(vfeps
,H
);
1178 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1179 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
1180 velec
= _mm_mul_pd(qq21
,VV
);
1181 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1182 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq21
,FF
),_mm_mul_pd(vftabscale
,rinv21
)));
1184 /* Update potential sum for this i atom from the interaction with this j atom. */
1185 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1186 velecsum
= _mm_add_pd(velecsum
,velec
);
1190 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1192 /* Calculate temporary vectorial force */
1193 tx
= _mm_mul_pd(fscal
,dx21
);
1194 ty
= _mm_mul_pd(fscal
,dy21
);
1195 tz
= _mm_mul_pd(fscal
,dz21
);
1197 /* Update vectorial force */
1198 fix2
= _mm_add_pd(fix2
,tx
);
1199 fiy2
= _mm_add_pd(fiy2
,ty
);
1200 fiz2
= _mm_add_pd(fiz2
,tz
);
1202 fjx1
= _mm_add_pd(fjx1
,tx
);
1203 fjy1
= _mm_add_pd(fjy1
,ty
);
1204 fjz1
= _mm_add_pd(fjz1
,tz
);
1206 /**************************
1207 * CALCULATE INTERACTIONS *
1208 **************************/
1210 r22
= _mm_mul_pd(rsq22
,rinv22
);
1212 /* Calculate table index by multiplying r with table scale and truncate to integer */
1213 rt
= _mm_mul_pd(r22
,vftabscale
);
1214 vfitab
= _mm_cvttpd_epi32(rt
);
1215 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1216 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1218 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1219 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1220 F
= _mm_setzero_pd();
1221 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1222 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1223 H
= _mm_setzero_pd();
1224 GMX_MM_TRANSPOSE2_PD(G
,H
);
1225 Heps
= _mm_mul_pd(vfeps
,H
);
1226 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1227 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
1228 velec
= _mm_mul_pd(qq22
,VV
);
1229 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1230 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq22
,FF
),_mm_mul_pd(vftabscale
,rinv22
)));
1232 /* Update potential sum for this i atom from the interaction with this j atom. */
1233 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1234 velecsum
= _mm_add_pd(velecsum
,velec
);
1238 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1240 /* Calculate temporary vectorial force */
1241 tx
= _mm_mul_pd(fscal
,dx22
);
1242 ty
= _mm_mul_pd(fscal
,dy22
);
1243 tz
= _mm_mul_pd(fscal
,dz22
);
1245 /* Update vectorial force */
1246 fix2
= _mm_add_pd(fix2
,tx
);
1247 fiy2
= _mm_add_pd(fiy2
,ty
);
1248 fiz2
= _mm_add_pd(fiz2
,tz
);
1250 fjx2
= _mm_add_pd(fjx2
,tx
);
1251 fjy2
= _mm_add_pd(fjy2
,ty
);
1252 fjz2
= _mm_add_pd(fjz2
,tz
);
1254 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
1256 /* Inner loop uses 417 flops */
1259 /* End of innermost loop */
1261 gmx_mm_update_iforce_3atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
1262 f
+i_coord_offset
,fshift
+i_shift_offset
);
1265 /* Update potential energies */
1266 gmx_mm_update_1pot_pd(velecsum
,kernel_data
->energygrp_elec
+ggid
);
1267 gmx_mm_update_1pot_pd(vvdwsum
,kernel_data
->energygrp_vdw
+ggid
);
1269 /* Increment number of inner iterations */
1270 inneriter
+= j_index_end
- j_index_start
;
1272 /* Outer loop uses 20 flops */
1275 /* Increment number of outer iterations */
1278 /* Update outer/inner flops */
1280 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W3W3_VF
,outeriter
*20 + inneriter
*417);
1283 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sse2_double
1284 * Electrostatics interaction: CubicSplineTable
1285 * VdW interaction: CubicSplineTable
1286 * Geometry: Water3-Water3
1287 * Calculate force/pot: Force
1290 nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sse2_double
1291 (t_nblist
* gmx_restrict nlist
,
1292 rvec
* gmx_restrict xx
,
1293 rvec
* gmx_restrict ff
,
1294 t_forcerec
* gmx_restrict fr
,
1295 t_mdatoms
* gmx_restrict mdatoms
,
1296 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
1297 t_nrnb
* gmx_restrict nrnb
)
1299 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1300 * just 0 for non-waters.
1301 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1302 * jnr indices corresponding to data put in the four positions in the SIMD register.
1304 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
1305 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
1307 int j_coord_offsetA
,j_coord_offsetB
;
1308 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
1309 real rcutoff_scalar
;
1310 real
*shiftvec
,*fshift
,*x
,*f
;
1311 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
1313 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
1315 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
1317 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
1318 int vdwjidx0A
,vdwjidx0B
;
1319 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
1320 int vdwjidx1A
,vdwjidx1B
;
1321 __m128d jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
1322 int vdwjidx2A
,vdwjidx2B
;
1323 __m128d jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
1324 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
1325 __m128d dx01
,dy01
,dz01
,rsq01
,rinv01
,rinvsq01
,r01
,qq01
,c6_01
,c12_01
;
1326 __m128d dx02
,dy02
,dz02
,rsq02
,rinv02
,rinvsq02
,r02
,qq02
,c6_02
,c12_02
;
1327 __m128d dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
1328 __m128d dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
1329 __m128d dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
1330 __m128d dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
1331 __m128d dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
1332 __m128d dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
1333 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
1336 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
1339 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
1340 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
1342 __m128i ifour
= _mm_set1_epi32(4);
1343 __m128d rt
,vfeps
,vftabscale
,Y
,F
,G
,H
,Heps
,Fp
,VV
,FF
;
1345 __m128d dummy_mask
,cutoff_mask
;
1346 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1347 __m128d one
= _mm_set1_pd(1.0);
1348 __m128d two
= _mm_set1_pd(2.0);
1354 jindex
= nlist
->jindex
;
1356 shiftidx
= nlist
->shift
;
1358 shiftvec
= fr
->shift_vec
[0];
1359 fshift
= fr
->fshift
[0];
1360 facel
= _mm_set1_pd(fr
->epsfac
);
1361 charge
= mdatoms
->chargeA
;
1362 nvdwtype
= fr
->ntype
;
1363 vdwparam
= fr
->nbfp
;
1364 vdwtype
= mdatoms
->typeA
;
1366 vftab
= kernel_data
->table_elec_vdw
->data
;
1367 vftabscale
= _mm_set1_pd(kernel_data
->table_elec_vdw
->scale
);
1369 /* Setup water-specific parameters */
1370 inr
= nlist
->iinr
[0];
1371 iq0
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+0]));
1372 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
1373 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
1374 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
1376 jq0
= _mm_set1_pd(charge
[inr
+0]);
1377 jq1
= _mm_set1_pd(charge
[inr
+1]);
1378 jq2
= _mm_set1_pd(charge
[inr
+2]);
1379 vdwjidx0A
= 2*vdwtype
[inr
+0];
1380 qq00
= _mm_mul_pd(iq0
,jq0
);
1381 c6_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
]);
1382 c12_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
1383 qq01
= _mm_mul_pd(iq0
,jq1
);
1384 qq02
= _mm_mul_pd(iq0
,jq2
);
1385 qq10
= _mm_mul_pd(iq1
,jq0
);
1386 qq11
= _mm_mul_pd(iq1
,jq1
);
1387 qq12
= _mm_mul_pd(iq1
,jq2
);
1388 qq20
= _mm_mul_pd(iq2
,jq0
);
1389 qq21
= _mm_mul_pd(iq2
,jq1
);
1390 qq22
= _mm_mul_pd(iq2
,jq2
);
1392 /* Avoid stupid compiler warnings */
1394 j_coord_offsetA
= 0;
1395 j_coord_offsetB
= 0;
1400 /* Start outer loop over neighborlists */
1401 for(iidx
=0; iidx
<nri
; iidx
++)
1403 /* Load shift vector for this list */
1404 i_shift_offset
= DIM
*shiftidx
[iidx
];
1406 /* Load limits for loop over neighbors */
1407 j_index_start
= jindex
[iidx
];
1408 j_index_end
= jindex
[iidx
+1];
1410 /* Get outer coordinate index */
1412 i_coord_offset
= DIM
*inr
;
1414 /* Load i particle coords and add shift vector */
1415 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
1416 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
1418 fix0
= _mm_setzero_pd();
1419 fiy0
= _mm_setzero_pd();
1420 fiz0
= _mm_setzero_pd();
1421 fix1
= _mm_setzero_pd();
1422 fiy1
= _mm_setzero_pd();
1423 fiz1
= _mm_setzero_pd();
1424 fix2
= _mm_setzero_pd();
1425 fiy2
= _mm_setzero_pd();
1426 fiz2
= _mm_setzero_pd();
1428 /* Start inner kernel loop */
1429 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
1432 /* Get j neighbor index, and coordinate index */
1434 jnrB
= jjnr
[jidx
+1];
1435 j_coord_offsetA
= DIM
*jnrA
;
1436 j_coord_offsetB
= DIM
*jnrB
;
1438 /* load j atom coordinates */
1439 gmx_mm_load_3rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1440 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
1442 /* Calculate displacement vector */
1443 dx00
= _mm_sub_pd(ix0
,jx0
);
1444 dy00
= _mm_sub_pd(iy0
,jy0
);
1445 dz00
= _mm_sub_pd(iz0
,jz0
);
1446 dx01
= _mm_sub_pd(ix0
,jx1
);
1447 dy01
= _mm_sub_pd(iy0
,jy1
);
1448 dz01
= _mm_sub_pd(iz0
,jz1
);
1449 dx02
= _mm_sub_pd(ix0
,jx2
);
1450 dy02
= _mm_sub_pd(iy0
,jy2
);
1451 dz02
= _mm_sub_pd(iz0
,jz2
);
1452 dx10
= _mm_sub_pd(ix1
,jx0
);
1453 dy10
= _mm_sub_pd(iy1
,jy0
);
1454 dz10
= _mm_sub_pd(iz1
,jz0
);
1455 dx11
= _mm_sub_pd(ix1
,jx1
);
1456 dy11
= _mm_sub_pd(iy1
,jy1
);
1457 dz11
= _mm_sub_pd(iz1
,jz1
);
1458 dx12
= _mm_sub_pd(ix1
,jx2
);
1459 dy12
= _mm_sub_pd(iy1
,jy2
);
1460 dz12
= _mm_sub_pd(iz1
,jz2
);
1461 dx20
= _mm_sub_pd(ix2
,jx0
);
1462 dy20
= _mm_sub_pd(iy2
,jy0
);
1463 dz20
= _mm_sub_pd(iz2
,jz0
);
1464 dx21
= _mm_sub_pd(ix2
,jx1
);
1465 dy21
= _mm_sub_pd(iy2
,jy1
);
1466 dz21
= _mm_sub_pd(iz2
,jz1
);
1467 dx22
= _mm_sub_pd(ix2
,jx2
);
1468 dy22
= _mm_sub_pd(iy2
,jy2
);
1469 dz22
= _mm_sub_pd(iz2
,jz2
);
1471 /* Calculate squared distance and things based on it */
1472 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
1473 rsq01
= gmx_mm_calc_rsq_pd(dx01
,dy01
,dz01
);
1474 rsq02
= gmx_mm_calc_rsq_pd(dx02
,dy02
,dz02
);
1475 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
1476 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
1477 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
1478 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
1479 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
1480 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
1482 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
1483 rinv01
= gmx_mm_invsqrt_pd(rsq01
);
1484 rinv02
= gmx_mm_invsqrt_pd(rsq02
);
1485 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
1486 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
1487 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
1488 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
1489 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
1490 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
1492 fjx0
= _mm_setzero_pd();
1493 fjy0
= _mm_setzero_pd();
1494 fjz0
= _mm_setzero_pd();
1495 fjx1
= _mm_setzero_pd();
1496 fjy1
= _mm_setzero_pd();
1497 fjz1
= _mm_setzero_pd();
1498 fjx2
= _mm_setzero_pd();
1499 fjy2
= _mm_setzero_pd();
1500 fjz2
= _mm_setzero_pd();
1502 /**************************
1503 * CALCULATE INTERACTIONS *
1504 **************************/
1506 r00
= _mm_mul_pd(rsq00
,rinv00
);
1508 /* Calculate table index by multiplying r with table scale and truncate to integer */
1509 rt
= _mm_mul_pd(r00
,vftabscale
);
1510 vfitab
= _mm_cvttpd_epi32(rt
);
1511 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1512 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1514 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1515 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1516 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1517 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1518 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1519 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
1520 GMX_MM_TRANSPOSE2_PD(G
,H
);
1521 Heps
= _mm_mul_pd(vfeps
,H
);
1522 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1523 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1524 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq00
,FF
),_mm_mul_pd(vftabscale
,rinv00
)));
1526 /* CUBIC SPLINE TABLE DISPERSION */
1527 vfitab
= _mm_add_epi32(vfitab
,ifour
);
1528 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1529 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1530 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1531 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1532 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
1533 GMX_MM_TRANSPOSE2_PD(G
,H
);
1534 Heps
= _mm_mul_pd(vfeps
,H
);
1535 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1536 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1537 fvdw6
= _mm_mul_pd(c6_00
,FF
);
1539 /* CUBIC SPLINE TABLE REPULSION */
1540 vfitab
= _mm_add_epi32(vfitab
,ifour
);
1541 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1542 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1543 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1544 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1545 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
1546 GMX_MM_TRANSPOSE2_PD(G
,H
);
1547 Heps
= _mm_mul_pd(vfeps
,H
);
1548 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1549 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1550 fvdw12
= _mm_mul_pd(c12_00
,FF
);
1551 fvdw
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_add_pd(fvdw6
,fvdw12
),_mm_mul_pd(vftabscale
,rinv00
)));
1553 fscal
= _mm_add_pd(felec
,fvdw
);
1555 /* Calculate temporary vectorial force */
1556 tx
= _mm_mul_pd(fscal
,dx00
);
1557 ty
= _mm_mul_pd(fscal
,dy00
);
1558 tz
= _mm_mul_pd(fscal
,dz00
);
1560 /* Update vectorial force */
1561 fix0
= _mm_add_pd(fix0
,tx
);
1562 fiy0
= _mm_add_pd(fiy0
,ty
);
1563 fiz0
= _mm_add_pd(fiz0
,tz
);
1565 fjx0
= _mm_add_pd(fjx0
,tx
);
1566 fjy0
= _mm_add_pd(fjy0
,ty
);
1567 fjz0
= _mm_add_pd(fjz0
,tz
);
1569 /**************************
1570 * CALCULATE INTERACTIONS *
1571 **************************/
1573 r01
= _mm_mul_pd(rsq01
,rinv01
);
1575 /* Calculate table index by multiplying r with table scale and truncate to integer */
1576 rt
= _mm_mul_pd(r01
,vftabscale
);
1577 vfitab
= _mm_cvttpd_epi32(rt
);
1578 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1579 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1581 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1582 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1583 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1584 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1585 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1586 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
1587 GMX_MM_TRANSPOSE2_PD(G
,H
);
1588 Heps
= _mm_mul_pd(vfeps
,H
);
1589 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1590 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1591 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq01
,FF
),_mm_mul_pd(vftabscale
,rinv01
)));
1595 /* Calculate temporary vectorial force */
1596 tx
= _mm_mul_pd(fscal
,dx01
);
1597 ty
= _mm_mul_pd(fscal
,dy01
);
1598 tz
= _mm_mul_pd(fscal
,dz01
);
1600 /* Update vectorial force */
1601 fix0
= _mm_add_pd(fix0
,tx
);
1602 fiy0
= _mm_add_pd(fiy0
,ty
);
1603 fiz0
= _mm_add_pd(fiz0
,tz
);
1605 fjx1
= _mm_add_pd(fjx1
,tx
);
1606 fjy1
= _mm_add_pd(fjy1
,ty
);
1607 fjz1
= _mm_add_pd(fjz1
,tz
);
1609 /**************************
1610 * CALCULATE INTERACTIONS *
1611 **************************/
1613 r02
= _mm_mul_pd(rsq02
,rinv02
);
1615 /* Calculate table index by multiplying r with table scale and truncate to integer */
1616 rt
= _mm_mul_pd(r02
,vftabscale
);
1617 vfitab
= _mm_cvttpd_epi32(rt
);
1618 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1619 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1621 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1622 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1623 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1624 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1625 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1626 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
1627 GMX_MM_TRANSPOSE2_PD(G
,H
);
1628 Heps
= _mm_mul_pd(vfeps
,H
);
1629 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1630 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1631 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq02
,FF
),_mm_mul_pd(vftabscale
,rinv02
)));
1635 /* Calculate temporary vectorial force */
1636 tx
= _mm_mul_pd(fscal
,dx02
);
1637 ty
= _mm_mul_pd(fscal
,dy02
);
1638 tz
= _mm_mul_pd(fscal
,dz02
);
1640 /* Update vectorial force */
1641 fix0
= _mm_add_pd(fix0
,tx
);
1642 fiy0
= _mm_add_pd(fiy0
,ty
);
1643 fiz0
= _mm_add_pd(fiz0
,tz
);
1645 fjx2
= _mm_add_pd(fjx2
,tx
);
1646 fjy2
= _mm_add_pd(fjy2
,ty
);
1647 fjz2
= _mm_add_pd(fjz2
,tz
);
1649 /**************************
1650 * CALCULATE INTERACTIONS *
1651 **************************/
1653 r10
= _mm_mul_pd(rsq10
,rinv10
);
1655 /* Calculate table index by multiplying r with table scale and truncate to integer */
1656 rt
= _mm_mul_pd(r10
,vftabscale
);
1657 vfitab
= _mm_cvttpd_epi32(rt
);
1658 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1659 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1661 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1662 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1663 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1664 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1665 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1666 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
1667 GMX_MM_TRANSPOSE2_PD(G
,H
);
1668 Heps
= _mm_mul_pd(vfeps
,H
);
1669 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1670 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1671 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq10
,FF
),_mm_mul_pd(vftabscale
,rinv10
)));
1675 /* Calculate temporary vectorial force */
1676 tx
= _mm_mul_pd(fscal
,dx10
);
1677 ty
= _mm_mul_pd(fscal
,dy10
);
1678 tz
= _mm_mul_pd(fscal
,dz10
);
1680 /* Update vectorial force */
1681 fix1
= _mm_add_pd(fix1
,tx
);
1682 fiy1
= _mm_add_pd(fiy1
,ty
);
1683 fiz1
= _mm_add_pd(fiz1
,tz
);
1685 fjx0
= _mm_add_pd(fjx0
,tx
);
1686 fjy0
= _mm_add_pd(fjy0
,ty
);
1687 fjz0
= _mm_add_pd(fjz0
,tz
);
1689 /**************************
1690 * CALCULATE INTERACTIONS *
1691 **************************/
1693 r11
= _mm_mul_pd(rsq11
,rinv11
);
1695 /* Calculate table index by multiplying r with table scale and truncate to integer */
1696 rt
= _mm_mul_pd(r11
,vftabscale
);
1697 vfitab
= _mm_cvttpd_epi32(rt
);
1698 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1699 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1701 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1702 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1703 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1704 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1705 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1706 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
1707 GMX_MM_TRANSPOSE2_PD(G
,H
);
1708 Heps
= _mm_mul_pd(vfeps
,H
);
1709 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1710 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1711 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq11
,FF
),_mm_mul_pd(vftabscale
,rinv11
)));
1715 /* Calculate temporary vectorial force */
1716 tx
= _mm_mul_pd(fscal
,dx11
);
1717 ty
= _mm_mul_pd(fscal
,dy11
);
1718 tz
= _mm_mul_pd(fscal
,dz11
);
1720 /* Update vectorial force */
1721 fix1
= _mm_add_pd(fix1
,tx
);
1722 fiy1
= _mm_add_pd(fiy1
,ty
);
1723 fiz1
= _mm_add_pd(fiz1
,tz
);
1725 fjx1
= _mm_add_pd(fjx1
,tx
);
1726 fjy1
= _mm_add_pd(fjy1
,ty
);
1727 fjz1
= _mm_add_pd(fjz1
,tz
);
1729 /**************************
1730 * CALCULATE INTERACTIONS *
1731 **************************/
1733 r12
= _mm_mul_pd(rsq12
,rinv12
);
1735 /* Calculate table index by multiplying r with table scale and truncate to integer */
1736 rt
= _mm_mul_pd(r12
,vftabscale
);
1737 vfitab
= _mm_cvttpd_epi32(rt
);
1738 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1739 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1741 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1742 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1743 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1744 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1745 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1746 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
1747 GMX_MM_TRANSPOSE2_PD(G
,H
);
1748 Heps
= _mm_mul_pd(vfeps
,H
);
1749 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1750 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1751 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq12
,FF
),_mm_mul_pd(vftabscale
,rinv12
)));
1755 /* Calculate temporary vectorial force */
1756 tx
= _mm_mul_pd(fscal
,dx12
);
1757 ty
= _mm_mul_pd(fscal
,dy12
);
1758 tz
= _mm_mul_pd(fscal
,dz12
);
1760 /* Update vectorial force */
1761 fix1
= _mm_add_pd(fix1
,tx
);
1762 fiy1
= _mm_add_pd(fiy1
,ty
);
1763 fiz1
= _mm_add_pd(fiz1
,tz
);
1765 fjx2
= _mm_add_pd(fjx2
,tx
);
1766 fjy2
= _mm_add_pd(fjy2
,ty
);
1767 fjz2
= _mm_add_pd(fjz2
,tz
);
1769 /**************************
1770 * CALCULATE INTERACTIONS *
1771 **************************/
1773 r20
= _mm_mul_pd(rsq20
,rinv20
);
1775 /* Calculate table index by multiplying r with table scale and truncate to integer */
1776 rt
= _mm_mul_pd(r20
,vftabscale
);
1777 vfitab
= _mm_cvttpd_epi32(rt
);
1778 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1779 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1781 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1782 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1783 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1784 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1785 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1786 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
1787 GMX_MM_TRANSPOSE2_PD(G
,H
);
1788 Heps
= _mm_mul_pd(vfeps
,H
);
1789 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1790 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1791 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq20
,FF
),_mm_mul_pd(vftabscale
,rinv20
)));
1795 /* Calculate temporary vectorial force */
1796 tx
= _mm_mul_pd(fscal
,dx20
);
1797 ty
= _mm_mul_pd(fscal
,dy20
);
1798 tz
= _mm_mul_pd(fscal
,dz20
);
1800 /* Update vectorial force */
1801 fix2
= _mm_add_pd(fix2
,tx
);
1802 fiy2
= _mm_add_pd(fiy2
,ty
);
1803 fiz2
= _mm_add_pd(fiz2
,tz
);
1805 fjx0
= _mm_add_pd(fjx0
,tx
);
1806 fjy0
= _mm_add_pd(fjy0
,ty
);
1807 fjz0
= _mm_add_pd(fjz0
,tz
);
1809 /**************************
1810 * CALCULATE INTERACTIONS *
1811 **************************/
1813 r21
= _mm_mul_pd(rsq21
,rinv21
);
1815 /* Calculate table index by multiplying r with table scale and truncate to integer */
1816 rt
= _mm_mul_pd(r21
,vftabscale
);
1817 vfitab
= _mm_cvttpd_epi32(rt
);
1818 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1819 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1821 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1822 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1823 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1824 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1825 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1826 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
1827 GMX_MM_TRANSPOSE2_PD(G
,H
);
1828 Heps
= _mm_mul_pd(vfeps
,H
);
1829 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1830 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1831 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq21
,FF
),_mm_mul_pd(vftabscale
,rinv21
)));
1835 /* Calculate temporary vectorial force */
1836 tx
= _mm_mul_pd(fscal
,dx21
);
1837 ty
= _mm_mul_pd(fscal
,dy21
);
1838 tz
= _mm_mul_pd(fscal
,dz21
);
1840 /* Update vectorial force */
1841 fix2
= _mm_add_pd(fix2
,tx
);
1842 fiy2
= _mm_add_pd(fiy2
,ty
);
1843 fiz2
= _mm_add_pd(fiz2
,tz
);
1845 fjx1
= _mm_add_pd(fjx1
,tx
);
1846 fjy1
= _mm_add_pd(fjy1
,ty
);
1847 fjz1
= _mm_add_pd(fjz1
,tz
);
1849 /**************************
1850 * CALCULATE INTERACTIONS *
1851 **************************/
1853 r22
= _mm_mul_pd(rsq22
,rinv22
);
1855 /* Calculate table index by multiplying r with table scale and truncate to integer */
1856 rt
= _mm_mul_pd(r22
,vftabscale
);
1857 vfitab
= _mm_cvttpd_epi32(rt
);
1858 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1859 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1861 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1862 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1863 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1864 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1865 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1866 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
1867 GMX_MM_TRANSPOSE2_PD(G
,H
);
1868 Heps
= _mm_mul_pd(vfeps
,H
);
1869 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1870 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1871 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq22
,FF
),_mm_mul_pd(vftabscale
,rinv22
)));
1875 /* Calculate temporary vectorial force */
1876 tx
= _mm_mul_pd(fscal
,dx22
);
1877 ty
= _mm_mul_pd(fscal
,dy22
);
1878 tz
= _mm_mul_pd(fscal
,dz22
);
1880 /* Update vectorial force */
1881 fix2
= _mm_add_pd(fix2
,tx
);
1882 fiy2
= _mm_add_pd(fiy2
,ty
);
1883 fiz2
= _mm_add_pd(fiz2
,tz
);
1885 fjx2
= _mm_add_pd(fjx2
,tx
);
1886 fjy2
= _mm_add_pd(fjy2
,ty
);
1887 fjz2
= _mm_add_pd(fjz2
,tz
);
1889 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
1891 /* Inner loop uses 373 flops */
1894 if(jidx
<j_index_end
)
1898 j_coord_offsetA
= DIM
*jnrA
;
1900 /* load j atom coordinates */
1901 gmx_mm_load_3rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
1902 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
1904 /* Calculate displacement vector */
1905 dx00
= _mm_sub_pd(ix0
,jx0
);
1906 dy00
= _mm_sub_pd(iy0
,jy0
);
1907 dz00
= _mm_sub_pd(iz0
,jz0
);
1908 dx01
= _mm_sub_pd(ix0
,jx1
);
1909 dy01
= _mm_sub_pd(iy0
,jy1
);
1910 dz01
= _mm_sub_pd(iz0
,jz1
);
1911 dx02
= _mm_sub_pd(ix0
,jx2
);
1912 dy02
= _mm_sub_pd(iy0
,jy2
);
1913 dz02
= _mm_sub_pd(iz0
,jz2
);
1914 dx10
= _mm_sub_pd(ix1
,jx0
);
1915 dy10
= _mm_sub_pd(iy1
,jy0
);
1916 dz10
= _mm_sub_pd(iz1
,jz0
);
1917 dx11
= _mm_sub_pd(ix1
,jx1
);
1918 dy11
= _mm_sub_pd(iy1
,jy1
);
1919 dz11
= _mm_sub_pd(iz1
,jz1
);
1920 dx12
= _mm_sub_pd(ix1
,jx2
);
1921 dy12
= _mm_sub_pd(iy1
,jy2
);
1922 dz12
= _mm_sub_pd(iz1
,jz2
);
1923 dx20
= _mm_sub_pd(ix2
,jx0
);
1924 dy20
= _mm_sub_pd(iy2
,jy0
);
1925 dz20
= _mm_sub_pd(iz2
,jz0
);
1926 dx21
= _mm_sub_pd(ix2
,jx1
);
1927 dy21
= _mm_sub_pd(iy2
,jy1
);
1928 dz21
= _mm_sub_pd(iz2
,jz1
);
1929 dx22
= _mm_sub_pd(ix2
,jx2
);
1930 dy22
= _mm_sub_pd(iy2
,jy2
);
1931 dz22
= _mm_sub_pd(iz2
,jz2
);
1933 /* Calculate squared distance and things based on it */
1934 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
1935 rsq01
= gmx_mm_calc_rsq_pd(dx01
,dy01
,dz01
);
1936 rsq02
= gmx_mm_calc_rsq_pd(dx02
,dy02
,dz02
);
1937 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
1938 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
1939 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
1940 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
1941 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
1942 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
1944 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
1945 rinv01
= gmx_mm_invsqrt_pd(rsq01
);
1946 rinv02
= gmx_mm_invsqrt_pd(rsq02
);
1947 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
1948 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
1949 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
1950 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
1951 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
1952 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
1954 fjx0
= _mm_setzero_pd();
1955 fjy0
= _mm_setzero_pd();
1956 fjz0
= _mm_setzero_pd();
1957 fjx1
= _mm_setzero_pd();
1958 fjy1
= _mm_setzero_pd();
1959 fjz1
= _mm_setzero_pd();
1960 fjx2
= _mm_setzero_pd();
1961 fjy2
= _mm_setzero_pd();
1962 fjz2
= _mm_setzero_pd();
1964 /**************************
1965 * CALCULATE INTERACTIONS *
1966 **************************/
1968 r00
= _mm_mul_pd(rsq00
,rinv00
);
1970 /* Calculate table index by multiplying r with table scale and truncate to integer */
1971 rt
= _mm_mul_pd(r00
,vftabscale
);
1972 vfitab
= _mm_cvttpd_epi32(rt
);
1973 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1974 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
1976 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1977 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1978 F
= _mm_setzero_pd();
1979 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1980 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1981 H
= _mm_setzero_pd();
1982 GMX_MM_TRANSPOSE2_PD(G
,H
);
1983 Heps
= _mm_mul_pd(vfeps
,H
);
1984 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1985 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1986 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq00
,FF
),_mm_mul_pd(vftabscale
,rinv00
)));
1988 /* CUBIC SPLINE TABLE DISPERSION */
1989 vfitab
= _mm_add_epi32(vfitab
,ifour
);
1990 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1991 F
= _mm_setzero_pd();
1992 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1993 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1994 H
= _mm_setzero_pd();
1995 GMX_MM_TRANSPOSE2_PD(G
,H
);
1996 Heps
= _mm_mul_pd(vfeps
,H
);
1997 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1998 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1999 fvdw6
= _mm_mul_pd(c6_00
,FF
);
2001 /* CUBIC SPLINE TABLE REPULSION */
2002 vfitab
= _mm_add_epi32(vfitab
,ifour
);
2003 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2004 F
= _mm_setzero_pd();
2005 GMX_MM_TRANSPOSE2_PD(Y
,F
);
2006 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
2007 H
= _mm_setzero_pd();
2008 GMX_MM_TRANSPOSE2_PD(G
,H
);
2009 Heps
= _mm_mul_pd(vfeps
,H
);
2010 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
2011 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
2012 fvdw12
= _mm_mul_pd(c12_00
,FF
);
2013 fvdw
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_add_pd(fvdw6
,fvdw12
),_mm_mul_pd(vftabscale
,rinv00
)));
2015 fscal
= _mm_add_pd(felec
,fvdw
);
2017 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2019 /* Calculate temporary vectorial force */
2020 tx
= _mm_mul_pd(fscal
,dx00
);
2021 ty
= _mm_mul_pd(fscal
,dy00
);
2022 tz
= _mm_mul_pd(fscal
,dz00
);
2024 /* Update vectorial force */
2025 fix0
= _mm_add_pd(fix0
,tx
);
2026 fiy0
= _mm_add_pd(fiy0
,ty
);
2027 fiz0
= _mm_add_pd(fiz0
,tz
);
2029 fjx0
= _mm_add_pd(fjx0
,tx
);
2030 fjy0
= _mm_add_pd(fjy0
,ty
);
2031 fjz0
= _mm_add_pd(fjz0
,tz
);
2033 /**************************
2034 * CALCULATE INTERACTIONS *
2035 **************************/
2037 r01
= _mm_mul_pd(rsq01
,rinv01
);
2039 /* Calculate table index by multiplying r with table scale and truncate to integer */
2040 rt
= _mm_mul_pd(r01
,vftabscale
);
2041 vfitab
= _mm_cvttpd_epi32(rt
);
2042 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
2043 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
2045 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2046 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2047 F
= _mm_setzero_pd();
2048 GMX_MM_TRANSPOSE2_PD(Y
,F
);
2049 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
2050 H
= _mm_setzero_pd();
2051 GMX_MM_TRANSPOSE2_PD(G
,H
);
2052 Heps
= _mm_mul_pd(vfeps
,H
);
2053 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
2054 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
2055 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq01
,FF
),_mm_mul_pd(vftabscale
,rinv01
)));
2059 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2061 /* Calculate temporary vectorial force */
2062 tx
= _mm_mul_pd(fscal
,dx01
);
2063 ty
= _mm_mul_pd(fscal
,dy01
);
2064 tz
= _mm_mul_pd(fscal
,dz01
);
2066 /* Update vectorial force */
2067 fix0
= _mm_add_pd(fix0
,tx
);
2068 fiy0
= _mm_add_pd(fiy0
,ty
);
2069 fiz0
= _mm_add_pd(fiz0
,tz
);
2071 fjx1
= _mm_add_pd(fjx1
,tx
);
2072 fjy1
= _mm_add_pd(fjy1
,ty
);
2073 fjz1
= _mm_add_pd(fjz1
,tz
);
2075 /**************************
2076 * CALCULATE INTERACTIONS *
2077 **************************/
2079 r02
= _mm_mul_pd(rsq02
,rinv02
);
2081 /* Calculate table index by multiplying r with table scale and truncate to integer */
2082 rt
= _mm_mul_pd(r02
,vftabscale
);
2083 vfitab
= _mm_cvttpd_epi32(rt
);
2084 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
2085 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
2087 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2088 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2089 F
= _mm_setzero_pd();
2090 GMX_MM_TRANSPOSE2_PD(Y
,F
);
2091 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
2092 H
= _mm_setzero_pd();
2093 GMX_MM_TRANSPOSE2_PD(G
,H
);
2094 Heps
= _mm_mul_pd(vfeps
,H
);
2095 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
2096 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
2097 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq02
,FF
),_mm_mul_pd(vftabscale
,rinv02
)));
2101 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2103 /* Calculate temporary vectorial force */
2104 tx
= _mm_mul_pd(fscal
,dx02
);
2105 ty
= _mm_mul_pd(fscal
,dy02
);
2106 tz
= _mm_mul_pd(fscal
,dz02
);
2108 /* Update vectorial force */
2109 fix0
= _mm_add_pd(fix0
,tx
);
2110 fiy0
= _mm_add_pd(fiy0
,ty
);
2111 fiz0
= _mm_add_pd(fiz0
,tz
);
2113 fjx2
= _mm_add_pd(fjx2
,tx
);
2114 fjy2
= _mm_add_pd(fjy2
,ty
);
2115 fjz2
= _mm_add_pd(fjz2
,tz
);
2117 /**************************
2118 * CALCULATE INTERACTIONS *
2119 **************************/
2121 r10
= _mm_mul_pd(rsq10
,rinv10
);
2123 /* Calculate table index by multiplying r with table scale and truncate to integer */
2124 rt
= _mm_mul_pd(r10
,vftabscale
);
2125 vfitab
= _mm_cvttpd_epi32(rt
);
2126 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
2127 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
2129 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2130 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2131 F
= _mm_setzero_pd();
2132 GMX_MM_TRANSPOSE2_PD(Y
,F
);
2133 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
2134 H
= _mm_setzero_pd();
2135 GMX_MM_TRANSPOSE2_PD(G
,H
);
2136 Heps
= _mm_mul_pd(vfeps
,H
);
2137 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
2138 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
2139 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq10
,FF
),_mm_mul_pd(vftabscale
,rinv10
)));
2143 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2145 /* Calculate temporary vectorial force */
2146 tx
= _mm_mul_pd(fscal
,dx10
);
2147 ty
= _mm_mul_pd(fscal
,dy10
);
2148 tz
= _mm_mul_pd(fscal
,dz10
);
2150 /* Update vectorial force */
2151 fix1
= _mm_add_pd(fix1
,tx
);
2152 fiy1
= _mm_add_pd(fiy1
,ty
);
2153 fiz1
= _mm_add_pd(fiz1
,tz
);
2155 fjx0
= _mm_add_pd(fjx0
,tx
);
2156 fjy0
= _mm_add_pd(fjy0
,ty
);
2157 fjz0
= _mm_add_pd(fjz0
,tz
);
2159 /**************************
2160 * CALCULATE INTERACTIONS *
2161 **************************/
2163 r11
= _mm_mul_pd(rsq11
,rinv11
);
2165 /* Calculate table index by multiplying r with table scale and truncate to integer */
2166 rt
= _mm_mul_pd(r11
,vftabscale
);
2167 vfitab
= _mm_cvttpd_epi32(rt
);
2168 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
2169 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
2171 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2172 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2173 F
= _mm_setzero_pd();
2174 GMX_MM_TRANSPOSE2_PD(Y
,F
);
2175 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
2176 H
= _mm_setzero_pd();
2177 GMX_MM_TRANSPOSE2_PD(G
,H
);
2178 Heps
= _mm_mul_pd(vfeps
,H
);
2179 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
2180 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
2181 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq11
,FF
),_mm_mul_pd(vftabscale
,rinv11
)));
2185 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2187 /* Calculate temporary vectorial force */
2188 tx
= _mm_mul_pd(fscal
,dx11
);
2189 ty
= _mm_mul_pd(fscal
,dy11
);
2190 tz
= _mm_mul_pd(fscal
,dz11
);
2192 /* Update vectorial force */
2193 fix1
= _mm_add_pd(fix1
,tx
);
2194 fiy1
= _mm_add_pd(fiy1
,ty
);
2195 fiz1
= _mm_add_pd(fiz1
,tz
);
2197 fjx1
= _mm_add_pd(fjx1
,tx
);
2198 fjy1
= _mm_add_pd(fjy1
,ty
);
2199 fjz1
= _mm_add_pd(fjz1
,tz
);
2201 /**************************
2202 * CALCULATE INTERACTIONS *
2203 **************************/
2205 r12
= _mm_mul_pd(rsq12
,rinv12
);
2207 /* Calculate table index by multiplying r with table scale and truncate to integer */
2208 rt
= _mm_mul_pd(r12
,vftabscale
);
2209 vfitab
= _mm_cvttpd_epi32(rt
);
2210 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
2211 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
2213 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2214 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2215 F
= _mm_setzero_pd();
2216 GMX_MM_TRANSPOSE2_PD(Y
,F
);
2217 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
2218 H
= _mm_setzero_pd();
2219 GMX_MM_TRANSPOSE2_PD(G
,H
);
2220 Heps
= _mm_mul_pd(vfeps
,H
);
2221 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
2222 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
2223 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq12
,FF
),_mm_mul_pd(vftabscale
,rinv12
)));
2227 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2229 /* Calculate temporary vectorial force */
2230 tx
= _mm_mul_pd(fscal
,dx12
);
2231 ty
= _mm_mul_pd(fscal
,dy12
);
2232 tz
= _mm_mul_pd(fscal
,dz12
);
2234 /* Update vectorial force */
2235 fix1
= _mm_add_pd(fix1
,tx
);
2236 fiy1
= _mm_add_pd(fiy1
,ty
);
2237 fiz1
= _mm_add_pd(fiz1
,tz
);
2239 fjx2
= _mm_add_pd(fjx2
,tx
);
2240 fjy2
= _mm_add_pd(fjy2
,ty
);
2241 fjz2
= _mm_add_pd(fjz2
,tz
);
2243 /**************************
2244 * CALCULATE INTERACTIONS *
2245 **************************/
2247 r20
= _mm_mul_pd(rsq20
,rinv20
);
2249 /* Calculate table index by multiplying r with table scale and truncate to integer */
2250 rt
= _mm_mul_pd(r20
,vftabscale
);
2251 vfitab
= _mm_cvttpd_epi32(rt
);
2252 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
2253 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
2255 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2256 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2257 F
= _mm_setzero_pd();
2258 GMX_MM_TRANSPOSE2_PD(Y
,F
);
2259 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
2260 H
= _mm_setzero_pd();
2261 GMX_MM_TRANSPOSE2_PD(G
,H
);
2262 Heps
= _mm_mul_pd(vfeps
,H
);
2263 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
2264 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
2265 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq20
,FF
),_mm_mul_pd(vftabscale
,rinv20
)));
2269 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2271 /* Calculate temporary vectorial force */
2272 tx
= _mm_mul_pd(fscal
,dx20
);
2273 ty
= _mm_mul_pd(fscal
,dy20
);
2274 tz
= _mm_mul_pd(fscal
,dz20
);
2276 /* Update vectorial force */
2277 fix2
= _mm_add_pd(fix2
,tx
);
2278 fiy2
= _mm_add_pd(fiy2
,ty
);
2279 fiz2
= _mm_add_pd(fiz2
,tz
);
2281 fjx0
= _mm_add_pd(fjx0
,tx
);
2282 fjy0
= _mm_add_pd(fjy0
,ty
);
2283 fjz0
= _mm_add_pd(fjz0
,tz
);
2285 /**************************
2286 * CALCULATE INTERACTIONS *
2287 **************************/
2289 r21
= _mm_mul_pd(rsq21
,rinv21
);
2291 /* Calculate table index by multiplying r with table scale and truncate to integer */
2292 rt
= _mm_mul_pd(r21
,vftabscale
);
2293 vfitab
= _mm_cvttpd_epi32(rt
);
2294 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
2295 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
2297 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2298 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2299 F
= _mm_setzero_pd();
2300 GMX_MM_TRANSPOSE2_PD(Y
,F
);
2301 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
2302 H
= _mm_setzero_pd();
2303 GMX_MM_TRANSPOSE2_PD(G
,H
);
2304 Heps
= _mm_mul_pd(vfeps
,H
);
2305 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
2306 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
2307 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq21
,FF
),_mm_mul_pd(vftabscale
,rinv21
)));
2311 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2313 /* Calculate temporary vectorial force */
2314 tx
= _mm_mul_pd(fscal
,dx21
);
2315 ty
= _mm_mul_pd(fscal
,dy21
);
2316 tz
= _mm_mul_pd(fscal
,dz21
);
2318 /* Update vectorial force */
2319 fix2
= _mm_add_pd(fix2
,tx
);
2320 fiy2
= _mm_add_pd(fiy2
,ty
);
2321 fiz2
= _mm_add_pd(fiz2
,tz
);
2323 fjx1
= _mm_add_pd(fjx1
,tx
);
2324 fjy1
= _mm_add_pd(fjy1
,ty
);
2325 fjz1
= _mm_add_pd(fjz1
,tz
);
2327 /**************************
2328 * CALCULATE INTERACTIONS *
2329 **************************/
2331 r22
= _mm_mul_pd(rsq22
,rinv22
);
2333 /* Calculate table index by multiplying r with table scale and truncate to integer */
2334 rt
= _mm_mul_pd(r22
,vftabscale
);
2335 vfitab
= _mm_cvttpd_epi32(rt
);
2336 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
2337 vfitab
= _mm_slli_epi32(_mm_add_epi32(vfitab
,_mm_slli_epi32(vfitab
,1)),2);
2339 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2340 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2341 F
= _mm_setzero_pd();
2342 GMX_MM_TRANSPOSE2_PD(Y
,F
);
2343 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
2344 H
= _mm_setzero_pd();
2345 GMX_MM_TRANSPOSE2_PD(G
,H
);
2346 Heps
= _mm_mul_pd(vfeps
,H
);
2347 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
2348 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
2349 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq22
,FF
),_mm_mul_pd(vftabscale
,rinv22
)));
2353 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2355 /* Calculate temporary vectorial force */
2356 tx
= _mm_mul_pd(fscal
,dx22
);
2357 ty
= _mm_mul_pd(fscal
,dy22
);
2358 tz
= _mm_mul_pd(fscal
,dz22
);
2360 /* Update vectorial force */
2361 fix2
= _mm_add_pd(fix2
,tx
);
2362 fiy2
= _mm_add_pd(fiy2
,ty
);
2363 fiz2
= _mm_add_pd(fiz2
,tz
);
2365 fjx2
= _mm_add_pd(fjx2
,tx
);
2366 fjy2
= _mm_add_pd(fjy2
,ty
);
2367 fjz2
= _mm_add_pd(fjz2
,tz
);
2369 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
2371 /* Inner loop uses 373 flops */
2374 /* End of innermost loop */
2376 gmx_mm_update_iforce_3atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
2377 f
+i_coord_offset
,fshift
+i_shift_offset
);
2379 /* Increment number of inner iterations */
2380 inneriter
+= j_index_end
- j_index_start
;
2382 /* Outer loop uses 18 flops */
2385 /* Increment number of outer iterations */
2388 /* Update outer/inner flops */
2390 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W3W3_F
,outeriter
*18 + inneriter
*373);