2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/math/vec.h"
46 #include "gromacs/legacyheaders/nrnb.h"
48 #include "gromacs/simd/math_x86_sse2_double.h"
49 #include "kernelutil_x86_sse2_double.h"
52 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_sse2_double
53 * Electrostatics interaction: CubicSplineTable
54 * VdW interaction: LennardJones
55 * Geometry: Water3-Water3
56 * Calculate force/pot: PotentialAndForce
59 nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_sse2_double
60 (t_nblist
* gmx_restrict nlist
,
61 rvec
* gmx_restrict xx
,
62 rvec
* gmx_restrict ff
,
63 t_forcerec
* gmx_restrict fr
,
64 t_mdatoms
* gmx_restrict mdatoms
,
65 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
66 t_nrnb
* gmx_restrict nrnb
)
68 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
69 * just 0 for non-waters.
70 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
71 * jnr indices corresponding to data put in the four positions in the SIMD register.
73 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
74 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
76 int j_coord_offsetA
,j_coord_offsetB
;
77 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
79 real
*shiftvec
,*fshift
,*x
,*f
;
80 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
82 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
84 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
86 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
87 int vdwjidx0A
,vdwjidx0B
;
88 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
89 int vdwjidx1A
,vdwjidx1B
;
90 __m128d jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
91 int vdwjidx2A
,vdwjidx2B
;
92 __m128d jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
93 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
94 __m128d dx01
,dy01
,dz01
,rsq01
,rinv01
,rinvsq01
,r01
,qq01
,c6_01
,c12_01
;
95 __m128d dx02
,dy02
,dz02
,rsq02
,rinv02
,rinvsq02
,r02
,qq02
,c6_02
,c12_02
;
96 __m128d dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
97 __m128d dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
98 __m128d dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
99 __m128d dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
100 __m128d dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
101 __m128d dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
102 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
105 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
108 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
109 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
111 __m128i ifour
= _mm_set1_epi32(4);
112 __m128d rt
,vfeps
,vftabscale
,Y
,F
,G
,H
,Heps
,Fp
,VV
,FF
;
114 __m128d dummy_mask
,cutoff_mask
;
115 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
116 __m128d one
= _mm_set1_pd(1.0);
117 __m128d two
= _mm_set1_pd(2.0);
123 jindex
= nlist
->jindex
;
125 shiftidx
= nlist
->shift
;
127 shiftvec
= fr
->shift_vec
[0];
128 fshift
= fr
->fshift
[0];
129 facel
= _mm_set1_pd(fr
->epsfac
);
130 charge
= mdatoms
->chargeA
;
131 nvdwtype
= fr
->ntype
;
133 vdwtype
= mdatoms
->typeA
;
135 vftab
= kernel_data
->table_elec
->data
;
136 vftabscale
= _mm_set1_pd(kernel_data
->table_elec
->scale
);
138 /* Setup water-specific parameters */
139 inr
= nlist
->iinr
[0];
140 iq0
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+0]));
141 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
142 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
143 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
145 jq0
= _mm_set1_pd(charge
[inr
+0]);
146 jq1
= _mm_set1_pd(charge
[inr
+1]);
147 jq2
= _mm_set1_pd(charge
[inr
+2]);
148 vdwjidx0A
= 2*vdwtype
[inr
+0];
149 qq00
= _mm_mul_pd(iq0
,jq0
);
150 c6_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
]);
151 c12_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
152 qq01
= _mm_mul_pd(iq0
,jq1
);
153 qq02
= _mm_mul_pd(iq0
,jq2
);
154 qq10
= _mm_mul_pd(iq1
,jq0
);
155 qq11
= _mm_mul_pd(iq1
,jq1
);
156 qq12
= _mm_mul_pd(iq1
,jq2
);
157 qq20
= _mm_mul_pd(iq2
,jq0
);
158 qq21
= _mm_mul_pd(iq2
,jq1
);
159 qq22
= _mm_mul_pd(iq2
,jq2
);
161 /* Avoid stupid compiler warnings */
169 /* Start outer loop over neighborlists */
170 for(iidx
=0; iidx
<nri
; iidx
++)
172 /* Load shift vector for this list */
173 i_shift_offset
= DIM
*shiftidx
[iidx
];
175 /* Load limits for loop over neighbors */
176 j_index_start
= jindex
[iidx
];
177 j_index_end
= jindex
[iidx
+1];
179 /* Get outer coordinate index */
181 i_coord_offset
= DIM
*inr
;
183 /* Load i particle coords and add shift vector */
184 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
185 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
187 fix0
= _mm_setzero_pd();
188 fiy0
= _mm_setzero_pd();
189 fiz0
= _mm_setzero_pd();
190 fix1
= _mm_setzero_pd();
191 fiy1
= _mm_setzero_pd();
192 fiz1
= _mm_setzero_pd();
193 fix2
= _mm_setzero_pd();
194 fiy2
= _mm_setzero_pd();
195 fiz2
= _mm_setzero_pd();
197 /* Reset potential sums */
198 velecsum
= _mm_setzero_pd();
199 vvdwsum
= _mm_setzero_pd();
201 /* Start inner kernel loop */
202 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
205 /* Get j neighbor index, and coordinate index */
208 j_coord_offsetA
= DIM
*jnrA
;
209 j_coord_offsetB
= DIM
*jnrB
;
211 /* load j atom coordinates */
212 gmx_mm_load_3rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
213 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
215 /* Calculate displacement vector */
216 dx00
= _mm_sub_pd(ix0
,jx0
);
217 dy00
= _mm_sub_pd(iy0
,jy0
);
218 dz00
= _mm_sub_pd(iz0
,jz0
);
219 dx01
= _mm_sub_pd(ix0
,jx1
);
220 dy01
= _mm_sub_pd(iy0
,jy1
);
221 dz01
= _mm_sub_pd(iz0
,jz1
);
222 dx02
= _mm_sub_pd(ix0
,jx2
);
223 dy02
= _mm_sub_pd(iy0
,jy2
);
224 dz02
= _mm_sub_pd(iz0
,jz2
);
225 dx10
= _mm_sub_pd(ix1
,jx0
);
226 dy10
= _mm_sub_pd(iy1
,jy0
);
227 dz10
= _mm_sub_pd(iz1
,jz0
);
228 dx11
= _mm_sub_pd(ix1
,jx1
);
229 dy11
= _mm_sub_pd(iy1
,jy1
);
230 dz11
= _mm_sub_pd(iz1
,jz1
);
231 dx12
= _mm_sub_pd(ix1
,jx2
);
232 dy12
= _mm_sub_pd(iy1
,jy2
);
233 dz12
= _mm_sub_pd(iz1
,jz2
);
234 dx20
= _mm_sub_pd(ix2
,jx0
);
235 dy20
= _mm_sub_pd(iy2
,jy0
);
236 dz20
= _mm_sub_pd(iz2
,jz0
);
237 dx21
= _mm_sub_pd(ix2
,jx1
);
238 dy21
= _mm_sub_pd(iy2
,jy1
);
239 dz21
= _mm_sub_pd(iz2
,jz1
);
240 dx22
= _mm_sub_pd(ix2
,jx2
);
241 dy22
= _mm_sub_pd(iy2
,jy2
);
242 dz22
= _mm_sub_pd(iz2
,jz2
);
244 /* Calculate squared distance and things based on it */
245 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
246 rsq01
= gmx_mm_calc_rsq_pd(dx01
,dy01
,dz01
);
247 rsq02
= gmx_mm_calc_rsq_pd(dx02
,dy02
,dz02
);
248 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
249 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
250 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
251 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
252 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
253 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
255 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
256 rinv01
= gmx_mm_invsqrt_pd(rsq01
);
257 rinv02
= gmx_mm_invsqrt_pd(rsq02
);
258 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
259 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
260 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
261 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
262 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
263 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
265 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
267 fjx0
= _mm_setzero_pd();
268 fjy0
= _mm_setzero_pd();
269 fjz0
= _mm_setzero_pd();
270 fjx1
= _mm_setzero_pd();
271 fjy1
= _mm_setzero_pd();
272 fjz1
= _mm_setzero_pd();
273 fjx2
= _mm_setzero_pd();
274 fjy2
= _mm_setzero_pd();
275 fjz2
= _mm_setzero_pd();
277 /**************************
278 * CALCULATE INTERACTIONS *
279 **************************/
281 r00
= _mm_mul_pd(rsq00
,rinv00
);
283 /* Calculate table index by multiplying r with table scale and truncate to integer */
284 rt
= _mm_mul_pd(r00
,vftabscale
);
285 vfitab
= _mm_cvttpd_epi32(rt
);
286 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
287 vfitab
= _mm_slli_epi32(vfitab
,2);
289 /* CUBIC SPLINE TABLE ELECTROSTATICS */
290 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
291 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
292 GMX_MM_TRANSPOSE2_PD(Y
,F
);
293 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
294 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
295 GMX_MM_TRANSPOSE2_PD(G
,H
);
296 Heps
= _mm_mul_pd(vfeps
,H
);
297 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
298 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
299 velec
= _mm_mul_pd(qq00
,VV
);
300 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
301 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq00
,FF
),_mm_mul_pd(vftabscale
,rinv00
)));
303 /* LENNARD-JONES DISPERSION/REPULSION */
305 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
306 vvdw6
= _mm_mul_pd(c6_00
,rinvsix
);
307 vvdw12
= _mm_mul_pd(c12_00
,_mm_mul_pd(rinvsix
,rinvsix
));
308 vvdw
= _mm_sub_pd( _mm_mul_pd(vvdw12
,one_twelfth
) , _mm_mul_pd(vvdw6
,one_sixth
) );
309 fvdw
= _mm_mul_pd(_mm_sub_pd(vvdw12
,vvdw6
),rinvsq00
);
311 /* Update potential sum for this i atom from the interaction with this j atom. */
312 velecsum
= _mm_add_pd(velecsum
,velec
);
313 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
315 fscal
= _mm_add_pd(felec
,fvdw
);
317 /* Calculate temporary vectorial force */
318 tx
= _mm_mul_pd(fscal
,dx00
);
319 ty
= _mm_mul_pd(fscal
,dy00
);
320 tz
= _mm_mul_pd(fscal
,dz00
);
322 /* Update vectorial force */
323 fix0
= _mm_add_pd(fix0
,tx
);
324 fiy0
= _mm_add_pd(fiy0
,ty
);
325 fiz0
= _mm_add_pd(fiz0
,tz
);
327 fjx0
= _mm_add_pd(fjx0
,tx
);
328 fjy0
= _mm_add_pd(fjy0
,ty
);
329 fjz0
= _mm_add_pd(fjz0
,tz
);
331 /**************************
332 * CALCULATE INTERACTIONS *
333 **************************/
335 r01
= _mm_mul_pd(rsq01
,rinv01
);
337 /* Calculate table index by multiplying r with table scale and truncate to integer */
338 rt
= _mm_mul_pd(r01
,vftabscale
);
339 vfitab
= _mm_cvttpd_epi32(rt
);
340 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
341 vfitab
= _mm_slli_epi32(vfitab
,2);
343 /* CUBIC SPLINE TABLE ELECTROSTATICS */
344 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
345 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
346 GMX_MM_TRANSPOSE2_PD(Y
,F
);
347 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
348 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
349 GMX_MM_TRANSPOSE2_PD(G
,H
);
350 Heps
= _mm_mul_pd(vfeps
,H
);
351 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
352 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
353 velec
= _mm_mul_pd(qq01
,VV
);
354 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
355 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq01
,FF
),_mm_mul_pd(vftabscale
,rinv01
)));
357 /* Update potential sum for this i atom from the interaction with this j atom. */
358 velecsum
= _mm_add_pd(velecsum
,velec
);
362 /* Calculate temporary vectorial force */
363 tx
= _mm_mul_pd(fscal
,dx01
);
364 ty
= _mm_mul_pd(fscal
,dy01
);
365 tz
= _mm_mul_pd(fscal
,dz01
);
367 /* Update vectorial force */
368 fix0
= _mm_add_pd(fix0
,tx
);
369 fiy0
= _mm_add_pd(fiy0
,ty
);
370 fiz0
= _mm_add_pd(fiz0
,tz
);
372 fjx1
= _mm_add_pd(fjx1
,tx
);
373 fjy1
= _mm_add_pd(fjy1
,ty
);
374 fjz1
= _mm_add_pd(fjz1
,tz
);
376 /**************************
377 * CALCULATE INTERACTIONS *
378 **************************/
380 r02
= _mm_mul_pd(rsq02
,rinv02
);
382 /* Calculate table index by multiplying r with table scale and truncate to integer */
383 rt
= _mm_mul_pd(r02
,vftabscale
);
384 vfitab
= _mm_cvttpd_epi32(rt
);
385 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
386 vfitab
= _mm_slli_epi32(vfitab
,2);
388 /* CUBIC SPLINE TABLE ELECTROSTATICS */
389 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
390 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
391 GMX_MM_TRANSPOSE2_PD(Y
,F
);
392 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
393 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
394 GMX_MM_TRANSPOSE2_PD(G
,H
);
395 Heps
= _mm_mul_pd(vfeps
,H
);
396 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
397 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
398 velec
= _mm_mul_pd(qq02
,VV
);
399 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
400 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq02
,FF
),_mm_mul_pd(vftabscale
,rinv02
)));
402 /* Update potential sum for this i atom from the interaction with this j atom. */
403 velecsum
= _mm_add_pd(velecsum
,velec
);
407 /* Calculate temporary vectorial force */
408 tx
= _mm_mul_pd(fscal
,dx02
);
409 ty
= _mm_mul_pd(fscal
,dy02
);
410 tz
= _mm_mul_pd(fscal
,dz02
);
412 /* Update vectorial force */
413 fix0
= _mm_add_pd(fix0
,tx
);
414 fiy0
= _mm_add_pd(fiy0
,ty
);
415 fiz0
= _mm_add_pd(fiz0
,tz
);
417 fjx2
= _mm_add_pd(fjx2
,tx
);
418 fjy2
= _mm_add_pd(fjy2
,ty
);
419 fjz2
= _mm_add_pd(fjz2
,tz
);
421 /**************************
422 * CALCULATE INTERACTIONS *
423 **************************/
425 r10
= _mm_mul_pd(rsq10
,rinv10
);
427 /* Calculate table index by multiplying r with table scale and truncate to integer */
428 rt
= _mm_mul_pd(r10
,vftabscale
);
429 vfitab
= _mm_cvttpd_epi32(rt
);
430 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
431 vfitab
= _mm_slli_epi32(vfitab
,2);
433 /* CUBIC SPLINE TABLE ELECTROSTATICS */
434 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
435 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
436 GMX_MM_TRANSPOSE2_PD(Y
,F
);
437 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
438 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
439 GMX_MM_TRANSPOSE2_PD(G
,H
);
440 Heps
= _mm_mul_pd(vfeps
,H
);
441 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
442 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
443 velec
= _mm_mul_pd(qq10
,VV
);
444 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
445 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq10
,FF
),_mm_mul_pd(vftabscale
,rinv10
)));
447 /* Update potential sum for this i atom from the interaction with this j atom. */
448 velecsum
= _mm_add_pd(velecsum
,velec
);
452 /* Calculate temporary vectorial force */
453 tx
= _mm_mul_pd(fscal
,dx10
);
454 ty
= _mm_mul_pd(fscal
,dy10
);
455 tz
= _mm_mul_pd(fscal
,dz10
);
457 /* Update vectorial force */
458 fix1
= _mm_add_pd(fix1
,tx
);
459 fiy1
= _mm_add_pd(fiy1
,ty
);
460 fiz1
= _mm_add_pd(fiz1
,tz
);
462 fjx0
= _mm_add_pd(fjx0
,tx
);
463 fjy0
= _mm_add_pd(fjy0
,ty
);
464 fjz0
= _mm_add_pd(fjz0
,tz
);
466 /**************************
467 * CALCULATE INTERACTIONS *
468 **************************/
470 r11
= _mm_mul_pd(rsq11
,rinv11
);
472 /* Calculate table index by multiplying r with table scale and truncate to integer */
473 rt
= _mm_mul_pd(r11
,vftabscale
);
474 vfitab
= _mm_cvttpd_epi32(rt
);
475 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
476 vfitab
= _mm_slli_epi32(vfitab
,2);
478 /* CUBIC SPLINE TABLE ELECTROSTATICS */
479 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
480 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
481 GMX_MM_TRANSPOSE2_PD(Y
,F
);
482 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
483 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
484 GMX_MM_TRANSPOSE2_PD(G
,H
);
485 Heps
= _mm_mul_pd(vfeps
,H
);
486 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
487 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
488 velec
= _mm_mul_pd(qq11
,VV
);
489 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
490 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq11
,FF
),_mm_mul_pd(vftabscale
,rinv11
)));
492 /* Update potential sum for this i atom from the interaction with this j atom. */
493 velecsum
= _mm_add_pd(velecsum
,velec
);
497 /* Calculate temporary vectorial force */
498 tx
= _mm_mul_pd(fscal
,dx11
);
499 ty
= _mm_mul_pd(fscal
,dy11
);
500 tz
= _mm_mul_pd(fscal
,dz11
);
502 /* Update vectorial force */
503 fix1
= _mm_add_pd(fix1
,tx
);
504 fiy1
= _mm_add_pd(fiy1
,ty
);
505 fiz1
= _mm_add_pd(fiz1
,tz
);
507 fjx1
= _mm_add_pd(fjx1
,tx
);
508 fjy1
= _mm_add_pd(fjy1
,ty
);
509 fjz1
= _mm_add_pd(fjz1
,tz
);
511 /**************************
512 * CALCULATE INTERACTIONS *
513 **************************/
515 r12
= _mm_mul_pd(rsq12
,rinv12
);
517 /* Calculate table index by multiplying r with table scale and truncate to integer */
518 rt
= _mm_mul_pd(r12
,vftabscale
);
519 vfitab
= _mm_cvttpd_epi32(rt
);
520 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
521 vfitab
= _mm_slli_epi32(vfitab
,2);
523 /* CUBIC SPLINE TABLE ELECTROSTATICS */
524 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
525 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
526 GMX_MM_TRANSPOSE2_PD(Y
,F
);
527 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
528 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
529 GMX_MM_TRANSPOSE2_PD(G
,H
);
530 Heps
= _mm_mul_pd(vfeps
,H
);
531 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
532 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
533 velec
= _mm_mul_pd(qq12
,VV
);
534 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
535 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq12
,FF
),_mm_mul_pd(vftabscale
,rinv12
)));
537 /* Update potential sum for this i atom from the interaction with this j atom. */
538 velecsum
= _mm_add_pd(velecsum
,velec
);
542 /* Calculate temporary vectorial force */
543 tx
= _mm_mul_pd(fscal
,dx12
);
544 ty
= _mm_mul_pd(fscal
,dy12
);
545 tz
= _mm_mul_pd(fscal
,dz12
);
547 /* Update vectorial force */
548 fix1
= _mm_add_pd(fix1
,tx
);
549 fiy1
= _mm_add_pd(fiy1
,ty
);
550 fiz1
= _mm_add_pd(fiz1
,tz
);
552 fjx2
= _mm_add_pd(fjx2
,tx
);
553 fjy2
= _mm_add_pd(fjy2
,ty
);
554 fjz2
= _mm_add_pd(fjz2
,tz
);
556 /**************************
557 * CALCULATE INTERACTIONS *
558 **************************/
560 r20
= _mm_mul_pd(rsq20
,rinv20
);
562 /* Calculate table index by multiplying r with table scale and truncate to integer */
563 rt
= _mm_mul_pd(r20
,vftabscale
);
564 vfitab
= _mm_cvttpd_epi32(rt
);
565 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
566 vfitab
= _mm_slli_epi32(vfitab
,2);
568 /* CUBIC SPLINE TABLE ELECTROSTATICS */
569 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
570 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
571 GMX_MM_TRANSPOSE2_PD(Y
,F
);
572 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
573 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
574 GMX_MM_TRANSPOSE2_PD(G
,H
);
575 Heps
= _mm_mul_pd(vfeps
,H
);
576 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
577 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
578 velec
= _mm_mul_pd(qq20
,VV
);
579 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
580 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq20
,FF
),_mm_mul_pd(vftabscale
,rinv20
)));
582 /* Update potential sum for this i atom from the interaction with this j atom. */
583 velecsum
= _mm_add_pd(velecsum
,velec
);
587 /* Calculate temporary vectorial force */
588 tx
= _mm_mul_pd(fscal
,dx20
);
589 ty
= _mm_mul_pd(fscal
,dy20
);
590 tz
= _mm_mul_pd(fscal
,dz20
);
592 /* Update vectorial force */
593 fix2
= _mm_add_pd(fix2
,tx
);
594 fiy2
= _mm_add_pd(fiy2
,ty
);
595 fiz2
= _mm_add_pd(fiz2
,tz
);
597 fjx0
= _mm_add_pd(fjx0
,tx
);
598 fjy0
= _mm_add_pd(fjy0
,ty
);
599 fjz0
= _mm_add_pd(fjz0
,tz
);
601 /**************************
602 * CALCULATE INTERACTIONS *
603 **************************/
605 r21
= _mm_mul_pd(rsq21
,rinv21
);
607 /* Calculate table index by multiplying r with table scale and truncate to integer */
608 rt
= _mm_mul_pd(r21
,vftabscale
);
609 vfitab
= _mm_cvttpd_epi32(rt
);
610 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
611 vfitab
= _mm_slli_epi32(vfitab
,2);
613 /* CUBIC SPLINE TABLE ELECTROSTATICS */
614 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
615 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
616 GMX_MM_TRANSPOSE2_PD(Y
,F
);
617 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
618 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
619 GMX_MM_TRANSPOSE2_PD(G
,H
);
620 Heps
= _mm_mul_pd(vfeps
,H
);
621 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
622 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
623 velec
= _mm_mul_pd(qq21
,VV
);
624 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
625 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq21
,FF
),_mm_mul_pd(vftabscale
,rinv21
)));
627 /* Update potential sum for this i atom from the interaction with this j atom. */
628 velecsum
= _mm_add_pd(velecsum
,velec
);
632 /* Calculate temporary vectorial force */
633 tx
= _mm_mul_pd(fscal
,dx21
);
634 ty
= _mm_mul_pd(fscal
,dy21
);
635 tz
= _mm_mul_pd(fscal
,dz21
);
637 /* Update vectorial force */
638 fix2
= _mm_add_pd(fix2
,tx
);
639 fiy2
= _mm_add_pd(fiy2
,ty
);
640 fiz2
= _mm_add_pd(fiz2
,tz
);
642 fjx1
= _mm_add_pd(fjx1
,tx
);
643 fjy1
= _mm_add_pd(fjy1
,ty
);
644 fjz1
= _mm_add_pd(fjz1
,tz
);
646 /**************************
647 * CALCULATE INTERACTIONS *
648 **************************/
650 r22
= _mm_mul_pd(rsq22
,rinv22
);
652 /* Calculate table index by multiplying r with table scale and truncate to integer */
653 rt
= _mm_mul_pd(r22
,vftabscale
);
654 vfitab
= _mm_cvttpd_epi32(rt
);
655 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
656 vfitab
= _mm_slli_epi32(vfitab
,2);
658 /* CUBIC SPLINE TABLE ELECTROSTATICS */
659 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
660 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
661 GMX_MM_TRANSPOSE2_PD(Y
,F
);
662 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
663 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
664 GMX_MM_TRANSPOSE2_PD(G
,H
);
665 Heps
= _mm_mul_pd(vfeps
,H
);
666 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
667 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
668 velec
= _mm_mul_pd(qq22
,VV
);
669 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
670 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq22
,FF
),_mm_mul_pd(vftabscale
,rinv22
)));
672 /* Update potential sum for this i atom from the interaction with this j atom. */
673 velecsum
= _mm_add_pd(velecsum
,velec
);
677 /* Calculate temporary vectorial force */
678 tx
= _mm_mul_pd(fscal
,dx22
);
679 ty
= _mm_mul_pd(fscal
,dy22
);
680 tz
= _mm_mul_pd(fscal
,dz22
);
682 /* Update vectorial force */
683 fix2
= _mm_add_pd(fix2
,tx
);
684 fiy2
= _mm_add_pd(fiy2
,ty
);
685 fiz2
= _mm_add_pd(fiz2
,tz
);
687 fjx2
= _mm_add_pd(fjx2
,tx
);
688 fjy2
= _mm_add_pd(fjy2
,ty
);
689 fjz2
= _mm_add_pd(fjz2
,tz
);
691 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
693 /* Inner loop uses 400 flops */
700 j_coord_offsetA
= DIM
*jnrA
;
702 /* load j atom coordinates */
703 gmx_mm_load_3rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
704 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
706 /* Calculate displacement vector */
707 dx00
= _mm_sub_pd(ix0
,jx0
);
708 dy00
= _mm_sub_pd(iy0
,jy0
);
709 dz00
= _mm_sub_pd(iz0
,jz0
);
710 dx01
= _mm_sub_pd(ix0
,jx1
);
711 dy01
= _mm_sub_pd(iy0
,jy1
);
712 dz01
= _mm_sub_pd(iz0
,jz1
);
713 dx02
= _mm_sub_pd(ix0
,jx2
);
714 dy02
= _mm_sub_pd(iy0
,jy2
);
715 dz02
= _mm_sub_pd(iz0
,jz2
);
716 dx10
= _mm_sub_pd(ix1
,jx0
);
717 dy10
= _mm_sub_pd(iy1
,jy0
);
718 dz10
= _mm_sub_pd(iz1
,jz0
);
719 dx11
= _mm_sub_pd(ix1
,jx1
);
720 dy11
= _mm_sub_pd(iy1
,jy1
);
721 dz11
= _mm_sub_pd(iz1
,jz1
);
722 dx12
= _mm_sub_pd(ix1
,jx2
);
723 dy12
= _mm_sub_pd(iy1
,jy2
);
724 dz12
= _mm_sub_pd(iz1
,jz2
);
725 dx20
= _mm_sub_pd(ix2
,jx0
);
726 dy20
= _mm_sub_pd(iy2
,jy0
);
727 dz20
= _mm_sub_pd(iz2
,jz0
);
728 dx21
= _mm_sub_pd(ix2
,jx1
);
729 dy21
= _mm_sub_pd(iy2
,jy1
);
730 dz21
= _mm_sub_pd(iz2
,jz1
);
731 dx22
= _mm_sub_pd(ix2
,jx2
);
732 dy22
= _mm_sub_pd(iy2
,jy2
);
733 dz22
= _mm_sub_pd(iz2
,jz2
);
735 /* Calculate squared distance and things based on it */
736 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
737 rsq01
= gmx_mm_calc_rsq_pd(dx01
,dy01
,dz01
);
738 rsq02
= gmx_mm_calc_rsq_pd(dx02
,dy02
,dz02
);
739 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
740 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
741 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
742 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
743 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
744 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
746 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
747 rinv01
= gmx_mm_invsqrt_pd(rsq01
);
748 rinv02
= gmx_mm_invsqrt_pd(rsq02
);
749 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
750 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
751 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
752 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
753 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
754 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
756 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
758 fjx0
= _mm_setzero_pd();
759 fjy0
= _mm_setzero_pd();
760 fjz0
= _mm_setzero_pd();
761 fjx1
= _mm_setzero_pd();
762 fjy1
= _mm_setzero_pd();
763 fjz1
= _mm_setzero_pd();
764 fjx2
= _mm_setzero_pd();
765 fjy2
= _mm_setzero_pd();
766 fjz2
= _mm_setzero_pd();
768 /**************************
769 * CALCULATE INTERACTIONS *
770 **************************/
772 r00
= _mm_mul_pd(rsq00
,rinv00
);
774 /* Calculate table index by multiplying r with table scale and truncate to integer */
775 rt
= _mm_mul_pd(r00
,vftabscale
);
776 vfitab
= _mm_cvttpd_epi32(rt
);
777 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
778 vfitab
= _mm_slli_epi32(vfitab
,2);
780 /* CUBIC SPLINE TABLE ELECTROSTATICS */
781 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
782 F
= _mm_setzero_pd();
783 GMX_MM_TRANSPOSE2_PD(Y
,F
);
784 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
785 H
= _mm_setzero_pd();
786 GMX_MM_TRANSPOSE2_PD(G
,H
);
787 Heps
= _mm_mul_pd(vfeps
,H
);
788 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
789 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
790 velec
= _mm_mul_pd(qq00
,VV
);
791 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
792 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq00
,FF
),_mm_mul_pd(vftabscale
,rinv00
)));
794 /* LENNARD-JONES DISPERSION/REPULSION */
796 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
797 vvdw6
= _mm_mul_pd(c6_00
,rinvsix
);
798 vvdw12
= _mm_mul_pd(c12_00
,_mm_mul_pd(rinvsix
,rinvsix
));
799 vvdw
= _mm_sub_pd( _mm_mul_pd(vvdw12
,one_twelfth
) , _mm_mul_pd(vvdw6
,one_sixth
) );
800 fvdw
= _mm_mul_pd(_mm_sub_pd(vvdw12
,vvdw6
),rinvsq00
);
802 /* Update potential sum for this i atom from the interaction with this j atom. */
803 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
804 velecsum
= _mm_add_pd(velecsum
,velec
);
805 vvdw
= _mm_unpacklo_pd(vvdw
,_mm_setzero_pd());
806 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
808 fscal
= _mm_add_pd(felec
,fvdw
);
810 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
812 /* Calculate temporary vectorial force */
813 tx
= _mm_mul_pd(fscal
,dx00
);
814 ty
= _mm_mul_pd(fscal
,dy00
);
815 tz
= _mm_mul_pd(fscal
,dz00
);
817 /* Update vectorial force */
818 fix0
= _mm_add_pd(fix0
,tx
);
819 fiy0
= _mm_add_pd(fiy0
,ty
);
820 fiz0
= _mm_add_pd(fiz0
,tz
);
822 fjx0
= _mm_add_pd(fjx0
,tx
);
823 fjy0
= _mm_add_pd(fjy0
,ty
);
824 fjz0
= _mm_add_pd(fjz0
,tz
);
826 /**************************
827 * CALCULATE INTERACTIONS *
828 **************************/
830 r01
= _mm_mul_pd(rsq01
,rinv01
);
832 /* Calculate table index by multiplying r with table scale and truncate to integer */
833 rt
= _mm_mul_pd(r01
,vftabscale
);
834 vfitab
= _mm_cvttpd_epi32(rt
);
835 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
836 vfitab
= _mm_slli_epi32(vfitab
,2);
838 /* CUBIC SPLINE TABLE ELECTROSTATICS */
839 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
840 F
= _mm_setzero_pd();
841 GMX_MM_TRANSPOSE2_PD(Y
,F
);
842 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
843 H
= _mm_setzero_pd();
844 GMX_MM_TRANSPOSE2_PD(G
,H
);
845 Heps
= _mm_mul_pd(vfeps
,H
);
846 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
847 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
848 velec
= _mm_mul_pd(qq01
,VV
);
849 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
850 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq01
,FF
),_mm_mul_pd(vftabscale
,rinv01
)));
852 /* Update potential sum for this i atom from the interaction with this j atom. */
853 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
854 velecsum
= _mm_add_pd(velecsum
,velec
);
858 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
860 /* Calculate temporary vectorial force */
861 tx
= _mm_mul_pd(fscal
,dx01
);
862 ty
= _mm_mul_pd(fscal
,dy01
);
863 tz
= _mm_mul_pd(fscal
,dz01
);
865 /* Update vectorial force */
866 fix0
= _mm_add_pd(fix0
,tx
);
867 fiy0
= _mm_add_pd(fiy0
,ty
);
868 fiz0
= _mm_add_pd(fiz0
,tz
);
870 fjx1
= _mm_add_pd(fjx1
,tx
);
871 fjy1
= _mm_add_pd(fjy1
,ty
);
872 fjz1
= _mm_add_pd(fjz1
,tz
);
874 /**************************
875 * CALCULATE INTERACTIONS *
876 **************************/
878 r02
= _mm_mul_pd(rsq02
,rinv02
);
880 /* Calculate table index by multiplying r with table scale and truncate to integer */
881 rt
= _mm_mul_pd(r02
,vftabscale
);
882 vfitab
= _mm_cvttpd_epi32(rt
);
883 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
884 vfitab
= _mm_slli_epi32(vfitab
,2);
886 /* CUBIC SPLINE TABLE ELECTROSTATICS */
887 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
888 F
= _mm_setzero_pd();
889 GMX_MM_TRANSPOSE2_PD(Y
,F
);
890 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
891 H
= _mm_setzero_pd();
892 GMX_MM_TRANSPOSE2_PD(G
,H
);
893 Heps
= _mm_mul_pd(vfeps
,H
);
894 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
895 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
896 velec
= _mm_mul_pd(qq02
,VV
);
897 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
898 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq02
,FF
),_mm_mul_pd(vftabscale
,rinv02
)));
900 /* Update potential sum for this i atom from the interaction with this j atom. */
901 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
902 velecsum
= _mm_add_pd(velecsum
,velec
);
906 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
908 /* Calculate temporary vectorial force */
909 tx
= _mm_mul_pd(fscal
,dx02
);
910 ty
= _mm_mul_pd(fscal
,dy02
);
911 tz
= _mm_mul_pd(fscal
,dz02
);
913 /* Update vectorial force */
914 fix0
= _mm_add_pd(fix0
,tx
);
915 fiy0
= _mm_add_pd(fiy0
,ty
);
916 fiz0
= _mm_add_pd(fiz0
,tz
);
918 fjx2
= _mm_add_pd(fjx2
,tx
);
919 fjy2
= _mm_add_pd(fjy2
,ty
);
920 fjz2
= _mm_add_pd(fjz2
,tz
);
922 /**************************
923 * CALCULATE INTERACTIONS *
924 **************************/
926 r10
= _mm_mul_pd(rsq10
,rinv10
);
928 /* Calculate table index by multiplying r with table scale and truncate to integer */
929 rt
= _mm_mul_pd(r10
,vftabscale
);
930 vfitab
= _mm_cvttpd_epi32(rt
);
931 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
932 vfitab
= _mm_slli_epi32(vfitab
,2);
934 /* CUBIC SPLINE TABLE ELECTROSTATICS */
935 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
936 F
= _mm_setzero_pd();
937 GMX_MM_TRANSPOSE2_PD(Y
,F
);
938 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
939 H
= _mm_setzero_pd();
940 GMX_MM_TRANSPOSE2_PD(G
,H
);
941 Heps
= _mm_mul_pd(vfeps
,H
);
942 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
943 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
944 velec
= _mm_mul_pd(qq10
,VV
);
945 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
946 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq10
,FF
),_mm_mul_pd(vftabscale
,rinv10
)));
948 /* Update potential sum for this i atom from the interaction with this j atom. */
949 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
950 velecsum
= _mm_add_pd(velecsum
,velec
);
954 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
956 /* Calculate temporary vectorial force */
957 tx
= _mm_mul_pd(fscal
,dx10
);
958 ty
= _mm_mul_pd(fscal
,dy10
);
959 tz
= _mm_mul_pd(fscal
,dz10
);
961 /* Update vectorial force */
962 fix1
= _mm_add_pd(fix1
,tx
);
963 fiy1
= _mm_add_pd(fiy1
,ty
);
964 fiz1
= _mm_add_pd(fiz1
,tz
);
966 fjx0
= _mm_add_pd(fjx0
,tx
);
967 fjy0
= _mm_add_pd(fjy0
,ty
);
968 fjz0
= _mm_add_pd(fjz0
,tz
);
970 /**************************
971 * CALCULATE INTERACTIONS *
972 **************************/
974 r11
= _mm_mul_pd(rsq11
,rinv11
);
976 /* Calculate table index by multiplying r with table scale and truncate to integer */
977 rt
= _mm_mul_pd(r11
,vftabscale
);
978 vfitab
= _mm_cvttpd_epi32(rt
);
979 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
980 vfitab
= _mm_slli_epi32(vfitab
,2);
982 /* CUBIC SPLINE TABLE ELECTROSTATICS */
983 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
984 F
= _mm_setzero_pd();
985 GMX_MM_TRANSPOSE2_PD(Y
,F
);
986 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
987 H
= _mm_setzero_pd();
988 GMX_MM_TRANSPOSE2_PD(G
,H
);
989 Heps
= _mm_mul_pd(vfeps
,H
);
990 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
991 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
992 velec
= _mm_mul_pd(qq11
,VV
);
993 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
994 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq11
,FF
),_mm_mul_pd(vftabscale
,rinv11
)));
996 /* Update potential sum for this i atom from the interaction with this j atom. */
997 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
998 velecsum
= _mm_add_pd(velecsum
,velec
);
1002 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1004 /* Calculate temporary vectorial force */
1005 tx
= _mm_mul_pd(fscal
,dx11
);
1006 ty
= _mm_mul_pd(fscal
,dy11
);
1007 tz
= _mm_mul_pd(fscal
,dz11
);
1009 /* Update vectorial force */
1010 fix1
= _mm_add_pd(fix1
,tx
);
1011 fiy1
= _mm_add_pd(fiy1
,ty
);
1012 fiz1
= _mm_add_pd(fiz1
,tz
);
1014 fjx1
= _mm_add_pd(fjx1
,tx
);
1015 fjy1
= _mm_add_pd(fjy1
,ty
);
1016 fjz1
= _mm_add_pd(fjz1
,tz
);
1018 /**************************
1019 * CALCULATE INTERACTIONS *
1020 **************************/
1022 r12
= _mm_mul_pd(rsq12
,rinv12
);
1024 /* Calculate table index by multiplying r with table scale and truncate to integer */
1025 rt
= _mm_mul_pd(r12
,vftabscale
);
1026 vfitab
= _mm_cvttpd_epi32(rt
);
1027 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1028 vfitab
= _mm_slli_epi32(vfitab
,2);
1030 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1031 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1032 F
= _mm_setzero_pd();
1033 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1034 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1035 H
= _mm_setzero_pd();
1036 GMX_MM_TRANSPOSE2_PD(G
,H
);
1037 Heps
= _mm_mul_pd(vfeps
,H
);
1038 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1039 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
1040 velec
= _mm_mul_pd(qq12
,VV
);
1041 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1042 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq12
,FF
),_mm_mul_pd(vftabscale
,rinv12
)));
1044 /* Update potential sum for this i atom from the interaction with this j atom. */
1045 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1046 velecsum
= _mm_add_pd(velecsum
,velec
);
1050 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1052 /* Calculate temporary vectorial force */
1053 tx
= _mm_mul_pd(fscal
,dx12
);
1054 ty
= _mm_mul_pd(fscal
,dy12
);
1055 tz
= _mm_mul_pd(fscal
,dz12
);
1057 /* Update vectorial force */
1058 fix1
= _mm_add_pd(fix1
,tx
);
1059 fiy1
= _mm_add_pd(fiy1
,ty
);
1060 fiz1
= _mm_add_pd(fiz1
,tz
);
1062 fjx2
= _mm_add_pd(fjx2
,tx
);
1063 fjy2
= _mm_add_pd(fjy2
,ty
);
1064 fjz2
= _mm_add_pd(fjz2
,tz
);
1066 /**************************
1067 * CALCULATE INTERACTIONS *
1068 **************************/
1070 r20
= _mm_mul_pd(rsq20
,rinv20
);
1072 /* Calculate table index by multiplying r with table scale and truncate to integer */
1073 rt
= _mm_mul_pd(r20
,vftabscale
);
1074 vfitab
= _mm_cvttpd_epi32(rt
);
1075 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1076 vfitab
= _mm_slli_epi32(vfitab
,2);
1078 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1079 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1080 F
= _mm_setzero_pd();
1081 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1082 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1083 H
= _mm_setzero_pd();
1084 GMX_MM_TRANSPOSE2_PD(G
,H
);
1085 Heps
= _mm_mul_pd(vfeps
,H
);
1086 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1087 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
1088 velec
= _mm_mul_pd(qq20
,VV
);
1089 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1090 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq20
,FF
),_mm_mul_pd(vftabscale
,rinv20
)));
1092 /* Update potential sum for this i atom from the interaction with this j atom. */
1093 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1094 velecsum
= _mm_add_pd(velecsum
,velec
);
1098 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1100 /* Calculate temporary vectorial force */
1101 tx
= _mm_mul_pd(fscal
,dx20
);
1102 ty
= _mm_mul_pd(fscal
,dy20
);
1103 tz
= _mm_mul_pd(fscal
,dz20
);
1105 /* Update vectorial force */
1106 fix2
= _mm_add_pd(fix2
,tx
);
1107 fiy2
= _mm_add_pd(fiy2
,ty
);
1108 fiz2
= _mm_add_pd(fiz2
,tz
);
1110 fjx0
= _mm_add_pd(fjx0
,tx
);
1111 fjy0
= _mm_add_pd(fjy0
,ty
);
1112 fjz0
= _mm_add_pd(fjz0
,tz
);
1114 /**************************
1115 * CALCULATE INTERACTIONS *
1116 **************************/
1118 r21
= _mm_mul_pd(rsq21
,rinv21
);
1120 /* Calculate table index by multiplying r with table scale and truncate to integer */
1121 rt
= _mm_mul_pd(r21
,vftabscale
);
1122 vfitab
= _mm_cvttpd_epi32(rt
);
1123 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1124 vfitab
= _mm_slli_epi32(vfitab
,2);
1126 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1127 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1128 F
= _mm_setzero_pd();
1129 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1130 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1131 H
= _mm_setzero_pd();
1132 GMX_MM_TRANSPOSE2_PD(G
,H
);
1133 Heps
= _mm_mul_pd(vfeps
,H
);
1134 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1135 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
1136 velec
= _mm_mul_pd(qq21
,VV
);
1137 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1138 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq21
,FF
),_mm_mul_pd(vftabscale
,rinv21
)));
1140 /* Update potential sum for this i atom from the interaction with this j atom. */
1141 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1142 velecsum
= _mm_add_pd(velecsum
,velec
);
1146 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1148 /* Calculate temporary vectorial force */
1149 tx
= _mm_mul_pd(fscal
,dx21
);
1150 ty
= _mm_mul_pd(fscal
,dy21
);
1151 tz
= _mm_mul_pd(fscal
,dz21
);
1153 /* Update vectorial force */
1154 fix2
= _mm_add_pd(fix2
,tx
);
1155 fiy2
= _mm_add_pd(fiy2
,ty
);
1156 fiz2
= _mm_add_pd(fiz2
,tz
);
1158 fjx1
= _mm_add_pd(fjx1
,tx
);
1159 fjy1
= _mm_add_pd(fjy1
,ty
);
1160 fjz1
= _mm_add_pd(fjz1
,tz
);
1162 /**************************
1163 * CALCULATE INTERACTIONS *
1164 **************************/
1166 r22
= _mm_mul_pd(rsq22
,rinv22
);
1168 /* Calculate table index by multiplying r with table scale and truncate to integer */
1169 rt
= _mm_mul_pd(r22
,vftabscale
);
1170 vfitab
= _mm_cvttpd_epi32(rt
);
1171 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1172 vfitab
= _mm_slli_epi32(vfitab
,2);
1174 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1175 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1176 F
= _mm_setzero_pd();
1177 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1178 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1179 H
= _mm_setzero_pd();
1180 GMX_MM_TRANSPOSE2_PD(G
,H
);
1181 Heps
= _mm_mul_pd(vfeps
,H
);
1182 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1183 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
1184 velec
= _mm_mul_pd(qq22
,VV
);
1185 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1186 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq22
,FF
),_mm_mul_pd(vftabscale
,rinv22
)));
1188 /* Update potential sum for this i atom from the interaction with this j atom. */
1189 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1190 velecsum
= _mm_add_pd(velecsum
,velec
);
1194 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1196 /* Calculate temporary vectorial force */
1197 tx
= _mm_mul_pd(fscal
,dx22
);
1198 ty
= _mm_mul_pd(fscal
,dy22
);
1199 tz
= _mm_mul_pd(fscal
,dz22
);
1201 /* Update vectorial force */
1202 fix2
= _mm_add_pd(fix2
,tx
);
1203 fiy2
= _mm_add_pd(fiy2
,ty
);
1204 fiz2
= _mm_add_pd(fiz2
,tz
);
1206 fjx2
= _mm_add_pd(fjx2
,tx
);
1207 fjy2
= _mm_add_pd(fjy2
,ty
);
1208 fjz2
= _mm_add_pd(fjz2
,tz
);
1210 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
1212 /* Inner loop uses 400 flops */
1215 /* End of innermost loop */
1217 gmx_mm_update_iforce_3atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
1218 f
+i_coord_offset
,fshift
+i_shift_offset
);
1221 /* Update potential energies */
1222 gmx_mm_update_1pot_pd(velecsum
,kernel_data
->energygrp_elec
+ggid
);
1223 gmx_mm_update_1pot_pd(vvdwsum
,kernel_data
->energygrp_vdw
+ggid
);
1225 /* Increment number of inner iterations */
1226 inneriter
+= j_index_end
- j_index_start
;
1228 /* Outer loop uses 20 flops */
1231 /* Increment number of outer iterations */
1234 /* Update outer/inner flops */
1236 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W3W3_VF
,outeriter
*20 + inneriter
*400);
1239 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sse2_double
1240 * Electrostatics interaction: CubicSplineTable
1241 * VdW interaction: LennardJones
1242 * Geometry: Water3-Water3
1243 * Calculate force/pot: Force
1246 nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sse2_double
1247 (t_nblist
* gmx_restrict nlist
,
1248 rvec
* gmx_restrict xx
,
1249 rvec
* gmx_restrict ff
,
1250 t_forcerec
* gmx_restrict fr
,
1251 t_mdatoms
* gmx_restrict mdatoms
,
1252 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
1253 t_nrnb
* gmx_restrict nrnb
)
1255 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1256 * just 0 for non-waters.
1257 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1258 * jnr indices corresponding to data put in the four positions in the SIMD register.
1260 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
1261 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
1263 int j_coord_offsetA
,j_coord_offsetB
;
1264 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
1265 real rcutoff_scalar
;
1266 real
*shiftvec
,*fshift
,*x
,*f
;
1267 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
1269 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
1271 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
1273 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
1274 int vdwjidx0A
,vdwjidx0B
;
1275 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
1276 int vdwjidx1A
,vdwjidx1B
;
1277 __m128d jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
1278 int vdwjidx2A
,vdwjidx2B
;
1279 __m128d jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
1280 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
1281 __m128d dx01
,dy01
,dz01
,rsq01
,rinv01
,rinvsq01
,r01
,qq01
,c6_01
,c12_01
;
1282 __m128d dx02
,dy02
,dz02
,rsq02
,rinv02
,rinvsq02
,r02
,qq02
,c6_02
,c12_02
;
1283 __m128d dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
1284 __m128d dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
1285 __m128d dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
1286 __m128d dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
1287 __m128d dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
1288 __m128d dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
1289 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
1292 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
1295 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
1296 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
1298 __m128i ifour
= _mm_set1_epi32(4);
1299 __m128d rt
,vfeps
,vftabscale
,Y
,F
,G
,H
,Heps
,Fp
,VV
,FF
;
1301 __m128d dummy_mask
,cutoff_mask
;
1302 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1303 __m128d one
= _mm_set1_pd(1.0);
1304 __m128d two
= _mm_set1_pd(2.0);
1310 jindex
= nlist
->jindex
;
1312 shiftidx
= nlist
->shift
;
1314 shiftvec
= fr
->shift_vec
[0];
1315 fshift
= fr
->fshift
[0];
1316 facel
= _mm_set1_pd(fr
->epsfac
);
1317 charge
= mdatoms
->chargeA
;
1318 nvdwtype
= fr
->ntype
;
1319 vdwparam
= fr
->nbfp
;
1320 vdwtype
= mdatoms
->typeA
;
1322 vftab
= kernel_data
->table_elec
->data
;
1323 vftabscale
= _mm_set1_pd(kernel_data
->table_elec
->scale
);
1325 /* Setup water-specific parameters */
1326 inr
= nlist
->iinr
[0];
1327 iq0
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+0]));
1328 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
1329 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
1330 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
1332 jq0
= _mm_set1_pd(charge
[inr
+0]);
1333 jq1
= _mm_set1_pd(charge
[inr
+1]);
1334 jq2
= _mm_set1_pd(charge
[inr
+2]);
1335 vdwjidx0A
= 2*vdwtype
[inr
+0];
1336 qq00
= _mm_mul_pd(iq0
,jq0
);
1337 c6_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
]);
1338 c12_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
1339 qq01
= _mm_mul_pd(iq0
,jq1
);
1340 qq02
= _mm_mul_pd(iq0
,jq2
);
1341 qq10
= _mm_mul_pd(iq1
,jq0
);
1342 qq11
= _mm_mul_pd(iq1
,jq1
);
1343 qq12
= _mm_mul_pd(iq1
,jq2
);
1344 qq20
= _mm_mul_pd(iq2
,jq0
);
1345 qq21
= _mm_mul_pd(iq2
,jq1
);
1346 qq22
= _mm_mul_pd(iq2
,jq2
);
1348 /* Avoid stupid compiler warnings */
1350 j_coord_offsetA
= 0;
1351 j_coord_offsetB
= 0;
1356 /* Start outer loop over neighborlists */
1357 for(iidx
=0; iidx
<nri
; iidx
++)
1359 /* Load shift vector for this list */
1360 i_shift_offset
= DIM
*shiftidx
[iidx
];
1362 /* Load limits for loop over neighbors */
1363 j_index_start
= jindex
[iidx
];
1364 j_index_end
= jindex
[iidx
+1];
1366 /* Get outer coordinate index */
1368 i_coord_offset
= DIM
*inr
;
1370 /* Load i particle coords and add shift vector */
1371 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
1372 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
1374 fix0
= _mm_setzero_pd();
1375 fiy0
= _mm_setzero_pd();
1376 fiz0
= _mm_setzero_pd();
1377 fix1
= _mm_setzero_pd();
1378 fiy1
= _mm_setzero_pd();
1379 fiz1
= _mm_setzero_pd();
1380 fix2
= _mm_setzero_pd();
1381 fiy2
= _mm_setzero_pd();
1382 fiz2
= _mm_setzero_pd();
1384 /* Start inner kernel loop */
1385 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
1388 /* Get j neighbor index, and coordinate index */
1390 jnrB
= jjnr
[jidx
+1];
1391 j_coord_offsetA
= DIM
*jnrA
;
1392 j_coord_offsetB
= DIM
*jnrB
;
1394 /* load j atom coordinates */
1395 gmx_mm_load_3rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1396 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
1398 /* Calculate displacement vector */
1399 dx00
= _mm_sub_pd(ix0
,jx0
);
1400 dy00
= _mm_sub_pd(iy0
,jy0
);
1401 dz00
= _mm_sub_pd(iz0
,jz0
);
1402 dx01
= _mm_sub_pd(ix0
,jx1
);
1403 dy01
= _mm_sub_pd(iy0
,jy1
);
1404 dz01
= _mm_sub_pd(iz0
,jz1
);
1405 dx02
= _mm_sub_pd(ix0
,jx2
);
1406 dy02
= _mm_sub_pd(iy0
,jy2
);
1407 dz02
= _mm_sub_pd(iz0
,jz2
);
1408 dx10
= _mm_sub_pd(ix1
,jx0
);
1409 dy10
= _mm_sub_pd(iy1
,jy0
);
1410 dz10
= _mm_sub_pd(iz1
,jz0
);
1411 dx11
= _mm_sub_pd(ix1
,jx1
);
1412 dy11
= _mm_sub_pd(iy1
,jy1
);
1413 dz11
= _mm_sub_pd(iz1
,jz1
);
1414 dx12
= _mm_sub_pd(ix1
,jx2
);
1415 dy12
= _mm_sub_pd(iy1
,jy2
);
1416 dz12
= _mm_sub_pd(iz1
,jz2
);
1417 dx20
= _mm_sub_pd(ix2
,jx0
);
1418 dy20
= _mm_sub_pd(iy2
,jy0
);
1419 dz20
= _mm_sub_pd(iz2
,jz0
);
1420 dx21
= _mm_sub_pd(ix2
,jx1
);
1421 dy21
= _mm_sub_pd(iy2
,jy1
);
1422 dz21
= _mm_sub_pd(iz2
,jz1
);
1423 dx22
= _mm_sub_pd(ix2
,jx2
);
1424 dy22
= _mm_sub_pd(iy2
,jy2
);
1425 dz22
= _mm_sub_pd(iz2
,jz2
);
1427 /* Calculate squared distance and things based on it */
1428 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
1429 rsq01
= gmx_mm_calc_rsq_pd(dx01
,dy01
,dz01
);
1430 rsq02
= gmx_mm_calc_rsq_pd(dx02
,dy02
,dz02
);
1431 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
1432 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
1433 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
1434 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
1435 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
1436 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
1438 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
1439 rinv01
= gmx_mm_invsqrt_pd(rsq01
);
1440 rinv02
= gmx_mm_invsqrt_pd(rsq02
);
1441 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
1442 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
1443 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
1444 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
1445 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
1446 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
1448 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
1450 fjx0
= _mm_setzero_pd();
1451 fjy0
= _mm_setzero_pd();
1452 fjz0
= _mm_setzero_pd();
1453 fjx1
= _mm_setzero_pd();
1454 fjy1
= _mm_setzero_pd();
1455 fjz1
= _mm_setzero_pd();
1456 fjx2
= _mm_setzero_pd();
1457 fjy2
= _mm_setzero_pd();
1458 fjz2
= _mm_setzero_pd();
1460 /**************************
1461 * CALCULATE INTERACTIONS *
1462 **************************/
1464 r00
= _mm_mul_pd(rsq00
,rinv00
);
1466 /* Calculate table index by multiplying r with table scale and truncate to integer */
1467 rt
= _mm_mul_pd(r00
,vftabscale
);
1468 vfitab
= _mm_cvttpd_epi32(rt
);
1469 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1470 vfitab
= _mm_slli_epi32(vfitab
,2);
1472 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1473 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1474 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1475 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1476 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1477 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
1478 GMX_MM_TRANSPOSE2_PD(G
,H
);
1479 Heps
= _mm_mul_pd(vfeps
,H
);
1480 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1481 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1482 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq00
,FF
),_mm_mul_pd(vftabscale
,rinv00
)));
1484 /* LENNARD-JONES DISPERSION/REPULSION */
1486 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
1487 fvdw
= _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00
,rinvsix
),c6_00
),_mm_mul_pd(rinvsix
,rinvsq00
));
1489 fscal
= _mm_add_pd(felec
,fvdw
);
1491 /* Calculate temporary vectorial force */
1492 tx
= _mm_mul_pd(fscal
,dx00
);
1493 ty
= _mm_mul_pd(fscal
,dy00
);
1494 tz
= _mm_mul_pd(fscal
,dz00
);
1496 /* Update vectorial force */
1497 fix0
= _mm_add_pd(fix0
,tx
);
1498 fiy0
= _mm_add_pd(fiy0
,ty
);
1499 fiz0
= _mm_add_pd(fiz0
,tz
);
1501 fjx0
= _mm_add_pd(fjx0
,tx
);
1502 fjy0
= _mm_add_pd(fjy0
,ty
);
1503 fjz0
= _mm_add_pd(fjz0
,tz
);
1505 /**************************
1506 * CALCULATE INTERACTIONS *
1507 **************************/
1509 r01
= _mm_mul_pd(rsq01
,rinv01
);
1511 /* Calculate table index by multiplying r with table scale and truncate to integer */
1512 rt
= _mm_mul_pd(r01
,vftabscale
);
1513 vfitab
= _mm_cvttpd_epi32(rt
);
1514 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1515 vfitab
= _mm_slli_epi32(vfitab
,2);
1517 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1518 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1519 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1520 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1521 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1522 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
1523 GMX_MM_TRANSPOSE2_PD(G
,H
);
1524 Heps
= _mm_mul_pd(vfeps
,H
);
1525 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1526 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1527 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq01
,FF
),_mm_mul_pd(vftabscale
,rinv01
)));
1531 /* Calculate temporary vectorial force */
1532 tx
= _mm_mul_pd(fscal
,dx01
);
1533 ty
= _mm_mul_pd(fscal
,dy01
);
1534 tz
= _mm_mul_pd(fscal
,dz01
);
1536 /* Update vectorial force */
1537 fix0
= _mm_add_pd(fix0
,tx
);
1538 fiy0
= _mm_add_pd(fiy0
,ty
);
1539 fiz0
= _mm_add_pd(fiz0
,tz
);
1541 fjx1
= _mm_add_pd(fjx1
,tx
);
1542 fjy1
= _mm_add_pd(fjy1
,ty
);
1543 fjz1
= _mm_add_pd(fjz1
,tz
);
1545 /**************************
1546 * CALCULATE INTERACTIONS *
1547 **************************/
1549 r02
= _mm_mul_pd(rsq02
,rinv02
);
1551 /* Calculate table index by multiplying r with table scale and truncate to integer */
1552 rt
= _mm_mul_pd(r02
,vftabscale
);
1553 vfitab
= _mm_cvttpd_epi32(rt
);
1554 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1555 vfitab
= _mm_slli_epi32(vfitab
,2);
1557 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1558 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1559 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1560 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1561 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1562 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
1563 GMX_MM_TRANSPOSE2_PD(G
,H
);
1564 Heps
= _mm_mul_pd(vfeps
,H
);
1565 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1566 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1567 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq02
,FF
),_mm_mul_pd(vftabscale
,rinv02
)));
1571 /* Calculate temporary vectorial force */
1572 tx
= _mm_mul_pd(fscal
,dx02
);
1573 ty
= _mm_mul_pd(fscal
,dy02
);
1574 tz
= _mm_mul_pd(fscal
,dz02
);
1576 /* Update vectorial force */
1577 fix0
= _mm_add_pd(fix0
,tx
);
1578 fiy0
= _mm_add_pd(fiy0
,ty
);
1579 fiz0
= _mm_add_pd(fiz0
,tz
);
1581 fjx2
= _mm_add_pd(fjx2
,tx
);
1582 fjy2
= _mm_add_pd(fjy2
,ty
);
1583 fjz2
= _mm_add_pd(fjz2
,tz
);
1585 /**************************
1586 * CALCULATE INTERACTIONS *
1587 **************************/
1589 r10
= _mm_mul_pd(rsq10
,rinv10
);
1591 /* Calculate table index by multiplying r with table scale and truncate to integer */
1592 rt
= _mm_mul_pd(r10
,vftabscale
);
1593 vfitab
= _mm_cvttpd_epi32(rt
);
1594 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1595 vfitab
= _mm_slli_epi32(vfitab
,2);
1597 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1598 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1599 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1600 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1601 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1602 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
1603 GMX_MM_TRANSPOSE2_PD(G
,H
);
1604 Heps
= _mm_mul_pd(vfeps
,H
);
1605 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1606 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1607 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq10
,FF
),_mm_mul_pd(vftabscale
,rinv10
)));
1611 /* Calculate temporary vectorial force */
1612 tx
= _mm_mul_pd(fscal
,dx10
);
1613 ty
= _mm_mul_pd(fscal
,dy10
);
1614 tz
= _mm_mul_pd(fscal
,dz10
);
1616 /* Update vectorial force */
1617 fix1
= _mm_add_pd(fix1
,tx
);
1618 fiy1
= _mm_add_pd(fiy1
,ty
);
1619 fiz1
= _mm_add_pd(fiz1
,tz
);
1621 fjx0
= _mm_add_pd(fjx0
,tx
);
1622 fjy0
= _mm_add_pd(fjy0
,ty
);
1623 fjz0
= _mm_add_pd(fjz0
,tz
);
1625 /**************************
1626 * CALCULATE INTERACTIONS *
1627 **************************/
1629 r11
= _mm_mul_pd(rsq11
,rinv11
);
1631 /* Calculate table index by multiplying r with table scale and truncate to integer */
1632 rt
= _mm_mul_pd(r11
,vftabscale
);
1633 vfitab
= _mm_cvttpd_epi32(rt
);
1634 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1635 vfitab
= _mm_slli_epi32(vfitab
,2);
1637 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1638 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1639 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1640 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1641 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1642 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
1643 GMX_MM_TRANSPOSE2_PD(G
,H
);
1644 Heps
= _mm_mul_pd(vfeps
,H
);
1645 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1646 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1647 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq11
,FF
),_mm_mul_pd(vftabscale
,rinv11
)));
1651 /* Calculate temporary vectorial force */
1652 tx
= _mm_mul_pd(fscal
,dx11
);
1653 ty
= _mm_mul_pd(fscal
,dy11
);
1654 tz
= _mm_mul_pd(fscal
,dz11
);
1656 /* Update vectorial force */
1657 fix1
= _mm_add_pd(fix1
,tx
);
1658 fiy1
= _mm_add_pd(fiy1
,ty
);
1659 fiz1
= _mm_add_pd(fiz1
,tz
);
1661 fjx1
= _mm_add_pd(fjx1
,tx
);
1662 fjy1
= _mm_add_pd(fjy1
,ty
);
1663 fjz1
= _mm_add_pd(fjz1
,tz
);
1665 /**************************
1666 * CALCULATE INTERACTIONS *
1667 **************************/
1669 r12
= _mm_mul_pd(rsq12
,rinv12
);
1671 /* Calculate table index by multiplying r with table scale and truncate to integer */
1672 rt
= _mm_mul_pd(r12
,vftabscale
);
1673 vfitab
= _mm_cvttpd_epi32(rt
);
1674 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1675 vfitab
= _mm_slli_epi32(vfitab
,2);
1677 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1678 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1679 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1680 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1681 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1682 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
1683 GMX_MM_TRANSPOSE2_PD(G
,H
);
1684 Heps
= _mm_mul_pd(vfeps
,H
);
1685 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1686 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1687 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq12
,FF
),_mm_mul_pd(vftabscale
,rinv12
)));
1691 /* Calculate temporary vectorial force */
1692 tx
= _mm_mul_pd(fscal
,dx12
);
1693 ty
= _mm_mul_pd(fscal
,dy12
);
1694 tz
= _mm_mul_pd(fscal
,dz12
);
1696 /* Update vectorial force */
1697 fix1
= _mm_add_pd(fix1
,tx
);
1698 fiy1
= _mm_add_pd(fiy1
,ty
);
1699 fiz1
= _mm_add_pd(fiz1
,tz
);
1701 fjx2
= _mm_add_pd(fjx2
,tx
);
1702 fjy2
= _mm_add_pd(fjy2
,ty
);
1703 fjz2
= _mm_add_pd(fjz2
,tz
);
1705 /**************************
1706 * CALCULATE INTERACTIONS *
1707 **************************/
1709 r20
= _mm_mul_pd(rsq20
,rinv20
);
1711 /* Calculate table index by multiplying r with table scale and truncate to integer */
1712 rt
= _mm_mul_pd(r20
,vftabscale
);
1713 vfitab
= _mm_cvttpd_epi32(rt
);
1714 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1715 vfitab
= _mm_slli_epi32(vfitab
,2);
1717 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1718 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1719 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1720 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1721 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1722 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
1723 GMX_MM_TRANSPOSE2_PD(G
,H
);
1724 Heps
= _mm_mul_pd(vfeps
,H
);
1725 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1726 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1727 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq20
,FF
),_mm_mul_pd(vftabscale
,rinv20
)));
1731 /* Calculate temporary vectorial force */
1732 tx
= _mm_mul_pd(fscal
,dx20
);
1733 ty
= _mm_mul_pd(fscal
,dy20
);
1734 tz
= _mm_mul_pd(fscal
,dz20
);
1736 /* Update vectorial force */
1737 fix2
= _mm_add_pd(fix2
,tx
);
1738 fiy2
= _mm_add_pd(fiy2
,ty
);
1739 fiz2
= _mm_add_pd(fiz2
,tz
);
1741 fjx0
= _mm_add_pd(fjx0
,tx
);
1742 fjy0
= _mm_add_pd(fjy0
,ty
);
1743 fjz0
= _mm_add_pd(fjz0
,tz
);
1745 /**************************
1746 * CALCULATE INTERACTIONS *
1747 **************************/
1749 r21
= _mm_mul_pd(rsq21
,rinv21
);
1751 /* Calculate table index by multiplying r with table scale and truncate to integer */
1752 rt
= _mm_mul_pd(r21
,vftabscale
);
1753 vfitab
= _mm_cvttpd_epi32(rt
);
1754 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1755 vfitab
= _mm_slli_epi32(vfitab
,2);
1757 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1758 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1759 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1760 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1761 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1762 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
1763 GMX_MM_TRANSPOSE2_PD(G
,H
);
1764 Heps
= _mm_mul_pd(vfeps
,H
);
1765 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1766 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1767 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq21
,FF
),_mm_mul_pd(vftabscale
,rinv21
)));
1771 /* Calculate temporary vectorial force */
1772 tx
= _mm_mul_pd(fscal
,dx21
);
1773 ty
= _mm_mul_pd(fscal
,dy21
);
1774 tz
= _mm_mul_pd(fscal
,dz21
);
1776 /* Update vectorial force */
1777 fix2
= _mm_add_pd(fix2
,tx
);
1778 fiy2
= _mm_add_pd(fiy2
,ty
);
1779 fiz2
= _mm_add_pd(fiz2
,tz
);
1781 fjx1
= _mm_add_pd(fjx1
,tx
);
1782 fjy1
= _mm_add_pd(fjy1
,ty
);
1783 fjz1
= _mm_add_pd(fjz1
,tz
);
1785 /**************************
1786 * CALCULATE INTERACTIONS *
1787 **************************/
1789 r22
= _mm_mul_pd(rsq22
,rinv22
);
1791 /* Calculate table index by multiplying r with table scale and truncate to integer */
1792 rt
= _mm_mul_pd(r22
,vftabscale
);
1793 vfitab
= _mm_cvttpd_epi32(rt
);
1794 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1795 vfitab
= _mm_slli_epi32(vfitab
,2);
1797 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1798 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1799 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1800 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1801 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1802 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
1803 GMX_MM_TRANSPOSE2_PD(G
,H
);
1804 Heps
= _mm_mul_pd(vfeps
,H
);
1805 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1806 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1807 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq22
,FF
),_mm_mul_pd(vftabscale
,rinv22
)));
1811 /* Calculate temporary vectorial force */
1812 tx
= _mm_mul_pd(fscal
,dx22
);
1813 ty
= _mm_mul_pd(fscal
,dy22
);
1814 tz
= _mm_mul_pd(fscal
,dz22
);
1816 /* Update vectorial force */
1817 fix2
= _mm_add_pd(fix2
,tx
);
1818 fiy2
= _mm_add_pd(fiy2
,ty
);
1819 fiz2
= _mm_add_pd(fiz2
,tz
);
1821 fjx2
= _mm_add_pd(fjx2
,tx
);
1822 fjy2
= _mm_add_pd(fjy2
,ty
);
1823 fjz2
= _mm_add_pd(fjz2
,tz
);
1825 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
1827 /* Inner loop uses 359 flops */
1830 if(jidx
<j_index_end
)
1834 j_coord_offsetA
= DIM
*jnrA
;
1836 /* load j atom coordinates */
1837 gmx_mm_load_3rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
1838 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
1840 /* Calculate displacement vector */
1841 dx00
= _mm_sub_pd(ix0
,jx0
);
1842 dy00
= _mm_sub_pd(iy0
,jy0
);
1843 dz00
= _mm_sub_pd(iz0
,jz0
);
1844 dx01
= _mm_sub_pd(ix0
,jx1
);
1845 dy01
= _mm_sub_pd(iy0
,jy1
);
1846 dz01
= _mm_sub_pd(iz0
,jz1
);
1847 dx02
= _mm_sub_pd(ix0
,jx2
);
1848 dy02
= _mm_sub_pd(iy0
,jy2
);
1849 dz02
= _mm_sub_pd(iz0
,jz2
);
1850 dx10
= _mm_sub_pd(ix1
,jx0
);
1851 dy10
= _mm_sub_pd(iy1
,jy0
);
1852 dz10
= _mm_sub_pd(iz1
,jz0
);
1853 dx11
= _mm_sub_pd(ix1
,jx1
);
1854 dy11
= _mm_sub_pd(iy1
,jy1
);
1855 dz11
= _mm_sub_pd(iz1
,jz1
);
1856 dx12
= _mm_sub_pd(ix1
,jx2
);
1857 dy12
= _mm_sub_pd(iy1
,jy2
);
1858 dz12
= _mm_sub_pd(iz1
,jz2
);
1859 dx20
= _mm_sub_pd(ix2
,jx0
);
1860 dy20
= _mm_sub_pd(iy2
,jy0
);
1861 dz20
= _mm_sub_pd(iz2
,jz0
);
1862 dx21
= _mm_sub_pd(ix2
,jx1
);
1863 dy21
= _mm_sub_pd(iy2
,jy1
);
1864 dz21
= _mm_sub_pd(iz2
,jz1
);
1865 dx22
= _mm_sub_pd(ix2
,jx2
);
1866 dy22
= _mm_sub_pd(iy2
,jy2
);
1867 dz22
= _mm_sub_pd(iz2
,jz2
);
1869 /* Calculate squared distance and things based on it */
1870 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
1871 rsq01
= gmx_mm_calc_rsq_pd(dx01
,dy01
,dz01
);
1872 rsq02
= gmx_mm_calc_rsq_pd(dx02
,dy02
,dz02
);
1873 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
1874 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
1875 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
1876 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
1877 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
1878 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
1880 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
1881 rinv01
= gmx_mm_invsqrt_pd(rsq01
);
1882 rinv02
= gmx_mm_invsqrt_pd(rsq02
);
1883 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
1884 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
1885 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
1886 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
1887 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
1888 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
1890 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
1892 fjx0
= _mm_setzero_pd();
1893 fjy0
= _mm_setzero_pd();
1894 fjz0
= _mm_setzero_pd();
1895 fjx1
= _mm_setzero_pd();
1896 fjy1
= _mm_setzero_pd();
1897 fjz1
= _mm_setzero_pd();
1898 fjx2
= _mm_setzero_pd();
1899 fjy2
= _mm_setzero_pd();
1900 fjz2
= _mm_setzero_pd();
1902 /**************************
1903 * CALCULATE INTERACTIONS *
1904 **************************/
1906 r00
= _mm_mul_pd(rsq00
,rinv00
);
1908 /* Calculate table index by multiplying r with table scale and truncate to integer */
1909 rt
= _mm_mul_pd(r00
,vftabscale
);
1910 vfitab
= _mm_cvttpd_epi32(rt
);
1911 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1912 vfitab
= _mm_slli_epi32(vfitab
,2);
1914 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1915 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1916 F
= _mm_setzero_pd();
1917 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1918 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1919 H
= _mm_setzero_pd();
1920 GMX_MM_TRANSPOSE2_PD(G
,H
);
1921 Heps
= _mm_mul_pd(vfeps
,H
);
1922 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1923 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1924 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq00
,FF
),_mm_mul_pd(vftabscale
,rinv00
)));
1926 /* LENNARD-JONES DISPERSION/REPULSION */
1928 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
1929 fvdw
= _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00
,rinvsix
),c6_00
),_mm_mul_pd(rinvsix
,rinvsq00
));
1931 fscal
= _mm_add_pd(felec
,fvdw
);
1933 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1935 /* Calculate temporary vectorial force */
1936 tx
= _mm_mul_pd(fscal
,dx00
);
1937 ty
= _mm_mul_pd(fscal
,dy00
);
1938 tz
= _mm_mul_pd(fscal
,dz00
);
1940 /* Update vectorial force */
1941 fix0
= _mm_add_pd(fix0
,tx
);
1942 fiy0
= _mm_add_pd(fiy0
,ty
);
1943 fiz0
= _mm_add_pd(fiz0
,tz
);
1945 fjx0
= _mm_add_pd(fjx0
,tx
);
1946 fjy0
= _mm_add_pd(fjy0
,ty
);
1947 fjz0
= _mm_add_pd(fjz0
,tz
);
1949 /**************************
1950 * CALCULATE INTERACTIONS *
1951 **************************/
1953 r01
= _mm_mul_pd(rsq01
,rinv01
);
1955 /* Calculate table index by multiplying r with table scale and truncate to integer */
1956 rt
= _mm_mul_pd(r01
,vftabscale
);
1957 vfitab
= _mm_cvttpd_epi32(rt
);
1958 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
1959 vfitab
= _mm_slli_epi32(vfitab
,2);
1961 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1962 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1963 F
= _mm_setzero_pd();
1964 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1965 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1966 H
= _mm_setzero_pd();
1967 GMX_MM_TRANSPOSE2_PD(G
,H
);
1968 Heps
= _mm_mul_pd(vfeps
,H
);
1969 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1970 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1971 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq01
,FF
),_mm_mul_pd(vftabscale
,rinv01
)));
1975 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1977 /* Calculate temporary vectorial force */
1978 tx
= _mm_mul_pd(fscal
,dx01
);
1979 ty
= _mm_mul_pd(fscal
,dy01
);
1980 tz
= _mm_mul_pd(fscal
,dz01
);
1982 /* Update vectorial force */
1983 fix0
= _mm_add_pd(fix0
,tx
);
1984 fiy0
= _mm_add_pd(fiy0
,ty
);
1985 fiz0
= _mm_add_pd(fiz0
,tz
);
1987 fjx1
= _mm_add_pd(fjx1
,tx
);
1988 fjy1
= _mm_add_pd(fjy1
,ty
);
1989 fjz1
= _mm_add_pd(fjz1
,tz
);
1991 /**************************
1992 * CALCULATE INTERACTIONS *
1993 **************************/
1995 r02
= _mm_mul_pd(rsq02
,rinv02
);
1997 /* Calculate table index by multiplying r with table scale and truncate to integer */
1998 rt
= _mm_mul_pd(r02
,vftabscale
);
1999 vfitab
= _mm_cvttpd_epi32(rt
);
2000 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
2001 vfitab
= _mm_slli_epi32(vfitab
,2);
2003 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2004 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2005 F
= _mm_setzero_pd();
2006 GMX_MM_TRANSPOSE2_PD(Y
,F
);
2007 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
2008 H
= _mm_setzero_pd();
2009 GMX_MM_TRANSPOSE2_PD(G
,H
);
2010 Heps
= _mm_mul_pd(vfeps
,H
);
2011 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
2012 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
2013 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq02
,FF
),_mm_mul_pd(vftabscale
,rinv02
)));
2017 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2019 /* Calculate temporary vectorial force */
2020 tx
= _mm_mul_pd(fscal
,dx02
);
2021 ty
= _mm_mul_pd(fscal
,dy02
);
2022 tz
= _mm_mul_pd(fscal
,dz02
);
2024 /* Update vectorial force */
2025 fix0
= _mm_add_pd(fix0
,tx
);
2026 fiy0
= _mm_add_pd(fiy0
,ty
);
2027 fiz0
= _mm_add_pd(fiz0
,tz
);
2029 fjx2
= _mm_add_pd(fjx2
,tx
);
2030 fjy2
= _mm_add_pd(fjy2
,ty
);
2031 fjz2
= _mm_add_pd(fjz2
,tz
);
2033 /**************************
2034 * CALCULATE INTERACTIONS *
2035 **************************/
2037 r10
= _mm_mul_pd(rsq10
,rinv10
);
2039 /* Calculate table index by multiplying r with table scale and truncate to integer */
2040 rt
= _mm_mul_pd(r10
,vftabscale
);
2041 vfitab
= _mm_cvttpd_epi32(rt
);
2042 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
2043 vfitab
= _mm_slli_epi32(vfitab
,2);
2045 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2046 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2047 F
= _mm_setzero_pd();
2048 GMX_MM_TRANSPOSE2_PD(Y
,F
);
2049 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
2050 H
= _mm_setzero_pd();
2051 GMX_MM_TRANSPOSE2_PD(G
,H
);
2052 Heps
= _mm_mul_pd(vfeps
,H
);
2053 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
2054 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
2055 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq10
,FF
),_mm_mul_pd(vftabscale
,rinv10
)));
2059 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2061 /* Calculate temporary vectorial force */
2062 tx
= _mm_mul_pd(fscal
,dx10
);
2063 ty
= _mm_mul_pd(fscal
,dy10
);
2064 tz
= _mm_mul_pd(fscal
,dz10
);
2066 /* Update vectorial force */
2067 fix1
= _mm_add_pd(fix1
,tx
);
2068 fiy1
= _mm_add_pd(fiy1
,ty
);
2069 fiz1
= _mm_add_pd(fiz1
,tz
);
2071 fjx0
= _mm_add_pd(fjx0
,tx
);
2072 fjy0
= _mm_add_pd(fjy0
,ty
);
2073 fjz0
= _mm_add_pd(fjz0
,tz
);
2075 /**************************
2076 * CALCULATE INTERACTIONS *
2077 **************************/
2079 r11
= _mm_mul_pd(rsq11
,rinv11
);
2081 /* Calculate table index by multiplying r with table scale and truncate to integer */
2082 rt
= _mm_mul_pd(r11
,vftabscale
);
2083 vfitab
= _mm_cvttpd_epi32(rt
);
2084 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
2085 vfitab
= _mm_slli_epi32(vfitab
,2);
2087 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2088 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2089 F
= _mm_setzero_pd();
2090 GMX_MM_TRANSPOSE2_PD(Y
,F
);
2091 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
2092 H
= _mm_setzero_pd();
2093 GMX_MM_TRANSPOSE2_PD(G
,H
);
2094 Heps
= _mm_mul_pd(vfeps
,H
);
2095 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
2096 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
2097 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq11
,FF
),_mm_mul_pd(vftabscale
,rinv11
)));
2101 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2103 /* Calculate temporary vectorial force */
2104 tx
= _mm_mul_pd(fscal
,dx11
);
2105 ty
= _mm_mul_pd(fscal
,dy11
);
2106 tz
= _mm_mul_pd(fscal
,dz11
);
2108 /* Update vectorial force */
2109 fix1
= _mm_add_pd(fix1
,tx
);
2110 fiy1
= _mm_add_pd(fiy1
,ty
);
2111 fiz1
= _mm_add_pd(fiz1
,tz
);
2113 fjx1
= _mm_add_pd(fjx1
,tx
);
2114 fjy1
= _mm_add_pd(fjy1
,ty
);
2115 fjz1
= _mm_add_pd(fjz1
,tz
);
2117 /**************************
2118 * CALCULATE INTERACTIONS *
2119 **************************/
2121 r12
= _mm_mul_pd(rsq12
,rinv12
);
2123 /* Calculate table index by multiplying r with table scale and truncate to integer */
2124 rt
= _mm_mul_pd(r12
,vftabscale
);
2125 vfitab
= _mm_cvttpd_epi32(rt
);
2126 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
2127 vfitab
= _mm_slli_epi32(vfitab
,2);
2129 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2130 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2131 F
= _mm_setzero_pd();
2132 GMX_MM_TRANSPOSE2_PD(Y
,F
);
2133 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
2134 H
= _mm_setzero_pd();
2135 GMX_MM_TRANSPOSE2_PD(G
,H
);
2136 Heps
= _mm_mul_pd(vfeps
,H
);
2137 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
2138 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
2139 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq12
,FF
),_mm_mul_pd(vftabscale
,rinv12
)));
2143 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2145 /* Calculate temporary vectorial force */
2146 tx
= _mm_mul_pd(fscal
,dx12
);
2147 ty
= _mm_mul_pd(fscal
,dy12
);
2148 tz
= _mm_mul_pd(fscal
,dz12
);
2150 /* Update vectorial force */
2151 fix1
= _mm_add_pd(fix1
,tx
);
2152 fiy1
= _mm_add_pd(fiy1
,ty
);
2153 fiz1
= _mm_add_pd(fiz1
,tz
);
2155 fjx2
= _mm_add_pd(fjx2
,tx
);
2156 fjy2
= _mm_add_pd(fjy2
,ty
);
2157 fjz2
= _mm_add_pd(fjz2
,tz
);
2159 /**************************
2160 * CALCULATE INTERACTIONS *
2161 **************************/
2163 r20
= _mm_mul_pd(rsq20
,rinv20
);
2165 /* Calculate table index by multiplying r with table scale and truncate to integer */
2166 rt
= _mm_mul_pd(r20
,vftabscale
);
2167 vfitab
= _mm_cvttpd_epi32(rt
);
2168 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
2169 vfitab
= _mm_slli_epi32(vfitab
,2);
2171 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2172 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2173 F
= _mm_setzero_pd();
2174 GMX_MM_TRANSPOSE2_PD(Y
,F
);
2175 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
2176 H
= _mm_setzero_pd();
2177 GMX_MM_TRANSPOSE2_PD(G
,H
);
2178 Heps
= _mm_mul_pd(vfeps
,H
);
2179 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
2180 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
2181 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq20
,FF
),_mm_mul_pd(vftabscale
,rinv20
)));
2185 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2187 /* Calculate temporary vectorial force */
2188 tx
= _mm_mul_pd(fscal
,dx20
);
2189 ty
= _mm_mul_pd(fscal
,dy20
);
2190 tz
= _mm_mul_pd(fscal
,dz20
);
2192 /* Update vectorial force */
2193 fix2
= _mm_add_pd(fix2
,tx
);
2194 fiy2
= _mm_add_pd(fiy2
,ty
);
2195 fiz2
= _mm_add_pd(fiz2
,tz
);
2197 fjx0
= _mm_add_pd(fjx0
,tx
);
2198 fjy0
= _mm_add_pd(fjy0
,ty
);
2199 fjz0
= _mm_add_pd(fjz0
,tz
);
2201 /**************************
2202 * CALCULATE INTERACTIONS *
2203 **************************/
2205 r21
= _mm_mul_pd(rsq21
,rinv21
);
2207 /* Calculate table index by multiplying r with table scale and truncate to integer */
2208 rt
= _mm_mul_pd(r21
,vftabscale
);
2209 vfitab
= _mm_cvttpd_epi32(rt
);
2210 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
2211 vfitab
= _mm_slli_epi32(vfitab
,2);
2213 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2214 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2215 F
= _mm_setzero_pd();
2216 GMX_MM_TRANSPOSE2_PD(Y
,F
);
2217 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
2218 H
= _mm_setzero_pd();
2219 GMX_MM_TRANSPOSE2_PD(G
,H
);
2220 Heps
= _mm_mul_pd(vfeps
,H
);
2221 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
2222 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
2223 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq21
,FF
),_mm_mul_pd(vftabscale
,rinv21
)));
2227 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2229 /* Calculate temporary vectorial force */
2230 tx
= _mm_mul_pd(fscal
,dx21
);
2231 ty
= _mm_mul_pd(fscal
,dy21
);
2232 tz
= _mm_mul_pd(fscal
,dz21
);
2234 /* Update vectorial force */
2235 fix2
= _mm_add_pd(fix2
,tx
);
2236 fiy2
= _mm_add_pd(fiy2
,ty
);
2237 fiz2
= _mm_add_pd(fiz2
,tz
);
2239 fjx1
= _mm_add_pd(fjx1
,tx
);
2240 fjy1
= _mm_add_pd(fjy1
,ty
);
2241 fjz1
= _mm_add_pd(fjz1
,tz
);
2243 /**************************
2244 * CALCULATE INTERACTIONS *
2245 **************************/
2247 r22
= _mm_mul_pd(rsq22
,rinv22
);
2249 /* Calculate table index by multiplying r with table scale and truncate to integer */
2250 rt
= _mm_mul_pd(r22
,vftabscale
);
2251 vfitab
= _mm_cvttpd_epi32(rt
);
2252 vfeps
= _mm_sub_pd(rt
,_mm_cvtepi32_pd(vfitab
));
2253 vfitab
= _mm_slli_epi32(vfitab
,2);
2255 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2256 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2257 F
= _mm_setzero_pd();
2258 GMX_MM_TRANSPOSE2_PD(Y
,F
);
2259 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
2260 H
= _mm_setzero_pd();
2261 GMX_MM_TRANSPOSE2_PD(G
,H
);
2262 Heps
= _mm_mul_pd(vfeps
,H
);
2263 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
2264 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
2265 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq22
,FF
),_mm_mul_pd(vftabscale
,rinv22
)));
2269 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2271 /* Calculate temporary vectorial force */
2272 tx
= _mm_mul_pd(fscal
,dx22
);
2273 ty
= _mm_mul_pd(fscal
,dy22
);
2274 tz
= _mm_mul_pd(fscal
,dz22
);
2276 /* Update vectorial force */
2277 fix2
= _mm_add_pd(fix2
,tx
);
2278 fiy2
= _mm_add_pd(fiy2
,ty
);
2279 fiz2
= _mm_add_pd(fiz2
,tz
);
2281 fjx2
= _mm_add_pd(fjx2
,tx
);
2282 fjy2
= _mm_add_pd(fjy2
,ty
);
2283 fjz2
= _mm_add_pd(fjz2
,tz
);
2285 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
2287 /* Inner loop uses 359 flops */
2290 /* End of innermost loop */
2292 gmx_mm_update_iforce_3atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
2293 f
+i_coord_offset
,fshift
+i_shift_offset
);
2295 /* Increment number of inner iterations */
2296 inneriter
+= j_index_end
- j_index_start
;
2298 /* Outer loop uses 18 flops */
2301 /* Increment number of outer iterations */
2304 /* Update outer/inner flops */
2306 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W3W3_F
,outeriter
*18 + inneriter
*359);