2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse4_1_double kernel generator.
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
47 #include "gromacs/simd/math_x86_sse4_1_double.h"
48 #include "kernelutil_x86_sse4_1_double.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_sse4_1_double
52 * Electrostatics interaction: CubicSplineTable
53 * VdW interaction: LennardJones
54 * Geometry: Water3-Water3
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_sse4_1_double
59 (t_nblist
* gmx_restrict nlist
,
60 rvec
* gmx_restrict xx
,
61 rvec
* gmx_restrict ff
,
62 t_forcerec
* gmx_restrict fr
,
63 t_mdatoms
* gmx_restrict mdatoms
,
64 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
65 t_nrnb
* gmx_restrict nrnb
)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
73 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
75 int j_coord_offsetA
,j_coord_offsetB
;
76 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
78 real
*shiftvec
,*fshift
,*x
,*f
;
79 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
81 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
83 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
85 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
86 int vdwjidx0A
,vdwjidx0B
;
87 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
88 int vdwjidx1A
,vdwjidx1B
;
89 __m128d jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
90 int vdwjidx2A
,vdwjidx2B
;
91 __m128d jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
92 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
93 __m128d dx01
,dy01
,dz01
,rsq01
,rinv01
,rinvsq01
,r01
,qq01
,c6_01
,c12_01
;
94 __m128d dx02
,dy02
,dz02
,rsq02
,rinv02
,rinvsq02
,r02
,qq02
,c6_02
,c12_02
;
95 __m128d dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
96 __m128d dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
97 __m128d dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
98 __m128d dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
99 __m128d dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
100 __m128d dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
101 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
104 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
107 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
108 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
110 __m128i ifour
= _mm_set1_epi32(4);
111 __m128d rt
,vfeps
,vftabscale
,Y
,F
,G
,H
,Heps
,Fp
,VV
,FF
;
113 __m128d dummy_mask
,cutoff_mask
;
114 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
115 __m128d one
= _mm_set1_pd(1.0);
116 __m128d two
= _mm_set1_pd(2.0);
122 jindex
= nlist
->jindex
;
124 shiftidx
= nlist
->shift
;
126 shiftvec
= fr
->shift_vec
[0];
127 fshift
= fr
->fshift
[0];
128 facel
= _mm_set1_pd(fr
->epsfac
);
129 charge
= mdatoms
->chargeA
;
130 nvdwtype
= fr
->ntype
;
132 vdwtype
= mdatoms
->typeA
;
134 vftab
= kernel_data
->table_elec
->data
;
135 vftabscale
= _mm_set1_pd(kernel_data
->table_elec
->scale
);
137 /* Setup water-specific parameters */
138 inr
= nlist
->iinr
[0];
139 iq0
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+0]));
140 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
141 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
142 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
144 jq0
= _mm_set1_pd(charge
[inr
+0]);
145 jq1
= _mm_set1_pd(charge
[inr
+1]);
146 jq2
= _mm_set1_pd(charge
[inr
+2]);
147 vdwjidx0A
= 2*vdwtype
[inr
+0];
148 qq00
= _mm_mul_pd(iq0
,jq0
);
149 c6_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
]);
150 c12_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
151 qq01
= _mm_mul_pd(iq0
,jq1
);
152 qq02
= _mm_mul_pd(iq0
,jq2
);
153 qq10
= _mm_mul_pd(iq1
,jq0
);
154 qq11
= _mm_mul_pd(iq1
,jq1
);
155 qq12
= _mm_mul_pd(iq1
,jq2
);
156 qq20
= _mm_mul_pd(iq2
,jq0
);
157 qq21
= _mm_mul_pd(iq2
,jq1
);
158 qq22
= _mm_mul_pd(iq2
,jq2
);
160 /* Avoid stupid compiler warnings */
168 /* Start outer loop over neighborlists */
169 for(iidx
=0; iidx
<nri
; iidx
++)
171 /* Load shift vector for this list */
172 i_shift_offset
= DIM
*shiftidx
[iidx
];
174 /* Load limits for loop over neighbors */
175 j_index_start
= jindex
[iidx
];
176 j_index_end
= jindex
[iidx
+1];
178 /* Get outer coordinate index */
180 i_coord_offset
= DIM
*inr
;
182 /* Load i particle coords and add shift vector */
183 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
184 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
186 fix0
= _mm_setzero_pd();
187 fiy0
= _mm_setzero_pd();
188 fiz0
= _mm_setzero_pd();
189 fix1
= _mm_setzero_pd();
190 fiy1
= _mm_setzero_pd();
191 fiz1
= _mm_setzero_pd();
192 fix2
= _mm_setzero_pd();
193 fiy2
= _mm_setzero_pd();
194 fiz2
= _mm_setzero_pd();
196 /* Reset potential sums */
197 velecsum
= _mm_setzero_pd();
198 vvdwsum
= _mm_setzero_pd();
200 /* Start inner kernel loop */
201 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
204 /* Get j neighbor index, and coordinate index */
207 j_coord_offsetA
= DIM
*jnrA
;
208 j_coord_offsetB
= DIM
*jnrB
;
210 /* load j atom coordinates */
211 gmx_mm_load_3rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
212 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
214 /* Calculate displacement vector */
215 dx00
= _mm_sub_pd(ix0
,jx0
);
216 dy00
= _mm_sub_pd(iy0
,jy0
);
217 dz00
= _mm_sub_pd(iz0
,jz0
);
218 dx01
= _mm_sub_pd(ix0
,jx1
);
219 dy01
= _mm_sub_pd(iy0
,jy1
);
220 dz01
= _mm_sub_pd(iz0
,jz1
);
221 dx02
= _mm_sub_pd(ix0
,jx2
);
222 dy02
= _mm_sub_pd(iy0
,jy2
);
223 dz02
= _mm_sub_pd(iz0
,jz2
);
224 dx10
= _mm_sub_pd(ix1
,jx0
);
225 dy10
= _mm_sub_pd(iy1
,jy0
);
226 dz10
= _mm_sub_pd(iz1
,jz0
);
227 dx11
= _mm_sub_pd(ix1
,jx1
);
228 dy11
= _mm_sub_pd(iy1
,jy1
);
229 dz11
= _mm_sub_pd(iz1
,jz1
);
230 dx12
= _mm_sub_pd(ix1
,jx2
);
231 dy12
= _mm_sub_pd(iy1
,jy2
);
232 dz12
= _mm_sub_pd(iz1
,jz2
);
233 dx20
= _mm_sub_pd(ix2
,jx0
);
234 dy20
= _mm_sub_pd(iy2
,jy0
);
235 dz20
= _mm_sub_pd(iz2
,jz0
);
236 dx21
= _mm_sub_pd(ix2
,jx1
);
237 dy21
= _mm_sub_pd(iy2
,jy1
);
238 dz21
= _mm_sub_pd(iz2
,jz1
);
239 dx22
= _mm_sub_pd(ix2
,jx2
);
240 dy22
= _mm_sub_pd(iy2
,jy2
);
241 dz22
= _mm_sub_pd(iz2
,jz2
);
243 /* Calculate squared distance and things based on it */
244 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
245 rsq01
= gmx_mm_calc_rsq_pd(dx01
,dy01
,dz01
);
246 rsq02
= gmx_mm_calc_rsq_pd(dx02
,dy02
,dz02
);
247 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
248 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
249 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
250 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
251 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
252 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
254 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
255 rinv01
= gmx_mm_invsqrt_pd(rsq01
);
256 rinv02
= gmx_mm_invsqrt_pd(rsq02
);
257 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
258 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
259 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
260 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
261 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
262 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
264 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
266 fjx0
= _mm_setzero_pd();
267 fjy0
= _mm_setzero_pd();
268 fjz0
= _mm_setzero_pd();
269 fjx1
= _mm_setzero_pd();
270 fjy1
= _mm_setzero_pd();
271 fjz1
= _mm_setzero_pd();
272 fjx2
= _mm_setzero_pd();
273 fjy2
= _mm_setzero_pd();
274 fjz2
= _mm_setzero_pd();
276 /**************************
277 * CALCULATE INTERACTIONS *
278 **************************/
280 r00
= _mm_mul_pd(rsq00
,rinv00
);
282 /* Calculate table index by multiplying r with table scale and truncate to integer */
283 rt
= _mm_mul_pd(r00
,vftabscale
);
284 vfitab
= _mm_cvttpd_epi32(rt
);
285 vfeps
= _mm_sub_pd(rt
,_mm_round_pd(rt
, _MM_FROUND_FLOOR
));
286 vfitab
= _mm_slli_epi32(vfitab
,2);
288 /* CUBIC SPLINE TABLE ELECTROSTATICS */
289 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
290 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
291 GMX_MM_TRANSPOSE2_PD(Y
,F
);
292 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
293 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
294 GMX_MM_TRANSPOSE2_PD(G
,H
);
295 Heps
= _mm_mul_pd(vfeps
,H
);
296 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
297 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
298 velec
= _mm_mul_pd(qq00
,VV
);
299 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
300 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq00
,FF
),_mm_mul_pd(vftabscale
,rinv00
)));
302 /* LENNARD-JONES DISPERSION/REPULSION */
304 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
305 vvdw6
= _mm_mul_pd(c6_00
,rinvsix
);
306 vvdw12
= _mm_mul_pd(c12_00
,_mm_mul_pd(rinvsix
,rinvsix
));
307 vvdw
= _mm_sub_pd( _mm_mul_pd(vvdw12
,one_twelfth
) , _mm_mul_pd(vvdw6
,one_sixth
) );
308 fvdw
= _mm_mul_pd(_mm_sub_pd(vvdw12
,vvdw6
),rinvsq00
);
310 /* Update potential sum for this i atom from the interaction with this j atom. */
311 velecsum
= _mm_add_pd(velecsum
,velec
);
312 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
314 fscal
= _mm_add_pd(felec
,fvdw
);
316 /* Calculate temporary vectorial force */
317 tx
= _mm_mul_pd(fscal
,dx00
);
318 ty
= _mm_mul_pd(fscal
,dy00
);
319 tz
= _mm_mul_pd(fscal
,dz00
);
321 /* Update vectorial force */
322 fix0
= _mm_add_pd(fix0
,tx
);
323 fiy0
= _mm_add_pd(fiy0
,ty
);
324 fiz0
= _mm_add_pd(fiz0
,tz
);
326 fjx0
= _mm_add_pd(fjx0
,tx
);
327 fjy0
= _mm_add_pd(fjy0
,ty
);
328 fjz0
= _mm_add_pd(fjz0
,tz
);
330 /**************************
331 * CALCULATE INTERACTIONS *
332 **************************/
334 r01
= _mm_mul_pd(rsq01
,rinv01
);
336 /* Calculate table index by multiplying r with table scale and truncate to integer */
337 rt
= _mm_mul_pd(r01
,vftabscale
);
338 vfitab
= _mm_cvttpd_epi32(rt
);
339 vfeps
= _mm_sub_pd(rt
,_mm_round_pd(rt
, _MM_FROUND_FLOOR
));
340 vfitab
= _mm_slli_epi32(vfitab
,2);
342 /* CUBIC SPLINE TABLE ELECTROSTATICS */
343 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
344 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
345 GMX_MM_TRANSPOSE2_PD(Y
,F
);
346 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
347 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
348 GMX_MM_TRANSPOSE2_PD(G
,H
);
349 Heps
= _mm_mul_pd(vfeps
,H
);
350 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
351 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
352 velec
= _mm_mul_pd(qq01
,VV
);
353 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
354 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq01
,FF
),_mm_mul_pd(vftabscale
,rinv01
)));
356 /* Update potential sum for this i atom from the interaction with this j atom. */
357 velecsum
= _mm_add_pd(velecsum
,velec
);
361 /* Calculate temporary vectorial force */
362 tx
= _mm_mul_pd(fscal
,dx01
);
363 ty
= _mm_mul_pd(fscal
,dy01
);
364 tz
= _mm_mul_pd(fscal
,dz01
);
366 /* Update vectorial force */
367 fix0
= _mm_add_pd(fix0
,tx
);
368 fiy0
= _mm_add_pd(fiy0
,ty
);
369 fiz0
= _mm_add_pd(fiz0
,tz
);
371 fjx1
= _mm_add_pd(fjx1
,tx
);
372 fjy1
= _mm_add_pd(fjy1
,ty
);
373 fjz1
= _mm_add_pd(fjz1
,tz
);
375 /**************************
376 * CALCULATE INTERACTIONS *
377 **************************/
379 r02
= _mm_mul_pd(rsq02
,rinv02
);
381 /* Calculate table index by multiplying r with table scale and truncate to integer */
382 rt
= _mm_mul_pd(r02
,vftabscale
);
383 vfitab
= _mm_cvttpd_epi32(rt
);
384 vfeps
= _mm_sub_pd(rt
,_mm_round_pd(rt
, _MM_FROUND_FLOOR
));
385 vfitab
= _mm_slli_epi32(vfitab
,2);
387 /* CUBIC SPLINE TABLE ELECTROSTATICS */
388 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
389 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
390 GMX_MM_TRANSPOSE2_PD(Y
,F
);
391 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
392 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
393 GMX_MM_TRANSPOSE2_PD(G
,H
);
394 Heps
= _mm_mul_pd(vfeps
,H
);
395 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
396 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
397 velec
= _mm_mul_pd(qq02
,VV
);
398 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
399 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq02
,FF
),_mm_mul_pd(vftabscale
,rinv02
)));
401 /* Update potential sum for this i atom from the interaction with this j atom. */
402 velecsum
= _mm_add_pd(velecsum
,velec
);
406 /* Calculate temporary vectorial force */
407 tx
= _mm_mul_pd(fscal
,dx02
);
408 ty
= _mm_mul_pd(fscal
,dy02
);
409 tz
= _mm_mul_pd(fscal
,dz02
);
411 /* Update vectorial force */
412 fix0
= _mm_add_pd(fix0
,tx
);
413 fiy0
= _mm_add_pd(fiy0
,ty
);
414 fiz0
= _mm_add_pd(fiz0
,tz
);
416 fjx2
= _mm_add_pd(fjx2
,tx
);
417 fjy2
= _mm_add_pd(fjy2
,ty
);
418 fjz2
= _mm_add_pd(fjz2
,tz
);
420 /**************************
421 * CALCULATE INTERACTIONS *
422 **************************/
424 r10
= _mm_mul_pd(rsq10
,rinv10
);
426 /* Calculate table index by multiplying r with table scale and truncate to integer */
427 rt
= _mm_mul_pd(r10
,vftabscale
);
428 vfitab
= _mm_cvttpd_epi32(rt
);
429 vfeps
= _mm_sub_pd(rt
,_mm_round_pd(rt
, _MM_FROUND_FLOOR
));
430 vfitab
= _mm_slli_epi32(vfitab
,2);
432 /* CUBIC SPLINE TABLE ELECTROSTATICS */
433 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
434 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
435 GMX_MM_TRANSPOSE2_PD(Y
,F
);
436 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
437 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
438 GMX_MM_TRANSPOSE2_PD(G
,H
);
439 Heps
= _mm_mul_pd(vfeps
,H
);
440 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
441 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
442 velec
= _mm_mul_pd(qq10
,VV
);
443 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
444 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq10
,FF
),_mm_mul_pd(vftabscale
,rinv10
)));
446 /* Update potential sum for this i atom from the interaction with this j atom. */
447 velecsum
= _mm_add_pd(velecsum
,velec
);
451 /* Calculate temporary vectorial force */
452 tx
= _mm_mul_pd(fscal
,dx10
);
453 ty
= _mm_mul_pd(fscal
,dy10
);
454 tz
= _mm_mul_pd(fscal
,dz10
);
456 /* Update vectorial force */
457 fix1
= _mm_add_pd(fix1
,tx
);
458 fiy1
= _mm_add_pd(fiy1
,ty
);
459 fiz1
= _mm_add_pd(fiz1
,tz
);
461 fjx0
= _mm_add_pd(fjx0
,tx
);
462 fjy0
= _mm_add_pd(fjy0
,ty
);
463 fjz0
= _mm_add_pd(fjz0
,tz
);
465 /**************************
466 * CALCULATE INTERACTIONS *
467 **************************/
469 r11
= _mm_mul_pd(rsq11
,rinv11
);
471 /* Calculate table index by multiplying r with table scale and truncate to integer */
472 rt
= _mm_mul_pd(r11
,vftabscale
);
473 vfitab
= _mm_cvttpd_epi32(rt
);
474 vfeps
= _mm_sub_pd(rt
,_mm_round_pd(rt
, _MM_FROUND_FLOOR
));
475 vfitab
= _mm_slli_epi32(vfitab
,2);
477 /* CUBIC SPLINE TABLE ELECTROSTATICS */
478 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
479 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
480 GMX_MM_TRANSPOSE2_PD(Y
,F
);
481 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
482 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
483 GMX_MM_TRANSPOSE2_PD(G
,H
);
484 Heps
= _mm_mul_pd(vfeps
,H
);
485 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
486 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
487 velec
= _mm_mul_pd(qq11
,VV
);
488 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
489 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq11
,FF
),_mm_mul_pd(vftabscale
,rinv11
)));
491 /* Update potential sum for this i atom from the interaction with this j atom. */
492 velecsum
= _mm_add_pd(velecsum
,velec
);
496 /* Calculate temporary vectorial force */
497 tx
= _mm_mul_pd(fscal
,dx11
);
498 ty
= _mm_mul_pd(fscal
,dy11
);
499 tz
= _mm_mul_pd(fscal
,dz11
);
501 /* Update vectorial force */
502 fix1
= _mm_add_pd(fix1
,tx
);
503 fiy1
= _mm_add_pd(fiy1
,ty
);
504 fiz1
= _mm_add_pd(fiz1
,tz
);
506 fjx1
= _mm_add_pd(fjx1
,tx
);
507 fjy1
= _mm_add_pd(fjy1
,ty
);
508 fjz1
= _mm_add_pd(fjz1
,tz
);
510 /**************************
511 * CALCULATE INTERACTIONS *
512 **************************/
514 r12
= _mm_mul_pd(rsq12
,rinv12
);
516 /* Calculate table index by multiplying r with table scale and truncate to integer */
517 rt
= _mm_mul_pd(r12
,vftabscale
);
518 vfitab
= _mm_cvttpd_epi32(rt
);
519 vfeps
= _mm_sub_pd(rt
,_mm_round_pd(rt
, _MM_FROUND_FLOOR
));
520 vfitab
= _mm_slli_epi32(vfitab
,2);
522 /* CUBIC SPLINE TABLE ELECTROSTATICS */
523 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
524 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
525 GMX_MM_TRANSPOSE2_PD(Y
,F
);
526 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
527 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
528 GMX_MM_TRANSPOSE2_PD(G
,H
);
529 Heps
= _mm_mul_pd(vfeps
,H
);
530 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
531 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
532 velec
= _mm_mul_pd(qq12
,VV
);
533 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
534 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq12
,FF
),_mm_mul_pd(vftabscale
,rinv12
)));
536 /* Update potential sum for this i atom from the interaction with this j atom. */
537 velecsum
= _mm_add_pd(velecsum
,velec
);
541 /* Calculate temporary vectorial force */
542 tx
= _mm_mul_pd(fscal
,dx12
);
543 ty
= _mm_mul_pd(fscal
,dy12
);
544 tz
= _mm_mul_pd(fscal
,dz12
);
546 /* Update vectorial force */
547 fix1
= _mm_add_pd(fix1
,tx
);
548 fiy1
= _mm_add_pd(fiy1
,ty
);
549 fiz1
= _mm_add_pd(fiz1
,tz
);
551 fjx2
= _mm_add_pd(fjx2
,tx
);
552 fjy2
= _mm_add_pd(fjy2
,ty
);
553 fjz2
= _mm_add_pd(fjz2
,tz
);
555 /**************************
556 * CALCULATE INTERACTIONS *
557 **************************/
559 r20
= _mm_mul_pd(rsq20
,rinv20
);
561 /* Calculate table index by multiplying r with table scale and truncate to integer */
562 rt
= _mm_mul_pd(r20
,vftabscale
);
563 vfitab
= _mm_cvttpd_epi32(rt
);
564 vfeps
= _mm_sub_pd(rt
,_mm_round_pd(rt
, _MM_FROUND_FLOOR
));
565 vfitab
= _mm_slli_epi32(vfitab
,2);
567 /* CUBIC SPLINE TABLE ELECTROSTATICS */
568 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
569 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
570 GMX_MM_TRANSPOSE2_PD(Y
,F
);
571 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
572 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
573 GMX_MM_TRANSPOSE2_PD(G
,H
);
574 Heps
= _mm_mul_pd(vfeps
,H
);
575 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
576 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
577 velec
= _mm_mul_pd(qq20
,VV
);
578 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
579 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq20
,FF
),_mm_mul_pd(vftabscale
,rinv20
)));
581 /* Update potential sum for this i atom from the interaction with this j atom. */
582 velecsum
= _mm_add_pd(velecsum
,velec
);
586 /* Calculate temporary vectorial force */
587 tx
= _mm_mul_pd(fscal
,dx20
);
588 ty
= _mm_mul_pd(fscal
,dy20
);
589 tz
= _mm_mul_pd(fscal
,dz20
);
591 /* Update vectorial force */
592 fix2
= _mm_add_pd(fix2
,tx
);
593 fiy2
= _mm_add_pd(fiy2
,ty
);
594 fiz2
= _mm_add_pd(fiz2
,tz
);
596 fjx0
= _mm_add_pd(fjx0
,tx
);
597 fjy0
= _mm_add_pd(fjy0
,ty
);
598 fjz0
= _mm_add_pd(fjz0
,tz
);
600 /**************************
601 * CALCULATE INTERACTIONS *
602 **************************/
604 r21
= _mm_mul_pd(rsq21
,rinv21
);
606 /* Calculate table index by multiplying r with table scale and truncate to integer */
607 rt
= _mm_mul_pd(r21
,vftabscale
);
608 vfitab
= _mm_cvttpd_epi32(rt
);
609 vfeps
= _mm_sub_pd(rt
,_mm_round_pd(rt
, _MM_FROUND_FLOOR
));
610 vfitab
= _mm_slli_epi32(vfitab
,2);
612 /* CUBIC SPLINE TABLE ELECTROSTATICS */
613 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
614 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
615 GMX_MM_TRANSPOSE2_PD(Y
,F
);
616 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
617 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
618 GMX_MM_TRANSPOSE2_PD(G
,H
);
619 Heps
= _mm_mul_pd(vfeps
,H
);
620 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
621 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
622 velec
= _mm_mul_pd(qq21
,VV
);
623 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
624 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq21
,FF
),_mm_mul_pd(vftabscale
,rinv21
)));
626 /* Update potential sum for this i atom from the interaction with this j atom. */
627 velecsum
= _mm_add_pd(velecsum
,velec
);
631 /* Calculate temporary vectorial force */
632 tx
= _mm_mul_pd(fscal
,dx21
);
633 ty
= _mm_mul_pd(fscal
,dy21
);
634 tz
= _mm_mul_pd(fscal
,dz21
);
636 /* Update vectorial force */
637 fix2
= _mm_add_pd(fix2
,tx
);
638 fiy2
= _mm_add_pd(fiy2
,ty
);
639 fiz2
= _mm_add_pd(fiz2
,tz
);
641 fjx1
= _mm_add_pd(fjx1
,tx
);
642 fjy1
= _mm_add_pd(fjy1
,ty
);
643 fjz1
= _mm_add_pd(fjz1
,tz
);
645 /**************************
646 * CALCULATE INTERACTIONS *
647 **************************/
649 r22
= _mm_mul_pd(rsq22
,rinv22
);
651 /* Calculate table index by multiplying r with table scale and truncate to integer */
652 rt
= _mm_mul_pd(r22
,vftabscale
);
653 vfitab
= _mm_cvttpd_epi32(rt
);
654 vfeps
= _mm_sub_pd(rt
,_mm_round_pd(rt
, _MM_FROUND_FLOOR
));
655 vfitab
= _mm_slli_epi32(vfitab
,2);
657 /* CUBIC SPLINE TABLE ELECTROSTATICS */
658 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
659 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
660 GMX_MM_TRANSPOSE2_PD(Y
,F
);
661 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
662 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
663 GMX_MM_TRANSPOSE2_PD(G
,H
);
664 Heps
= _mm_mul_pd(vfeps
,H
);
665 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
666 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
667 velec
= _mm_mul_pd(qq22
,VV
);
668 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
669 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq22
,FF
),_mm_mul_pd(vftabscale
,rinv22
)));
671 /* Update potential sum for this i atom from the interaction with this j atom. */
672 velecsum
= _mm_add_pd(velecsum
,velec
);
676 /* Calculate temporary vectorial force */
677 tx
= _mm_mul_pd(fscal
,dx22
);
678 ty
= _mm_mul_pd(fscal
,dy22
);
679 tz
= _mm_mul_pd(fscal
,dz22
);
681 /* Update vectorial force */
682 fix2
= _mm_add_pd(fix2
,tx
);
683 fiy2
= _mm_add_pd(fiy2
,ty
);
684 fiz2
= _mm_add_pd(fiz2
,tz
);
686 fjx2
= _mm_add_pd(fjx2
,tx
);
687 fjy2
= _mm_add_pd(fjy2
,ty
);
688 fjz2
= _mm_add_pd(fjz2
,tz
);
690 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
692 /* Inner loop uses 400 flops */
699 j_coord_offsetA
= DIM
*jnrA
;
701 /* load j atom coordinates */
702 gmx_mm_load_3rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
703 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
705 /* Calculate displacement vector */
706 dx00
= _mm_sub_pd(ix0
,jx0
);
707 dy00
= _mm_sub_pd(iy0
,jy0
);
708 dz00
= _mm_sub_pd(iz0
,jz0
);
709 dx01
= _mm_sub_pd(ix0
,jx1
);
710 dy01
= _mm_sub_pd(iy0
,jy1
);
711 dz01
= _mm_sub_pd(iz0
,jz1
);
712 dx02
= _mm_sub_pd(ix0
,jx2
);
713 dy02
= _mm_sub_pd(iy0
,jy2
);
714 dz02
= _mm_sub_pd(iz0
,jz2
);
715 dx10
= _mm_sub_pd(ix1
,jx0
);
716 dy10
= _mm_sub_pd(iy1
,jy0
);
717 dz10
= _mm_sub_pd(iz1
,jz0
);
718 dx11
= _mm_sub_pd(ix1
,jx1
);
719 dy11
= _mm_sub_pd(iy1
,jy1
);
720 dz11
= _mm_sub_pd(iz1
,jz1
);
721 dx12
= _mm_sub_pd(ix1
,jx2
);
722 dy12
= _mm_sub_pd(iy1
,jy2
);
723 dz12
= _mm_sub_pd(iz1
,jz2
);
724 dx20
= _mm_sub_pd(ix2
,jx0
);
725 dy20
= _mm_sub_pd(iy2
,jy0
);
726 dz20
= _mm_sub_pd(iz2
,jz0
);
727 dx21
= _mm_sub_pd(ix2
,jx1
);
728 dy21
= _mm_sub_pd(iy2
,jy1
);
729 dz21
= _mm_sub_pd(iz2
,jz1
);
730 dx22
= _mm_sub_pd(ix2
,jx2
);
731 dy22
= _mm_sub_pd(iy2
,jy2
);
732 dz22
= _mm_sub_pd(iz2
,jz2
);
734 /* Calculate squared distance and things based on it */
735 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
736 rsq01
= gmx_mm_calc_rsq_pd(dx01
,dy01
,dz01
);
737 rsq02
= gmx_mm_calc_rsq_pd(dx02
,dy02
,dz02
);
738 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
739 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
740 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
741 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
742 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
743 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
745 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
746 rinv01
= gmx_mm_invsqrt_pd(rsq01
);
747 rinv02
= gmx_mm_invsqrt_pd(rsq02
);
748 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
749 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
750 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
751 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
752 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
753 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
755 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
757 fjx0
= _mm_setzero_pd();
758 fjy0
= _mm_setzero_pd();
759 fjz0
= _mm_setzero_pd();
760 fjx1
= _mm_setzero_pd();
761 fjy1
= _mm_setzero_pd();
762 fjz1
= _mm_setzero_pd();
763 fjx2
= _mm_setzero_pd();
764 fjy2
= _mm_setzero_pd();
765 fjz2
= _mm_setzero_pd();
767 /**************************
768 * CALCULATE INTERACTIONS *
769 **************************/
771 r00
= _mm_mul_pd(rsq00
,rinv00
);
773 /* Calculate table index by multiplying r with table scale and truncate to integer */
774 rt
= _mm_mul_pd(r00
,vftabscale
);
775 vfitab
= _mm_cvttpd_epi32(rt
);
776 vfeps
= _mm_sub_pd(rt
,_mm_round_pd(rt
, _MM_FROUND_FLOOR
));
777 vfitab
= _mm_slli_epi32(vfitab
,2);
779 /* CUBIC SPLINE TABLE ELECTROSTATICS */
780 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
781 F
= _mm_setzero_pd();
782 GMX_MM_TRANSPOSE2_PD(Y
,F
);
783 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
784 H
= _mm_setzero_pd();
785 GMX_MM_TRANSPOSE2_PD(G
,H
);
786 Heps
= _mm_mul_pd(vfeps
,H
);
787 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
788 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
789 velec
= _mm_mul_pd(qq00
,VV
);
790 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
791 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq00
,FF
),_mm_mul_pd(vftabscale
,rinv00
)));
793 /* LENNARD-JONES DISPERSION/REPULSION */
795 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
796 vvdw6
= _mm_mul_pd(c6_00
,rinvsix
);
797 vvdw12
= _mm_mul_pd(c12_00
,_mm_mul_pd(rinvsix
,rinvsix
));
798 vvdw
= _mm_sub_pd( _mm_mul_pd(vvdw12
,one_twelfth
) , _mm_mul_pd(vvdw6
,one_sixth
) );
799 fvdw
= _mm_mul_pd(_mm_sub_pd(vvdw12
,vvdw6
),rinvsq00
);
801 /* Update potential sum for this i atom from the interaction with this j atom. */
802 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
803 velecsum
= _mm_add_pd(velecsum
,velec
);
804 vvdw
= _mm_unpacklo_pd(vvdw
,_mm_setzero_pd());
805 vvdwsum
= _mm_add_pd(vvdwsum
,vvdw
);
807 fscal
= _mm_add_pd(felec
,fvdw
);
809 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
811 /* Calculate temporary vectorial force */
812 tx
= _mm_mul_pd(fscal
,dx00
);
813 ty
= _mm_mul_pd(fscal
,dy00
);
814 tz
= _mm_mul_pd(fscal
,dz00
);
816 /* Update vectorial force */
817 fix0
= _mm_add_pd(fix0
,tx
);
818 fiy0
= _mm_add_pd(fiy0
,ty
);
819 fiz0
= _mm_add_pd(fiz0
,tz
);
821 fjx0
= _mm_add_pd(fjx0
,tx
);
822 fjy0
= _mm_add_pd(fjy0
,ty
);
823 fjz0
= _mm_add_pd(fjz0
,tz
);
825 /**************************
826 * CALCULATE INTERACTIONS *
827 **************************/
829 r01
= _mm_mul_pd(rsq01
,rinv01
);
831 /* Calculate table index by multiplying r with table scale and truncate to integer */
832 rt
= _mm_mul_pd(r01
,vftabscale
);
833 vfitab
= _mm_cvttpd_epi32(rt
);
834 vfeps
= _mm_sub_pd(rt
,_mm_round_pd(rt
, _MM_FROUND_FLOOR
));
835 vfitab
= _mm_slli_epi32(vfitab
,2);
837 /* CUBIC SPLINE TABLE ELECTROSTATICS */
838 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
839 F
= _mm_setzero_pd();
840 GMX_MM_TRANSPOSE2_PD(Y
,F
);
841 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
842 H
= _mm_setzero_pd();
843 GMX_MM_TRANSPOSE2_PD(G
,H
);
844 Heps
= _mm_mul_pd(vfeps
,H
);
845 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
846 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
847 velec
= _mm_mul_pd(qq01
,VV
);
848 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
849 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq01
,FF
),_mm_mul_pd(vftabscale
,rinv01
)));
851 /* Update potential sum for this i atom from the interaction with this j atom. */
852 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
853 velecsum
= _mm_add_pd(velecsum
,velec
);
857 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
859 /* Calculate temporary vectorial force */
860 tx
= _mm_mul_pd(fscal
,dx01
);
861 ty
= _mm_mul_pd(fscal
,dy01
);
862 tz
= _mm_mul_pd(fscal
,dz01
);
864 /* Update vectorial force */
865 fix0
= _mm_add_pd(fix0
,tx
);
866 fiy0
= _mm_add_pd(fiy0
,ty
);
867 fiz0
= _mm_add_pd(fiz0
,tz
);
869 fjx1
= _mm_add_pd(fjx1
,tx
);
870 fjy1
= _mm_add_pd(fjy1
,ty
);
871 fjz1
= _mm_add_pd(fjz1
,tz
);
873 /**************************
874 * CALCULATE INTERACTIONS *
875 **************************/
877 r02
= _mm_mul_pd(rsq02
,rinv02
);
879 /* Calculate table index by multiplying r with table scale and truncate to integer */
880 rt
= _mm_mul_pd(r02
,vftabscale
);
881 vfitab
= _mm_cvttpd_epi32(rt
);
882 vfeps
= _mm_sub_pd(rt
,_mm_round_pd(rt
, _MM_FROUND_FLOOR
));
883 vfitab
= _mm_slli_epi32(vfitab
,2);
885 /* CUBIC SPLINE TABLE ELECTROSTATICS */
886 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
887 F
= _mm_setzero_pd();
888 GMX_MM_TRANSPOSE2_PD(Y
,F
);
889 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
890 H
= _mm_setzero_pd();
891 GMX_MM_TRANSPOSE2_PD(G
,H
);
892 Heps
= _mm_mul_pd(vfeps
,H
);
893 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
894 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
895 velec
= _mm_mul_pd(qq02
,VV
);
896 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
897 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq02
,FF
),_mm_mul_pd(vftabscale
,rinv02
)));
899 /* Update potential sum for this i atom from the interaction with this j atom. */
900 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
901 velecsum
= _mm_add_pd(velecsum
,velec
);
905 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
907 /* Calculate temporary vectorial force */
908 tx
= _mm_mul_pd(fscal
,dx02
);
909 ty
= _mm_mul_pd(fscal
,dy02
);
910 tz
= _mm_mul_pd(fscal
,dz02
);
912 /* Update vectorial force */
913 fix0
= _mm_add_pd(fix0
,tx
);
914 fiy0
= _mm_add_pd(fiy0
,ty
);
915 fiz0
= _mm_add_pd(fiz0
,tz
);
917 fjx2
= _mm_add_pd(fjx2
,tx
);
918 fjy2
= _mm_add_pd(fjy2
,ty
);
919 fjz2
= _mm_add_pd(fjz2
,tz
);
921 /**************************
922 * CALCULATE INTERACTIONS *
923 **************************/
925 r10
= _mm_mul_pd(rsq10
,rinv10
);
927 /* Calculate table index by multiplying r with table scale and truncate to integer */
928 rt
= _mm_mul_pd(r10
,vftabscale
);
929 vfitab
= _mm_cvttpd_epi32(rt
);
930 vfeps
= _mm_sub_pd(rt
,_mm_round_pd(rt
, _MM_FROUND_FLOOR
));
931 vfitab
= _mm_slli_epi32(vfitab
,2);
933 /* CUBIC SPLINE TABLE ELECTROSTATICS */
934 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
935 F
= _mm_setzero_pd();
936 GMX_MM_TRANSPOSE2_PD(Y
,F
);
937 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
938 H
= _mm_setzero_pd();
939 GMX_MM_TRANSPOSE2_PD(G
,H
);
940 Heps
= _mm_mul_pd(vfeps
,H
);
941 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
942 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
943 velec
= _mm_mul_pd(qq10
,VV
);
944 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
945 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq10
,FF
),_mm_mul_pd(vftabscale
,rinv10
)));
947 /* Update potential sum for this i atom from the interaction with this j atom. */
948 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
949 velecsum
= _mm_add_pd(velecsum
,velec
);
953 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
955 /* Calculate temporary vectorial force */
956 tx
= _mm_mul_pd(fscal
,dx10
);
957 ty
= _mm_mul_pd(fscal
,dy10
);
958 tz
= _mm_mul_pd(fscal
,dz10
);
960 /* Update vectorial force */
961 fix1
= _mm_add_pd(fix1
,tx
);
962 fiy1
= _mm_add_pd(fiy1
,ty
);
963 fiz1
= _mm_add_pd(fiz1
,tz
);
965 fjx0
= _mm_add_pd(fjx0
,tx
);
966 fjy0
= _mm_add_pd(fjy0
,ty
);
967 fjz0
= _mm_add_pd(fjz0
,tz
);
969 /**************************
970 * CALCULATE INTERACTIONS *
971 **************************/
973 r11
= _mm_mul_pd(rsq11
,rinv11
);
975 /* Calculate table index by multiplying r with table scale and truncate to integer */
976 rt
= _mm_mul_pd(r11
,vftabscale
);
977 vfitab
= _mm_cvttpd_epi32(rt
);
978 vfeps
= _mm_sub_pd(rt
,_mm_round_pd(rt
, _MM_FROUND_FLOOR
));
979 vfitab
= _mm_slli_epi32(vfitab
,2);
981 /* CUBIC SPLINE TABLE ELECTROSTATICS */
982 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
983 F
= _mm_setzero_pd();
984 GMX_MM_TRANSPOSE2_PD(Y
,F
);
985 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
986 H
= _mm_setzero_pd();
987 GMX_MM_TRANSPOSE2_PD(G
,H
);
988 Heps
= _mm_mul_pd(vfeps
,H
);
989 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
990 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
991 velec
= _mm_mul_pd(qq11
,VV
);
992 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
993 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq11
,FF
),_mm_mul_pd(vftabscale
,rinv11
)));
995 /* Update potential sum for this i atom from the interaction with this j atom. */
996 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
997 velecsum
= _mm_add_pd(velecsum
,velec
);
1001 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1003 /* Calculate temporary vectorial force */
1004 tx
= _mm_mul_pd(fscal
,dx11
);
1005 ty
= _mm_mul_pd(fscal
,dy11
);
1006 tz
= _mm_mul_pd(fscal
,dz11
);
1008 /* Update vectorial force */
1009 fix1
= _mm_add_pd(fix1
,tx
);
1010 fiy1
= _mm_add_pd(fiy1
,ty
);
1011 fiz1
= _mm_add_pd(fiz1
,tz
);
1013 fjx1
= _mm_add_pd(fjx1
,tx
);
1014 fjy1
= _mm_add_pd(fjy1
,ty
);
1015 fjz1
= _mm_add_pd(fjz1
,tz
);
1017 /**************************
1018 * CALCULATE INTERACTIONS *
1019 **************************/
1021 r12
= _mm_mul_pd(rsq12
,rinv12
);
1023 /* Calculate table index by multiplying r with table scale and truncate to integer */
1024 rt
= _mm_mul_pd(r12
,vftabscale
);
1025 vfitab
= _mm_cvttpd_epi32(rt
);
1026 vfeps
= _mm_sub_pd(rt
,_mm_round_pd(rt
, _MM_FROUND_FLOOR
));
1027 vfitab
= _mm_slli_epi32(vfitab
,2);
1029 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1030 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1031 F
= _mm_setzero_pd();
1032 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1033 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1034 H
= _mm_setzero_pd();
1035 GMX_MM_TRANSPOSE2_PD(G
,H
);
1036 Heps
= _mm_mul_pd(vfeps
,H
);
1037 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1038 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
1039 velec
= _mm_mul_pd(qq12
,VV
);
1040 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1041 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq12
,FF
),_mm_mul_pd(vftabscale
,rinv12
)));
1043 /* Update potential sum for this i atom from the interaction with this j atom. */
1044 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1045 velecsum
= _mm_add_pd(velecsum
,velec
);
1049 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1051 /* Calculate temporary vectorial force */
1052 tx
= _mm_mul_pd(fscal
,dx12
);
1053 ty
= _mm_mul_pd(fscal
,dy12
);
1054 tz
= _mm_mul_pd(fscal
,dz12
);
1056 /* Update vectorial force */
1057 fix1
= _mm_add_pd(fix1
,tx
);
1058 fiy1
= _mm_add_pd(fiy1
,ty
);
1059 fiz1
= _mm_add_pd(fiz1
,tz
);
1061 fjx2
= _mm_add_pd(fjx2
,tx
);
1062 fjy2
= _mm_add_pd(fjy2
,ty
);
1063 fjz2
= _mm_add_pd(fjz2
,tz
);
1065 /**************************
1066 * CALCULATE INTERACTIONS *
1067 **************************/
1069 r20
= _mm_mul_pd(rsq20
,rinv20
);
1071 /* Calculate table index by multiplying r with table scale and truncate to integer */
1072 rt
= _mm_mul_pd(r20
,vftabscale
);
1073 vfitab
= _mm_cvttpd_epi32(rt
);
1074 vfeps
= _mm_sub_pd(rt
,_mm_round_pd(rt
, _MM_FROUND_FLOOR
));
1075 vfitab
= _mm_slli_epi32(vfitab
,2);
1077 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1078 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1079 F
= _mm_setzero_pd();
1080 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1081 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1082 H
= _mm_setzero_pd();
1083 GMX_MM_TRANSPOSE2_PD(G
,H
);
1084 Heps
= _mm_mul_pd(vfeps
,H
);
1085 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1086 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
1087 velec
= _mm_mul_pd(qq20
,VV
);
1088 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1089 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq20
,FF
),_mm_mul_pd(vftabscale
,rinv20
)));
1091 /* Update potential sum for this i atom from the interaction with this j atom. */
1092 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1093 velecsum
= _mm_add_pd(velecsum
,velec
);
1097 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1099 /* Calculate temporary vectorial force */
1100 tx
= _mm_mul_pd(fscal
,dx20
);
1101 ty
= _mm_mul_pd(fscal
,dy20
);
1102 tz
= _mm_mul_pd(fscal
,dz20
);
1104 /* Update vectorial force */
1105 fix2
= _mm_add_pd(fix2
,tx
);
1106 fiy2
= _mm_add_pd(fiy2
,ty
);
1107 fiz2
= _mm_add_pd(fiz2
,tz
);
1109 fjx0
= _mm_add_pd(fjx0
,tx
);
1110 fjy0
= _mm_add_pd(fjy0
,ty
);
1111 fjz0
= _mm_add_pd(fjz0
,tz
);
1113 /**************************
1114 * CALCULATE INTERACTIONS *
1115 **************************/
1117 r21
= _mm_mul_pd(rsq21
,rinv21
);
1119 /* Calculate table index by multiplying r with table scale and truncate to integer */
1120 rt
= _mm_mul_pd(r21
,vftabscale
);
1121 vfitab
= _mm_cvttpd_epi32(rt
);
1122 vfeps
= _mm_sub_pd(rt
,_mm_round_pd(rt
, _MM_FROUND_FLOOR
));
1123 vfitab
= _mm_slli_epi32(vfitab
,2);
1125 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1126 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1127 F
= _mm_setzero_pd();
1128 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1129 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1130 H
= _mm_setzero_pd();
1131 GMX_MM_TRANSPOSE2_PD(G
,H
);
1132 Heps
= _mm_mul_pd(vfeps
,H
);
1133 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1134 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
1135 velec
= _mm_mul_pd(qq21
,VV
);
1136 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1137 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq21
,FF
),_mm_mul_pd(vftabscale
,rinv21
)));
1139 /* Update potential sum for this i atom from the interaction with this j atom. */
1140 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1141 velecsum
= _mm_add_pd(velecsum
,velec
);
1145 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1147 /* Calculate temporary vectorial force */
1148 tx
= _mm_mul_pd(fscal
,dx21
);
1149 ty
= _mm_mul_pd(fscal
,dy21
);
1150 tz
= _mm_mul_pd(fscal
,dz21
);
1152 /* Update vectorial force */
1153 fix2
= _mm_add_pd(fix2
,tx
);
1154 fiy2
= _mm_add_pd(fiy2
,ty
);
1155 fiz2
= _mm_add_pd(fiz2
,tz
);
1157 fjx1
= _mm_add_pd(fjx1
,tx
);
1158 fjy1
= _mm_add_pd(fjy1
,ty
);
1159 fjz1
= _mm_add_pd(fjz1
,tz
);
1161 /**************************
1162 * CALCULATE INTERACTIONS *
1163 **************************/
1165 r22
= _mm_mul_pd(rsq22
,rinv22
);
1167 /* Calculate table index by multiplying r with table scale and truncate to integer */
1168 rt
= _mm_mul_pd(r22
,vftabscale
);
1169 vfitab
= _mm_cvttpd_epi32(rt
);
1170 vfeps
= _mm_sub_pd(rt
,_mm_round_pd(rt
, _MM_FROUND_FLOOR
));
1171 vfitab
= _mm_slli_epi32(vfitab
,2);
1173 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1174 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1175 F
= _mm_setzero_pd();
1176 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1177 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1178 H
= _mm_setzero_pd();
1179 GMX_MM_TRANSPOSE2_PD(G
,H
);
1180 Heps
= _mm_mul_pd(vfeps
,H
);
1181 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1182 VV
= _mm_add_pd(Y
,_mm_mul_pd(vfeps
,Fp
));
1183 velec
= _mm_mul_pd(qq22
,VV
);
1184 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1185 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq22
,FF
),_mm_mul_pd(vftabscale
,rinv22
)));
1187 /* Update potential sum for this i atom from the interaction with this j atom. */
1188 velec
= _mm_unpacklo_pd(velec
,_mm_setzero_pd());
1189 velecsum
= _mm_add_pd(velecsum
,velec
);
1193 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1195 /* Calculate temporary vectorial force */
1196 tx
= _mm_mul_pd(fscal
,dx22
);
1197 ty
= _mm_mul_pd(fscal
,dy22
);
1198 tz
= _mm_mul_pd(fscal
,dz22
);
1200 /* Update vectorial force */
1201 fix2
= _mm_add_pd(fix2
,tx
);
1202 fiy2
= _mm_add_pd(fiy2
,ty
);
1203 fiz2
= _mm_add_pd(fiz2
,tz
);
1205 fjx2
= _mm_add_pd(fjx2
,tx
);
1206 fjy2
= _mm_add_pd(fjy2
,ty
);
1207 fjz2
= _mm_add_pd(fjz2
,tz
);
1209 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
1211 /* Inner loop uses 400 flops */
1214 /* End of innermost loop */
1216 gmx_mm_update_iforce_3atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
1217 f
+i_coord_offset
,fshift
+i_shift_offset
);
1220 /* Update potential energies */
1221 gmx_mm_update_1pot_pd(velecsum
,kernel_data
->energygrp_elec
+ggid
);
1222 gmx_mm_update_1pot_pd(vvdwsum
,kernel_data
->energygrp_vdw
+ggid
);
1224 /* Increment number of inner iterations */
1225 inneriter
+= j_index_end
- j_index_start
;
1227 /* Outer loop uses 20 flops */
1230 /* Increment number of outer iterations */
1233 /* Update outer/inner flops */
1235 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W3W3_VF
,outeriter
*20 + inneriter
*400);
1238 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sse4_1_double
1239 * Electrostatics interaction: CubicSplineTable
1240 * VdW interaction: LennardJones
1241 * Geometry: Water3-Water3
1242 * Calculate force/pot: Force
1245 nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sse4_1_double
1246 (t_nblist
* gmx_restrict nlist
,
1247 rvec
* gmx_restrict xx
,
1248 rvec
* gmx_restrict ff
,
1249 t_forcerec
* gmx_restrict fr
,
1250 t_mdatoms
* gmx_restrict mdatoms
,
1251 nb_kernel_data_t gmx_unused
* gmx_restrict kernel_data
,
1252 t_nrnb
* gmx_restrict nrnb
)
1254 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1255 * just 0 for non-waters.
1256 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1257 * jnr indices corresponding to data put in the four positions in the SIMD register.
1259 int i_shift_offset
,i_coord_offset
,outeriter
,inneriter
;
1260 int j_index_start
,j_index_end
,jidx
,nri
,inr
,ggid
,iidx
;
1262 int j_coord_offsetA
,j_coord_offsetB
;
1263 int *iinr
,*jindex
,*jjnr
,*shiftidx
,*gid
;
1264 real rcutoff_scalar
;
1265 real
*shiftvec
,*fshift
,*x
,*f
;
1266 __m128d tx
,ty
,tz
,fscal
,rcutoff
,rcutoff2
,jidxall
;
1268 __m128d ix0
,iy0
,iz0
,fix0
,fiy0
,fiz0
,iq0
,isai0
;
1270 __m128d ix1
,iy1
,iz1
,fix1
,fiy1
,fiz1
,iq1
,isai1
;
1272 __m128d ix2
,iy2
,iz2
,fix2
,fiy2
,fiz2
,iq2
,isai2
;
1273 int vdwjidx0A
,vdwjidx0B
;
1274 __m128d jx0
,jy0
,jz0
,fjx0
,fjy0
,fjz0
,jq0
,isaj0
;
1275 int vdwjidx1A
,vdwjidx1B
;
1276 __m128d jx1
,jy1
,jz1
,fjx1
,fjy1
,fjz1
,jq1
,isaj1
;
1277 int vdwjidx2A
,vdwjidx2B
;
1278 __m128d jx2
,jy2
,jz2
,fjx2
,fjy2
,fjz2
,jq2
,isaj2
;
1279 __m128d dx00
,dy00
,dz00
,rsq00
,rinv00
,rinvsq00
,r00
,qq00
,c6_00
,c12_00
;
1280 __m128d dx01
,dy01
,dz01
,rsq01
,rinv01
,rinvsq01
,r01
,qq01
,c6_01
,c12_01
;
1281 __m128d dx02
,dy02
,dz02
,rsq02
,rinv02
,rinvsq02
,r02
,qq02
,c6_02
,c12_02
;
1282 __m128d dx10
,dy10
,dz10
,rsq10
,rinv10
,rinvsq10
,r10
,qq10
,c6_10
,c12_10
;
1283 __m128d dx11
,dy11
,dz11
,rsq11
,rinv11
,rinvsq11
,r11
,qq11
,c6_11
,c12_11
;
1284 __m128d dx12
,dy12
,dz12
,rsq12
,rinv12
,rinvsq12
,r12
,qq12
,c6_12
,c12_12
;
1285 __m128d dx20
,dy20
,dz20
,rsq20
,rinv20
,rinvsq20
,r20
,qq20
,c6_20
,c12_20
;
1286 __m128d dx21
,dy21
,dz21
,rsq21
,rinv21
,rinvsq21
,r21
,qq21
,c6_21
,c12_21
;
1287 __m128d dx22
,dy22
,dz22
,rsq22
,rinv22
,rinvsq22
,r22
,qq22
,c6_22
,c12_22
;
1288 __m128d velec
,felec
,velecsum
,facel
,crf
,krf
,krf2
;
1291 __m128d rinvsix
,rvdw
,vvdw
,vvdw6
,vvdw12
,fvdw
,fvdw6
,fvdw12
,vvdwsum
,sh_vdw_invrcut6
;
1294 __m128d one_sixth
= _mm_set1_pd(1.0/6.0);
1295 __m128d one_twelfth
= _mm_set1_pd(1.0/12.0);
1297 __m128i ifour
= _mm_set1_epi32(4);
1298 __m128d rt
,vfeps
,vftabscale
,Y
,F
,G
,H
,Heps
,Fp
,VV
,FF
;
1300 __m128d dummy_mask
,cutoff_mask
;
1301 __m128d signbit
= gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1302 __m128d one
= _mm_set1_pd(1.0);
1303 __m128d two
= _mm_set1_pd(2.0);
1309 jindex
= nlist
->jindex
;
1311 shiftidx
= nlist
->shift
;
1313 shiftvec
= fr
->shift_vec
[0];
1314 fshift
= fr
->fshift
[0];
1315 facel
= _mm_set1_pd(fr
->epsfac
);
1316 charge
= mdatoms
->chargeA
;
1317 nvdwtype
= fr
->ntype
;
1318 vdwparam
= fr
->nbfp
;
1319 vdwtype
= mdatoms
->typeA
;
1321 vftab
= kernel_data
->table_elec
->data
;
1322 vftabscale
= _mm_set1_pd(kernel_data
->table_elec
->scale
);
1324 /* Setup water-specific parameters */
1325 inr
= nlist
->iinr
[0];
1326 iq0
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+0]));
1327 iq1
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+1]));
1328 iq2
= _mm_mul_pd(facel
,_mm_set1_pd(charge
[inr
+2]));
1329 vdwioffset0
= 2*nvdwtype
*vdwtype
[inr
+0];
1331 jq0
= _mm_set1_pd(charge
[inr
+0]);
1332 jq1
= _mm_set1_pd(charge
[inr
+1]);
1333 jq2
= _mm_set1_pd(charge
[inr
+2]);
1334 vdwjidx0A
= 2*vdwtype
[inr
+0];
1335 qq00
= _mm_mul_pd(iq0
,jq0
);
1336 c6_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
]);
1337 c12_00
= _mm_set1_pd(vdwparam
[vdwioffset0
+vdwjidx0A
+1]);
1338 qq01
= _mm_mul_pd(iq0
,jq1
);
1339 qq02
= _mm_mul_pd(iq0
,jq2
);
1340 qq10
= _mm_mul_pd(iq1
,jq0
);
1341 qq11
= _mm_mul_pd(iq1
,jq1
);
1342 qq12
= _mm_mul_pd(iq1
,jq2
);
1343 qq20
= _mm_mul_pd(iq2
,jq0
);
1344 qq21
= _mm_mul_pd(iq2
,jq1
);
1345 qq22
= _mm_mul_pd(iq2
,jq2
);
1347 /* Avoid stupid compiler warnings */
1349 j_coord_offsetA
= 0;
1350 j_coord_offsetB
= 0;
1355 /* Start outer loop over neighborlists */
1356 for(iidx
=0; iidx
<nri
; iidx
++)
1358 /* Load shift vector for this list */
1359 i_shift_offset
= DIM
*shiftidx
[iidx
];
1361 /* Load limits for loop over neighbors */
1362 j_index_start
= jindex
[iidx
];
1363 j_index_end
= jindex
[iidx
+1];
1365 /* Get outer coordinate index */
1367 i_coord_offset
= DIM
*inr
;
1369 /* Load i particle coords and add shift vector */
1370 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec
+i_shift_offset
,x
+i_coord_offset
,
1371 &ix0
,&iy0
,&iz0
,&ix1
,&iy1
,&iz1
,&ix2
,&iy2
,&iz2
);
1373 fix0
= _mm_setzero_pd();
1374 fiy0
= _mm_setzero_pd();
1375 fiz0
= _mm_setzero_pd();
1376 fix1
= _mm_setzero_pd();
1377 fiy1
= _mm_setzero_pd();
1378 fiz1
= _mm_setzero_pd();
1379 fix2
= _mm_setzero_pd();
1380 fiy2
= _mm_setzero_pd();
1381 fiz2
= _mm_setzero_pd();
1383 /* Start inner kernel loop */
1384 for(jidx
=j_index_start
; jidx
<j_index_end
-1; jidx
+=2)
1387 /* Get j neighbor index, and coordinate index */
1389 jnrB
= jjnr
[jidx
+1];
1390 j_coord_offsetA
= DIM
*jnrA
;
1391 j_coord_offsetB
= DIM
*jnrB
;
1393 /* load j atom coordinates */
1394 gmx_mm_load_3rvec_2ptr_swizzle_pd(x
+j_coord_offsetA
,x
+j_coord_offsetB
,
1395 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
1397 /* Calculate displacement vector */
1398 dx00
= _mm_sub_pd(ix0
,jx0
);
1399 dy00
= _mm_sub_pd(iy0
,jy0
);
1400 dz00
= _mm_sub_pd(iz0
,jz0
);
1401 dx01
= _mm_sub_pd(ix0
,jx1
);
1402 dy01
= _mm_sub_pd(iy0
,jy1
);
1403 dz01
= _mm_sub_pd(iz0
,jz1
);
1404 dx02
= _mm_sub_pd(ix0
,jx2
);
1405 dy02
= _mm_sub_pd(iy0
,jy2
);
1406 dz02
= _mm_sub_pd(iz0
,jz2
);
1407 dx10
= _mm_sub_pd(ix1
,jx0
);
1408 dy10
= _mm_sub_pd(iy1
,jy0
);
1409 dz10
= _mm_sub_pd(iz1
,jz0
);
1410 dx11
= _mm_sub_pd(ix1
,jx1
);
1411 dy11
= _mm_sub_pd(iy1
,jy1
);
1412 dz11
= _mm_sub_pd(iz1
,jz1
);
1413 dx12
= _mm_sub_pd(ix1
,jx2
);
1414 dy12
= _mm_sub_pd(iy1
,jy2
);
1415 dz12
= _mm_sub_pd(iz1
,jz2
);
1416 dx20
= _mm_sub_pd(ix2
,jx0
);
1417 dy20
= _mm_sub_pd(iy2
,jy0
);
1418 dz20
= _mm_sub_pd(iz2
,jz0
);
1419 dx21
= _mm_sub_pd(ix2
,jx1
);
1420 dy21
= _mm_sub_pd(iy2
,jy1
);
1421 dz21
= _mm_sub_pd(iz2
,jz1
);
1422 dx22
= _mm_sub_pd(ix2
,jx2
);
1423 dy22
= _mm_sub_pd(iy2
,jy2
);
1424 dz22
= _mm_sub_pd(iz2
,jz2
);
1426 /* Calculate squared distance and things based on it */
1427 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
1428 rsq01
= gmx_mm_calc_rsq_pd(dx01
,dy01
,dz01
);
1429 rsq02
= gmx_mm_calc_rsq_pd(dx02
,dy02
,dz02
);
1430 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
1431 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
1432 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
1433 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
1434 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
1435 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
1437 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
1438 rinv01
= gmx_mm_invsqrt_pd(rsq01
);
1439 rinv02
= gmx_mm_invsqrt_pd(rsq02
);
1440 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
1441 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
1442 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
1443 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
1444 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
1445 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
1447 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
1449 fjx0
= _mm_setzero_pd();
1450 fjy0
= _mm_setzero_pd();
1451 fjz0
= _mm_setzero_pd();
1452 fjx1
= _mm_setzero_pd();
1453 fjy1
= _mm_setzero_pd();
1454 fjz1
= _mm_setzero_pd();
1455 fjx2
= _mm_setzero_pd();
1456 fjy2
= _mm_setzero_pd();
1457 fjz2
= _mm_setzero_pd();
1459 /**************************
1460 * CALCULATE INTERACTIONS *
1461 **************************/
1463 r00
= _mm_mul_pd(rsq00
,rinv00
);
1465 /* Calculate table index by multiplying r with table scale and truncate to integer */
1466 rt
= _mm_mul_pd(r00
,vftabscale
);
1467 vfitab
= _mm_cvttpd_epi32(rt
);
1468 vfeps
= _mm_sub_pd(rt
,_mm_round_pd(rt
, _MM_FROUND_FLOOR
));
1469 vfitab
= _mm_slli_epi32(vfitab
,2);
1471 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1472 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1473 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1474 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1475 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1476 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
1477 GMX_MM_TRANSPOSE2_PD(G
,H
);
1478 Heps
= _mm_mul_pd(vfeps
,H
);
1479 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1480 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1481 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq00
,FF
),_mm_mul_pd(vftabscale
,rinv00
)));
1483 /* LENNARD-JONES DISPERSION/REPULSION */
1485 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
1486 fvdw
= _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00
,rinvsix
),c6_00
),_mm_mul_pd(rinvsix
,rinvsq00
));
1488 fscal
= _mm_add_pd(felec
,fvdw
);
1490 /* Calculate temporary vectorial force */
1491 tx
= _mm_mul_pd(fscal
,dx00
);
1492 ty
= _mm_mul_pd(fscal
,dy00
);
1493 tz
= _mm_mul_pd(fscal
,dz00
);
1495 /* Update vectorial force */
1496 fix0
= _mm_add_pd(fix0
,tx
);
1497 fiy0
= _mm_add_pd(fiy0
,ty
);
1498 fiz0
= _mm_add_pd(fiz0
,tz
);
1500 fjx0
= _mm_add_pd(fjx0
,tx
);
1501 fjy0
= _mm_add_pd(fjy0
,ty
);
1502 fjz0
= _mm_add_pd(fjz0
,tz
);
1504 /**************************
1505 * CALCULATE INTERACTIONS *
1506 **************************/
1508 r01
= _mm_mul_pd(rsq01
,rinv01
);
1510 /* Calculate table index by multiplying r with table scale and truncate to integer */
1511 rt
= _mm_mul_pd(r01
,vftabscale
);
1512 vfitab
= _mm_cvttpd_epi32(rt
);
1513 vfeps
= _mm_sub_pd(rt
,_mm_round_pd(rt
, _MM_FROUND_FLOOR
));
1514 vfitab
= _mm_slli_epi32(vfitab
,2);
1516 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1517 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1518 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1519 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1520 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1521 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
1522 GMX_MM_TRANSPOSE2_PD(G
,H
);
1523 Heps
= _mm_mul_pd(vfeps
,H
);
1524 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1525 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1526 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq01
,FF
),_mm_mul_pd(vftabscale
,rinv01
)));
1530 /* Calculate temporary vectorial force */
1531 tx
= _mm_mul_pd(fscal
,dx01
);
1532 ty
= _mm_mul_pd(fscal
,dy01
);
1533 tz
= _mm_mul_pd(fscal
,dz01
);
1535 /* Update vectorial force */
1536 fix0
= _mm_add_pd(fix0
,tx
);
1537 fiy0
= _mm_add_pd(fiy0
,ty
);
1538 fiz0
= _mm_add_pd(fiz0
,tz
);
1540 fjx1
= _mm_add_pd(fjx1
,tx
);
1541 fjy1
= _mm_add_pd(fjy1
,ty
);
1542 fjz1
= _mm_add_pd(fjz1
,tz
);
1544 /**************************
1545 * CALCULATE INTERACTIONS *
1546 **************************/
1548 r02
= _mm_mul_pd(rsq02
,rinv02
);
1550 /* Calculate table index by multiplying r with table scale and truncate to integer */
1551 rt
= _mm_mul_pd(r02
,vftabscale
);
1552 vfitab
= _mm_cvttpd_epi32(rt
);
1553 vfeps
= _mm_sub_pd(rt
,_mm_round_pd(rt
, _MM_FROUND_FLOOR
));
1554 vfitab
= _mm_slli_epi32(vfitab
,2);
1556 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1557 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1558 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1559 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1560 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1561 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
1562 GMX_MM_TRANSPOSE2_PD(G
,H
);
1563 Heps
= _mm_mul_pd(vfeps
,H
);
1564 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1565 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1566 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq02
,FF
),_mm_mul_pd(vftabscale
,rinv02
)));
1570 /* Calculate temporary vectorial force */
1571 tx
= _mm_mul_pd(fscal
,dx02
);
1572 ty
= _mm_mul_pd(fscal
,dy02
);
1573 tz
= _mm_mul_pd(fscal
,dz02
);
1575 /* Update vectorial force */
1576 fix0
= _mm_add_pd(fix0
,tx
);
1577 fiy0
= _mm_add_pd(fiy0
,ty
);
1578 fiz0
= _mm_add_pd(fiz0
,tz
);
1580 fjx2
= _mm_add_pd(fjx2
,tx
);
1581 fjy2
= _mm_add_pd(fjy2
,ty
);
1582 fjz2
= _mm_add_pd(fjz2
,tz
);
1584 /**************************
1585 * CALCULATE INTERACTIONS *
1586 **************************/
1588 r10
= _mm_mul_pd(rsq10
,rinv10
);
1590 /* Calculate table index by multiplying r with table scale and truncate to integer */
1591 rt
= _mm_mul_pd(r10
,vftabscale
);
1592 vfitab
= _mm_cvttpd_epi32(rt
);
1593 vfeps
= _mm_sub_pd(rt
,_mm_round_pd(rt
, _MM_FROUND_FLOOR
));
1594 vfitab
= _mm_slli_epi32(vfitab
,2);
1596 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1597 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1598 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1599 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1600 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1601 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
1602 GMX_MM_TRANSPOSE2_PD(G
,H
);
1603 Heps
= _mm_mul_pd(vfeps
,H
);
1604 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1605 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1606 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq10
,FF
),_mm_mul_pd(vftabscale
,rinv10
)));
1610 /* Calculate temporary vectorial force */
1611 tx
= _mm_mul_pd(fscal
,dx10
);
1612 ty
= _mm_mul_pd(fscal
,dy10
);
1613 tz
= _mm_mul_pd(fscal
,dz10
);
1615 /* Update vectorial force */
1616 fix1
= _mm_add_pd(fix1
,tx
);
1617 fiy1
= _mm_add_pd(fiy1
,ty
);
1618 fiz1
= _mm_add_pd(fiz1
,tz
);
1620 fjx0
= _mm_add_pd(fjx0
,tx
);
1621 fjy0
= _mm_add_pd(fjy0
,ty
);
1622 fjz0
= _mm_add_pd(fjz0
,tz
);
1624 /**************************
1625 * CALCULATE INTERACTIONS *
1626 **************************/
1628 r11
= _mm_mul_pd(rsq11
,rinv11
);
1630 /* Calculate table index by multiplying r with table scale and truncate to integer */
1631 rt
= _mm_mul_pd(r11
,vftabscale
);
1632 vfitab
= _mm_cvttpd_epi32(rt
);
1633 vfeps
= _mm_sub_pd(rt
,_mm_round_pd(rt
, _MM_FROUND_FLOOR
));
1634 vfitab
= _mm_slli_epi32(vfitab
,2);
1636 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1637 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1638 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1639 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1640 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1641 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
1642 GMX_MM_TRANSPOSE2_PD(G
,H
);
1643 Heps
= _mm_mul_pd(vfeps
,H
);
1644 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1645 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1646 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq11
,FF
),_mm_mul_pd(vftabscale
,rinv11
)));
1650 /* Calculate temporary vectorial force */
1651 tx
= _mm_mul_pd(fscal
,dx11
);
1652 ty
= _mm_mul_pd(fscal
,dy11
);
1653 tz
= _mm_mul_pd(fscal
,dz11
);
1655 /* Update vectorial force */
1656 fix1
= _mm_add_pd(fix1
,tx
);
1657 fiy1
= _mm_add_pd(fiy1
,ty
);
1658 fiz1
= _mm_add_pd(fiz1
,tz
);
1660 fjx1
= _mm_add_pd(fjx1
,tx
);
1661 fjy1
= _mm_add_pd(fjy1
,ty
);
1662 fjz1
= _mm_add_pd(fjz1
,tz
);
1664 /**************************
1665 * CALCULATE INTERACTIONS *
1666 **************************/
1668 r12
= _mm_mul_pd(rsq12
,rinv12
);
1670 /* Calculate table index by multiplying r with table scale and truncate to integer */
1671 rt
= _mm_mul_pd(r12
,vftabscale
);
1672 vfitab
= _mm_cvttpd_epi32(rt
);
1673 vfeps
= _mm_sub_pd(rt
,_mm_round_pd(rt
, _MM_FROUND_FLOOR
));
1674 vfitab
= _mm_slli_epi32(vfitab
,2);
1676 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1677 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1678 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1679 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1680 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1681 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
1682 GMX_MM_TRANSPOSE2_PD(G
,H
);
1683 Heps
= _mm_mul_pd(vfeps
,H
);
1684 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1685 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1686 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq12
,FF
),_mm_mul_pd(vftabscale
,rinv12
)));
1690 /* Calculate temporary vectorial force */
1691 tx
= _mm_mul_pd(fscal
,dx12
);
1692 ty
= _mm_mul_pd(fscal
,dy12
);
1693 tz
= _mm_mul_pd(fscal
,dz12
);
1695 /* Update vectorial force */
1696 fix1
= _mm_add_pd(fix1
,tx
);
1697 fiy1
= _mm_add_pd(fiy1
,ty
);
1698 fiz1
= _mm_add_pd(fiz1
,tz
);
1700 fjx2
= _mm_add_pd(fjx2
,tx
);
1701 fjy2
= _mm_add_pd(fjy2
,ty
);
1702 fjz2
= _mm_add_pd(fjz2
,tz
);
1704 /**************************
1705 * CALCULATE INTERACTIONS *
1706 **************************/
1708 r20
= _mm_mul_pd(rsq20
,rinv20
);
1710 /* Calculate table index by multiplying r with table scale and truncate to integer */
1711 rt
= _mm_mul_pd(r20
,vftabscale
);
1712 vfitab
= _mm_cvttpd_epi32(rt
);
1713 vfeps
= _mm_sub_pd(rt
,_mm_round_pd(rt
, _MM_FROUND_FLOOR
));
1714 vfitab
= _mm_slli_epi32(vfitab
,2);
1716 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1717 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1718 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1719 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1720 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1721 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
1722 GMX_MM_TRANSPOSE2_PD(G
,H
);
1723 Heps
= _mm_mul_pd(vfeps
,H
);
1724 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1725 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1726 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq20
,FF
),_mm_mul_pd(vftabscale
,rinv20
)));
1730 /* Calculate temporary vectorial force */
1731 tx
= _mm_mul_pd(fscal
,dx20
);
1732 ty
= _mm_mul_pd(fscal
,dy20
);
1733 tz
= _mm_mul_pd(fscal
,dz20
);
1735 /* Update vectorial force */
1736 fix2
= _mm_add_pd(fix2
,tx
);
1737 fiy2
= _mm_add_pd(fiy2
,ty
);
1738 fiz2
= _mm_add_pd(fiz2
,tz
);
1740 fjx0
= _mm_add_pd(fjx0
,tx
);
1741 fjy0
= _mm_add_pd(fjy0
,ty
);
1742 fjz0
= _mm_add_pd(fjz0
,tz
);
1744 /**************************
1745 * CALCULATE INTERACTIONS *
1746 **************************/
1748 r21
= _mm_mul_pd(rsq21
,rinv21
);
1750 /* Calculate table index by multiplying r with table scale and truncate to integer */
1751 rt
= _mm_mul_pd(r21
,vftabscale
);
1752 vfitab
= _mm_cvttpd_epi32(rt
);
1753 vfeps
= _mm_sub_pd(rt
,_mm_round_pd(rt
, _MM_FROUND_FLOOR
));
1754 vfitab
= _mm_slli_epi32(vfitab
,2);
1756 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1757 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1758 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1759 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1760 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1761 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
1762 GMX_MM_TRANSPOSE2_PD(G
,H
);
1763 Heps
= _mm_mul_pd(vfeps
,H
);
1764 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1765 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1766 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq21
,FF
),_mm_mul_pd(vftabscale
,rinv21
)));
1770 /* Calculate temporary vectorial force */
1771 tx
= _mm_mul_pd(fscal
,dx21
);
1772 ty
= _mm_mul_pd(fscal
,dy21
);
1773 tz
= _mm_mul_pd(fscal
,dz21
);
1775 /* Update vectorial force */
1776 fix2
= _mm_add_pd(fix2
,tx
);
1777 fiy2
= _mm_add_pd(fiy2
,ty
);
1778 fiz2
= _mm_add_pd(fiz2
,tz
);
1780 fjx1
= _mm_add_pd(fjx1
,tx
);
1781 fjy1
= _mm_add_pd(fjy1
,ty
);
1782 fjz1
= _mm_add_pd(fjz1
,tz
);
1784 /**************************
1785 * CALCULATE INTERACTIONS *
1786 **************************/
1788 r22
= _mm_mul_pd(rsq22
,rinv22
);
1790 /* Calculate table index by multiplying r with table scale and truncate to integer */
1791 rt
= _mm_mul_pd(r22
,vftabscale
);
1792 vfitab
= _mm_cvttpd_epi32(rt
);
1793 vfeps
= _mm_sub_pd(rt
,_mm_round_pd(rt
, _MM_FROUND_FLOOR
));
1794 vfitab
= _mm_slli_epi32(vfitab
,2);
1796 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1797 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1798 F
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) );
1799 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1800 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1801 H
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,1) +2);
1802 GMX_MM_TRANSPOSE2_PD(G
,H
);
1803 Heps
= _mm_mul_pd(vfeps
,H
);
1804 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1805 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1806 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq22
,FF
),_mm_mul_pd(vftabscale
,rinv22
)));
1810 /* Calculate temporary vectorial force */
1811 tx
= _mm_mul_pd(fscal
,dx22
);
1812 ty
= _mm_mul_pd(fscal
,dy22
);
1813 tz
= _mm_mul_pd(fscal
,dz22
);
1815 /* Update vectorial force */
1816 fix2
= _mm_add_pd(fix2
,tx
);
1817 fiy2
= _mm_add_pd(fiy2
,ty
);
1818 fiz2
= _mm_add_pd(fiz2
,tz
);
1820 fjx2
= _mm_add_pd(fjx2
,tx
);
1821 fjy2
= _mm_add_pd(fjy2
,ty
);
1822 fjz2
= _mm_add_pd(fjz2
,tz
);
1824 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f
+j_coord_offsetA
,f
+j_coord_offsetB
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
1826 /* Inner loop uses 359 flops */
1829 if(jidx
<j_index_end
)
1833 j_coord_offsetA
= DIM
*jnrA
;
1835 /* load j atom coordinates */
1836 gmx_mm_load_3rvec_1ptr_swizzle_pd(x
+j_coord_offsetA
,
1837 &jx0
,&jy0
,&jz0
,&jx1
,&jy1
,&jz1
,&jx2
,&jy2
,&jz2
);
1839 /* Calculate displacement vector */
1840 dx00
= _mm_sub_pd(ix0
,jx0
);
1841 dy00
= _mm_sub_pd(iy0
,jy0
);
1842 dz00
= _mm_sub_pd(iz0
,jz0
);
1843 dx01
= _mm_sub_pd(ix0
,jx1
);
1844 dy01
= _mm_sub_pd(iy0
,jy1
);
1845 dz01
= _mm_sub_pd(iz0
,jz1
);
1846 dx02
= _mm_sub_pd(ix0
,jx2
);
1847 dy02
= _mm_sub_pd(iy0
,jy2
);
1848 dz02
= _mm_sub_pd(iz0
,jz2
);
1849 dx10
= _mm_sub_pd(ix1
,jx0
);
1850 dy10
= _mm_sub_pd(iy1
,jy0
);
1851 dz10
= _mm_sub_pd(iz1
,jz0
);
1852 dx11
= _mm_sub_pd(ix1
,jx1
);
1853 dy11
= _mm_sub_pd(iy1
,jy1
);
1854 dz11
= _mm_sub_pd(iz1
,jz1
);
1855 dx12
= _mm_sub_pd(ix1
,jx2
);
1856 dy12
= _mm_sub_pd(iy1
,jy2
);
1857 dz12
= _mm_sub_pd(iz1
,jz2
);
1858 dx20
= _mm_sub_pd(ix2
,jx0
);
1859 dy20
= _mm_sub_pd(iy2
,jy0
);
1860 dz20
= _mm_sub_pd(iz2
,jz0
);
1861 dx21
= _mm_sub_pd(ix2
,jx1
);
1862 dy21
= _mm_sub_pd(iy2
,jy1
);
1863 dz21
= _mm_sub_pd(iz2
,jz1
);
1864 dx22
= _mm_sub_pd(ix2
,jx2
);
1865 dy22
= _mm_sub_pd(iy2
,jy2
);
1866 dz22
= _mm_sub_pd(iz2
,jz2
);
1868 /* Calculate squared distance and things based on it */
1869 rsq00
= gmx_mm_calc_rsq_pd(dx00
,dy00
,dz00
);
1870 rsq01
= gmx_mm_calc_rsq_pd(dx01
,dy01
,dz01
);
1871 rsq02
= gmx_mm_calc_rsq_pd(dx02
,dy02
,dz02
);
1872 rsq10
= gmx_mm_calc_rsq_pd(dx10
,dy10
,dz10
);
1873 rsq11
= gmx_mm_calc_rsq_pd(dx11
,dy11
,dz11
);
1874 rsq12
= gmx_mm_calc_rsq_pd(dx12
,dy12
,dz12
);
1875 rsq20
= gmx_mm_calc_rsq_pd(dx20
,dy20
,dz20
);
1876 rsq21
= gmx_mm_calc_rsq_pd(dx21
,dy21
,dz21
);
1877 rsq22
= gmx_mm_calc_rsq_pd(dx22
,dy22
,dz22
);
1879 rinv00
= gmx_mm_invsqrt_pd(rsq00
);
1880 rinv01
= gmx_mm_invsqrt_pd(rsq01
);
1881 rinv02
= gmx_mm_invsqrt_pd(rsq02
);
1882 rinv10
= gmx_mm_invsqrt_pd(rsq10
);
1883 rinv11
= gmx_mm_invsqrt_pd(rsq11
);
1884 rinv12
= gmx_mm_invsqrt_pd(rsq12
);
1885 rinv20
= gmx_mm_invsqrt_pd(rsq20
);
1886 rinv21
= gmx_mm_invsqrt_pd(rsq21
);
1887 rinv22
= gmx_mm_invsqrt_pd(rsq22
);
1889 rinvsq00
= _mm_mul_pd(rinv00
,rinv00
);
1891 fjx0
= _mm_setzero_pd();
1892 fjy0
= _mm_setzero_pd();
1893 fjz0
= _mm_setzero_pd();
1894 fjx1
= _mm_setzero_pd();
1895 fjy1
= _mm_setzero_pd();
1896 fjz1
= _mm_setzero_pd();
1897 fjx2
= _mm_setzero_pd();
1898 fjy2
= _mm_setzero_pd();
1899 fjz2
= _mm_setzero_pd();
1901 /**************************
1902 * CALCULATE INTERACTIONS *
1903 **************************/
1905 r00
= _mm_mul_pd(rsq00
,rinv00
);
1907 /* Calculate table index by multiplying r with table scale and truncate to integer */
1908 rt
= _mm_mul_pd(r00
,vftabscale
);
1909 vfitab
= _mm_cvttpd_epi32(rt
);
1910 vfeps
= _mm_sub_pd(rt
,_mm_round_pd(rt
, _MM_FROUND_FLOOR
));
1911 vfitab
= _mm_slli_epi32(vfitab
,2);
1913 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1914 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1915 F
= _mm_setzero_pd();
1916 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1917 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1918 H
= _mm_setzero_pd();
1919 GMX_MM_TRANSPOSE2_PD(G
,H
);
1920 Heps
= _mm_mul_pd(vfeps
,H
);
1921 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1922 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1923 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq00
,FF
),_mm_mul_pd(vftabscale
,rinv00
)));
1925 /* LENNARD-JONES DISPERSION/REPULSION */
1927 rinvsix
= _mm_mul_pd(_mm_mul_pd(rinvsq00
,rinvsq00
),rinvsq00
);
1928 fvdw
= _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00
,rinvsix
),c6_00
),_mm_mul_pd(rinvsix
,rinvsq00
));
1930 fscal
= _mm_add_pd(felec
,fvdw
);
1932 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1934 /* Calculate temporary vectorial force */
1935 tx
= _mm_mul_pd(fscal
,dx00
);
1936 ty
= _mm_mul_pd(fscal
,dy00
);
1937 tz
= _mm_mul_pd(fscal
,dz00
);
1939 /* Update vectorial force */
1940 fix0
= _mm_add_pd(fix0
,tx
);
1941 fiy0
= _mm_add_pd(fiy0
,ty
);
1942 fiz0
= _mm_add_pd(fiz0
,tz
);
1944 fjx0
= _mm_add_pd(fjx0
,tx
);
1945 fjy0
= _mm_add_pd(fjy0
,ty
);
1946 fjz0
= _mm_add_pd(fjz0
,tz
);
1948 /**************************
1949 * CALCULATE INTERACTIONS *
1950 **************************/
1952 r01
= _mm_mul_pd(rsq01
,rinv01
);
1954 /* Calculate table index by multiplying r with table scale and truncate to integer */
1955 rt
= _mm_mul_pd(r01
,vftabscale
);
1956 vfitab
= _mm_cvttpd_epi32(rt
);
1957 vfeps
= _mm_sub_pd(rt
,_mm_round_pd(rt
, _MM_FROUND_FLOOR
));
1958 vfitab
= _mm_slli_epi32(vfitab
,2);
1960 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1961 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
1962 F
= _mm_setzero_pd();
1963 GMX_MM_TRANSPOSE2_PD(Y
,F
);
1964 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
1965 H
= _mm_setzero_pd();
1966 GMX_MM_TRANSPOSE2_PD(G
,H
);
1967 Heps
= _mm_mul_pd(vfeps
,H
);
1968 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
1969 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
1970 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq01
,FF
),_mm_mul_pd(vftabscale
,rinv01
)));
1974 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
1976 /* Calculate temporary vectorial force */
1977 tx
= _mm_mul_pd(fscal
,dx01
);
1978 ty
= _mm_mul_pd(fscal
,dy01
);
1979 tz
= _mm_mul_pd(fscal
,dz01
);
1981 /* Update vectorial force */
1982 fix0
= _mm_add_pd(fix0
,tx
);
1983 fiy0
= _mm_add_pd(fiy0
,ty
);
1984 fiz0
= _mm_add_pd(fiz0
,tz
);
1986 fjx1
= _mm_add_pd(fjx1
,tx
);
1987 fjy1
= _mm_add_pd(fjy1
,ty
);
1988 fjz1
= _mm_add_pd(fjz1
,tz
);
1990 /**************************
1991 * CALCULATE INTERACTIONS *
1992 **************************/
1994 r02
= _mm_mul_pd(rsq02
,rinv02
);
1996 /* Calculate table index by multiplying r with table scale and truncate to integer */
1997 rt
= _mm_mul_pd(r02
,vftabscale
);
1998 vfitab
= _mm_cvttpd_epi32(rt
);
1999 vfeps
= _mm_sub_pd(rt
,_mm_round_pd(rt
, _MM_FROUND_FLOOR
));
2000 vfitab
= _mm_slli_epi32(vfitab
,2);
2002 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2003 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2004 F
= _mm_setzero_pd();
2005 GMX_MM_TRANSPOSE2_PD(Y
,F
);
2006 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
2007 H
= _mm_setzero_pd();
2008 GMX_MM_TRANSPOSE2_PD(G
,H
);
2009 Heps
= _mm_mul_pd(vfeps
,H
);
2010 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
2011 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
2012 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq02
,FF
),_mm_mul_pd(vftabscale
,rinv02
)));
2016 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2018 /* Calculate temporary vectorial force */
2019 tx
= _mm_mul_pd(fscal
,dx02
);
2020 ty
= _mm_mul_pd(fscal
,dy02
);
2021 tz
= _mm_mul_pd(fscal
,dz02
);
2023 /* Update vectorial force */
2024 fix0
= _mm_add_pd(fix0
,tx
);
2025 fiy0
= _mm_add_pd(fiy0
,ty
);
2026 fiz0
= _mm_add_pd(fiz0
,tz
);
2028 fjx2
= _mm_add_pd(fjx2
,tx
);
2029 fjy2
= _mm_add_pd(fjy2
,ty
);
2030 fjz2
= _mm_add_pd(fjz2
,tz
);
2032 /**************************
2033 * CALCULATE INTERACTIONS *
2034 **************************/
2036 r10
= _mm_mul_pd(rsq10
,rinv10
);
2038 /* Calculate table index by multiplying r with table scale and truncate to integer */
2039 rt
= _mm_mul_pd(r10
,vftabscale
);
2040 vfitab
= _mm_cvttpd_epi32(rt
);
2041 vfeps
= _mm_sub_pd(rt
,_mm_round_pd(rt
, _MM_FROUND_FLOOR
));
2042 vfitab
= _mm_slli_epi32(vfitab
,2);
2044 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2045 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2046 F
= _mm_setzero_pd();
2047 GMX_MM_TRANSPOSE2_PD(Y
,F
);
2048 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
2049 H
= _mm_setzero_pd();
2050 GMX_MM_TRANSPOSE2_PD(G
,H
);
2051 Heps
= _mm_mul_pd(vfeps
,H
);
2052 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
2053 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
2054 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq10
,FF
),_mm_mul_pd(vftabscale
,rinv10
)));
2058 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2060 /* Calculate temporary vectorial force */
2061 tx
= _mm_mul_pd(fscal
,dx10
);
2062 ty
= _mm_mul_pd(fscal
,dy10
);
2063 tz
= _mm_mul_pd(fscal
,dz10
);
2065 /* Update vectorial force */
2066 fix1
= _mm_add_pd(fix1
,tx
);
2067 fiy1
= _mm_add_pd(fiy1
,ty
);
2068 fiz1
= _mm_add_pd(fiz1
,tz
);
2070 fjx0
= _mm_add_pd(fjx0
,tx
);
2071 fjy0
= _mm_add_pd(fjy0
,ty
);
2072 fjz0
= _mm_add_pd(fjz0
,tz
);
2074 /**************************
2075 * CALCULATE INTERACTIONS *
2076 **************************/
2078 r11
= _mm_mul_pd(rsq11
,rinv11
);
2080 /* Calculate table index by multiplying r with table scale and truncate to integer */
2081 rt
= _mm_mul_pd(r11
,vftabscale
);
2082 vfitab
= _mm_cvttpd_epi32(rt
);
2083 vfeps
= _mm_sub_pd(rt
,_mm_round_pd(rt
, _MM_FROUND_FLOOR
));
2084 vfitab
= _mm_slli_epi32(vfitab
,2);
2086 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2087 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2088 F
= _mm_setzero_pd();
2089 GMX_MM_TRANSPOSE2_PD(Y
,F
);
2090 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
2091 H
= _mm_setzero_pd();
2092 GMX_MM_TRANSPOSE2_PD(G
,H
);
2093 Heps
= _mm_mul_pd(vfeps
,H
);
2094 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
2095 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
2096 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq11
,FF
),_mm_mul_pd(vftabscale
,rinv11
)));
2100 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2102 /* Calculate temporary vectorial force */
2103 tx
= _mm_mul_pd(fscal
,dx11
);
2104 ty
= _mm_mul_pd(fscal
,dy11
);
2105 tz
= _mm_mul_pd(fscal
,dz11
);
2107 /* Update vectorial force */
2108 fix1
= _mm_add_pd(fix1
,tx
);
2109 fiy1
= _mm_add_pd(fiy1
,ty
);
2110 fiz1
= _mm_add_pd(fiz1
,tz
);
2112 fjx1
= _mm_add_pd(fjx1
,tx
);
2113 fjy1
= _mm_add_pd(fjy1
,ty
);
2114 fjz1
= _mm_add_pd(fjz1
,tz
);
2116 /**************************
2117 * CALCULATE INTERACTIONS *
2118 **************************/
2120 r12
= _mm_mul_pd(rsq12
,rinv12
);
2122 /* Calculate table index by multiplying r with table scale and truncate to integer */
2123 rt
= _mm_mul_pd(r12
,vftabscale
);
2124 vfitab
= _mm_cvttpd_epi32(rt
);
2125 vfeps
= _mm_sub_pd(rt
,_mm_round_pd(rt
, _MM_FROUND_FLOOR
));
2126 vfitab
= _mm_slli_epi32(vfitab
,2);
2128 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2129 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2130 F
= _mm_setzero_pd();
2131 GMX_MM_TRANSPOSE2_PD(Y
,F
);
2132 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
2133 H
= _mm_setzero_pd();
2134 GMX_MM_TRANSPOSE2_PD(G
,H
);
2135 Heps
= _mm_mul_pd(vfeps
,H
);
2136 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
2137 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
2138 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq12
,FF
),_mm_mul_pd(vftabscale
,rinv12
)));
2142 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2144 /* Calculate temporary vectorial force */
2145 tx
= _mm_mul_pd(fscal
,dx12
);
2146 ty
= _mm_mul_pd(fscal
,dy12
);
2147 tz
= _mm_mul_pd(fscal
,dz12
);
2149 /* Update vectorial force */
2150 fix1
= _mm_add_pd(fix1
,tx
);
2151 fiy1
= _mm_add_pd(fiy1
,ty
);
2152 fiz1
= _mm_add_pd(fiz1
,tz
);
2154 fjx2
= _mm_add_pd(fjx2
,tx
);
2155 fjy2
= _mm_add_pd(fjy2
,ty
);
2156 fjz2
= _mm_add_pd(fjz2
,tz
);
2158 /**************************
2159 * CALCULATE INTERACTIONS *
2160 **************************/
2162 r20
= _mm_mul_pd(rsq20
,rinv20
);
2164 /* Calculate table index by multiplying r with table scale and truncate to integer */
2165 rt
= _mm_mul_pd(r20
,vftabscale
);
2166 vfitab
= _mm_cvttpd_epi32(rt
);
2167 vfeps
= _mm_sub_pd(rt
,_mm_round_pd(rt
, _MM_FROUND_FLOOR
));
2168 vfitab
= _mm_slli_epi32(vfitab
,2);
2170 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2171 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2172 F
= _mm_setzero_pd();
2173 GMX_MM_TRANSPOSE2_PD(Y
,F
);
2174 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
2175 H
= _mm_setzero_pd();
2176 GMX_MM_TRANSPOSE2_PD(G
,H
);
2177 Heps
= _mm_mul_pd(vfeps
,H
);
2178 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
2179 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
2180 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq20
,FF
),_mm_mul_pd(vftabscale
,rinv20
)));
2184 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2186 /* Calculate temporary vectorial force */
2187 tx
= _mm_mul_pd(fscal
,dx20
);
2188 ty
= _mm_mul_pd(fscal
,dy20
);
2189 tz
= _mm_mul_pd(fscal
,dz20
);
2191 /* Update vectorial force */
2192 fix2
= _mm_add_pd(fix2
,tx
);
2193 fiy2
= _mm_add_pd(fiy2
,ty
);
2194 fiz2
= _mm_add_pd(fiz2
,tz
);
2196 fjx0
= _mm_add_pd(fjx0
,tx
);
2197 fjy0
= _mm_add_pd(fjy0
,ty
);
2198 fjz0
= _mm_add_pd(fjz0
,tz
);
2200 /**************************
2201 * CALCULATE INTERACTIONS *
2202 **************************/
2204 r21
= _mm_mul_pd(rsq21
,rinv21
);
2206 /* Calculate table index by multiplying r with table scale and truncate to integer */
2207 rt
= _mm_mul_pd(r21
,vftabscale
);
2208 vfitab
= _mm_cvttpd_epi32(rt
);
2209 vfeps
= _mm_sub_pd(rt
,_mm_round_pd(rt
, _MM_FROUND_FLOOR
));
2210 vfitab
= _mm_slli_epi32(vfitab
,2);
2212 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2213 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2214 F
= _mm_setzero_pd();
2215 GMX_MM_TRANSPOSE2_PD(Y
,F
);
2216 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
2217 H
= _mm_setzero_pd();
2218 GMX_MM_TRANSPOSE2_PD(G
,H
);
2219 Heps
= _mm_mul_pd(vfeps
,H
);
2220 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
2221 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
2222 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq21
,FF
),_mm_mul_pd(vftabscale
,rinv21
)));
2226 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2228 /* Calculate temporary vectorial force */
2229 tx
= _mm_mul_pd(fscal
,dx21
);
2230 ty
= _mm_mul_pd(fscal
,dy21
);
2231 tz
= _mm_mul_pd(fscal
,dz21
);
2233 /* Update vectorial force */
2234 fix2
= _mm_add_pd(fix2
,tx
);
2235 fiy2
= _mm_add_pd(fiy2
,ty
);
2236 fiz2
= _mm_add_pd(fiz2
,tz
);
2238 fjx1
= _mm_add_pd(fjx1
,tx
);
2239 fjy1
= _mm_add_pd(fjy1
,ty
);
2240 fjz1
= _mm_add_pd(fjz1
,tz
);
2242 /**************************
2243 * CALCULATE INTERACTIONS *
2244 **************************/
2246 r22
= _mm_mul_pd(rsq22
,rinv22
);
2248 /* Calculate table index by multiplying r with table scale and truncate to integer */
2249 rt
= _mm_mul_pd(r22
,vftabscale
);
2250 vfitab
= _mm_cvttpd_epi32(rt
);
2251 vfeps
= _mm_sub_pd(rt
,_mm_round_pd(rt
, _MM_FROUND_FLOOR
));
2252 vfitab
= _mm_slli_epi32(vfitab
,2);
2254 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2255 Y
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) );
2256 F
= _mm_setzero_pd();
2257 GMX_MM_TRANSPOSE2_PD(Y
,F
);
2258 G
= _mm_load_pd( vftab
+ gmx_mm_extract_epi32(vfitab
,0) +2);
2259 H
= _mm_setzero_pd();
2260 GMX_MM_TRANSPOSE2_PD(G
,H
);
2261 Heps
= _mm_mul_pd(vfeps
,H
);
2262 Fp
= _mm_add_pd(F
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,Heps
)));
2263 FF
= _mm_add_pd(Fp
,_mm_mul_pd(vfeps
,_mm_add_pd(G
,_mm_add_pd(Heps
,Heps
))));
2264 felec
= _mm_xor_pd(signbit
,_mm_mul_pd(_mm_mul_pd(qq22
,FF
),_mm_mul_pd(vftabscale
,rinv22
)));
2268 fscal
= _mm_unpacklo_pd(fscal
,_mm_setzero_pd());
2270 /* Calculate temporary vectorial force */
2271 tx
= _mm_mul_pd(fscal
,dx22
);
2272 ty
= _mm_mul_pd(fscal
,dy22
);
2273 tz
= _mm_mul_pd(fscal
,dz22
);
2275 /* Update vectorial force */
2276 fix2
= _mm_add_pd(fix2
,tx
);
2277 fiy2
= _mm_add_pd(fiy2
,ty
);
2278 fiz2
= _mm_add_pd(fiz2
,tz
);
2280 fjx2
= _mm_add_pd(fjx2
,tx
);
2281 fjy2
= _mm_add_pd(fjy2
,ty
);
2282 fjz2
= _mm_add_pd(fjz2
,tz
);
2284 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f
+j_coord_offsetA
,fjx0
,fjy0
,fjz0
,fjx1
,fjy1
,fjz1
,fjx2
,fjy2
,fjz2
);
2286 /* Inner loop uses 359 flops */
2289 /* End of innermost loop */
2291 gmx_mm_update_iforce_3atom_swizzle_pd(fix0
,fiy0
,fiz0
,fix1
,fiy1
,fiz1
,fix2
,fiy2
,fiz2
,
2292 f
+i_coord_offset
,fshift
+i_shift_offset
);
2294 /* Increment number of inner iterations */
2295 inneriter
+= j_index_end
- j_index_start
;
2297 /* Outer loop uses 18 flops */
2300 /* Increment number of outer iterations */
2303 /* Update outer/inner flops */
2305 inc_nrnb(nrnb
,eNR_NBKERNEL_ELEC_VDW_W3W3_F
,outeriter
*18 + inneriter
*359);